15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2012 The Chromium Authors. All rights reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// File utilities that use the ICU library go in this file.
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/i18n/file_util_icu.h"
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
92a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "base/files/file_path.h"
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/i18n/icu_string_conversions.h"
11c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)#include "base/i18n/string_compare.h"
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/logging.h"
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/memory/scoped_ptr.h"
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/memory/singleton.h"
15868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)#include "base/strings/string_util.h"
162a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "base/strings/sys_string_conversions.h"
17868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)#include "base/strings/utf_string_conversions.h"
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "build/build_config.h"
19ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch#include "third_party/icu/source/common/unicode/uniset.h"
20ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch#include "third_party/icu/source/i18n/unicode/coll.h"
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
226e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles)namespace base {
236e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles)namespace i18n {
24a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace {
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class IllegalCharacters {
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public:
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static IllegalCharacters* GetInstance() {
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return Singleton<IllegalCharacters>::get();
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool contains(UChar32 ucs4) {
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return !!set->contains(ucs4);
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool containsNone(const string16 &s) {
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return !!set->containsNone(icu::UnicodeString(s.c_str(), s.size()));
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private:
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  friend class Singleton<IllegalCharacters>;
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  friend struct DefaultSingletonTraits<IllegalCharacters>;
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  IllegalCharacters();
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ~IllegalCharacters() { }
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  scoped_ptr<icu::UnicodeSet> set;
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DISALLOW_COPY_AND_ASSIGN(IllegalCharacters);
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)IllegalCharacters::IllegalCharacters() {
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UErrorCode status = U_ZERO_ERROR;
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Control characters, formatting characters, non-characters, and
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // some printable ASCII characters regarded as dangerous ('"*/:<>?\\').
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // See  http://blogs.msdn.com/michkap/archive/2006/11/03/941420.aspx
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // and http://msdn2.microsoft.com/en-us/library/Aa365247.aspx
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // TODO(jungshik): Revisit the set. ZWJ and ZWNJ are excluded because they
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // are legitimate in Arabic and some S/SE Asian scripts. However, when used
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // elsewhere, they can be confusing/problematic.
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Also, consider wrapping the set with our Singleton class to create and
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // freeze it only once. Note that there's a trade-off between memory and
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // speed.
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(WCHAR_T_IS_UTF16)
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  set.reset(new icu::UnicodeSet(icu::UnicodeString(
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      L"[[\"*/:<>?\\\\|][:Cc:][:Cf:] - [\u200c\u200d]]"), status));
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#else
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  set.reset(new icu::UnicodeSet(UNICODE_STRING_SIMPLE(
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "[[\"*/:<>?\\\\|][:Cc:][:Cf:] - [\\u200c\\u200d]]").unescape(),
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      status));
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DCHECK(U_SUCCESS(status));
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Add non-characters. If this becomes a performance bottleneck by
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // any chance, do not add these to |set| and change IsFilenameLegal()
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // to check |ucs4 & 0xFFFEu == 0xFFFEu|, in addiition to calling
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // containsNone().
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  set->add(0xFDD0, 0xFDEF);
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (int i = 0; i <= 0x10; ++i) {
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int plane_base = 0x10000 * i;
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    set->add(plane_base + 0xFFFE, plane_base + 0xFFFF);
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  set->freeze();
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool IsFilenameLegal(const string16& file_name) {
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return IllegalCharacters::GetInstance()->containsNone(file_name);
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
926e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles)void ReplaceIllegalCharactersInPath(FilePath::StringType* file_name,
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                    char replace_char) {
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DCHECK(file_name);
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DCHECK(!(IllegalCharacters::GetInstance()->contains(replace_char)));
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Remove leading and trailing whitespace.
996e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles)  TrimWhitespace(*file_name, TRIM_ALL, file_name);
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  IllegalCharacters* illegal = IllegalCharacters::GetInstance();
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int cursor = 0;  // The ICU macros expect an int.
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  while (cursor < static_cast<int>(file_name->size())) {
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int char_begin = cursor;
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    uint32 code_point;
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(OS_MACOSX)
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Mac uses UTF-8 encoding for filenames.
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    U8_NEXT(file_name->data(), cursor, static_cast<int>(file_name->length()),
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            code_point);
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#elif defined(OS_WIN)
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Windows uses UTF-16 encoding for filenames.
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    U16_NEXT(file_name->data(), cursor, static_cast<int>(file_name->length()),
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)             code_point);
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#elif defined(OS_POSIX)
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Linux doesn't actually define an encoding. It basically allows anything
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // except for a few special ASCII characters.
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    unsigned char cur_char = static_cast<unsigned char>((*file_name)[cursor++]);
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (cur_char >= 0x80)
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      continue;
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    code_point = cur_char;
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#else
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    NOTREACHED();
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (illegal->contains(code_point)) {
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      file_name->replace(char_begin, cursor - char_begin, 1, replace_char);
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // We just made the potentially multi-byte/word char into one that only
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // takes one byte/word, so need to adjust the cursor to point to the next
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // character again.
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      cursor = char_begin + 1;
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1356e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles)bool LocaleAwareCompareFilenames(const FilePath& a, const FilePath& b) {
136c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  UErrorCode error_code = U_ZERO_ERROR;
137c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // Use the default collator. The default locale should have been properly
138c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // set by the time this constructor is called.
139c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  scoped_ptr<icu::Collator> collator(icu::Collator::createInstance(error_code));
140c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  DCHECK(U_SUCCESS(error_code));
141c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // Make it case-sensitive.
142c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  collator->setStrength(icu::Collator::TERTIARY);
143c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(OS_WIN)
1456e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles)  return CompareString16WithCollator(collator.get(),
1466e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles)      WideToUTF16(a.value()), WideToUTF16(b.value())) == UCOL_LESS;
1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#elif defined(OS_POSIX)
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // On linux, the file system encoding is not defined. We assume
1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // SysNativeMBToWide takes care of it.
1516e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles)  return CompareString16WithCollator(
1525d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      collator.get(),
1536e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles)      WideToUTF16(SysNativeMBToWide(a.value().c_str())),
1546e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles)      WideToUTF16(SysNativeMBToWide(b.value().c_str()))) == UCOL_LESS;
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#else
1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  #error Not implemented on your system
1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif
1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1606e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles)void NormalizeFileNameEncoding(FilePath* file_name) {
1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(OS_CHROMEOS)
1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::string normalized_str;
1636e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles)  if (ConvertToUtf8AndNormalize(file_name->BaseName().value(),
1646e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles)                                kCodepageUTF8,
1656e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles)                                &normalized_str)) {
1666e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles)    *file_name = file_name->DirName().Append(FilePath(normalized_str));
1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif
1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1716e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles)}  // namespace i18n
1726e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles)}  // namespace base
173