15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2012 The Chromium Authors. All rights reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// File utilities that use the ICU library go in this file.
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/i18n/file_util_icu.h"
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
92a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "base/files/file_path.h"
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/i18n/icu_string_conversions.h"
11c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)#include "base/i18n/string_compare.h"
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/logging.h"
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/memory/scoped_ptr.h"
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/memory/singleton.h"
15868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)#include "base/strings/string_util.h"
162a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "base/strings/sys_string_conversions.h"
17868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)#include "base/strings/utf_string_conversions.h"
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "build/build_config.h"
19ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch#include "third_party/icu/source/common/unicode/uniset.h"
20ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch#include "third_party/icu/source/i18n/unicode/coll.h"
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
22a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)using base::string16;
23a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace {
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class IllegalCharacters {
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public:
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static IllegalCharacters* GetInstance() {
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return Singleton<IllegalCharacters>::get();
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool contains(UChar32 ucs4) {
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return !!set->contains(ucs4);
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool containsNone(const string16 &s) {
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return !!set->containsNone(icu::UnicodeString(s.c_str(), s.size()));
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private:
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  friend class Singleton<IllegalCharacters>;
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  friend struct DefaultSingletonTraits<IllegalCharacters>;
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  IllegalCharacters();
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ~IllegalCharacters() { }
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  scoped_ptr<icu::UnicodeSet> set;
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DISALLOW_COPY_AND_ASSIGN(IllegalCharacters);
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)IllegalCharacters::IllegalCharacters() {
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UErrorCode status = U_ZERO_ERROR;
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Control characters, formatting characters, non-characters, and
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // some printable ASCII characters regarded as dangerous ('"*/:<>?\\').
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // See  http://blogs.msdn.com/michkap/archive/2006/11/03/941420.aspx
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // and http://msdn2.microsoft.com/en-us/library/Aa365247.aspx
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // TODO(jungshik): Revisit the set. ZWJ and ZWNJ are excluded because they
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // are legitimate in Arabic and some S/SE Asian scripts. However, when used
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // elsewhere, they can be confusing/problematic.
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Also, consider wrapping the set with our Singleton class to create and
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // freeze it only once. Note that there's a trade-off between memory and
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // speed.
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(WCHAR_T_IS_UTF16)
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  set.reset(new icu::UnicodeSet(icu::UnicodeString(
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      L"[[\"*/:<>?\\\\|][:Cc:][:Cf:] - [\u200c\u200d]]"), status));
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#else
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  set.reset(new icu::UnicodeSet(UNICODE_STRING_SIMPLE(
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      "[[\"*/:<>?\\\\|][:Cc:][:Cf:] - [\\u200c\\u200d]]").unescape(),
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      status));
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DCHECK(U_SUCCESS(status));
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Add non-characters. If this becomes a performance bottleneck by
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // any chance, do not add these to |set| and change IsFilenameLegal()
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // to check |ucs4 & 0xFFFEu == 0xFFFEu|, in addiition to calling
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // containsNone().
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  set->add(0xFDD0, 0xFDEF);
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (int i = 0; i <= 0x10; ++i) {
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int plane_base = 0x10000 * i;
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    set->add(plane_base + 0xFFFE, plane_base + 0xFFFF);
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  set->freeze();
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace file_util {
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool IsFilenameLegal(const string16& file_name) {
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return IllegalCharacters::GetInstance()->containsNone(file_name);
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
932a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)void ReplaceIllegalCharactersInPath(base::FilePath::StringType* file_name,
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                    char replace_char) {
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DCHECK(file_name);
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DCHECK(!(IllegalCharacters::GetInstance()->contains(replace_char)));
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Remove leading and trailing whitespace.
100a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)  base::TrimWhitespace(*file_name, base::TRIM_ALL, file_name);
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  IllegalCharacters* illegal = IllegalCharacters::GetInstance();
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int cursor = 0;  // The ICU macros expect an int.
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  while (cursor < static_cast<int>(file_name->size())) {
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int char_begin = cursor;
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    uint32 code_point;
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(OS_MACOSX)
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Mac uses UTF-8 encoding for filenames.
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    U8_NEXT(file_name->data(), cursor, static_cast<int>(file_name->length()),
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            code_point);
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#elif defined(OS_WIN)
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Windows uses UTF-16 encoding for filenames.
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    U16_NEXT(file_name->data(), cursor, static_cast<int>(file_name->length()),
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)             code_point);
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#elif defined(OS_POSIX)
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Linux doesn't actually define an encoding. It basically allows anything
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // except for a few special ASCII characters.
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    unsigned char cur_char = static_cast<unsigned char>((*file_name)[cursor++]);
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (cur_char >= 0x80)
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      continue;
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    code_point = cur_char;
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#else
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    NOTREACHED();
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif
1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (illegal->contains(code_point)) {
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      file_name->replace(char_begin, cursor - char_begin, 1, replace_char);
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // We just made the potentially multi-byte/word char into one that only
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // takes one byte/word, so need to adjust the cursor to point to the next
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // character again.
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      cursor = char_begin + 1;
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1362a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)bool LocaleAwareCompareFilenames(const base::FilePath& a,
1372a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)                                 const base::FilePath& b) {
138c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  UErrorCode error_code = U_ZERO_ERROR;
139c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // Use the default collator. The default locale should have been properly
140c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // set by the time this constructor is called.
141c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  scoped_ptr<icu::Collator> collator(icu::Collator::createInstance(error_code));
142c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  DCHECK(U_SUCCESS(error_code));
143c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  // Make it case-sensitive.
144c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  collator->setStrength(icu::Collator::TERTIARY);
145c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(OS_WIN)
147c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  return base::i18n::CompareString16WithCollator(collator.get(),
1485d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      base::WideToUTF16(a.value()), base::WideToUTF16(b.value())) == UCOL_LESS;
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#elif defined(OS_POSIX)
1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // On linux, the file system encoding is not defined. We assume
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // SysNativeMBToWide takes care of it.
1535d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  return base::i18n::CompareString16WithCollator(
1545d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      collator.get(),
1555d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      base::WideToUTF16(base::SysNativeMBToWide(a.value().c_str())),
1565d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      base::WideToUTF16(base::SysNativeMBToWide(b.value().c_str()))
1575d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      ) == UCOL_LESS;
1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#else
1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  #error Not implemented on your system
1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif
1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1632a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)void NormalizeFileNameEncoding(base::FilePath* file_name) {
1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(OS_CHROMEOS)
1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::string normalized_str;
1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (base::ConvertToUtf8AndNormalize(file_name->BaseName().value(),
1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                      base::kCodepageUTF8,
1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                      &normalized_str)) {
1692a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    *file_name = file_name->DirName().Append(base::FilePath(normalized_str));
1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif
1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace
175