file_util_icu.cc revision 6e8cce623b6e4fe0c9e4af605d675dd9d0338c38
1// Copyright (c) 2012 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5// File utilities that use the ICU library go in this file. 6 7#include "base/i18n/file_util_icu.h" 8 9#include "base/files/file_path.h" 10#include "base/i18n/icu_string_conversions.h" 11#include "base/i18n/string_compare.h" 12#include "base/logging.h" 13#include "base/memory/scoped_ptr.h" 14#include "base/memory/singleton.h" 15#include "base/strings/string_util.h" 16#include "base/strings/sys_string_conversions.h" 17#include "base/strings/utf_string_conversions.h" 18#include "build/build_config.h" 19#include "third_party/icu/source/common/unicode/uniset.h" 20#include "third_party/icu/source/i18n/unicode/coll.h" 21 22namespace base { 23namespace i18n { 24 25namespace { 26 27class IllegalCharacters { 28 public: 29 static IllegalCharacters* GetInstance() { 30 return Singleton<IllegalCharacters>::get(); 31 } 32 33 bool contains(UChar32 ucs4) { 34 return !!set->contains(ucs4); 35 } 36 37 bool containsNone(const string16 &s) { 38 return !!set->containsNone(icu::UnicodeString(s.c_str(), s.size())); 39 } 40 41 private: 42 friend class Singleton<IllegalCharacters>; 43 friend struct DefaultSingletonTraits<IllegalCharacters>; 44 45 IllegalCharacters(); 46 ~IllegalCharacters() { } 47 48 scoped_ptr<icu::UnicodeSet> set; 49 50 DISALLOW_COPY_AND_ASSIGN(IllegalCharacters); 51}; 52 53IllegalCharacters::IllegalCharacters() { 54 UErrorCode status = U_ZERO_ERROR; 55 // Control characters, formatting characters, non-characters, and 56 // some printable ASCII characters regarded as dangerous ('"*/:<>?\\'). 57 // See http://blogs.msdn.com/michkap/archive/2006/11/03/941420.aspx 58 // and http://msdn2.microsoft.com/en-us/library/Aa365247.aspx 59 // TODO(jungshik): Revisit the set. ZWJ and ZWNJ are excluded because they 60 // are legitimate in Arabic and some S/SE Asian scripts. However, when used 61 // elsewhere, they can be confusing/problematic. 62 // Also, consider wrapping the set with our Singleton class to create and 63 // freeze it only once. Note that there's a trade-off between memory and 64 // speed. 65#if defined(WCHAR_T_IS_UTF16) 66 set.reset(new icu::UnicodeSet(icu::UnicodeString( 67 L"[[\"*/:<>?\\\\|][:Cc:][:Cf:] - [\u200c\u200d]]"), status)); 68#else 69 set.reset(new icu::UnicodeSet(UNICODE_STRING_SIMPLE( 70 "[[\"*/:<>?\\\\|][:Cc:][:Cf:] - [\\u200c\\u200d]]").unescape(), 71 status)); 72#endif 73 DCHECK(U_SUCCESS(status)); 74 // Add non-characters. If this becomes a performance bottleneck by 75 // any chance, do not add these to |set| and change IsFilenameLegal() 76 // to check |ucs4 & 0xFFFEu == 0xFFFEu|, in addiition to calling 77 // containsNone(). 78 set->add(0xFDD0, 0xFDEF); 79 for (int i = 0; i <= 0x10; ++i) { 80 int plane_base = 0x10000 * i; 81 set->add(plane_base + 0xFFFE, plane_base + 0xFFFF); 82 } 83 set->freeze(); 84} 85 86} // namespace 87 88bool IsFilenameLegal(const string16& file_name) { 89 return IllegalCharacters::GetInstance()->containsNone(file_name); 90} 91 92void ReplaceIllegalCharactersInPath(FilePath::StringType* file_name, 93 char replace_char) { 94 DCHECK(file_name); 95 96 DCHECK(!(IllegalCharacters::GetInstance()->contains(replace_char))); 97 98 // Remove leading and trailing whitespace. 99 TrimWhitespace(*file_name, TRIM_ALL, file_name); 100 101 IllegalCharacters* illegal = IllegalCharacters::GetInstance(); 102 int cursor = 0; // The ICU macros expect an int. 103 while (cursor < static_cast<int>(file_name->size())) { 104 int char_begin = cursor; 105 uint32 code_point; 106#if defined(OS_MACOSX) 107 // Mac uses UTF-8 encoding for filenames. 108 U8_NEXT(file_name->data(), cursor, static_cast<int>(file_name->length()), 109 code_point); 110#elif defined(OS_WIN) 111 // Windows uses UTF-16 encoding for filenames. 112 U16_NEXT(file_name->data(), cursor, static_cast<int>(file_name->length()), 113 code_point); 114#elif defined(OS_POSIX) 115 // Linux doesn't actually define an encoding. It basically allows anything 116 // except for a few special ASCII characters. 117 unsigned char cur_char = static_cast<unsigned char>((*file_name)[cursor++]); 118 if (cur_char >= 0x80) 119 continue; 120 code_point = cur_char; 121#else 122 NOTREACHED(); 123#endif 124 125 if (illegal->contains(code_point)) { 126 file_name->replace(char_begin, cursor - char_begin, 1, replace_char); 127 // We just made the potentially multi-byte/word char into one that only 128 // takes one byte/word, so need to adjust the cursor to point to the next 129 // character again. 130 cursor = char_begin + 1; 131 } 132 } 133} 134 135bool LocaleAwareCompareFilenames(const FilePath& a, const FilePath& b) { 136 UErrorCode error_code = U_ZERO_ERROR; 137 // Use the default collator. The default locale should have been properly 138 // set by the time this constructor is called. 139 scoped_ptr<icu::Collator> collator(icu::Collator::createInstance(error_code)); 140 DCHECK(U_SUCCESS(error_code)); 141 // Make it case-sensitive. 142 collator->setStrength(icu::Collator::TERTIARY); 143 144#if defined(OS_WIN) 145 return CompareString16WithCollator(collator.get(), 146 WideToUTF16(a.value()), WideToUTF16(b.value())) == UCOL_LESS; 147 148#elif defined(OS_POSIX) 149 // On linux, the file system encoding is not defined. We assume 150 // SysNativeMBToWide takes care of it. 151 return CompareString16WithCollator( 152 collator.get(), 153 WideToUTF16(SysNativeMBToWide(a.value().c_str())), 154 WideToUTF16(SysNativeMBToWide(b.value().c_str()))) == UCOL_LESS; 155#else 156 #error Not implemented on your system 157#endif 158} 159 160void NormalizeFileNameEncoding(FilePath* file_name) { 161#if defined(OS_CHROMEOS) 162 std::string normalized_str; 163 if (ConvertToUtf8AndNormalize(file_name->BaseName().value(), 164 kCodepageUTF8, 165 &normalized_str)) { 166 *file_name = file_name->DirName().Append(FilePath(normalized_str)); 167 } 168#endif 169} 170 171} // namespace i18n 172} // namespace base 173