file_util_icu.cc revision ca12bfac764ba476d6cd062bf1dde12cc64c3f40
1// Copyright (c) 2012 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5// File utilities that use the ICU library go in this file. 6 7#include "base/i18n/file_util_icu.h" 8 9#include "base/files/file_path.h" 10#include "base/i18n/icu_string_conversions.h" 11#include "base/i18n/string_compare.h" 12#include "base/logging.h" 13#include "base/memory/scoped_ptr.h" 14#include "base/memory/singleton.h" 15#include "base/strings/string_util.h" 16#include "base/strings/sys_string_conversions.h" 17#include "base/strings/utf_string_conversions.h" 18#include "build/build_config.h" 19#include "third_party/icu/source/common/unicode/uniset.h" 20#include "third_party/icu/source/i18n/unicode/coll.h" 21 22namespace { 23 24class IllegalCharacters { 25 public: 26 static IllegalCharacters* GetInstance() { 27 return Singleton<IllegalCharacters>::get(); 28 } 29 30 bool contains(UChar32 ucs4) { 31 return !!set->contains(ucs4); 32 } 33 34 bool containsNone(const string16 &s) { 35 return !!set->containsNone(icu::UnicodeString(s.c_str(), s.size())); 36 } 37 38 private: 39 friend class Singleton<IllegalCharacters>; 40 friend struct DefaultSingletonTraits<IllegalCharacters>; 41 42 IllegalCharacters(); 43 ~IllegalCharacters() { } 44 45 scoped_ptr<icu::UnicodeSet> set; 46 47 DISALLOW_COPY_AND_ASSIGN(IllegalCharacters); 48}; 49 50IllegalCharacters::IllegalCharacters() { 51 UErrorCode status = U_ZERO_ERROR; 52 // Control characters, formatting characters, non-characters, and 53 // some printable ASCII characters regarded as dangerous ('"*/:<>?\\'). 54 // See http://blogs.msdn.com/michkap/archive/2006/11/03/941420.aspx 55 // and http://msdn2.microsoft.com/en-us/library/Aa365247.aspx 56 // TODO(jungshik): Revisit the set. ZWJ and ZWNJ are excluded because they 57 // are legitimate in Arabic and some S/SE Asian scripts. However, when used 58 // elsewhere, they can be confusing/problematic. 59 // Also, consider wrapping the set with our Singleton class to create and 60 // freeze it only once. Note that there's a trade-off between memory and 61 // speed. 62#if defined(WCHAR_T_IS_UTF16) 63 set.reset(new icu::UnicodeSet(icu::UnicodeString( 64 L"[[\"*/:<>?\\\\|][:Cc:][:Cf:] - [\u200c\u200d]]"), status)); 65#else 66 set.reset(new icu::UnicodeSet(UNICODE_STRING_SIMPLE( 67 "[[\"*/:<>?\\\\|][:Cc:][:Cf:] - [\\u200c\\u200d]]").unescape(), 68 status)); 69#endif 70 DCHECK(U_SUCCESS(status)); 71 // Add non-characters. If this becomes a performance bottleneck by 72 // any chance, do not add these to |set| and change IsFilenameLegal() 73 // to check |ucs4 & 0xFFFEu == 0xFFFEu|, in addiition to calling 74 // containsNone(). 75 set->add(0xFDD0, 0xFDEF); 76 for (int i = 0; i <= 0x10; ++i) { 77 int plane_base = 0x10000 * i; 78 set->add(plane_base + 0xFFFE, plane_base + 0xFFFF); 79 } 80 set->freeze(); 81} 82 83} // namespace 84 85namespace file_util { 86 87bool IsFilenameLegal(const string16& file_name) { 88 return IllegalCharacters::GetInstance()->containsNone(file_name); 89} 90 91void ReplaceIllegalCharactersInPath(base::FilePath::StringType* file_name, 92 char replace_char) { 93 DCHECK(file_name); 94 95 DCHECK(!(IllegalCharacters::GetInstance()->contains(replace_char))); 96 97 // Remove leading and trailing whitespace. 98 TrimWhitespace(*file_name, TRIM_ALL, file_name); 99 100 IllegalCharacters* illegal = IllegalCharacters::GetInstance(); 101 int cursor = 0; // The ICU macros expect an int. 102 while (cursor < static_cast<int>(file_name->size())) { 103 int char_begin = cursor; 104 uint32 code_point; 105#if defined(OS_MACOSX) 106 // Mac uses UTF-8 encoding for filenames. 107 U8_NEXT(file_name->data(), cursor, static_cast<int>(file_name->length()), 108 code_point); 109#elif defined(OS_WIN) 110 // Windows uses UTF-16 encoding for filenames. 111 U16_NEXT(file_name->data(), cursor, static_cast<int>(file_name->length()), 112 code_point); 113#elif defined(OS_POSIX) 114 // Linux doesn't actually define an encoding. It basically allows anything 115 // except for a few special ASCII characters. 116 unsigned char cur_char = static_cast<unsigned char>((*file_name)[cursor++]); 117 if (cur_char >= 0x80) 118 continue; 119 code_point = cur_char; 120#else 121 NOTREACHED(); 122#endif 123 124 if (illegal->contains(code_point)) { 125 file_name->replace(char_begin, cursor - char_begin, 1, replace_char); 126 // We just made the potentially multi-byte/word char into one that only 127 // takes one byte/word, so need to adjust the cursor to point to the next 128 // character again. 129 cursor = char_begin + 1; 130 } 131 } 132} 133 134bool LocaleAwareCompareFilenames(const base::FilePath& a, 135 const base::FilePath& b) { 136 UErrorCode error_code = U_ZERO_ERROR; 137 // Use the default collator. The default locale should have been properly 138 // set by the time this constructor is called. 139 scoped_ptr<icu::Collator> collator(icu::Collator::createInstance(error_code)); 140 DCHECK(U_SUCCESS(error_code)); 141 // Make it case-sensitive. 142 collator->setStrength(icu::Collator::TERTIARY); 143 144#if defined(OS_WIN) 145 return base::i18n::CompareString16WithCollator(collator.get(), 146 WideToUTF16(a.value()), WideToUTF16(b.value())) == UCOL_LESS; 147 148#elif defined(OS_POSIX) 149 // On linux, the file system encoding is not defined. We assume 150 // SysNativeMBToWide takes care of it. 151 return base::i18n::CompareString16WithCollator(collator.get(), 152 WideToUTF16(base::SysNativeMBToWide(a.value().c_str())), 153 WideToUTF16(base::SysNativeMBToWide(b.value().c_str()))) == UCOL_LESS; 154#else 155 #error Not implemented on your system 156#endif 157} 158 159void NormalizeFileNameEncoding(base::FilePath* file_name) { 160#if defined(OS_CHROMEOS) 161 std::string normalized_str; 162 if (base::ConvertToUtf8AndNormalize(file_name->BaseName().value(), 163 base::kCodepageUTF8, 164 &normalized_str)) { 165 *file_name = file_name->DirName().Append(base::FilePath(normalized_str)); 166 } 167#endif 168} 169 170} // namespace 171