1ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen// Copyright (c) 2011 The Chromium Authors. All rights reserved. 2c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Use of this source code is governed by a BSD-style license that can be 3c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// found in the LICENSE file. 4c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 5c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// File utilities that use the ICU library go in this file. 6c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 7c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "base/i18n/file_util_icu.h" 8c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 9c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "base/file_path.h" 10c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/logging.h" 11ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen#include "base/memory/scoped_ptr.h" 12ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen#include "base/memory/singleton.h" 13c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "base/string_util.h" 14c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/utf_string_conversions.h" 15c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "base/sys_string_conversions.h" 16c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "build/build_config.h" 17c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "unicode/coll.h" 18c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "unicode/uniset.h" 19c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 20c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottnamespace { 21c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 22c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottclass IllegalCharacters { 23c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott public: 2421d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen static IllegalCharacters* GetInstance() { 2521d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen return Singleton<IllegalCharacters>::get(); 2621d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen } 2721d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen 28c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott bool contains(UChar32 ucs4) { 29c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return !!set->contains(ucs4); 30c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 31c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 32c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott bool containsNone(const string16 &s) { 33c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return !!set->containsNone(icu::UnicodeString(s.c_str(), s.size())); 34c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 35c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 36c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott private: 37c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott friend class Singleton<IllegalCharacters>; 38c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott friend struct DefaultSingletonTraits<IllegalCharacters>; 39c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 40c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott IllegalCharacters(); 41c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott ~IllegalCharacters() { } 42c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 43c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott scoped_ptr<icu::UnicodeSet> set; 44c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 45c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott DISALLOW_COPY_AND_ASSIGN(IllegalCharacters); 46c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}; 47c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 48c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick ScottIllegalCharacters::IllegalCharacters() { 49c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UErrorCode status = U_ZERO_ERROR; 50c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Control characters, formatting characters, non-characters, and 51c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // some printable ASCII characters regarded as dangerous ('"*/:<>?\\'). 52c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // See http://blogs.msdn.com/michkap/archive/2006/11/03/941420.aspx 53c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // and http://msdn2.microsoft.com/en-us/library/Aa365247.aspx 54c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // TODO(jungshik): Revisit the set. ZWJ and ZWNJ are excluded because they 55c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // are legitimate in Arabic and some S/SE Asian scripts. However, when used 56c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // elsewhere, they can be confusing/problematic. 57c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Also, consider wrapping the set with our Singleton class to create and 58c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // freeze it only once. Note that there's a trade-off between memory and 59c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // speed. 60c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#if defined(WCHAR_T_IS_UTF16) 61c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott set.reset(new icu::UnicodeSet(icu::UnicodeString( 62c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott L"[[\"*/:<>?\\\\|][:Cc:][:Cf:] - [\u200c\u200d]]"), status)); 63c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#else 64c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott set.reset(new icu::UnicodeSet(UNICODE_STRING_SIMPLE( 65c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott "[[\"*/:<>?\\\\|][:Cc:][:Cf:] - [\\u200c\\u200d]]").unescape(), 66c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott status)); 67c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#endif 68c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott DCHECK(U_SUCCESS(status)); 69c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Add non-characters. If this becomes a performance bottleneck by 70c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // any chance, do not add these to |set| and change IsFilenameLegal() 71c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // to check |ucs4 & 0xFFFEu == 0xFFFEu|, in addiition to calling 72c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // containsNone(). 73c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott set->add(0xFDD0, 0xFDEF); 74c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott for (int i = 0; i <= 0x10; ++i) { 75c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int plane_base = 0x10000 * i; 76c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott set->add(plane_base + 0xFFFE, plane_base + 0xFFFF); 77c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 78c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott set->freeze(); 79c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 80c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 81c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottclass LocaleAwareComparator { 82c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott public: 8321d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen static LocaleAwareComparator* GetInstance() { 8421d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen return Singleton<LocaleAwareComparator>::get(); 85c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 86c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 87c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Note: A similar function is available in l10n_util. 88c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // We cannot use it because base should not depend on l10n_util. 89c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // TODO(yuzo): Move some of l10n_util to base. 90c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int Compare(const string16& a, const string16& b) { 91c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // We are not sure if Collator::compare is thread-safe. 92c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Use an AutoLock just in case. 9372a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen base::AutoLock auto_lock(lock_); 94c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 95c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UErrorCode error_code = U_ZERO_ERROR; 96c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott UCollationResult result = collator_->compare( 97c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott static_cast<const UChar*>(a.c_str()), 98c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott static_cast<int>(a.length()), 99c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott static_cast<const UChar*>(b.c_str()), 100c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott static_cast<int>(b.length()), 101c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott error_code); 102c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott DCHECK(U_SUCCESS(error_code)); 103c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott return result; 104c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 105c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 106c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott private: 10721d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen LocaleAwareComparator() { 10821d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen UErrorCode error_code = U_ZERO_ERROR; 10921d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen // Use the default collator. The default locale should have been properly 11021d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen // set by the time this constructor is called. 11121d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen collator_.reset(icu::Collator::createInstance(error_code)); 11221d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen DCHECK(U_SUCCESS(error_code)); 11321d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen // Make it case-sensitive. 11421d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen collator_->setStrength(icu::Collator::TERTIARY); 11521d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen // Note: We do not set UCOL_NORMALIZATION_MODE attribute. In other words, we 11621d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen // do not pay performance penalty to guarantee sort order correctness for 11721d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen // non-FCD (http://unicode.org/notes/tn5/#FCD) file names. This should be a 11821d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen // reasonable tradeoff because such file names should be rare and the sort 11921d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen // order doesn't change much anyway. 12021d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen } 12121d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen 122c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott scoped_ptr<icu::Collator> collator_; 12372a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen base::Lock lock_; 124c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott friend struct DefaultSingletonTraits<LocaleAwareComparator>; 125c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 126c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott DISALLOW_COPY_AND_ASSIGN(LocaleAwareComparator); 127c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}; 128c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 129c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} // namespace 130c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 131c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottnamespace file_util { 132c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 133c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool IsFilenameLegal(const string16& file_name) { 13421d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen return IllegalCharacters::GetInstance()->containsNone(file_name); 135c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 136c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 137c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottvoid ReplaceIllegalCharactersInPath(FilePath::StringType* file_name, 138c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott char replace_char) { 139c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott DCHECK(file_name); 140c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 14121d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen DCHECK(!(IllegalCharacters::GetInstance()->contains(replace_char))); 142c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 143c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Remove leading and trailing whitespace. 144c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott TrimWhitespace(*file_name, TRIM_ALL, file_name); 145c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 14621d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen IllegalCharacters* illegal = IllegalCharacters::GetInstance(); 147c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int cursor = 0; // The ICU macros expect an int. 148c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott while (cursor < static_cast<int>(file_name->size())) { 149c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott int char_begin = cursor; 150c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott uint32 code_point; 151c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#if defined(OS_MACOSX) 152c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Mac uses UTF-8 encoding for filenames. 153c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott U8_NEXT(file_name->data(), cursor, static_cast<int>(file_name->length()), 154c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott code_point); 155c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#elif defined(OS_WIN) 156c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Windows uses UTF-16 encoding for filenames. 157c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott U16_NEXT(file_name->data(), cursor, static_cast<int>(file_name->length()), 158c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott code_point); 159c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#elif defined(OS_POSIX) 160c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // Linux doesn't actually define an encoding. It basically allows anything 161c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // except for a few special ASCII characters. 162c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott unsigned char cur_char = static_cast<unsigned char>((*file_name)[cursor++]); 163c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (cur_char >= 0x80) 164c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott continue; 165c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott code_point = cur_char; 166c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#else 167c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott NOTREACHED(); 168c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#endif 169c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 170c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott if (illegal->contains(code_point)) { 171c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott file_name->replace(char_begin, cursor - char_begin, 1, replace_char); 172c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // We just made the potentially multi-byte/word char into one that only 173c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // takes one byte/word, so need to adjust the cursor to point to the next 174c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // character again. 175c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott cursor = char_begin + 1; 176c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 177c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott } 178c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 179c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 180c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool LocaleAwareCompareFilenames(const FilePath& a, const FilePath& b) { 181c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#if defined(OS_WIN) 18221d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen return LocaleAwareComparator::GetInstance()->Compare(a.value().c_str(), 18321d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen b.value().c_str()) < 0; 184c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 185c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#elif defined(OS_POSIX) 186c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // On linux, the file system encoding is not defined. We assume 187c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // SysNativeMBToWide takes care of it. 188c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // 189c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // ICU's collator can take strings in OS native encoding. But we convert the 190c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // strings to UTF-16 ourselves to ensure conversion consistency. 191c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott // TODO(yuzo): Perhaps we should define SysNativeMBToUTF16? 19221d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen return LocaleAwareComparator::GetInstance()->Compare( 193c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott WideToUTF16(base::SysNativeMBToWide(a.value().c_str())), 194c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott WideToUTF16(base::SysNativeMBToWide(b.value().c_str()))) < 0; 195c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#else 196c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott #error Not implemented on your system 197c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#endif 198c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} 199c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott 200c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott} // namespace 201