15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2012 The Chromium Authors. All rights reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// File utilities that use the ICU library go in this file. 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/i18n/file_util_icu.h" 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 92a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "base/files/file_path.h" 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/i18n/icu_string_conversions.h" 11c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)#include "base/i18n/string_compare.h" 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/logging.h" 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/memory/scoped_ptr.h" 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/memory/singleton.h" 15868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)#include "base/strings/string_util.h" 162a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "base/strings/sys_string_conversions.h" 17868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)#include "base/strings/utf_string_conversions.h" 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "build/build_config.h" 19ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch#include "third_party/icu/source/common/unicode/uniset.h" 20ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch#include "third_party/icu/source/i18n/unicode/coll.h" 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 226e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles)namespace base { 236e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles)namespace i18n { 24a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace { 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class IllegalCharacters { 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public: 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static IllegalCharacters* GetInstance() { 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return Singleton<IllegalCharacters>::get(); 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool contains(UChar32 ucs4) { 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return !!set->contains(ucs4); 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool containsNone(const string16 &s) { 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return !!set->containsNone(icu::UnicodeString(s.c_str(), s.size())); 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private: 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) friend class Singleton<IllegalCharacters>; 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) friend struct DefaultSingletonTraits<IllegalCharacters>; 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) IllegalCharacters(); 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ~IllegalCharacters() { } 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) scoped_ptr<icu::UnicodeSet> set; 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DISALLOW_COPY_AND_ASSIGN(IllegalCharacters); 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)IllegalCharacters::IllegalCharacters() { 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UErrorCode status = U_ZERO_ERROR; 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Control characters, formatting characters, non-characters, and 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // some printable ASCII characters regarded as dangerous ('"*/:<>?\\'). 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // See http://blogs.msdn.com/michkap/archive/2006/11/03/941420.aspx 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // and http://msdn2.microsoft.com/en-us/library/Aa365247.aspx 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // TODO(jungshik): Revisit the set. ZWJ and ZWNJ are excluded because they 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // are legitimate in Arabic and some S/SE Asian scripts. However, when used 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // elsewhere, they can be confusing/problematic. 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Also, consider wrapping the set with our Singleton class to create and 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // freeze it only once. Note that there's a trade-off between memory and 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // speed. 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(WCHAR_T_IS_UTF16) 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) set.reset(new icu::UnicodeSet(icu::UnicodeString( 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) L"[[\"*/:<>?\\\\|][:Cc:][:Cf:] - [\u200c\u200d]]"), status)); 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#else 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) set.reset(new icu::UnicodeSet(UNICODE_STRING_SIMPLE( 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "[[\"*/:<>?\\\\|][:Cc:][:Cf:] - [\\u200c\\u200d]]").unescape(), 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status)); 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(U_SUCCESS(status)); 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Add non-characters. If this becomes a performance bottleneck by 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // any chance, do not add these to |set| and change IsFilenameLegal() 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // to check |ucs4 & 0xFFFEu == 0xFFFEu|, in addiition to calling 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // containsNone(). 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) set->add(0xFDD0, 0xFDEF); 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i <= 0x10; ++i) { 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int plane_base = 0x10000 * i; 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) set->add(plane_base + 0xFFFE, plane_base + 0xFFFF); 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) set->freeze(); 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool IsFilenameLegal(const string16& file_name) { 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return IllegalCharacters::GetInstance()->containsNone(file_name); 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 926e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles)void ReplaceIllegalCharactersInPath(FilePath::StringType* file_name, 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) char replace_char) { 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(file_name); 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(!(IllegalCharacters::GetInstance()->contains(replace_char))); 975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Remove leading and trailing whitespace. 996e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles) TrimWhitespace(*file_name, TRIM_ALL, file_name); 1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) IllegalCharacters* illegal = IllegalCharacters::GetInstance(); 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int cursor = 0; // The ICU macros expect an int. 1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (cursor < static_cast<int>(file_name->size())) { 1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int char_begin = cursor; 1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint32 code_point; 1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(OS_MACOSX) 1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Mac uses UTF-8 encoding for filenames. 1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) U8_NEXT(file_name->data(), cursor, static_cast<int>(file_name->length()), 1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) code_point); 1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#elif defined(OS_WIN) 1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Windows uses UTF-16 encoding for filenames. 1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) U16_NEXT(file_name->data(), cursor, static_cast<int>(file_name->length()), 1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) code_point); 1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#elif defined(OS_POSIX) 1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Linux doesn't actually define an encoding. It basically allows anything 1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // except for a few special ASCII characters. 1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) unsigned char cur_char = static_cast<unsigned char>((*file_name)[cursor++]); 1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (cur_char >= 0x80) 1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) continue; 1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) code_point = cur_char; 1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#else 1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) NOTREACHED(); 1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif 1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (illegal->contains(code_point)) { 1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) file_name->replace(char_begin, cursor - char_begin, 1, replace_char); 1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // We just made the potentially multi-byte/word char into one that only 1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // takes one byte/word, so need to adjust the cursor to point to the next 1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // character again. 1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cursor = char_begin + 1; 1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1356e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles)bool LocaleAwareCompareFilenames(const FilePath& a, const FilePath& b) { 136c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) UErrorCode error_code = U_ZERO_ERROR; 137c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // Use the default collator. The default locale should have been properly 138c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // set by the time this constructor is called. 139c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) scoped_ptr<icu::Collator> collator(icu::Collator::createInstance(error_code)); 140c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) DCHECK(U_SUCCESS(error_code)); 141c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // Make it case-sensitive. 142c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) collator->setStrength(icu::Collator::TERTIARY); 143c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) 1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(OS_WIN) 1456e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles) return CompareString16WithCollator(collator.get(), 1466e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles) WideToUTF16(a.value()), WideToUTF16(b.value())) == UCOL_LESS; 1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#elif defined(OS_POSIX) 1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // On linux, the file system encoding is not defined. We assume 1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // SysNativeMBToWide takes care of it. 1516e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles) return CompareString16WithCollator( 1525d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) collator.get(), 1536e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles) WideToUTF16(SysNativeMBToWide(a.value().c_str())), 1546e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles) WideToUTF16(SysNativeMBToWide(b.value().c_str()))) == UCOL_LESS; 1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#else 1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) #error Not implemented on your system 1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif 1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1606e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles)void NormalizeFileNameEncoding(FilePath* file_name) { 1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if defined(OS_CHROMEOS) 1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::string normalized_str; 1636e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles) if (ConvertToUtf8AndNormalize(file_name->BaseName().value(), 1646e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles) kCodepageUTF8, 1656e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles) &normalized_str)) { 1666e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles) *file_name = file_name->DirName().Append(FilePath(normalized_str)); 1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif 1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1716e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles)} // namespace i18n 1726e8cce623b6e4fe0c9e4af605d675dd9d0338c38Torne (Richard Coles)} // namespace base 173