1// Copyright (C) 2014 Google Inc. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15#include <libaddressinput/address_formatter.h> 16 17#include <libaddressinput/address_data.h> 18#include <libaddressinput/address_field.h> 19#include <libaddressinput/util/basictypes.h> 20 21#include <algorithm> 22#include <cassert> 23#include <cstddef> 24#include <functional> 25#include <string> 26#include <vector> 27 28#include "format_element.h" 29#include "language.h" 30#include "region_data_constants.h" 31#include "rule.h" 32#include "util/cctype_tolower_equal.h" 33 34namespace i18n { 35namespace addressinput { 36 37namespace { 38 39const char kCommaSeparator[] = ", "; 40const char kSpaceSeparator[] = " "; 41const char kArabicCommaSeparator[] = "\xD8\x8C" " "; /* "، " */ 42 43const char* kLanguagesThatUseSpace[] = { 44 "th", 45 "ko" 46}; 47 48const char* kLanguagesThatHaveNoSeparator[] = { 49 "ja", 50 "zh" // All Chinese variants. 51}; 52 53// This data is based on CLDR, for languages that are in official use in some 54// country, where Arabic is the most likely script tag. 55// TODO: Consider supporting variants such as tr-Arab by detecting the script 56// code. 57const char* kLanguagesThatUseAnArabicComma[] = { 58 "ar", 59 "az", 60 "fa", 61 "kk", 62 "ku", 63 "ky", 64 "ps", 65 "tg", 66 "tk", 67 "ur", 68 "uz" 69}; 70 71std::string GetLineSeparatorForLanguage(const std::string& language_tag) { 72 Language address_language(language_tag); 73 74 // First deal with explicit script tags. 75 if (address_language.has_latin_script) { 76 return kCommaSeparator; 77 } 78 79 // Now guess something appropriate based on the base language. 80 const std::string& base_language = address_language.base; 81 if (std::find_if(kLanguagesThatUseSpace, 82 kLanguagesThatUseSpace + arraysize(kLanguagesThatUseSpace), 83 std::bind2nd(EqualToTolowerString(), base_language)) != 84 kLanguagesThatUseSpace + arraysize(kLanguagesThatUseSpace)) { 85 return kSpaceSeparator; 86 } else if (std::find_if( 87 kLanguagesThatHaveNoSeparator, 88 kLanguagesThatHaveNoSeparator + 89 arraysize(kLanguagesThatHaveNoSeparator), 90 std::bind2nd(EqualToTolowerString(), base_language)) != 91 kLanguagesThatHaveNoSeparator + 92 arraysize(kLanguagesThatHaveNoSeparator)) { 93 return ""; 94 } else if (std::find_if( 95 kLanguagesThatUseAnArabicComma, 96 kLanguagesThatUseAnArabicComma + 97 arraysize(kLanguagesThatUseAnArabicComma), 98 std::bind2nd(EqualToTolowerString(), base_language)) != 99 kLanguagesThatUseAnArabicComma + 100 arraysize(kLanguagesThatUseAnArabicComma)) { 101 return kArabicCommaSeparator; 102 } 103 // Either the language is a Latin-script language, or no language was 104 // specified. In the latter case we still return ", " as the most common 105 // separator in use. In countries that don't use this, e.g. Thailand, 106 // addresses are often written in Latin script where this would still be 107 // appropriate, so this is a reasonable default in the absence of information. 108 return kCommaSeparator; 109} 110 111void CombineLinesForLanguage(const std::vector<std::string>& lines, 112 const std::string& language_tag, 113 std::string* line) { 114 line->clear(); 115 std::string separator = GetLineSeparatorForLanguage(language_tag); 116 for (std::vector<std::string>::const_iterator it = lines.begin(); 117 it != lines.end(); 118 ++it) { 119 if (it != lines.begin()) { 120 line->append(separator); 121 } 122 line->append(*it); 123 } 124} 125 126} // namespace 127 128void GetFormattedNationalAddress( 129 const AddressData& address_data, std::vector<std::string>* lines) { 130 assert(lines != NULL); 131 lines->clear(); 132 133 Rule rule; 134 rule.CopyFrom(Rule::GetDefault()); 135 // TODO: Eventually, we should get the best rule for this country and 136 // language, rather than just for the country. 137 rule.ParseSerializedRule(RegionDataConstants::GetRegionData( 138 address_data.region_code)); 139 140 Language language(address_data.language_code); 141 142 // If Latin-script rules are available and the |language_code| of this address 143 // is explicitly tagged as being Latin, then use the Latin-script formatting 144 // rules. 145 const std::vector<FormatElement>& format = 146 language.has_latin_script && !rule.GetLatinFormat().empty() 147 ? rule.GetLatinFormat() 148 : rule.GetFormat(); 149 150 // Address format without the unnecessary elements (based on which address 151 // fields are empty). We assume all literal strings that are not at the start 152 // or end of a line are separators, and therefore only relevant if the 153 // surrounding fields are filled in. This works with the data we have 154 // currently. 155 std::vector<FormatElement> pruned_format; 156 for (std::vector<FormatElement>::const_iterator 157 element_it = format.begin(); 158 element_it != format.end(); 159 ++element_it) { 160 // Always keep the newlines. 161 if (element_it->IsNewline() || 162 // Always keep the non-empty address fields. 163 (element_it->IsField() && 164 !address_data.IsFieldEmpty(element_it->GetField())) || 165 // Only keep literals that satisfy these 2 conditions: 166 (!element_it->IsField() && 167 // (1) Not preceding an empty field. 168 (element_it + 1 == format.end() || 169 !(element_it + 1)->IsField() || 170 !address_data.IsFieldEmpty((element_it + 1)->GetField())) && 171 // (2) Not following a removed field. 172 (element_it == format.begin() || 173 !(element_it - 1)->IsField() || 174 (!pruned_format.empty() && pruned_format.back().IsField())))) { 175 pruned_format.push_back(*element_it); 176 } 177 } 178 179 std::string line; 180 for (std::vector<FormatElement>::const_iterator 181 element_it = pruned_format.begin(); 182 element_it != pruned_format.end(); 183 ++element_it) { 184 if (element_it->IsNewline()) { 185 if (!line.empty()) { 186 lines->push_back(line); 187 line.clear(); 188 } 189 } else if (element_it->IsField()) { 190 AddressField field = element_it->GetField(); 191 if (field == STREET_ADDRESS) { 192 // The field "street address" represents the street address lines of an 193 // address, so there can be multiple values. 194 if (!address_data.IsFieldEmpty(field)) { 195 line.append(address_data.address_line.front()); 196 if (address_data.address_line.size() > 1U) { 197 lines->push_back(line); 198 line.clear(); 199 lines->insert(lines->end(), 200 address_data.address_line.begin() + 1, 201 address_data.address_line.end()); 202 } 203 } 204 } else { 205 line.append(address_data.GetFieldValue(field)); 206 } 207 } else { 208 line.append(element_it->GetLiteral()); 209 } 210 } 211 if (!line.empty()) { 212 lines->push_back(line); 213 } 214} 215 216void GetFormattedNationalAddressLine( 217 const AddressData& address_data, std::string* line) { 218 std::vector<std::string> address_lines; 219 GetFormattedNationalAddress(address_data, &address_lines); 220 CombineLinesForLanguage(address_lines, address_data.language_code, line); 221} 222 223void GetStreetAddressLinesAsSingleLine( 224 const AddressData& address_data, std::string* line) { 225 CombineLinesForLanguage( 226 address_data.address_line, address_data.language_code, line); 227} 228 229} // namespace addressinput 230} // namespace i18n 231