1// Copyright (C) 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4**********************************************************************
5*   Copyright (C) 1997-2016, International Business Machines
6*   Corporation and others.  All Rights Reserved.
7**********************************************************************
8*
9* File ULOC.CPP
10*
11* Modification History:
12*
13*   Date        Name        Description
14*   04/01/97    aliu        Creation.
15*   08/21/98    stephen     JDK 1.2 sync
16*   12/08/98    rtg         New Locale implementation and C API
17*   03/15/99    damiba      overhaul.
18*   04/06/99    stephen     changed setDefault() to realloc and copy
19*   06/14/99    stephen     Changed calls to ures_open for new params
20*   07/21/99    stephen     Modified setDefault() to propagate to C++
21*   05/14/04    alan        7 years later: refactored, cleaned up, fixed bugs,
22*                           brought canonicalization code into line with spec
23*****************************************************************************/
24
25/*
26   POSIX's locale format, from putil.c: [no spaces]
27
28     ll [ _CC ] [ . MM ] [ @ VV]
29
30     l = lang, C = ctry, M = charmap, V = variant
31*/
32
33#include "unicode/utypes.h"
34#include "unicode/ustring.h"
35#include "unicode/uloc.h"
36
37#include "putilimp.h"
38#include "ustr_imp.h"
39#include "ulocimp.h"
40#include "umutex.h"
41#include "cstring.h"
42#include "cmemory.h"
43#include "locmap.h"
44#include "uarrsort.h"
45#include "uenumimp.h"
46#include "uassert.h"
47
48#include <stdio.h> /* for sprintf */
49
50using namespace icu;
51
52/* ### Declarations **************************************************/
53
54/* Locale stuff from locid.cpp */
55U_CFUNC void locale_set_default(const char *id);
56U_CFUNC const char *locale_get_default(void);
57U_CFUNC int32_t
58locale_getKeywords(const char *localeID,
59            char prev,
60            char *keywords, int32_t keywordCapacity,
61            char *values, int32_t valuesCapacity, int32_t *valLen,
62            UBool valuesToo,
63            UErrorCode *status);
64
65/* ### Data tables **************************************************/
66
67/**
68 * Table of language codes, both 2- and 3-letter, with preference
69 * given to 2-letter codes where possible.  Includes 3-letter codes
70 * that lack a 2-letter equivalent.
71 *
72 * This list must be in sorted order.  This list is returned directly
73 * to the user by some API.
74 *
75 * This list must be kept in sync with LANGUAGES_3, with corresponding
76 * entries matched.
77 *
78 * This table should be terminated with a NULL entry, followed by a
79 * second list, and another NULL entry.  The first list is visible to
80 * user code when this array is returned by API.  The second list
81 * contains codes we support, but do not expose through user API.
82 *
83 * Notes
84 *
85 * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
86 * include the revisions up to 2001/7/27 *CWB*
87 *
88 * The 3 character codes are the terminology codes like RFC 3066.  This
89 * is compatible with prior ICU codes
90 *
91 * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
92 * table but now at the end of the table because 3 character codes are
93 * duplicates.  This avoids bad searches going from 3 to 2 character
94 * codes.
95 *
96 * The range qaa-qtz is reserved for local use
97 */
98/* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
99/* ISO639 table version is 20150505 */
100static const char * const LANGUAGES[] = {
101    "aa",  "ab",  "ace", "ach", "ada", "ady", "ae",  "aeb",
102    "af",  "afh", "agq", "ain", "ak",  "akk", "akz", "ale",
103    "aln", "alt", "am",  "an",  "ang", "anp", "ar",  "arc",
104    "arn", "aro", "arp", "arq", "arw", "ary", "arz", "as",
105    "asa", "ase", "ast", "av",  "avk", "awa", "ay",  "az",
106    "ba",  "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
107    "be",  "bej", "bem", "bew", "bez", "bfd", "bfq", "bg",
108    "bgn", "bho", "bi",  "bik", "bin", "bjn", "bkm", "bla",
109    "bm",  "bn",  "bo",  "bpy", "bqi", "br",  "bra", "brh",
110    "brx", "bs",  "bss", "bua", "bug", "bum", "byn", "byv",
111    "ca",  "cad", "car", "cay", "cch", "ce",  "ceb", "cgg",
112    "ch",  "chb", "chg", "chk", "chm", "chn", "cho", "chp",
113    "chr", "chy", "ckb", "co",  "cop", "cps", "cr",  "crh",
114    "cs",  "csb", "cu",  "cv",  "cy",
115    "da",  "dak", "dar", "dav", "de",  "del", "den", "dgr",
116    "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "dv",
117    "dyo", "dyu", "dz",  "dzg",
118    "ebu", "ee",  "efi", "egl", "egy", "eka", "el",  "elx",
119    "en",  "enm", "eo",  "es",  "esu", "et",  "eu",  "ewo",
120    "ext",
121    "fa",  "fan", "fat", "ff",  "fi",  "fil", "fit", "fj",
122    "fo",  "fon", "fr",  "frc", "frm", "fro", "frp", "frr",
123    "frs", "fur", "fy",
124    "ga",  "gaa", "gag", "gan", "gay", "gba", "gbz", "gd",
125    "gez", "gil", "gl",  "glk", "gmh", "gn",  "goh", "gom",
126    "gon", "gor", "got", "grb", "grc", "gsw", "gu",  "guc",
127    "gur", "guz", "gv",  "gwi",
128    "ha",  "hai", "hak", "haw", "he",  "hi",  "hif", "hil",
129    "hit", "hmn", "ho",  "hr",  "hsb", "hsn", "ht",  "hu",
130    "hup", "hy",  "hz",
131    "ia",  "iba", "ibb", "id",  "ie",  "ig",  "ii",  "ik",
132    "ilo", "inh", "io",  "is",  "it",  "iu",  "izh",
133    "ja",  "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
134    "jv",
135    "ka",  "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
136    "kbl", "kcg", "kde", "kea", "ken", "kfo", "kg",  "kgp",
137    "kha", "kho", "khq", "khw", "ki",  "kiu", "kj",  "kk",
138    "kkj", "kl",  "kln", "km",  "kmb", "kn",  "ko",  "koi",
139    "kok", "kos", "kpe", "kr",  "krc", "kri", "krj", "krl",
140    "kru", "ks",  "ksb", "ksf", "ksh", "ku",  "kum", "kut",
141    "kv",  "kw",  "ky",
142    "la",  "lad", "lag", "lah", "lam", "lb",  "lez", "lfn",
143    "lg",  "li",  "lij", "liv", "lkt", "lmo", "ln",  "lo",
144    "lol", "loz", "lrc", "lt",  "ltg", "lu",  "lua", "lui",
145    "lun", "luo", "lus", "luy", "lv",  "lzh", "lzz",
146    "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
147    "mdf", "mdh", "mdr", "men", "mer", "mfe", "mg",  "mga",
148    "mgh", "mgo", "mh",  "mi",  "mic", "min", "mis", "mk",
149    "ml",  "mn",  "mnc", "mni", "moh", "mos", "mr",  "mrj",
150    "ms",  "mt",  "mua", "mul", "mus", "mwl", "mwr", "mwv",
151    "my",  "mye", "myv", "mzn",
152    "na",  "nan", "nap", "naq", "nb",  "nd",  "nds", "ne",
153    "new", "ng",  "nia", "niu", "njo", "nl",  "nmg", "nn",
154    "nnh", "no",  "nog", "non", "nov", "nqo", "nr",  "nso",
155    "nus", "nv",  "nwc", "ny",  "nym", "nyn", "nyo", "nzi",
156    "oc",  "oj",  "om",  "or",  "os",  "osa", "ota",
157    "pa",  "pag", "pal", "pam", "pap", "pau", "pcd", "pdc",
158    "pdt", "peo", "pfl", "phn", "pi",  "pl",  "pms", "pnt",
159    "pon", "prg", "pro", "ps",  "pt",
160    "qu",  "quc", "qug",
161    "raj", "rap", "rar", "rgn", "rif", "rm",  "rn",  "ro",
162    "rof", "rom", "rtm", "ru",  "rue", "rug", "rup",
163    "rw",  "rwk",
164    "sa",  "sad", "sah", "sam", "saq", "sas", "sat", "saz",
165    "sba", "sbp", "sc",  "scn", "sco", "sd",  "sdc", "sdh",
166    "se",  "see", "seh", "sei", "sel", "ses", "sg",  "sga",
167    "sgs", "shi", "shn", "shu", "si",  "sid", "sk",
168    "sl",  "sli", "sly", "sm",  "sma", "smj", "smn", "sms",
169    "sn",  "snk", "so",  "sog", "sq",  "sr",  "srn", "srr",
170    "ss",  "ssy", "st",  "stq", "su",  "suk", "sus", "sux",
171    "sv",  "sw",  "swb", "swc", "syc", "syr", "szl",
172    "ta",  "tcy", "te",  "tem", "teo", "ter", "tet", "tg",
173    "th",  "ti",  "tig", "tiv", "tk",  "tkl", "tkr", "tl",
174    "tlh", "tli", "tly", "tmh", "tn",  "to",  "tog", "tpi",
175    "tr",  "tru", "trv", "ts",  "tsd", "tsi", "tt",  "ttt",
176    "tum", "tvl", "tw",  "twq", "ty",  "tyv", "tzm",
177    "udm", "ug",  "uga", "uk",  "umb", "und", "ur",  "uz",
178    "vai", "ve",  "vec", "vep", "vi",  "vls", "vmf", "vo",
179    "vot", "vro", "vun",
180    "wa",  "wae", "wal", "war", "was", "wbp", "wo",  "wuu",
181    "xal", "xh",  "xmf", "xog",
182    "yao", "yap", "yav", "ybb", "yi",  "yo",  "yrl", "yue",
183    "za",  "zap", "zbl", "zea", "zen", "zgh", "zh",  "zu",
184    "zun", "zxx", "zza",
185NULL,
186    "in",  "iw",  "ji",  "jw",  "sh",    /* obsolete language codes */
187NULL
188};
189
190static const char* const DEPRECATED_LANGUAGES[]={
191    "in", "iw", "ji", "jw", NULL, NULL
192};
193static const char* const REPLACEMENT_LANGUAGES[]={
194    "id", "he", "yi", "jv", NULL, NULL
195};
196
197/**
198 * Table of 3-letter language codes.
199 *
200 * This is a lookup table used to convert 3-letter language codes to
201 * their 2-letter equivalent, where possible.  It must be kept in sync
202 * with LANGUAGES.  For all valid i, LANGUAGES[i] must refer to the
203 * same language as LANGUAGES_3[i].  The commented-out lines are
204 * copied from LANGUAGES to make eyeballing this baby easier.
205 *
206 * Where a 3-letter language code has no 2-letter equivalent, the
207 * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
208 *
209 * This table should be terminated with a NULL entry, followed by a
210 * second list, and another NULL entry.  The two lists correspond to
211 * the two lists in LANGUAGES.
212 */
213/* Generated using org.unicode.cldr.icu.GenerateISO639LanguageTables */
214/* ISO639 table version is 20150505 */
215static const char * const LANGUAGES_3[] = {
216    "aar", "abk", "ace", "ach", "ada", "ady", "ave", "aeb",
217    "afr", "afh", "agq", "ain", "aka", "akk", "akz", "ale",
218    "aln", "alt", "amh", "arg", "ang", "anp", "ara", "arc",
219    "arn", "aro", "arp", "arq", "arw", "ary", "arz", "asm",
220    "asa", "ase", "ast", "ava", "avk", "awa", "aym", "aze",
221    "bak", "bal", "ban", "bar", "bas", "bax", "bbc", "bbj",
222    "bel", "bej", "bem", "bew", "bez", "bfd", "bfq", "bul",
223    "bgn", "bho", "bis", "bik", "bin", "bjn", "bkm", "bla",
224    "bam", "ben", "bod", "bpy", "bqi", "bre", "bra", "brh",
225    "brx", "bos", "bss", "bua", "bug", "bum", "byn", "byv",
226    "cat", "cad", "car", "cay", "cch", "che", "ceb", "cgg",
227    "cha", "chb", "chg", "chk", "chm", "chn", "cho", "chp",
228    "chr", "chy", "ckb", "cos", "cop", "cps", "cre", "crh",
229    "ces", "csb", "chu", "chv", "cym",
230    "dan", "dak", "dar", "dav", "deu", "del", "den", "dgr",
231    "din", "dje", "doi", "dsb", "dtp", "dua", "dum", "div",
232    "dyo", "dyu", "dzo", "dzg",
233    "ebu", "ewe", "efi", "egl", "egy", "eka", "ell", "elx",
234    "eng", "enm", "epo", "spa", "esu", "est", "eus", "ewo",
235    "ext",
236    "fas", "fan", "fat", "ful", "fin", "fil", "fit", "fij",
237    "fao", "fon", "fra", "frc", "frm", "fro", "frp", "frr",
238    "frs", "fur", "fry",
239    "gle", "gaa", "gag", "gan", "gay", "gba", "gbz", "gla",
240    "gez", "gil", "glg", "glk", "gmh", "grn", "goh", "gom",
241    "gon", "gor", "got", "grb", "grc", "gsw", "guj", "guc",
242    "gur", "guz", "glv", "gwi",
243    "hau", "hai", "hak", "haw", "heb", "hin", "hif", "hil",
244    "hit", "hmn", "hmo", "hrv", "hsb", "hsn", "hat", "hun",
245    "hup", "hye", "her",
246    "ina", "iba", "ibb", "ind", "ile", "ibo", "iii", "ipk",
247    "ilo", "inh", "ido", "isl", "ita", "iku", "izh",
248    "jpn", "jam", "jbo", "jgo", "jmc", "jpr", "jrb", "jut",
249    "jav",
250    "kat", "kaa", "kab", "kac", "kaj", "kam", "kaw", "kbd",
251    "kbl", "kcg", "kde", "kea", "ken", "kfo", "kon", "kgp",
252    "kha", "kho", "khq", "khw", "kik", "kiu", "kua", "kaz",
253    "kkj", "kal", "kln", "khm", "kmb", "kan", "kor", "koi",
254    "kok", "kos", "kpe", "kau", "krc", "kri", "krj", "krl",
255    "kru", "kas", "ksb", "ksf", "ksh", "kur", "kum", "kut",
256    "kom", "cor", "kir",
257    "lat", "lad", "lag", "lah", "lam", "ltz", "lez", "lfn",
258    "lug", "lim", "lij", "liv", "lkt", "lmo", "lin", "lao",
259    "lol", "loz", "lrc", "lit", "ltg", "lub", "lua", "lui",
260    "lun", "luo", "lus", "luy", "lav", "lzh", "lzz",
261    "mad", "maf", "mag", "mai", "mak", "man", "mas", "mde",
262    "mdf", "mdh", "mdr", "men", "mer", "mfe", "mlg", "mga",
263    "mgh", "mgo", "mah", "mri", "mic", "min", "mis", "mkd",
264    "mal", "mon", "mnc", "mni", "moh", "mos", "mar", "mrj",
265    "msa", "mlt", "mua", "mul", "mus", "mwl", "mwr", "mwv",
266    "mya", "mye", "myv", "mzn",
267    "nau", "nan", "nap", "naq", "nob", "nde", "nds", "nep",
268    "new", "ndo", "nia", "niu", "njo", "nld", "nmg", "nno",
269    "nnh", "nor", "nog", "non", "nov", "nqo", "nbl", "nso",
270    "nus", "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi",
271    "oci", "oji", "orm", "ori", "oss", "osa", "ota",
272    "pan", "pag", "pal", "pam", "pap", "pau", "pcd", "pdc",
273    "pdt", "peo", "pfl", "phn", "pli", "pol", "pms", "pnt",
274    "pon", "prg", "pro", "pus", "por",
275    "que", "quc", "qug",
276    "raj", "rap", "rar", "rgn", "rif", "roh", "run", "ron",
277    "rof", "rom", "rtm", "rus", "rue", "rug", "rup",
278    "kin", "rwk",
279    "san", "sad", "sah", "sam", "saq", "sas", "sat", "saz",
280    "sba", "sbp", "srd", "scn", "sco", "snd", "sdc", "sdh",
281    "sme", "see", "seh", "sei", "sel", "ses", "sag", "sga",
282    "sgs", "shi", "shn", "shu", "sin", "sid", "slk",
283    "slv", "sli", "sly", "smo", "sma", "smj", "smn", "sms",
284    "sna", "snk", "som", "sog", "sqi", "srp", "srn", "srr",
285    "ssw", "ssy", "sot", "stq", "sun", "suk", "sus", "sux",
286    "swe", "swa", "swb", "swc", "syc", "syr", "szl",
287    "tam", "tcy", "tel", "tem", "teo", "ter", "tet", "tgk",
288    "tha", "tir", "tig", "tiv", "tuk", "tkl", "tkr", "tgl",
289    "tlh", "tli", "tly", "tmh", "tsn", "ton", "tog", "tpi",
290    "tur", "tru", "trv", "tso", "tsd", "tsi", "tat", "ttt",
291    "tum", "tvl", "twi", "twq", "tah", "tyv", "tzm",
292    "udm", "uig", "uga", "ukr", "umb", "und", "urd", "uzb",
293    "vai", "ven", "vec", "vep", "vie", "vls", "vmf", "vol",
294    "vot", "vro", "vun",
295    "wln", "wae", "wal", "war", "was", "wbp", "wol", "wuu",
296    "xal", "xho", "xmf", "xog",
297    "yao", "yap", "yav", "ybb", "yid", "yor", "yrl", "yue",
298    "zha", "zap", "zbl", "zea", "zen", "zgh", "zho", "zul",
299    "zun", "zxx", "zza",
300NULL,
301/*  "in",  "iw",  "ji",  "jw",  "sh",                          */
302    "ind", "heb", "yid", "jaw", "srp",
303NULL
304};
305
306/**
307 * Table of 2-letter country codes.
308 *
309 * This list must be in sorted order.  This list is returned directly
310 * to the user by some API.
311 *
312 * This list must be kept in sync with COUNTRIES_3, with corresponding
313 * entries matched.
314 *
315 * This table should be terminated with a NULL entry, followed by a
316 * second list, and another NULL entry.  The first list is visible to
317 * user code when this array is returned by API.  The second list
318 * contains codes we support, but do not expose through user API.
319 *
320 * Notes:
321 *
322 * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
323 * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
324 * new codes keeping the old ones for compatibility updated to include
325 * 1999/12/03 revisions *CWB*
326 *
327 * RO(ROM) is now RO(ROU) according to
328 * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
329 */
330static const char * const COUNTRIES[] = {
331    "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",
332    "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",
333    "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",
334    "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",
335    "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",
336    "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CR",
337    "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DJ",  "DK",
338    "DM",  "DO",  "DZ",  "EC",  "EE",  "EG",  "EH",  "ER",
339    "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",
340    "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",
341    "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",
342    "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",
343    "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS",
344    "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",
345    "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",
346    "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",
347    "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",
348    "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",
349    "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",
350    "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",
351    "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",
352    "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",
353    "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",
354    "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",
355    "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",
356    "SX",  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",
357    "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",
358    "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",
359    "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",
360    "WS",  "YE",  "YT",  "ZA",  "ZM",  "ZW",
361NULL,
362    "AN",  "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR",   /* obsolete country codes */
363NULL
364};
365
366static const char* const DEPRECATED_COUNTRIES[] = {
367    "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR", NULL, NULL /* deprecated country list */
368};
369static const char* const REPLACEMENT_COUNTRIES[] = {
370/*  "AN", "BU", "CS", "DD", "DY", "FX", "HV", "NH", "RH", "SU", "TP", "UK", "VD", "YD", "YU", "ZR" */
371    "CW", "MM", "RS", "DE", "BJ", "FR", "BF", "VU", "ZW", "RU", "TL", "GB", "VN", "YE", "RS", "CD", NULL, NULL  /* replacement country codes */
372};
373
374/**
375 * Table of 3-letter country codes.
376 *
377 * This is a lookup table used to convert 3-letter country codes to
378 * their 2-letter equivalent.  It must be kept in sync with COUNTRIES.
379 * For all valid i, COUNTRIES[i] must refer to the same country as
380 * COUNTRIES_3[i].  The commented-out lines are copied from COUNTRIES
381 * to make eyeballing this baby easier.
382 *
383 * This table should be terminated with a NULL entry, followed by a
384 * second list, and another NULL entry.  The two lists correspond to
385 * the two lists in COUNTRIES.
386 */
387static const char * const COUNTRIES_3[] = {
388/*  "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",      */
389    "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM",
390/*  "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",     */
391    "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
392/*  "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",     */
393    "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
394/*  "BJ",  "BL",  "BM",  "BN",  "BO",  "BQ",  "BR",  "BS",  "BT",  "BV",     */
395    "BEN", "BLM", "BMU", "BRN", "BOL", "BES", "BRA", "BHS", "BTN", "BVT",
396/*  "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",     */
397    "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
398/*  "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CR",     */
399    "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CRI",
400/*  "CU",  "CV",  "CW",  "CX",  "CY",  "CZ",  "DE",  "DJ",  "DK",     */
401    "CUB", "CPV", "CUW", "CXR", "CYP", "CZE", "DEU", "DJI", "DNK",
402/*  "DM",  "DO",  "DZ",  "EC",  "EE",  "EG",  "EH",  "ER",     */
403    "DMA", "DOM", "DZA", "ECU", "EST", "EGY", "ESH", "ERI",
404/*  "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",     */
405    "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
406/*  "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",     */
407    "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
408/*  "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",     */
409    "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
410/*  "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",     */
411    "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
412/*  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS" */
413    "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL",
414/*  "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",     */
415    "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
416/*  "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",     */
417    "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
418/*  "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",     */
419    "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
420/*  "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",     */
421    "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
422/*  "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",     */
423    "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
424/*  "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",     */
425    "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
426/*  "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",     */
427    "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
428/*  "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",     */
429    "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
430/*  "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",     */
431    "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
432/*  "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",     */
433    "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
434/*  "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",     */
435    "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
436/*  "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "SS",  "ST",  "SV",     */
437    "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "SSD", "STP", "SLV",
438/*  "SX",  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",     */
439    "SXM", "SYR", "SWZ", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
440/*  "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",     */
441    "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
442/*  "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",     */
443    "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
444/*  "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",     */
445    "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
446/*  "WS",  "YE",  "YT",  "ZA",  "ZM",  "ZW",          */
447    "WSM", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
448NULL,
449/*  "AN",  "BU",  "CS",  "FX",  "RO", "SU",  "TP",  "YD",  "YU",  "ZR" */
450    "ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR",
451NULL
452};
453
454typedef struct CanonicalizationMap {
455    const char *id;          /* input ID */
456    const char *canonicalID; /* canonicalized output ID */
457    const char *keyword;     /* keyword, or NULL if none */
458    const char *value;       /* keyword value, or NULL if kw==NULL */
459} CanonicalizationMap;
460
461/**
462 * A map to canonicalize locale IDs.  This handles a variety of
463 * different semantic kinds of transformations.
464 */
465static const CanonicalizationMap CANONICALIZE_MAP[] = {
466    { "",               "en_US_POSIX", NULL, NULL }, /* .NET name */
467    { "c",              "en_US_POSIX", NULL, NULL }, /* POSIX name */
468    { "posix",          "en_US_POSIX", NULL, NULL }, /* POSIX name (alias of C) */
469    { "art_LOJBAN",     "jbo", NULL, NULL }, /* registered name */
470    { "az_AZ_CYRL",     "az_Cyrl_AZ", NULL, NULL }, /* .NET name */
471    { "az_AZ_LATN",     "az_Latn_AZ", NULL, NULL }, /* .NET name */
472    { "ca_ES_PREEURO",  "ca_ES", "currency", "ESP" },
473    { "de__PHONEBOOK",  "de", "collation", "phonebook" }, /* Old ICU name */
474    { "de_AT_PREEURO",  "de_AT", "currency", "ATS" },
475    { "de_DE_PREEURO",  "de_DE", "currency", "DEM" },
476    { "de_LU_PREEURO",  "de_LU", "currency", "LUF" },
477    { "el_GR_PREEURO",  "el_GR", "currency", "GRD" },
478    { "en_BE_PREEURO",  "en_BE", "currency", "BEF" },
479    { "en_IE_PREEURO",  "en_IE", "currency", "IEP" },
480    { "es__TRADITIONAL", "es", "collation", "traditional" }, /* Old ICU name */
481    { "es_ES_PREEURO",  "es_ES", "currency", "ESP" },
482    { "eu_ES_PREEURO",  "eu_ES", "currency", "ESP" },
483    { "fi_FI_PREEURO",  "fi_FI", "currency", "FIM" },
484    { "fr_BE_PREEURO",  "fr_BE", "currency", "BEF" },
485    { "fr_FR_PREEURO",  "fr_FR", "currency", "FRF" },
486    { "fr_LU_PREEURO",  "fr_LU", "currency", "LUF" },
487    { "ga_IE_PREEURO",  "ga_IE", "currency", "IEP" },
488    { "gl_ES_PREEURO",  "gl_ES", "currency", "ESP" },
489    { "hi__DIRECT",     "hi", "collation", "direct" }, /* Old ICU name */
490    { "it_IT_PREEURO",  "it_IT", "currency", "ITL" },
491    { "ja_JP_TRADITIONAL", "ja_JP", "calendar", "japanese" }, /* Old ICU name */
492    { "nb_NO_NY",       "nn_NO", NULL, NULL },  /* "markus said this was ok" :-) */
493    { "nl_BE_PREEURO",  "nl_BE", "currency", "BEF" },
494    { "nl_NL_PREEURO",  "nl_NL", "currency", "NLG" },
495    { "pt_PT_PREEURO",  "pt_PT", "currency", "PTE" },
496    { "sr_SP_CYRL",     "sr_Cyrl_RS", NULL, NULL }, /* .NET name */
497    { "sr_SP_LATN",     "sr_Latn_RS", NULL, NULL }, /* .NET name */
498    { "sr_YU_CYRILLIC", "sr_Cyrl_RS", NULL, NULL }, /* Linux name */
499    { "th_TH_TRADITIONAL", "th_TH", "calendar", "buddhist" }, /* Old ICU name */
500    { "uz_UZ_CYRILLIC", "uz_Cyrl_UZ", NULL, NULL }, /* Linux name */
501    { "uz_UZ_CYRL",     "uz_Cyrl_UZ", NULL, NULL }, /* .NET name */
502    { "uz_UZ_LATN",     "uz_Latn_UZ", NULL, NULL }, /* .NET name */
503    { "zh_CHS",         "zh_Hans", NULL, NULL }, /* .NET name */
504    { "zh_CHT",         "zh_Hant", NULL, NULL }, /* .NET name */
505    { "zh_GAN",         "gan", NULL, NULL }, /* registered name */
506    { "zh_GUOYU",       "zh", NULL, NULL }, /* registered name */
507    { "zh_HAKKA",       "hak", NULL, NULL }, /* registered name */
508    { "zh_MIN_NAN",     "nan", NULL, NULL }, /* registered name */
509    { "zh_WUU",         "wuu", NULL, NULL }, /* registered name */
510    { "zh_XIANG",       "hsn", NULL, NULL }, /* registered name */
511    { "zh_YUE",         "yue", NULL, NULL }, /* registered name */
512};
513
514typedef struct VariantMap {
515    const char *variant;          /* input ID */
516    const char *keyword;     /* keyword, or NULL if none */
517    const char *value;       /* keyword value, or NULL if kw==NULL */
518} VariantMap;
519
520static const VariantMap VARIANT_MAP[] = {
521    { "EURO",   "currency", "EUR" },
522    { "PINYIN", "collation", "pinyin" }, /* Solaris variant */
523    { "STROKE", "collation", "stroke" }  /* Solaris variant */
524};
525
526/* ### BCP47 Conversion *******************************************/
527/* Test if the locale id has BCP47 u extension and does not have '@' */
528#define _hasBCP47Extension(id) (id && uprv_strstr(id, "@") == NULL && getShortestSubtagLength(localeID) == 1)
529/* Converts the BCP47 id to Unicode id. Does nothing to id if conversion fails */
530#define _ConvertBCP47(finalID, id, buffer, length,err) \
531        if (uloc_forLanguageTag(id, buffer, length, NULL, err) <= 0 || U_FAILURE(*err)) { \
532            finalID=id; \
533        } else { \
534            finalID=buffer; \
535        }
536/* Gets the size of the shortest subtag in the given localeID. */
537static int32_t getShortestSubtagLength(const char *localeID) {
538    int32_t localeIDLength = uprv_strlen(localeID);
539    int32_t length = localeIDLength;
540    int32_t tmpLength = 0;
541    int32_t i;
542    UBool reset = TRUE;
543
544    for (i = 0; i < localeIDLength; i++) {
545        if (localeID[i] != '_' && localeID[i] != '-') {
546            if (reset) {
547                tmpLength = 0;
548                reset = FALSE;
549            }
550            tmpLength++;
551        } else {
552            if (tmpLength != 0 && tmpLength < length) {
553                length = tmpLength;
554            }
555            reset = TRUE;
556        }
557    }
558
559    return length;
560}
561
562/* ### Keywords **************************************************/
563
564#define ULOC_KEYWORD_BUFFER_LEN 25
565#define ULOC_MAX_NO_KEYWORDS 25
566
567U_CAPI const char * U_EXPORT2
568locale_getKeywordsStart(const char *localeID) {
569    const char *result = NULL;
570    if((result = uprv_strchr(localeID, '@')) != NULL) {
571        return result;
572    }
573#if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
574    else {
575        /* We do this because the @ sign is variant, and the @ sign used on one
576        EBCDIC machine won't be compiled the same way on other EBCDIC based
577        machines. */
578        static const uint8_t ebcdicSigns[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 };
579        const uint8_t *charToFind = ebcdicSigns;
580        while(*charToFind) {
581            if((result = uprv_strchr(localeID, *charToFind)) != NULL) {
582                return result;
583            }
584            charToFind++;
585        }
586    }
587#endif
588    return NULL;
589}
590
591/**
592 * @param buf buffer of size [ULOC_KEYWORD_BUFFER_LEN]
593 * @param keywordName incoming name to be canonicalized
594 * @param status return status (keyword too long)
595 * @return length of the keyword name
596 */
597static int32_t locale_canonKeywordName(char *buf, const char *keywordName, UErrorCode *status)
598{
599  int32_t i;
600  int32_t keywordNameLen = (int32_t)uprv_strlen(keywordName);
601
602  if(keywordNameLen >= ULOC_KEYWORD_BUFFER_LEN) {
603    /* keyword name too long for internal buffer */
604    *status = U_INTERNAL_PROGRAM_ERROR;
605          return 0;
606  }
607
608  /* normalize the keyword name */
609  for(i = 0; i < keywordNameLen; i++) {
610    buf[i] = uprv_tolower(keywordName[i]);
611  }
612  buf[i] = 0;
613
614  return keywordNameLen;
615}
616
617typedef struct {
618    char keyword[ULOC_KEYWORD_BUFFER_LEN];
619    int32_t keywordLen;
620    const char *valueStart;
621    int32_t valueLen;
622} KeywordStruct;
623
624static int32_t U_CALLCONV
625compareKeywordStructs(const void * /*context*/, const void *left, const void *right) {
626    const char* leftString = ((const KeywordStruct *)left)->keyword;
627    const char* rightString = ((const KeywordStruct *)right)->keyword;
628    return uprv_strcmp(leftString, rightString);
629}
630
631/**
632 * Both addKeyword and addValue must already be in canonical form.
633 * Either both addKeyword and addValue are NULL, or neither is NULL.
634 * If they are not NULL they must be zero terminated.
635 * If addKeyword is not NULL is must have length small enough to fit in KeywordStruct.keyword.
636 */
637static int32_t
638_getKeywords(const char *localeID,
639             char prev,
640             char *keywords, int32_t keywordCapacity,
641             char *values, int32_t valuesCapacity, int32_t *valLen,
642             UBool valuesToo,
643             const char* addKeyword,
644             const char* addValue,
645             UErrorCode *status)
646{
647    KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS];
648
649    int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS;
650    int32_t numKeywords = 0;
651    const char* pos = localeID;
652    const char* equalSign = NULL;
653    const char* semicolon = NULL;
654    int32_t i = 0, j, n;
655    int32_t keywordsLen = 0;
656    int32_t valuesLen = 0;
657
658    if(prev == '@') { /* start of keyword definition */
659        /* we will grab pairs, trim spaces, lowercase keywords, sort and return */
660        do {
661            UBool duplicate = FALSE;
662            /* skip leading spaces */
663            while(*pos == ' ') {
664                pos++;
665            }
666            if (!*pos) { /* handle trailing "; " */
667                break;
668            }
669            if(numKeywords == maxKeywords) {
670                *status = U_INTERNAL_PROGRAM_ERROR;
671                return 0;
672            }
673            equalSign = uprv_strchr(pos, '=');
674            semicolon = uprv_strchr(pos, ';');
675            /* lack of '=' [foo@currency] is illegal */
676            /* ';' before '=' [foo@currency;collation=pinyin] is illegal */
677            if(!equalSign || (semicolon && semicolon<equalSign)) {
678                *status = U_INVALID_FORMAT_ERROR;
679                return 0;
680            }
681            /* need to normalize both keyword and keyword name */
682            if(equalSign - pos >= ULOC_KEYWORD_BUFFER_LEN) {
683                /* keyword name too long for internal buffer */
684                *status = U_INTERNAL_PROGRAM_ERROR;
685                return 0;
686            }
687            for(i = 0, n = 0; i < equalSign - pos; ++i) {
688                if (pos[i] != ' ') {
689                    keywordList[numKeywords].keyword[n++] = uprv_tolower(pos[i]);
690                }
691            }
692
693            /* zero-length keyword is an error. */
694            if (n == 0) {
695                *status = U_INVALID_FORMAT_ERROR;
696                return 0;
697            }
698
699            keywordList[numKeywords].keyword[n] = 0;
700            keywordList[numKeywords].keywordLen = n;
701            /* now grab the value part. First we skip the '=' */
702            equalSign++;
703            /* then we leading spaces */
704            while(*equalSign == ' ') {
705                equalSign++;
706            }
707
708            /* Premature end or zero-length value */
709            if (!*equalSign || equalSign == semicolon) {
710                *status = U_INVALID_FORMAT_ERROR;
711                return 0;
712            }
713
714            keywordList[numKeywords].valueStart = equalSign;
715
716            pos = semicolon;
717            i = 0;
718            if(pos) {
719                while(*(pos - i - 1) == ' ') {
720                    i++;
721                }
722                keywordList[numKeywords].valueLen = (int32_t)(pos - equalSign - i);
723                pos++;
724            } else {
725                i = (int32_t)uprv_strlen(equalSign);
726                while(i && equalSign[i-1] == ' ') {
727                    i--;
728                }
729                keywordList[numKeywords].valueLen = i;
730            }
731            /* If this is a duplicate keyword, then ignore it */
732            for (j=0; j<numKeywords; ++j) {
733                if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == 0) {
734                    duplicate = TRUE;
735                    break;
736                }
737            }
738            if (!duplicate) {
739                ++numKeywords;
740            }
741        } while(pos);
742
743        /* Handle addKeyword/addValue. */
744        if (addKeyword != NULL) {
745            UBool duplicate = FALSE;
746            U_ASSERT(addValue != NULL);
747            /* Search for duplicate; if found, do nothing. Explicit keyword
748               overrides addKeyword. */
749            for (j=0; j<numKeywords; ++j) {
750                if (uprv_strcmp(keywordList[j].keyword, addKeyword) == 0) {
751                    duplicate = TRUE;
752                    break;
753                }
754            }
755            if (!duplicate) {
756                if (numKeywords == maxKeywords) {
757                    *status = U_INTERNAL_PROGRAM_ERROR;
758                    return 0;
759                }
760                uprv_strcpy(keywordList[numKeywords].keyword, addKeyword);
761                keywordList[numKeywords].keywordLen = (int32_t)uprv_strlen(addKeyword);
762                keywordList[numKeywords].valueStart = addValue;
763                keywordList[numKeywords].valueLen = (int32_t)uprv_strlen(addValue);
764                ++numKeywords;
765            }
766        } else {
767            U_ASSERT(addValue == NULL);
768        }
769
770        /* now we have a list of keywords */
771        /* we need to sort it */
772        uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, NULL, FALSE, status);
773
774        /* Now construct the keyword part */
775        for(i = 0; i < numKeywords; i++) {
776            if(keywordsLen + keywordList[i].keywordLen + 1< keywordCapacity) {
777                uprv_strcpy(keywords+keywordsLen, keywordList[i].keyword);
778                if(valuesToo) {
779                    keywords[keywordsLen + keywordList[i].keywordLen] = '=';
780                } else {
781                    keywords[keywordsLen + keywordList[i].keywordLen] = 0;
782                }
783            }
784            keywordsLen += keywordList[i].keywordLen + 1;
785            if(valuesToo) {
786                if(keywordsLen + keywordList[i].valueLen < keywordCapacity) {
787                    uprv_strncpy(keywords+keywordsLen, keywordList[i].valueStart, keywordList[i].valueLen);
788                }
789                keywordsLen += keywordList[i].valueLen;
790
791                if(i < numKeywords - 1) {
792                    if(keywordsLen < keywordCapacity) {
793                        keywords[keywordsLen] = ';';
794                    }
795                    keywordsLen++;
796                }
797            }
798            if(values) {
799                if(valuesLen + keywordList[i].valueLen + 1< valuesCapacity) {
800                    uprv_strcpy(values+valuesLen, keywordList[i].valueStart);
801                    values[valuesLen + keywordList[i].valueLen] = 0;
802                }
803                valuesLen += keywordList[i].valueLen + 1;
804            }
805        }
806        if(values) {
807            values[valuesLen] = 0;
808            if(valLen) {
809                *valLen = valuesLen;
810            }
811        }
812        return u_terminateChars(keywords, keywordCapacity, keywordsLen, status);
813    } else {
814        return 0;
815    }
816}
817
818U_CFUNC int32_t
819locale_getKeywords(const char *localeID,
820                   char prev,
821                   char *keywords, int32_t keywordCapacity,
822                   char *values, int32_t valuesCapacity, int32_t *valLen,
823                   UBool valuesToo,
824                   UErrorCode *status) {
825    return _getKeywords(localeID, prev, keywords, keywordCapacity,
826                        values, valuesCapacity, valLen, valuesToo,
827                        NULL, NULL, status);
828}
829
830U_CAPI int32_t U_EXPORT2
831uloc_getKeywordValue(const char* localeID,
832                     const char* keywordName,
833                     char* buffer, int32_t bufferCapacity,
834                     UErrorCode* status)
835{
836    const char* startSearchHere = NULL;
837    const char* nextSeparator = NULL;
838    char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
839    char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
840    int32_t i = 0;
841    int32_t result = 0;
842
843    if(status && U_SUCCESS(*status) && localeID) {
844      char tempBuffer[ULOC_FULLNAME_CAPACITY];
845      const char* tmpLocaleID;
846
847      if (_hasBCP47Extension(localeID)) {
848          _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
849      } else {
850          tmpLocaleID=localeID;
851      }
852
853      startSearchHere = uprv_strchr(tmpLocaleID, '@'); /* TODO: REVISIT: shouldn't this be locale_getKeywordsStart ? */
854      if(startSearchHere == NULL) {
855          /* no keywords, return at once */
856          return 0;
857      }
858
859      locale_canonKeywordName(keywordNameBuffer, keywordName, status);
860      if(U_FAILURE(*status)) {
861        return 0;
862      }
863
864      /* find the first keyword */
865      while(startSearchHere) {
866          startSearchHere++;
867          /* skip leading spaces (allowed?) */
868          while(*startSearchHere == ' ') {
869              startSearchHere++;
870          }
871          nextSeparator = uprv_strchr(startSearchHere, '=');
872          /* need to normalize both keyword and keyword name */
873          if(!nextSeparator) {
874              break;
875          }
876          if(nextSeparator - startSearchHere >= ULOC_KEYWORD_BUFFER_LEN) {
877              /* keyword name too long for internal buffer */
878              *status = U_INTERNAL_PROGRAM_ERROR;
879              return 0;
880          }
881          for(i = 0; i < nextSeparator - startSearchHere; i++) {
882              localeKeywordNameBuffer[i] = uprv_tolower(startSearchHere[i]);
883          }
884          /* trim trailing spaces */
885          while(startSearchHere[i-1] == ' ') {
886              i--;
887              U_ASSERT(i>=0);
888          }
889          localeKeywordNameBuffer[i] = 0;
890
891          startSearchHere = uprv_strchr(nextSeparator, ';');
892
893          if(uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer) == 0) {
894              nextSeparator++;
895              while(*nextSeparator == ' ') {
896                  nextSeparator++;
897              }
898              /* we actually found the keyword. Copy the value */
899              if(startSearchHere && startSearchHere - nextSeparator < bufferCapacity) {
900                  while(*(startSearchHere-1) == ' ') {
901                      startSearchHere--;
902                  }
903                  uprv_strncpy(buffer, nextSeparator, startSearchHere - nextSeparator);
904                  result = u_terminateChars(buffer, bufferCapacity, (int32_t)(startSearchHere - nextSeparator), status);
905              } else if(!startSearchHere && (int32_t)uprv_strlen(nextSeparator) < bufferCapacity) { /* last item in string */
906                  i = (int32_t)uprv_strlen(nextSeparator);
907                  while(nextSeparator[i - 1] == ' ') {
908                      i--;
909                  }
910                  uprv_strncpy(buffer, nextSeparator, i);
911                  result = u_terminateChars(buffer, bufferCapacity, i, status);
912              } else {
913                  /* give a bigger buffer, please */
914                  *status = U_BUFFER_OVERFLOW_ERROR;
915                  if(startSearchHere) {
916                      result = (int32_t)(startSearchHere - nextSeparator);
917                  } else {
918                      result = (int32_t)uprv_strlen(nextSeparator);
919                  }
920              }
921              return result;
922          }
923      }
924    }
925    return 0;
926}
927
928U_CAPI int32_t U_EXPORT2
929uloc_setKeywordValue(const char* keywordName,
930                     const char* keywordValue,
931                     char* buffer, int32_t bufferCapacity,
932                     UErrorCode* status)
933{
934    /* TODO: sorting. removal. */
935    int32_t keywordNameLen;
936    int32_t keywordValueLen;
937    int32_t bufLen;
938    int32_t needLen = 0;
939    int32_t foundValueLen;
940    int32_t keywordAtEnd = 0; /* is the keyword at the end of the string? */
941    char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
942    char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
943    int32_t i = 0;
944    int32_t rc;
945    char* nextSeparator = NULL;
946    char* nextEqualsign = NULL;
947    char* startSearchHere = NULL;
948    char* keywordStart = NULL;
949    char *insertHere = NULL;
950    if(U_FAILURE(*status)) {
951        return -1;
952    }
953    if(bufferCapacity>1) {
954        bufLen = (int32_t)uprv_strlen(buffer);
955    } else {
956        *status = U_ILLEGAL_ARGUMENT_ERROR;
957        return 0;
958    }
959    if(bufferCapacity<bufLen) {
960        /* The capacity is less than the length?! Is this NULL terminated? */
961        *status = U_ILLEGAL_ARGUMENT_ERROR;
962        return 0;
963    }
964    if(keywordValue && !*keywordValue) {
965        keywordValue = NULL;
966    }
967    if(keywordValue) {
968        keywordValueLen = (int32_t)uprv_strlen(keywordValue);
969    } else {
970        keywordValueLen = 0;
971    }
972    keywordNameLen = locale_canonKeywordName(keywordNameBuffer, keywordName, status);
973    if(U_FAILURE(*status)) {
974        return 0;
975    }
976    startSearchHere = (char*)locale_getKeywordsStart(buffer);
977    if(startSearchHere == NULL || (startSearchHere[1]==0)) {
978        if(!keywordValue) { /* no keywords = nothing to remove */
979            return bufLen;
980        }
981
982        needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
983        if(startSearchHere) { /* had a single @ */
984            needLen--; /* already had the @ */
985            /* startSearchHere points at the @ */
986        } else {
987            startSearchHere=buffer+bufLen;
988        }
989        if(needLen >= bufferCapacity) {
990            *status = U_BUFFER_OVERFLOW_ERROR;
991            return needLen; /* no change */
992        }
993        *startSearchHere = '@';
994        startSearchHere++;
995        uprv_strcpy(startSearchHere, keywordNameBuffer);
996        startSearchHere += keywordNameLen;
997        *startSearchHere = '=';
998        startSearchHere++;
999        uprv_strcpy(startSearchHere, keywordValue);
1000        startSearchHere+=keywordValueLen;
1001        return needLen;
1002    } /* end shortcut - no @ */
1003
1004    keywordStart = startSearchHere;
1005    /* search for keyword */
1006    while(keywordStart) {
1007        keywordStart++;
1008        /* skip leading spaces (allowed?) */
1009        while(*keywordStart == ' ') {
1010            keywordStart++;
1011        }
1012        nextEqualsign = uprv_strchr(keywordStart, '=');
1013        /* need to normalize both keyword and keyword name */
1014        if(!nextEqualsign) {
1015            break;
1016        }
1017        if(nextEqualsign - keywordStart >= ULOC_KEYWORD_BUFFER_LEN) {
1018            /* keyword name too long for internal buffer */
1019            *status = U_INTERNAL_PROGRAM_ERROR;
1020            return 0;
1021        }
1022        for(i = 0; i < nextEqualsign - keywordStart; i++) {
1023            localeKeywordNameBuffer[i] = uprv_tolower(keywordStart[i]);
1024        }
1025        /* trim trailing spaces */
1026        while(keywordStart[i-1] == ' ') {
1027            i--;
1028        }
1029        U_ASSERT(i>=0 && i<ULOC_KEYWORD_BUFFER_LEN);
1030        localeKeywordNameBuffer[i] = 0;
1031
1032        nextSeparator = uprv_strchr(nextEqualsign, ';');
1033        rc = uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer);
1034        if(rc == 0) {
1035            nextEqualsign++;
1036            while(*nextEqualsign == ' ') {
1037                nextEqualsign++;
1038            }
1039            /* we actually found the keyword. Change the value */
1040            if (nextSeparator) {
1041                keywordAtEnd = 0;
1042                foundValueLen = (int32_t)(nextSeparator - nextEqualsign);
1043            } else {
1044                keywordAtEnd = 1;
1045                foundValueLen = (int32_t)uprv_strlen(nextEqualsign);
1046            }
1047            if(keywordValue) { /* adding a value - not removing */
1048              if(foundValueLen == keywordValueLen) {
1049                uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1050                return bufLen; /* no change in size */
1051              } else if(foundValueLen > keywordValueLen) {
1052                int32_t delta = foundValueLen - keywordValueLen;
1053                if(nextSeparator) { /* RH side */
1054                  uprv_memmove(nextSeparator - delta, nextSeparator, bufLen-(nextSeparator-buffer));
1055                }
1056                uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1057                bufLen -= delta;
1058                buffer[bufLen]=0;
1059                return bufLen;
1060              } else { /* FVL < KVL */
1061                int32_t delta = keywordValueLen - foundValueLen;
1062                if((bufLen+delta) >= bufferCapacity) {
1063                  *status = U_BUFFER_OVERFLOW_ERROR;
1064                  return bufLen+delta;
1065                }
1066                if(nextSeparator) { /* RH side */
1067                  uprv_memmove(nextSeparator+delta,nextSeparator, bufLen-(nextSeparator-buffer));
1068                }
1069                uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1070                bufLen += delta;
1071                buffer[bufLen]=0;
1072                return bufLen;
1073              }
1074            } else { /* removing a keyword */
1075              if(keywordAtEnd) {
1076                /* zero out the ';' or '@' just before startSearchhere */
1077                keywordStart[-1] = 0;
1078                return (int32_t)((keywordStart-buffer)-1); /* (string length without keyword) minus separator */
1079              } else {
1080                uprv_memmove(keywordStart, nextSeparator+1, bufLen-((nextSeparator+1)-buffer));
1081                keywordStart[bufLen-((nextSeparator+1)-buffer)]=0;
1082                return (int32_t)(bufLen-((nextSeparator+1)-keywordStart));
1083              }
1084            }
1085        } else if(rc<0){ /* end match keyword */
1086          /* could insert at this location. */
1087          insertHere = keywordStart;
1088        }
1089        keywordStart = nextSeparator;
1090    } /* end loop searching */
1091
1092    if(!keywordValue) {
1093      return bufLen; /* removal of non-extant keyword - no change */
1094    }
1095
1096    /* we know there is at least one keyword. */
1097    needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
1098    if(needLen >= bufferCapacity) {
1099        *status = U_BUFFER_OVERFLOW_ERROR;
1100        return needLen; /* no change */
1101    }
1102
1103    if(insertHere) {
1104      uprv_memmove(insertHere+(1+keywordNameLen+1+keywordValueLen), insertHere, bufLen-(insertHere-buffer));
1105      keywordStart = insertHere;
1106    } else {
1107      keywordStart = buffer+bufLen;
1108      *keywordStart = ';';
1109      keywordStart++;
1110    }
1111    uprv_strncpy(keywordStart, keywordNameBuffer, keywordNameLen);
1112    keywordStart += keywordNameLen;
1113    *keywordStart = '=';
1114    keywordStart++;
1115    uprv_strncpy(keywordStart, keywordValue, keywordValueLen); /* terminates. */
1116    keywordStart+=keywordValueLen;
1117    if(insertHere) {
1118      *keywordStart = ';';
1119      keywordStart++;
1120    }
1121    buffer[needLen]=0;
1122    return needLen;
1123}
1124
1125/* ### ID parsing implementation **************************************************/
1126
1127#define _isPrefixLetter(a) ((a=='x')||(a=='X')||(a=='i')||(a=='I'))
1128
1129/*returns TRUE if one of the special prefixes is here (s=string)
1130  'x-' or 'i-' */
1131#define _isIDPrefix(s) (_isPrefixLetter(s[0])&&_isIDSeparator(s[1]))
1132
1133/* Dot terminates it because of POSIX form  where dot precedes the codepage
1134 * except for variant
1135 */
1136#define _isTerminator(a)  ((a==0)||(a=='.')||(a=='@'))
1137
1138static char* _strnchr(const char* str, int32_t len, char c) {
1139    U_ASSERT(str != 0 && len >= 0);
1140    while (len-- != 0) {
1141        char d = *str;
1142        if (d == c) {
1143            return (char*) str;
1144        } else if (d == 0) {
1145            break;
1146        }
1147        ++str;
1148    }
1149    return NULL;
1150}
1151
1152/**
1153 * Lookup 'key' in the array 'list'.  The array 'list' should contain
1154 * a NULL entry, followed by more entries, and a second NULL entry.
1155 *
1156 * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
1157 * COUNTRIES_3.
1158 */
1159static int16_t _findIndex(const char* const* list, const char* key)
1160{
1161    const char* const* anchor = list;
1162    int32_t pass = 0;
1163
1164    /* Make two passes through two NULL-terminated arrays at 'list' */
1165    while (pass++ < 2) {
1166        while (*list) {
1167            if (uprv_strcmp(key, *list) == 0) {
1168                return (int16_t)(list - anchor);
1169            }
1170            list++;
1171        }
1172        ++list;     /* skip final NULL *CWB*/
1173    }
1174    return -1;
1175}
1176
1177/* count the length of src while copying it to dest; return strlen(src) */
1178static inline int32_t
1179_copyCount(char *dest, int32_t destCapacity, const char *src) {
1180    const char *anchor;
1181    char c;
1182
1183    anchor=src;
1184    for(;;) {
1185        if((c=*src)==0) {
1186            return (int32_t)(src-anchor);
1187        }
1188        if(destCapacity<=0) {
1189            return (int32_t)((src-anchor)+uprv_strlen(src));
1190        }
1191        ++src;
1192        *dest++=c;
1193        --destCapacity;
1194    }
1195}
1196
1197U_CFUNC const char*
1198uloc_getCurrentCountryID(const char* oldID){
1199    int32_t offset = _findIndex(DEPRECATED_COUNTRIES, oldID);
1200    if (offset >= 0) {
1201        return REPLACEMENT_COUNTRIES[offset];
1202    }
1203    return oldID;
1204}
1205U_CFUNC const char*
1206uloc_getCurrentLanguageID(const char* oldID){
1207    int32_t offset = _findIndex(DEPRECATED_LANGUAGES, oldID);
1208    if (offset >= 0) {
1209        return REPLACEMENT_LANGUAGES[offset];
1210    }
1211    return oldID;
1212}
1213/*
1214 * the internal functions _getLanguage(), _getCountry(), _getVariant()
1215 * avoid duplicating code to handle the earlier locale ID pieces
1216 * in the functions for the later ones by
1217 * setting the *pEnd pointer to where they stopped parsing
1218 *
1219 * TODO try to use this in Locale
1220 */
1221U_CFUNC int32_t
1222ulocimp_getLanguage(const char *localeID,
1223                    char *language, int32_t languageCapacity,
1224                    const char **pEnd) {
1225    int32_t i=0;
1226    int32_t offset;
1227    char lang[4]={ 0, 0, 0, 0 }; /* temporary buffer to hold language code for searching */
1228
1229    /* if it starts with i- or x- then copy that prefix */
1230    if(_isIDPrefix(localeID)) {
1231        if(i<languageCapacity) {
1232            language[i]=(char)uprv_tolower(*localeID);
1233        }
1234        if(i<languageCapacity) {
1235            language[i+1]='-';
1236        }
1237        i+=2;
1238        localeID+=2;
1239    }
1240
1241    /* copy the language as far as possible and count its length */
1242    while(!_isTerminator(*localeID) && !_isIDSeparator(*localeID)) {
1243        if(i<languageCapacity) {
1244            language[i]=(char)uprv_tolower(*localeID);
1245        }
1246        if(i<3) {
1247            U_ASSERT(i>=0);
1248            lang[i]=(char)uprv_tolower(*localeID);
1249        }
1250        i++;
1251        localeID++;
1252    }
1253
1254    if(i==3) {
1255        /* convert 3 character code to 2 character code if possible *CWB*/
1256        offset=_findIndex(LANGUAGES_3, lang);
1257        if(offset>=0) {
1258            i=_copyCount(language, languageCapacity, LANGUAGES[offset]);
1259        }
1260    }
1261
1262    if(pEnd!=NULL) {
1263        *pEnd=localeID;
1264    }
1265    return i;
1266}
1267
1268U_CFUNC int32_t
1269ulocimp_getScript(const char *localeID,
1270                  char *script, int32_t scriptCapacity,
1271                  const char **pEnd)
1272{
1273    int32_t idLen = 0;
1274
1275    if (pEnd != NULL) {
1276        *pEnd = localeID;
1277    }
1278
1279    /* copy the second item as far as possible and count its length */
1280    while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])
1281            && uprv_isASCIILetter(localeID[idLen])) {
1282        idLen++;
1283    }
1284
1285    /* If it's exactly 4 characters long, then it's a script and not a country. */
1286    if (idLen == 4) {
1287        int32_t i;
1288        if (pEnd != NULL) {
1289            *pEnd = localeID+idLen;
1290        }
1291        if(idLen > scriptCapacity) {
1292            idLen = scriptCapacity;
1293        }
1294        if (idLen >= 1) {
1295            script[0]=(char)uprv_toupper(*(localeID++));
1296        }
1297        for (i = 1; i < idLen; i++) {
1298            script[i]=(char)uprv_tolower(*(localeID++));
1299        }
1300    }
1301    else {
1302        idLen = 0;
1303    }
1304    return idLen;
1305}
1306
1307U_CFUNC int32_t
1308ulocimp_getCountry(const char *localeID,
1309                   char *country, int32_t countryCapacity,
1310                   const char **pEnd)
1311{
1312    int32_t idLen=0;
1313    char cnty[ULOC_COUNTRY_CAPACITY]={ 0, 0, 0, 0 };
1314    int32_t offset;
1315
1316    /* copy the country as far as possible and count its length */
1317    while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])) {
1318        if(idLen<(ULOC_COUNTRY_CAPACITY-1)) {   /*CWB*/
1319            cnty[idLen]=(char)uprv_toupper(localeID[idLen]);
1320        }
1321        idLen++;
1322    }
1323
1324    /* the country should be either length 2 or 3 */
1325    if (idLen == 2 || idLen == 3) {
1326        UBool gotCountry = FALSE;
1327        /* convert 3 character code to 2 character code if possible *CWB*/
1328        if(idLen==3) {
1329            offset=_findIndex(COUNTRIES_3, cnty);
1330            if(offset>=0) {
1331                idLen=_copyCount(country, countryCapacity, COUNTRIES[offset]);
1332                gotCountry = TRUE;
1333            }
1334        }
1335        if (!gotCountry) {
1336            int32_t i = 0;
1337            for (i = 0; i < idLen; i++) {
1338                if (i < countryCapacity) {
1339                    country[i]=(char)uprv_toupper(localeID[i]);
1340                }
1341            }
1342        }
1343        localeID+=idLen;
1344    } else {
1345        idLen = 0;
1346    }
1347
1348    if(pEnd!=NULL) {
1349        *pEnd=localeID;
1350    }
1351
1352    return idLen;
1353}
1354
1355/**
1356 * @param needSeparator if true, then add leading '_' if any variants
1357 * are added to 'variant'
1358 */
1359static int32_t
1360_getVariantEx(const char *localeID,
1361              char prev,
1362              char *variant, int32_t variantCapacity,
1363              UBool needSeparator) {
1364    int32_t i=0;
1365
1366    /* get one or more variant tags and separate them with '_' */
1367    if(_isIDSeparator(prev)) {
1368        /* get a variant string after a '-' or '_' */
1369        while(!_isTerminator(*localeID)) {
1370            if (needSeparator) {
1371                if (i<variantCapacity) {
1372                    variant[i] = '_';
1373                }
1374                ++i;
1375                needSeparator = FALSE;
1376            }
1377            if(i<variantCapacity) {
1378                variant[i]=(char)uprv_toupper(*localeID);
1379                if(variant[i]=='-') {
1380                    variant[i]='_';
1381                }
1382            }
1383            i++;
1384            localeID++;
1385        }
1386    }
1387
1388    /* if there is no variant tag after a '-' or '_' then look for '@' */
1389    if(i==0) {
1390        if(prev=='@') {
1391            /* keep localeID */
1392        } else if((localeID=locale_getKeywordsStart(localeID))!=NULL) {
1393            ++localeID; /* point after the '@' */
1394        } else {
1395            return 0;
1396        }
1397        while(!_isTerminator(*localeID)) {
1398            if (needSeparator) {
1399                if (i<variantCapacity) {
1400                    variant[i] = '_';
1401                }
1402                ++i;
1403                needSeparator = FALSE;
1404            }
1405            if(i<variantCapacity) {
1406                variant[i]=(char)uprv_toupper(*localeID);
1407                if(variant[i]=='-' || variant[i]==',') {
1408                    variant[i]='_';
1409                }
1410            }
1411            i++;
1412            localeID++;
1413        }
1414    }
1415
1416    return i;
1417}
1418
1419static int32_t
1420_getVariant(const char *localeID,
1421            char prev,
1422            char *variant, int32_t variantCapacity) {
1423    return _getVariantEx(localeID, prev, variant, variantCapacity, FALSE);
1424}
1425
1426/**
1427 * Delete ALL instances of a variant from the given list of one or
1428 * more variants.  Example: "FOO_EURO_BAR_EURO" => "FOO_BAR".
1429 * @param variants the source string of one or more variants,
1430 * separated by '_'.  This will be MODIFIED IN PLACE.  Not zero
1431 * terminated; if it is, trailing zero will NOT be maintained.
1432 * @param variantsLen length of variants
1433 * @param toDelete variant to delete, without separators, e.g.  "EURO"
1434 * or "PREEURO"; not zero terminated
1435 * @param toDeleteLen length of toDelete
1436 * @return number of characters deleted from variants
1437 */
1438static int32_t
1439_deleteVariant(char* variants, int32_t variantsLen,
1440               const char* toDelete, int32_t toDeleteLen)
1441{
1442    int32_t delta = 0; /* number of chars deleted */
1443    for (;;) {
1444        UBool flag = FALSE;
1445        if (variantsLen < toDeleteLen) {
1446            return delta;
1447        }
1448        if (uprv_strncmp(variants, toDelete, toDeleteLen) == 0 &&
1449            (variantsLen == toDeleteLen ||
1450             (flag=(variants[toDeleteLen] == '_'))))
1451        {
1452            int32_t d = toDeleteLen + (flag?1:0);
1453            variantsLen -= d;
1454            delta += d;
1455            if (variantsLen > 0) {
1456                uprv_memmove(variants, variants+d, variantsLen);
1457            }
1458        } else {
1459            char* p = _strnchr(variants, variantsLen, '_');
1460            if (p == NULL) {
1461                return delta;
1462            }
1463            ++p;
1464            variantsLen -= (int32_t)(p - variants);
1465            variants = p;
1466        }
1467    }
1468}
1469
1470/* Keyword enumeration */
1471
1472typedef struct UKeywordsContext {
1473    char* keywords;
1474    char* current;
1475} UKeywordsContext;
1476
1477U_CDECL_BEGIN
1478
1479static void U_CALLCONV
1480uloc_kw_closeKeywords(UEnumeration *enumerator) {
1481    uprv_free(((UKeywordsContext *)enumerator->context)->keywords);
1482    uprv_free(enumerator->context);
1483    uprv_free(enumerator);
1484}
1485
1486static int32_t U_CALLCONV
1487uloc_kw_countKeywords(UEnumeration *en, UErrorCode * /*status*/) {
1488    char *kw = ((UKeywordsContext *)en->context)->keywords;
1489    int32_t result = 0;
1490    while(*kw) {
1491        result++;
1492        kw += uprv_strlen(kw)+1;
1493    }
1494    return result;
1495}
1496
1497static const char * U_CALLCONV
1498uloc_kw_nextKeyword(UEnumeration* en,
1499                    int32_t* resultLength,
1500                    UErrorCode* /*status*/) {
1501    const char* result = ((UKeywordsContext *)en->context)->current;
1502    int32_t len = 0;
1503    if(*result) {
1504        len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current);
1505        ((UKeywordsContext *)en->context)->current += len+1;
1506    } else {
1507        result = NULL;
1508    }
1509    if (resultLength) {
1510        *resultLength = len;
1511    }
1512    return result;
1513}
1514
1515static void U_CALLCONV
1516uloc_kw_resetKeywords(UEnumeration* en,
1517                      UErrorCode* /*status*/) {
1518    ((UKeywordsContext *)en->context)->current = ((UKeywordsContext *)en->context)->keywords;
1519}
1520
1521U_CDECL_END
1522
1523
1524static const UEnumeration gKeywordsEnum = {
1525    NULL,
1526    NULL,
1527    uloc_kw_closeKeywords,
1528    uloc_kw_countKeywords,
1529    uenum_unextDefault,
1530    uloc_kw_nextKeyword,
1531    uloc_kw_resetKeywords
1532};
1533
1534U_CAPI UEnumeration* U_EXPORT2
1535uloc_openKeywordList(const char *keywordList, int32_t keywordListSize, UErrorCode* status)
1536{
1537    UKeywordsContext *myContext = NULL;
1538    UEnumeration *result = NULL;
1539
1540    if(U_FAILURE(*status)) {
1541        return NULL;
1542    }
1543    result = (UEnumeration *)uprv_malloc(sizeof(UEnumeration));
1544    /* Null pointer test */
1545    if (result == NULL) {
1546        *status = U_MEMORY_ALLOCATION_ERROR;
1547        return NULL;
1548    }
1549    uprv_memcpy(result, &gKeywordsEnum, sizeof(UEnumeration));
1550    myContext = static_cast<UKeywordsContext *>(uprv_malloc(sizeof(UKeywordsContext)));
1551    if (myContext == NULL) {
1552        *status = U_MEMORY_ALLOCATION_ERROR;
1553        uprv_free(result);
1554        return NULL;
1555    }
1556    myContext->keywords = (char *)uprv_malloc(keywordListSize+1);
1557    uprv_memcpy(myContext->keywords, keywordList, keywordListSize);
1558    myContext->keywords[keywordListSize] = 0;
1559    myContext->current = myContext->keywords;
1560    result->context = myContext;
1561    return result;
1562}
1563
1564U_CAPI UEnumeration* U_EXPORT2
1565uloc_openKeywords(const char* localeID,
1566                        UErrorCode* status)
1567{
1568    int32_t i=0;
1569    char keywords[256];
1570    int32_t keywordsCapacity = 256;
1571    char tempBuffer[ULOC_FULLNAME_CAPACITY];
1572    const char* tmpLocaleID;
1573
1574    if(status==NULL || U_FAILURE(*status)) {
1575        return 0;
1576    }
1577
1578    if (_hasBCP47Extension(localeID)) {
1579        _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
1580    } else {
1581        if (localeID==NULL) {
1582           localeID=uloc_getDefault();
1583        }
1584        tmpLocaleID=localeID;
1585    }
1586
1587    /* Skip the language */
1588    ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
1589    if(_isIDSeparator(*tmpLocaleID)) {
1590        const char *scriptID;
1591        /* Skip the script if available */
1592        ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
1593        if(scriptID != tmpLocaleID+1) {
1594            /* Found optional script */
1595            tmpLocaleID = scriptID;
1596        }
1597        /* Skip the Country */
1598        if (_isIDSeparator(*tmpLocaleID)) {
1599            ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &tmpLocaleID);
1600            if(_isIDSeparator(*tmpLocaleID)) {
1601                _getVariant(tmpLocaleID+1, *tmpLocaleID, NULL, 0);
1602            }
1603        }
1604    }
1605
1606    /* keywords are located after '@' */
1607    if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != NULL) {
1608        i=locale_getKeywords(tmpLocaleID+1, '@', keywords, keywordsCapacity, NULL, 0, NULL, FALSE, status);
1609    }
1610
1611    if(i) {
1612        return uloc_openKeywordList(keywords, i, status);
1613    } else {
1614        return NULL;
1615    }
1616}
1617
1618
1619/* bit-flags for 'options' parameter of _canonicalize */
1620#define _ULOC_STRIP_KEYWORDS 0x2
1621#define _ULOC_CANONICALIZE   0x1
1622
1623#define OPTION_SET(options, mask) ((options & mask) != 0)
1624
1625static const char i_default[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'};
1626#define I_DEFAULT_LENGTH UPRV_LENGTHOF(i_default)
1627
1628/**
1629 * Canonicalize the given localeID, to level 1 or to level 2,
1630 * depending on the options.  To specify level 1, pass in options=0.
1631 * To specify level 2, pass in options=_ULOC_CANONICALIZE.
1632 *
1633 * This is the code underlying uloc_getName and uloc_canonicalize.
1634 */
1635static int32_t
1636_canonicalize(const char* localeID,
1637              char* result,
1638              int32_t resultCapacity,
1639              uint32_t options,
1640              UErrorCode* err) {
1641    int32_t j, len, fieldCount=0, scriptSize=0, variantSize=0, nameCapacity;
1642    char localeBuffer[ULOC_FULLNAME_CAPACITY];
1643    char tempBuffer[ULOC_FULLNAME_CAPACITY];
1644    const char* origLocaleID;
1645    const char* tmpLocaleID;
1646    const char* keywordAssign = NULL;
1647    const char* separatorIndicator = NULL;
1648    const char* addKeyword = NULL;
1649    const char* addValue = NULL;
1650    char* name;
1651    char* variant = NULL; /* pointer into name, or NULL */
1652
1653    if (U_FAILURE(*err)) {
1654        return 0;
1655    }
1656
1657    if (_hasBCP47Extension(localeID)) {
1658        _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1659    } else {
1660        if (localeID==NULL) {
1661           localeID=uloc_getDefault();
1662        }
1663        tmpLocaleID=localeID;
1664    }
1665
1666    origLocaleID=tmpLocaleID;
1667
1668    /* if we are doing a full canonicalization, then put results in
1669       localeBuffer, if necessary; otherwise send them to result. */
1670    if (/*OPTION_SET(options, _ULOC_CANONICALIZE) &&*/
1671        (result == NULL || resultCapacity < (int32_t)sizeof(localeBuffer))) {
1672        name = localeBuffer;
1673        nameCapacity = (int32_t)sizeof(localeBuffer);
1674    } else {
1675        name = result;
1676        nameCapacity = resultCapacity;
1677    }
1678
1679    /* get all pieces, one after another, and separate with '_' */
1680    len=ulocimp_getLanguage(tmpLocaleID, name, nameCapacity, &tmpLocaleID);
1681
1682    if(len == I_DEFAULT_LENGTH && uprv_strncmp(origLocaleID, i_default, len) == 0) {
1683        const char *d = uloc_getDefault();
1684
1685        len = (int32_t)uprv_strlen(d);
1686
1687        if (name != NULL) {
1688            uprv_strncpy(name, d, len);
1689        }
1690    } else if(_isIDSeparator(*tmpLocaleID)) {
1691        const char *scriptID;
1692
1693        ++fieldCount;
1694        if(len<nameCapacity) {
1695            name[len]='_';
1696        }
1697        ++len;
1698
1699        scriptSize=ulocimp_getScript(tmpLocaleID+1,
1700            (len<nameCapacity ? name+len : NULL), nameCapacity-len, &scriptID);
1701        if(scriptSize > 0) {
1702            /* Found optional script */
1703            tmpLocaleID = scriptID;
1704            ++fieldCount;
1705            len+=scriptSize;
1706            if (_isIDSeparator(*tmpLocaleID)) {
1707                /* If there is something else, then we add the _ */
1708                if(len<nameCapacity) {
1709                    name[len]='_';
1710                }
1711                ++len;
1712            }
1713        }
1714
1715        if (_isIDSeparator(*tmpLocaleID)) {
1716            const char *cntryID;
1717            int32_t cntrySize = ulocimp_getCountry(tmpLocaleID+1,
1718                (len<nameCapacity ? name+len : NULL), nameCapacity-len, &cntryID);
1719            if (cntrySize > 0) {
1720                /* Found optional country */
1721                tmpLocaleID = cntryID;
1722                len+=cntrySize;
1723            }
1724            if(_isIDSeparator(*tmpLocaleID)) {
1725                /* If there is something else, then we add the _  if we found country before. */
1726                if (cntrySize >= 0 && ! _isIDSeparator(*(tmpLocaleID+1)) ) {
1727                    ++fieldCount;
1728                    if(len<nameCapacity) {
1729                        name[len]='_';
1730                    }
1731                    ++len;
1732                }
1733
1734                variantSize = _getVariant(tmpLocaleID+1, *tmpLocaleID,
1735                    (len<nameCapacity ? name+len : NULL), nameCapacity-len);
1736                if (variantSize > 0) {
1737                    variant = len<nameCapacity ? name+len : NULL;
1738                    len += variantSize;
1739                    tmpLocaleID += variantSize + 1; /* skip '_' and variant */
1740                }
1741            }
1742        }
1743    }
1744
1745    /* Copy POSIX-style charset specifier, if any [mr.utf8] */
1746    if (!OPTION_SET(options, _ULOC_CANONICALIZE) && *tmpLocaleID == '.') {
1747        UBool done = FALSE;
1748        do {
1749            char c = *tmpLocaleID;
1750            switch (c) {
1751            case 0:
1752            case '@':
1753                done = TRUE;
1754                break;
1755            default:
1756                if (len<nameCapacity) {
1757                    name[len] = c;
1758                }
1759                ++len;
1760                ++tmpLocaleID;
1761                break;
1762            }
1763        } while (!done);
1764    }
1765
1766    /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'
1767       After this, tmpLocaleID either points to '@' or is NULL */
1768    if ((tmpLocaleID=locale_getKeywordsStart(tmpLocaleID))!=NULL) {
1769        keywordAssign = uprv_strchr(tmpLocaleID, '=');
1770        separatorIndicator = uprv_strchr(tmpLocaleID, ';');
1771    }
1772
1773    /* Copy POSIX-style variant, if any [mr@FOO] */
1774    if (!OPTION_SET(options, _ULOC_CANONICALIZE) &&
1775        tmpLocaleID != NULL && keywordAssign == NULL) {
1776        for (;;) {
1777            char c = *tmpLocaleID;
1778            if (c == 0) {
1779                break;
1780            }
1781            if (len<nameCapacity) {
1782                name[len] = c;
1783            }
1784            ++len;
1785            ++tmpLocaleID;
1786        }
1787    }
1788
1789    if (OPTION_SET(options, _ULOC_CANONICALIZE)) {
1790        /* Handle @FOO variant if @ is present and not followed by = */
1791        if (tmpLocaleID!=NULL && keywordAssign==NULL) {
1792            int32_t posixVariantSize;
1793            /* Add missing '_' if needed */
1794            if (fieldCount < 2 || (fieldCount < 3 && scriptSize > 0)) {
1795                do {
1796                    if(len<nameCapacity) {
1797                        name[len]='_';
1798                    }
1799                    ++len;
1800                    ++fieldCount;
1801                } while(fieldCount<2);
1802            }
1803            posixVariantSize = _getVariantEx(tmpLocaleID+1, '@', name+len, nameCapacity-len,
1804                                             (UBool)(variantSize > 0));
1805            if (posixVariantSize > 0) {
1806                if (variant == NULL) {
1807                    variant = name+len;
1808                }
1809                len += posixVariantSize;
1810                variantSize += posixVariantSize;
1811            }
1812        }
1813
1814        /* Handle generic variants first */
1815        if (variant) {
1816            for (j=0; j<UPRV_LENGTHOF(VARIANT_MAP); j++) {
1817                const char* variantToCompare = VARIANT_MAP[j].variant;
1818                int32_t n = (int32_t)uprv_strlen(variantToCompare);
1819                int32_t variantLen = _deleteVariant(variant, uprv_min(variantSize, (nameCapacity-len)), variantToCompare, n);
1820                len -= variantLen;
1821                if (variantLen > 0) {
1822                    if (len > 0 && name[len-1] == '_') { /* delete trailing '_' */
1823                        --len;
1824                    }
1825                    addKeyword = VARIANT_MAP[j].keyword;
1826                    addValue = VARIANT_MAP[j].value;
1827                    break;
1828                }
1829            }
1830            if (len > 0 && len <= nameCapacity && name[len-1] == '_') { /* delete trailing '_' */
1831                --len;
1832            }
1833        }
1834
1835        /* Look up the ID in the canonicalization map */
1836        for (j=0; j<UPRV_LENGTHOF(CANONICALIZE_MAP); j++) {
1837            const char* id = CANONICALIZE_MAP[j].id;
1838            int32_t n = (int32_t)uprv_strlen(id);
1839            if (len == n && uprv_strncmp(name, id, n) == 0) {
1840                if (n == 0 && tmpLocaleID != NULL) {
1841                    break; /* Don't remap "" if keywords present */
1842                }
1843                len = _copyCount(name, nameCapacity, CANONICALIZE_MAP[j].canonicalID);
1844                if (CANONICALIZE_MAP[j].keyword) {
1845                    addKeyword = CANONICALIZE_MAP[j].keyword;
1846                    addValue = CANONICALIZE_MAP[j].value;
1847                }
1848                break;
1849            }
1850        }
1851    }
1852
1853    if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) {
1854        if (tmpLocaleID!=NULL && keywordAssign!=NULL &&
1855            (!separatorIndicator || separatorIndicator > keywordAssign)) {
1856            if(len<nameCapacity) {
1857                name[len]='@';
1858            }
1859            ++len;
1860            ++fieldCount;
1861            len += _getKeywords(tmpLocaleID+1, '@', (len<nameCapacity ? name+len : NULL), nameCapacity-len,
1862                                NULL, 0, NULL, TRUE, addKeyword, addValue, err);
1863        } else if (addKeyword != NULL) {
1864            U_ASSERT(addValue != NULL && len < nameCapacity);
1865            /* inelegant but works -- later make _getKeywords do this? */
1866            len += _copyCount(name+len, nameCapacity-len, "@");
1867            len += _copyCount(name+len, nameCapacity-len, addKeyword);
1868            len += _copyCount(name+len, nameCapacity-len, "=");
1869            len += _copyCount(name+len, nameCapacity-len, addValue);
1870        }
1871    }
1872
1873    if (U_SUCCESS(*err) && result != NULL && name == localeBuffer) {
1874        uprv_strncpy(result, localeBuffer, (len > resultCapacity) ? resultCapacity : len);
1875    }
1876
1877    return u_terminateChars(result, resultCapacity, len, err);
1878}
1879
1880/* ### ID parsing API **************************************************/
1881
1882U_CAPI int32_t  U_EXPORT2
1883uloc_getParent(const char*    localeID,
1884               char* parent,
1885               int32_t parentCapacity,
1886               UErrorCode* err)
1887{
1888    const char *lastUnderscore;
1889    int32_t i;
1890
1891    if (U_FAILURE(*err))
1892        return 0;
1893
1894    if (localeID == NULL)
1895        localeID = uloc_getDefault();
1896
1897    lastUnderscore=uprv_strrchr(localeID, '_');
1898    if(lastUnderscore!=NULL) {
1899        i=(int32_t)(lastUnderscore-localeID);
1900    } else {
1901        i=0;
1902    }
1903
1904    if(i>0 && parent != localeID) {
1905        uprv_memcpy(parent, localeID, uprv_min(i, parentCapacity));
1906    }
1907    return u_terminateChars(parent, parentCapacity, i, err);
1908}
1909
1910U_CAPI int32_t U_EXPORT2
1911uloc_getLanguage(const char*    localeID,
1912         char* language,
1913         int32_t languageCapacity,
1914         UErrorCode* err)
1915{
1916    /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/
1917    int32_t i=0;
1918
1919    if (err==NULL || U_FAILURE(*err)) {
1920        return 0;
1921    }
1922
1923    if(localeID==NULL) {
1924        localeID=uloc_getDefault();
1925    }
1926
1927    i=ulocimp_getLanguage(localeID, language, languageCapacity, NULL);
1928    return u_terminateChars(language, languageCapacity, i, err);
1929}
1930
1931U_CAPI int32_t U_EXPORT2
1932uloc_getScript(const char*    localeID,
1933         char* script,
1934         int32_t scriptCapacity,
1935         UErrorCode* err)
1936{
1937    int32_t i=0;
1938
1939    if(err==NULL || U_FAILURE(*err)) {
1940        return 0;
1941    }
1942
1943    if(localeID==NULL) {
1944        localeID=uloc_getDefault();
1945    }
1946
1947    /* skip the language */
1948    ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1949    if(_isIDSeparator(*localeID)) {
1950        i=ulocimp_getScript(localeID+1, script, scriptCapacity, NULL);
1951    }
1952    return u_terminateChars(script, scriptCapacity, i, err);
1953}
1954
1955U_CAPI int32_t  U_EXPORT2
1956uloc_getCountry(const char* localeID,
1957            char* country,
1958            int32_t countryCapacity,
1959            UErrorCode* err)
1960{
1961    int32_t i=0;
1962
1963    if(err==NULL || U_FAILURE(*err)) {
1964        return 0;
1965    }
1966
1967    if(localeID==NULL) {
1968        localeID=uloc_getDefault();
1969    }
1970
1971    /* Skip the language */
1972    ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1973    if(_isIDSeparator(*localeID)) {
1974        const char *scriptID;
1975        /* Skip the script if available */
1976        ulocimp_getScript(localeID+1, NULL, 0, &scriptID);
1977        if(scriptID != localeID+1) {
1978            /* Found optional script */
1979            localeID = scriptID;
1980        }
1981        if(_isIDSeparator(*localeID)) {
1982            i=ulocimp_getCountry(localeID+1, country, countryCapacity, NULL);
1983        }
1984    }
1985    return u_terminateChars(country, countryCapacity, i, err);
1986}
1987
1988U_CAPI int32_t  U_EXPORT2
1989uloc_getVariant(const char* localeID,
1990                char* variant,
1991                int32_t variantCapacity,
1992                UErrorCode* err)
1993{
1994    char tempBuffer[ULOC_FULLNAME_CAPACITY];
1995    const char* tmpLocaleID;
1996    int32_t i=0;
1997
1998    if(err==NULL || U_FAILURE(*err)) {
1999        return 0;
2000    }
2001
2002    if (_hasBCP47Extension(localeID)) {
2003        _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
2004    } else {
2005        if (localeID==NULL) {
2006           localeID=uloc_getDefault();
2007        }
2008        tmpLocaleID=localeID;
2009    }
2010
2011    /* Skip the language */
2012    ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
2013    if(_isIDSeparator(*tmpLocaleID)) {
2014        const char *scriptID;
2015        /* Skip the script if available */
2016        ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
2017        if(scriptID != tmpLocaleID+1) {
2018            /* Found optional script */
2019            tmpLocaleID = scriptID;
2020        }
2021        /* Skip the Country */
2022        if (_isIDSeparator(*tmpLocaleID)) {
2023            const char *cntryID;
2024            ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &cntryID);
2025            if (cntryID != tmpLocaleID+1) {
2026                /* Found optional country */
2027                tmpLocaleID = cntryID;
2028            }
2029            if(_isIDSeparator(*tmpLocaleID)) {
2030                /* If there was no country ID, skip a possible extra IDSeparator */
2031                if (tmpLocaleID != cntryID && _isIDSeparator(tmpLocaleID[1])) {
2032                    tmpLocaleID++;
2033                }
2034                i=_getVariant(tmpLocaleID+1, *tmpLocaleID, variant, variantCapacity);
2035            }
2036        }
2037    }
2038
2039    /* removed by weiv. We don't want to handle POSIX variants anymore. Use canonicalization function */
2040    /* if we do not have a variant tag yet then try a POSIX variant after '@' */
2041/*
2042    if(!haveVariant && (localeID=uprv_strrchr(localeID, '@'))!=NULL) {
2043        i=_getVariant(localeID+1, '@', variant, variantCapacity);
2044    }
2045*/
2046    return u_terminateChars(variant, variantCapacity, i, err);
2047}
2048
2049U_CAPI int32_t  U_EXPORT2
2050uloc_getName(const char* localeID,
2051             char* name,
2052             int32_t nameCapacity,
2053             UErrorCode* err)
2054{
2055    return _canonicalize(localeID, name, nameCapacity, 0, err);
2056}
2057
2058U_CAPI int32_t  U_EXPORT2
2059uloc_getBaseName(const char* localeID,
2060                 char* name,
2061                 int32_t nameCapacity,
2062                 UErrorCode* err)
2063{
2064    return _canonicalize(localeID, name, nameCapacity, _ULOC_STRIP_KEYWORDS, err);
2065}
2066
2067U_CAPI int32_t  U_EXPORT2
2068uloc_canonicalize(const char* localeID,
2069                  char* name,
2070                  int32_t nameCapacity,
2071                  UErrorCode* err)
2072{
2073    return _canonicalize(localeID, name, nameCapacity, _ULOC_CANONICALIZE, err);
2074}
2075
2076U_CAPI const char*  U_EXPORT2
2077uloc_getISO3Language(const char* localeID)
2078{
2079    int16_t offset;
2080    char lang[ULOC_LANG_CAPACITY];
2081    UErrorCode err = U_ZERO_ERROR;
2082
2083    if (localeID == NULL)
2084    {
2085        localeID = uloc_getDefault();
2086    }
2087    uloc_getLanguage(localeID, lang, ULOC_LANG_CAPACITY, &err);
2088    if (U_FAILURE(err))
2089        return "";
2090    offset = _findIndex(LANGUAGES, lang);
2091    if (offset < 0)
2092        return "";
2093    return LANGUAGES_3[offset];
2094}
2095
2096U_CAPI const char*  U_EXPORT2
2097uloc_getISO3Country(const char* localeID)
2098{
2099    int16_t offset;
2100    char cntry[ULOC_LANG_CAPACITY];
2101    UErrorCode err = U_ZERO_ERROR;
2102
2103    if (localeID == NULL)
2104    {
2105        localeID = uloc_getDefault();
2106    }
2107    uloc_getCountry(localeID, cntry, ULOC_LANG_CAPACITY, &err);
2108    if (U_FAILURE(err))
2109        return "";
2110    offset = _findIndex(COUNTRIES, cntry);
2111    if (offset < 0)
2112        return "";
2113
2114    return COUNTRIES_3[offset];
2115}
2116
2117U_CAPI uint32_t  U_EXPORT2
2118uloc_getLCID(const char* localeID)
2119{
2120    UErrorCode status = U_ZERO_ERROR;
2121    char       langID[ULOC_FULLNAME_CAPACITY];
2122
2123    uloc_getLanguage(localeID, langID, sizeof(langID), &status);
2124    if (U_FAILURE(status)) {
2125        return 0;
2126    }
2127
2128    if (uprv_strchr(localeID, '@')) {
2129        // uprv_convertToLCID does not support keywords other than collation.
2130        // Remove all keywords except collation.
2131        int32_t len;
2132        char collVal[ULOC_KEYWORDS_CAPACITY];
2133        char tmpLocaleID[ULOC_FULLNAME_CAPACITY];
2134
2135        len = uloc_getKeywordValue(localeID, "collation", collVal,
2136            UPRV_LENGTHOF(collVal) - 1, &status);
2137
2138        if (U_SUCCESS(status) && len > 0) {
2139            collVal[len] = 0;
2140
2141            len = uloc_getBaseName(localeID, tmpLocaleID,
2142                UPRV_LENGTHOF(tmpLocaleID) - 1, &status);
2143
2144            if (U_SUCCESS(status) && len > 0) {
2145                tmpLocaleID[len] = 0;
2146
2147                len = uloc_setKeywordValue("collation", collVal, tmpLocaleID,
2148                    UPRV_LENGTHOF(tmpLocaleID) - len - 1, &status);
2149
2150                if (U_SUCCESS(status) && len > 0) {
2151                    tmpLocaleID[len] = 0;
2152                    return uprv_convertToLCID(langID, tmpLocaleID, &status);
2153                }
2154            }
2155        }
2156
2157        // fall through - all keywords are simply ignored
2158        status = U_ZERO_ERROR;
2159    }
2160
2161    return uprv_convertToLCID(langID, localeID, &status);
2162}
2163
2164U_CAPI int32_t U_EXPORT2
2165uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
2166                UErrorCode *status)
2167{
2168    return uprv_convertToPosix(hostid, locale, localeCapacity, status);
2169}
2170
2171/* ### Default locale **************************************************/
2172
2173U_CAPI const char*  U_EXPORT2
2174uloc_getDefault()
2175{
2176    return locale_get_default();
2177}
2178
2179U_CAPI void  U_EXPORT2
2180uloc_setDefault(const char*   newDefaultLocale,
2181             UErrorCode* err)
2182{
2183    if (U_FAILURE(*err))
2184        return;
2185    /* the error code isn't currently used for anything by this function*/
2186
2187    /* propagate change to C++ */
2188    locale_set_default(newDefaultLocale);
2189}
2190
2191/**
2192 * Returns a list of all 2-letter language codes defined in ISO 639.  This is a pointer
2193 * to an array of pointers to arrays of char.  All of these pointers are owned
2194 * by ICU-- do not delete them, and do not write through them.  The array is
2195 * terminated with a null pointer.
2196 */
2197U_CAPI const char* const*  U_EXPORT2
2198uloc_getISOLanguages()
2199{
2200    return LANGUAGES;
2201}
2202
2203/**
2204 * Returns a list of all 2-letter country codes defined in ISO 639.  This is a
2205 * pointer to an array of pointers to arrays of char.  All of these pointers are
2206 * owned by ICU-- do not delete them, and do not write through them.  The array is
2207 * terminated with a null pointer.
2208 */
2209U_CAPI const char* const*  U_EXPORT2
2210uloc_getISOCountries()
2211{
2212    return COUNTRIES;
2213}
2214
2215
2216/* this function to be moved into cstring.c later */
2217static char gDecimal = 0;
2218
2219static /* U_CAPI */
2220double
2221/* U_EXPORT2 */
2222_uloc_strtod(const char *start, char **end) {
2223    char *decimal;
2224    char *myEnd;
2225    char buf[30];
2226    double rv;
2227    if (!gDecimal) {
2228        char rep[5];
2229        /* For machines that decide to change the decimal on you,
2230        and try to be too smart with localization.
2231        This normally should be just a '.'. */
2232        sprintf(rep, "%+1.1f", 1.0);
2233        gDecimal = rep[2];
2234    }
2235
2236    if(gDecimal == '.') {
2237        return uprv_strtod(start, end); /* fall through to OS */
2238    } else {
2239        uprv_strncpy(buf, start, 29);
2240        buf[29]=0;
2241        decimal = uprv_strchr(buf, '.');
2242        if(decimal) {
2243            *decimal = gDecimal;
2244        } else {
2245            return uprv_strtod(start, end); /* no decimal point */
2246        }
2247        rv = uprv_strtod(buf, &myEnd);
2248        if(end) {
2249            *end = (char*)(start+(myEnd-buf)); /* cast away const (to follow uprv_strtod API.) */
2250        }
2251        return rv;
2252    }
2253}
2254
2255typedef struct {
2256    float q;
2257    int32_t dummy;  /* to avoid uninitialized memory copy from qsort */
2258    char locale[ULOC_FULLNAME_CAPACITY+1];
2259} _acceptLangItem;
2260
2261static int32_t U_CALLCONV
2262uloc_acceptLanguageCompare(const void * /*context*/, const void *a, const void *b)
2263{
2264    const _acceptLangItem *aa = (const _acceptLangItem*)a;
2265    const _acceptLangItem *bb = (const _acceptLangItem*)b;
2266
2267    int32_t rc = 0;
2268    if(bb->q < aa->q) {
2269        rc = -1;  /* A > B */
2270    } else if(bb->q > aa->q) {
2271        rc = 1;   /* A < B */
2272    } else {
2273        rc = 0;   /* A = B */
2274    }
2275
2276    if(rc==0) {
2277        rc = uprv_stricmp(aa->locale, bb->locale);
2278    }
2279
2280#if defined(ULOC_DEBUG)
2281    /*  fprintf(stderr, "a:[%s:%g], b:[%s:%g] -> %d\n",
2282    aa->locale, aa->q,
2283    bb->locale, bb->q,
2284    rc);*/
2285#endif
2286
2287    return rc;
2288}
2289
2290/*
2291mt-mt, ja;q=0.76, en-us;q=0.95, en;q=0.92, en-gb;q=0.89, fr;q=0.87, iu-ca;q=0.84, iu;q=0.82, ja-jp;q=0.79, mt;q=0.97, de-de;q=0.74, de;q=0.71, es;q=0.68, it-it;q=0.66, it;q=0.63, vi-vn;q=0.61, vi;q=0.58, nl-nl;q=0.55, nl;q=0.53
2292*/
2293
2294U_CAPI int32_t U_EXPORT2
2295uloc_acceptLanguageFromHTTP(char *result, int32_t resultAvailable, UAcceptResult *outResult,
2296                            const char *httpAcceptLanguage,
2297                            UEnumeration* availableLocales,
2298                            UErrorCode *status)
2299{
2300  MaybeStackArray<_acceptLangItem, 4> items; // Struct for collecting items.
2301    char tmp[ULOC_FULLNAME_CAPACITY +1];
2302    int32_t n = 0;
2303    const char *itemEnd;
2304    const char *paramEnd;
2305    const char *s;
2306    const char *t;
2307    int32_t res;
2308    int32_t i;
2309    int32_t l = (int32_t)uprv_strlen(httpAcceptLanguage);
2310
2311    if(U_FAILURE(*status)) {
2312        return -1;
2313    }
2314
2315    for(s=httpAcceptLanguage;s&&*s;) {
2316        while(isspace(*s)) /* eat space at the beginning */
2317            s++;
2318        itemEnd=uprv_strchr(s,',');
2319        paramEnd=uprv_strchr(s,';');
2320        if(!itemEnd) {
2321            itemEnd = httpAcceptLanguage+l; /* end of string */
2322        }
2323        if(paramEnd && paramEnd<itemEnd) {
2324            /* semicolon (;) is closer than end (,) */
2325            t = paramEnd+1;
2326            if(*t=='q') {
2327                t++;
2328            }
2329            while(isspace(*t)) {
2330                t++;
2331            }
2332            if(*t=='=') {
2333                t++;
2334            }
2335            while(isspace(*t)) {
2336                t++;
2337            }
2338            items[n].q = (float)_uloc_strtod(t,NULL);
2339        } else {
2340            /* no semicolon - it's 1.0 */
2341            items[n].q = 1.0f;
2342            paramEnd = itemEnd;
2343        }
2344        items[n].dummy=0;
2345        /* eat spaces prior to semi */
2346        for(t=(paramEnd-1);(paramEnd>s)&&isspace(*t);t--)
2347            ;
2348        int32_t slen = ((t+1)-s);
2349        if(slen > ULOC_FULLNAME_CAPACITY) {
2350          *status = U_BUFFER_OVERFLOW_ERROR;
2351          return -1; // too big
2352        }
2353        uprv_strncpy(items[n].locale, s, slen);
2354        items[n].locale[slen]=0; // terminate
2355        int32_t clen = uloc_canonicalize(items[n].locale, tmp, UPRV_LENGTHOF(tmp)-1, status);
2356        if(U_FAILURE(*status)) return -1;
2357        if((clen!=slen) || (uprv_strncmp(items[n].locale, tmp, slen))) {
2358            // canonicalization had an effect- copy back
2359            uprv_strncpy(items[n].locale, tmp, clen);
2360            items[n].locale[clen] = 0; // terminate
2361        }
2362#if defined(ULOC_DEBUG)
2363        /*fprintf(stderr,"%d: s <%s> q <%g>\n", n, j[n].locale, j[n].q);*/
2364#endif
2365        n++;
2366        s = itemEnd;
2367        while(*s==',') { /* eat duplicate commas */
2368            s++;
2369        }
2370        if(n>=items.getCapacity()) { // If we need more items
2371          if(NULL == items.resize(items.getCapacity()*2, items.getCapacity())) {
2372              *status = U_MEMORY_ALLOCATION_ERROR;
2373              return -1;
2374          }
2375#if defined(ULOC_DEBUG)
2376          fprintf(stderr,"malloced at size %d\n", items.getCapacity());
2377#endif
2378        }
2379    }
2380    uprv_sortArray(items.getAlias(), n, sizeof(items[0]), uloc_acceptLanguageCompare, NULL, TRUE, status);
2381    if (U_FAILURE(*status)) {
2382        return -1;
2383    }
2384    LocalMemory<const char*> strs(NULL);
2385    if (strs.allocateInsteadAndReset(n) == NULL) {
2386        *status = U_MEMORY_ALLOCATION_ERROR;
2387        return -1;
2388    }
2389    for(i=0;i<n;i++) {
2390#if defined(ULOC_DEBUG)
2391        /*fprintf(stderr,"%d: s <%s> q <%g>\n", i, j[i].locale, j[i].q);*/
2392#endif
2393        strs[i]=items[i].locale;
2394    }
2395    res =  uloc_acceptLanguage(result, resultAvailable, outResult,
2396                               strs.getAlias(), n, availableLocales, status);
2397    return res;
2398}
2399
2400
2401U_CAPI int32_t U_EXPORT2
2402uloc_acceptLanguage(char *result, int32_t resultAvailable,
2403                    UAcceptResult *outResult, const char **acceptList,
2404                    int32_t acceptListCount,
2405                    UEnumeration* availableLocales,
2406                    UErrorCode *status)
2407{
2408    int32_t i,j;
2409    int32_t len;
2410    int32_t maxLen=0;
2411    char tmp[ULOC_FULLNAME_CAPACITY+1];
2412    const char *l;
2413    char **fallbackList;
2414    if(U_FAILURE(*status)) {
2415        return -1;
2416    }
2417    fallbackList = static_cast<char **>(uprv_malloc((size_t)(sizeof(fallbackList[0])*acceptListCount)));
2418    if(fallbackList==NULL) {
2419        *status = U_MEMORY_ALLOCATION_ERROR;
2420        return -1;
2421    }
2422    for(i=0;i<acceptListCount;i++) {
2423#if defined(ULOC_DEBUG)
2424        fprintf(stderr,"%02d: %s\n", i, acceptList[i]);
2425#endif
2426        while((l=uenum_next(availableLocales, NULL, status))) {
2427#if defined(ULOC_DEBUG)
2428            fprintf(stderr,"  %s\n", l);
2429#endif
2430            len = (int32_t)uprv_strlen(l);
2431            if(!uprv_strcmp(acceptList[i], l)) {
2432                if(outResult) {
2433                    *outResult = ULOC_ACCEPT_VALID;
2434                }
2435#if defined(ULOC_DEBUG)
2436                fprintf(stderr, "MATCH! %s\n", l);
2437#endif
2438                if(len>0) {
2439                    uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2440                }
2441                for(j=0;j<i;j++) {
2442                    uprv_free(fallbackList[j]);
2443                }
2444                uprv_free(fallbackList);
2445                return u_terminateChars(result, resultAvailable, len, status);
2446            }
2447            if(len>maxLen) {
2448                maxLen = len;
2449            }
2450        }
2451        uenum_reset(availableLocales, status);
2452        /* save off parent info */
2453        if(uloc_getParent(acceptList[i], tmp, UPRV_LENGTHOF(tmp), status)!=0) {
2454            fallbackList[i] = uprv_strdup(tmp);
2455        } else {
2456            fallbackList[i]=0;
2457        }
2458    }
2459
2460    for(maxLen--;maxLen>0;maxLen--) {
2461        for(i=0;i<acceptListCount;i++) {
2462            if(fallbackList[i] && ((int32_t)uprv_strlen(fallbackList[i])==maxLen)) {
2463#if defined(ULOC_DEBUG)
2464                fprintf(stderr,"Try: [%s]", fallbackList[i]);
2465#endif
2466                while((l=uenum_next(availableLocales, NULL, status))) {
2467#if defined(ULOC_DEBUG)
2468                    fprintf(stderr,"  %s\n", l);
2469#endif
2470                    len = (int32_t)uprv_strlen(l);
2471                    if(!uprv_strcmp(fallbackList[i], l)) {
2472                        if(outResult) {
2473                            *outResult = ULOC_ACCEPT_FALLBACK;
2474                        }
2475#if defined(ULOC_DEBUG)
2476                        fprintf(stderr, "fallback MATCH! %s\n", l);
2477#endif
2478                        if(len>0) {
2479                            uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2480                        }
2481                        for(j=0;j<acceptListCount;j++) {
2482                            uprv_free(fallbackList[j]);
2483                        }
2484                        uprv_free(fallbackList);
2485                        return u_terminateChars(result, resultAvailable, len, status);
2486                    }
2487                }
2488                uenum_reset(availableLocales, status);
2489
2490                if(uloc_getParent(fallbackList[i], tmp, UPRV_LENGTHOF(tmp), status)!=0) {
2491                    uprv_free(fallbackList[i]);
2492                    fallbackList[i] = uprv_strdup(tmp);
2493                } else {
2494                    uprv_free(fallbackList[i]);
2495                    fallbackList[i]=0;
2496                }
2497            }
2498        }
2499        if(outResult) {
2500            *outResult = ULOC_ACCEPT_FAILED;
2501        }
2502    }
2503    for(i=0;i<acceptListCount;i++) {
2504        uprv_free(fallbackList[i]);
2505    }
2506    uprv_free(fallbackList);
2507    return -1;
2508}
2509
2510U_CAPI const char* U_EXPORT2
2511uloc_toUnicodeLocaleKey(const char* keyword)
2512{
2513    const char* bcpKey = ulocimp_toBcpKey(keyword);
2514    if (bcpKey == NULL && ultag_isUnicodeLocaleKey(keyword, -1)) {
2515        // unknown keyword, but syntax is fine..
2516        return keyword;
2517    }
2518    return bcpKey;
2519}
2520
2521U_CAPI const char* U_EXPORT2
2522uloc_toUnicodeLocaleType(const char* keyword, const char* value)
2523{
2524    const char* bcpType = ulocimp_toBcpType(keyword, value, NULL, NULL);
2525    if (bcpType == NULL && ultag_isUnicodeLocaleType(value, -1)) {
2526        // unknown keyword, but syntax is fine..
2527        return value;
2528    }
2529    return bcpType;
2530}
2531
2532#define UPRV_ISDIGIT(c) (((c) >= '0') && ((c) <= '9'))
2533#define UPRV_ISALPHANUM(c) (uprv_isASCIILetter(c) || UPRV_ISDIGIT(c) )
2534
2535static UBool
2536isWellFormedLegacyKey(const char* legacyKey)
2537{
2538    const char* p = legacyKey;
2539    while (*p) {
2540        if (!UPRV_ISALPHANUM(*p)) {
2541            return FALSE;
2542        }
2543        p++;
2544    }
2545    return TRUE;
2546}
2547
2548static UBool
2549isWellFormedLegacyType(const char* legacyType)
2550{
2551    const char* p = legacyType;
2552    int32_t alphaNumLen = 0;
2553    while (*p) {
2554        if (*p == '_' || *p == '/' || *p == '-') {
2555            if (alphaNumLen == 0) {
2556                return FALSE;
2557            }
2558            alphaNumLen = 0;
2559        } else if (UPRV_ISALPHANUM(*p)) {
2560            alphaNumLen++;
2561        } else {
2562            return FALSE;
2563        }
2564        p++;
2565    }
2566    return (alphaNumLen != 0);
2567}
2568
2569U_CAPI const char* U_EXPORT2
2570uloc_toLegacyKey(const char* keyword)
2571{
2572    const char* legacyKey = ulocimp_toLegacyKey(keyword);
2573    if (legacyKey == NULL) {
2574        // Checks if the specified locale key is well-formed with the legacy locale syntax.
2575        //
2576        // Note:
2577        //  Neither ICU nor LDML/CLDR provides the definition of keyword syntax.
2578        //  However, a key should not contain '=' obviously. For now, all existing
2579        //  keys are using ASCII alphabetic letters only. We won't add any new key
2580        //  that is not compatible with the BCP 47 syntax. Therefore, we assume
2581        //  a valid key consist from [0-9a-zA-Z], no symbols.
2582        if (isWellFormedLegacyKey(keyword)) {
2583            return keyword;
2584        }
2585    }
2586    return legacyKey;
2587}
2588
2589U_CAPI const char* U_EXPORT2
2590uloc_toLegacyType(const char* keyword, const char* value)
2591{
2592    const char* legacyType = ulocimp_toLegacyType(keyword, value, NULL, NULL);
2593    if (legacyType == NULL) {
2594        // Checks if the specified locale type is well-formed with the legacy locale syntax.
2595        //
2596        // Note:
2597        //  Neither ICU nor LDML/CLDR provides the definition of keyword syntax.
2598        //  However, a type should not contain '=' obviously. For now, all existing
2599        //  types are using ASCII alphabetic letters with a few symbol letters. We won't
2600        //  add any new type that is not compatible with the BCP 47 syntax except timezone
2601        //  IDs. For now, we assume a valid type start with [0-9a-zA-Z], but may contain
2602        //  '-' '_' '/' in the middle.
2603        if (isWellFormedLegacyType(value)) {
2604            return value;
2605        }
2606    }
2607    return legacyType;
2608}
2609
2610/*eof*/
2611