1/*
2**********************************************************************
3*   Copyright (C) 1997-2010, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5**********************************************************************
6*
7* File ULOC.CPP
8*
9* Modification History:
10*
11*   Date        Name        Description
12*   04/01/97    aliu        Creation.
13*   08/21/98    stephen     JDK 1.2 sync
14*   12/08/98    rtg         New Locale implementation and C API
15*   03/15/99    damiba      overhaul.
16*   04/06/99    stephen     changed setDefault() to realloc and copy
17*   06/14/99    stephen     Changed calls to ures_open for new params
18*   07/21/99    stephen     Modified setDefault() to propagate to C++
19*   05/14/04    alan        7 years later: refactored, cleaned up, fixed bugs,
20*                           brought canonicalization code into line with spec
21*****************************************************************************/
22
23/*
24   POSIX's locale format, from putil.c: [no spaces]
25
26     ll [ _CC ] [ . MM ] [ @ VV]
27
28     l = lang, C = ctry, M = charmap, V = variant
29*/
30
31#include "unicode/utypes.h"
32#include "unicode/ustring.h"
33#include "unicode/uloc.h"
34
35#include "putilimp.h"
36#include "ustr_imp.h"
37#include "ulocimp.h"
38#include "umutex.h"
39#include "cstring.h"
40#include "cmemory.h"
41#include "ucln_cmn.h"
42#include "locmap.h"
43#include "uarrsort.h"
44#include "uenumimp.h"
45#include "uassert.h"
46
47#include <stdio.h> /* for sprintf */
48
49/* ### Declarations **************************************************/
50
51/* Locale stuff from locid.cpp */
52U_CFUNC void locale_set_default(const char *id);
53U_CFUNC const char *locale_get_default(void);
54U_CFUNC int32_t
55locale_getKeywords(const char *localeID,
56            char prev,
57            char *keywords, int32_t keywordCapacity,
58            char *values, int32_t valuesCapacity, int32_t *valLen,
59            UBool valuesToo,
60            UErrorCode *status);
61
62/* ### Data tables **************************************************/
63
64/**
65 * Table of language codes, both 2- and 3-letter, with preference
66 * given to 2-letter codes where possible.  Includes 3-letter codes
67 * that lack a 2-letter equivalent.
68 *
69 * This list must be in sorted order.  This list is returned directly
70 * to the user by some API.
71 *
72 * This list must be kept in sync with LANGUAGES_3, with corresponding
73 * entries matched.
74 *
75 * This table should be terminated with a NULL entry, followed by a
76 * second list, and another NULL entry.  The first list is visible to
77 * user code when this array is returned by API.  The second list
78 * contains codes we support, but do not expose through user API.
79 *
80 * Notes
81 *
82 * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
83 * include the revisions up to 2001/7/27 *CWB*
84 *
85 * The 3 character codes are the terminology codes like RFC 3066.  This
86 * is compatible with prior ICU codes
87 *
88 * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
89 * table but now at the end of the table because 3 character codes are
90 * duplicates.  This avoids bad searches going from 3 to 2 character
91 * codes.
92 *
93 * The range qaa-qtz is reserved for local use
94 */
95static const char * const LANGUAGES[] = {
96    "aa",  "ab",  "ace", "ach", "ada", "ady", "ae",  "af",  "afa",
97    "afh", "ain", "ak",  "akk", "ale", "alg", "alt", "am",  "an",
98    "ang", "anp", "apa",
99    "ar",  "arc", "arn", "arp", "art", "arw", "as",  "ast",
100    "ath", "aus", "av",  "awa", "ay",  "az",  "ba",  "bad",
101    "bai", "bal", "ban", "bas", "bat", "be",  "bej",
102    "bem", "ber", "bg",  "bh",  "bho", "bi",  "bik", "bin",
103    "bla", "bm",  "bn",  "bnt", "bo",  "br",  "bra", "bs",
104    "btk", "bua", "bug", "byn", "ca",  "cad", "cai", "car", "cau",
105    "cch", "ce",  "ceb", "cel", "ch",  "chb", "chg", "chk", "chm",
106    "chn", "cho", "chp", "chr", "chy", "cmc", "co",  "cop",
107    "cpe", "cpf", "cpp", "cr",  "crh", "crp", "cs",  "csb", "cu",  "cus",
108    "cv",  "cy",  "da",  "dak", "dar", "day", "de",  "del", "den",
109    "dgr", "din", "doi", "dra", "dsb", "dua", "dum", "dv",  "dyu",
110    "dz",  "ee",  "efi", "egy", "eka", "el",  "elx", "en",
111    "enm", "eo",  "es",  "et",  "eu",  "ewo", "fa",
112    "fan", "fat", "ff",  "fi",  "fil", "fiu", "fj",  "fo",  "fon",
113    "fr",  "frm", "fro", "frr", "frs", "fur", "fy",
114    "ga",  "gaa", "gay", "gba", "gd",  "gem", "gez", "gil",
115    "gl",  "gmh", "gn",  "goh", "gon", "gor", "got", "grb",
116    "grc", "gsw", "gu",  "gv", "gwi",
117    "ha",  "hai", "haw", "he",  "hi",  "hil", "him",
118    "hit", "hmn", "ho",  "hr",  "hsb", "ht",  "hu",  "hup", "hy",  "hz",
119    "ia",  "iba", "id",  "ie",  "ig",  "ii",  "ijo", "ik",
120    "ilo", "inc", "ine", "inh", "io",  "ira", "iro", "is",  "it",
121    "iu",  "ja",  "jbo", "jpr", "jrb", "jv",  "ka",  "kaa", "kab",
122    "kac", "kaj", "kam", "kar", "kaw", "kbd", "kcg", "kfo", "kg",  "kha", "khi",
123    "kho", "ki",  "kj",  "kk",  "kl",  "km",  "kmb", "kn",
124    "ko",  "kok", "kos", "kpe", "kr",  "krc", "krl", "kro", "kru", "ks",
125    "ku",  "kum", "kut", "kv",  "kw",  "ky",  "la",  "lad",
126    "lah", "lam", "lb",  "lez", "lg",  "li",  "ln",  "lo",  "lol",
127    "loz", "lt",  "lu",  "lua", "lui", "lun", "luo", "lus",
128    "lv",  "mad", "mag", "mai", "mak", "man", "map", "mas",
129    "mdf", "mdr", "men", "mfe", "mg",  "mga", "mh",  "mi",  "mic", "min",
130    "mis", "mk",  "mkh", "ml",  "mn",  "mnc", "mni", "mno",
131    "mo",  "moh", "mos", "mr",  "ms",  "mt",  "mul", "mun",
132    "mus", "mwl", "mwr", "my",  "myn", "myv", "na",  "nah", "nai", "nap",
133    "nb",  "nd",  "nds", "ne",  "new", "ng",  "nia", "nic",
134    "niu", "nl",  "nn",  "no",  "nog", "non", "nqo", "nr",  "nso", "nub",
135    "nv",  "nwc", "ny",  "nym", "nyn", "nyo", "nzi", "oc",  "oj",
136    "om",  "or",  "os",  "osa", "ota", "oto", "pa",  "paa",
137    "pag", "pal", "pam", "pap", "pau", "peo", "phi", "phn",
138    "pi",  "pl",  "pon", "pra", "pro", "ps",  "pt",  "qu",
139    "raj", "rap", "rar", "rm",  "rn",  "ro",  "roa", "rom",
140    "ru",  "rup", "rw",  "sa",  "sad", "sah", "sai", "sal", "sam",
141    "sas", "sat", "sc",  "scn", "sco", "sd",  "se",  "sel", "sem",
142    "sg",  "sga", "sgn", "shn", "si",  "sid", "sio", "sit",
143    "sk",  "sl",  "sla", "sm",  "sma", "smi", "smj", "smn",
144    "sms", "sn",  "snk", "so",  "sog", "son", "sq",  "sr",
145    "srn", "srr", "ss",  "ssa", "st",  "su",  "suk", "sus", "sux",
146    "sv",  "sw",  "syc", "syr", "ta",  "tai", "te",  "tem", "ter",
147    "tet", "tg",  "th",  "ti",  "tig", "tiv", "tk",  "tkl",
148    "tl",  "tlh", "tli", "tmh", "tn",  "to",  "tog", "tpi", "tr", "trv",
149    "ts",  "tsi", "tt",  "tum", "tup", "tut", "tvl", "tw",
150    "ty",  "tyv", "udm", "ug",  "uga", "uk",  "umb", "und", "ur",
151    "uz",  "vai", "ve",  "vi",  "vo",  "vot", "wa",  "wak",
152    "wal", "war", "was", "wen", "wo",  "xal", "xh",  "yao", "yap",
153    "yi",  "yo",  "ypk", "za",  "zap", "zbl", "zen", "zh",  "znd",
154    "zu",  "zun", "zxx", "zza",
155NULL,
156    "in",  "iw",  "ji",  "jw",  "sh",    /* obsolete language codes */
157NULL
158};
159static const char* const DEPRECATED_LANGUAGES[]={
160    "in", "iw", "ji", "jw", NULL, NULL
161};
162static const char* const REPLACEMENT_LANGUAGES[]={
163    "id", "he", "yi", "jv", NULL, NULL
164};
165
166/**
167 * Table of 3-letter language codes.
168 *
169 * This is a lookup table used to convert 3-letter language codes to
170 * their 2-letter equivalent, where possible.  It must be kept in sync
171 * with LANGUAGES.  For all valid i, LANGUAGES[i] must refer to the
172 * same language as LANGUAGES_3[i].  The commented-out lines are
173 * copied from LANGUAGES to make eyeballing this baby easier.
174 *
175 * Where a 3-letter language code has no 2-letter equivalent, the
176 * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
177 *
178 * This table should be terminated with a NULL entry, followed by a
179 * second list, and another NULL entry.  The two lists correspond to
180 * the two lists in LANGUAGES.
181 */
182static const char * const LANGUAGES_3[] = {
183/*  "aa",  "ab",  "ace", "ach", "ada", "ady", "ae",  "af",  "afa",    */
184    "aar", "abk", "ace", "ach", "ada", "ady", "ave", "afr", "afa",
185/*  "afh", "ain", "ak",  "akk", "ale", "alg", "alt", "am",  "an",  "ang", "anp", "apa",    */
186    "afh", "ain", "aka", "akk", "ale", "alg", "alt", "amh", "arg", "ang", "anp", "apa",
187/*  "ar",  "arc", "arn", "arp", "art", "arw", "as",  "ast",    */
188    "ara", "arc", "arn", "arp", "art", "arw", "asm", "ast",
189/*  "ath", "aus", "av",  "awa", "ay",  "az",  "ba",  "bad",    */
190    "ath", "aus", "ava", "awa", "aym", "aze", "bak", "bad",
191/*  "bai", "bal", "ban", "bas", "bat", "be",  "bej",    */
192    "bai", "bal", "ban", "bas", "bat", "bel", "bej",
193/*  "bem", "ber", "bg",  "bh",  "bho", "bi",  "bik", "bin",    */
194    "bem", "ber", "bul", "bih", "bho", "bis", "bik", "bin",
195/*  "bla", "bm",  "bn",  "bnt", "bo",  "br",  "bra", "bs",     */
196    "bla", "bam", "ben", "bnt", "bod", "bre", "bra", "bos",
197/*  "btk", "bua", "bug", "byn", "ca",  "cad", "cai", "car", "cau",    */
198    "btk", "bua", "bug", "byn", "cat", "cad", "cai", "car", "cau",
199/*  "cch", "ce",  "ceb", "cel", "ch",  "chb", "chg", "chk", "chm",    */
200    "cch", "che", "ceb", "cel", "cha", "chb", "chg", "chk", "chm",
201/*  "chn", "cho", "chp", "chr", "chy", "cmc", "co",  "cop",    */
202    "chn", "cho", "chp", "chr", "chy", "cmc", "cos", "cop",
203/*  "cpe", "cpf", "cpp", "cr",  "crh", "crp", "cs",  "csb", "cu",  "cus",    */
204    "cpe", "cpf", "cpp", "cre", "crh", "crp", "ces", "csb", "chu", "cus",
205/*  "cv",  "cy",  "da",  "dak", "dar", "day", "de",  "del", "den",    */
206    "chv", "cym", "dan", "dak", "dar", "day", "deu", "del", "den",
207/*  "dgr", "din", "doi", "dra", "dsb", "dua", "dum", "dv",  "dyu",    */
208    "dgr", "din", "doi", "dra", "dsb", "dua", "dum", "div", "dyu",
209/*  "dz",  "ee",  "efi", "egy", "eka", "el",  "elx", "en",     */
210    "dzo", "ewe", "efi", "egy", "eka", "ell", "elx", "eng",
211/*  "enm", "eo",  "es",  "et",  "eu",  "ewo", "fa",     */
212    "enm", "epo", "spa", "est", "eus", "ewo", "fas",
213/*  "fan", "fat", "ff",  "fi",  "fil", "fiu", "fj",  "fo",  "fon",    */
214    "fan", "fat", "ful", "fin", "fil", "fiu", "fij", "fao", "fon",
215/*  "fr",  "frm", "fro", "frr", "frs", "fur", "fy",  "ga",  "gaa", "gay",    */
216    "fra", "frm", "fro", "frr", "frs", "fur", "fry", "gle", "gaa", "gay",
217/*  "gba", "gd",  "gem", "gez", "gil", "gl",  "gmh", "gn",     */
218    "gba", "gla", "gem", "gez", "gil", "glg", "gmh", "grn",
219/*  "goh", "gon", "gor", "got", "grb", "grc", "gsw", "gu",  "gv",     */
220    "goh", "gon", "gor", "got", "grb", "grc", "gsw", "guj", "glv",
221/*  "gwi", "ha",  "hai", "haw", "he",  "hi",  "hil", "him",    */
222    "gwi", "hau", "hai", "haw", "heb", "hin", "hil", "him",
223/*  "hit", "hmn", "ho",  "hr",  "hsb", "ht",  "hu",  "hup", "hy",  "hz",     */
224    "hit", "hmn", "hmo", "hrv", "hsb", "hat", "hun", "hup", "hye", "her",
225/*  "ia",  "iba", "id",  "ie",  "ig",  "ii",  "ijo", "ik",     */
226    "ina", "iba", "ind", "ile", "ibo", "iii", "ijo", "ipk",
227/*  "ilo", "inc", "ine", "inh", "io",  "ira", "iro", "is",  "it",      */
228    "ilo", "inc", "ine", "inh", "ido", "ira", "iro", "isl", "ita",
229/*  "iu",  "ja",  "jbo", "jpr", "jrb", "jv",  "ka",  "kaa", "kab",   */
230    "iku", "jpn", "jbo", "jpr", "jrb", "jav", "kat", "kaa", "kab",
231/*  "kac", "kaj", "kam", "kar", "kaw", "kbd", "kcg", "kfo", "kg",  "kha", "khi",*/
232    "kac", "kaj", "kam", "kar", "kaw", "kbd", "kcg", "kfo", "kg",  "kha", "khi",
233/*  "kho", "ki",  "kj",  "kk",  "kl",  "km",  "kmb", "kn",     */
234    "kho", "kik", "kua", "kaz", "kal", "khm", "kmb", "kan",
235/*  "ko",  "kok", "kos", "kpe", "kr",  "krc", "krl", "kro", "kru", "ks",     */
236    "kor", "kok", "kos", "kpe", "kau", "krc", "krl", "kro", "kru", "kas",
237/*  "ku",  "kum", "kut", "kv",  "kw",  "ky",  "la",  "lad",    */
238    "kur", "kum", "kut", "kom", "cor", "kir", "lat", "lad",
239/*  "lah", "lam", "lb",  "lez", "lg",  "li",  "ln",  "lo",  "lol",    */
240    "lah", "lam", "ltz", "lez", "lug", "lim", "lin", "lao", "lol",
241/*  "loz", "lt",  "lu",  "lua", "lui", "lun", "luo", "lus",    */
242    "loz", "lit", "lub", "lua", "lui", "lun", "luo", "lus",
243/*  "lv",  "mad", "mag", "mai", "mak", "man", "map", "mas",    */
244    "lav", "mad", "mag", "mai", "mak", "man", "map", "mas",
245/*  "mdf", "mdr", "men", "mfe", "mg",  "mga", "mh",  "mi",  "mic", "min",    */
246    "mdf", "mdr", "men", "mfe", "mlg", "mga", "mah", "mri", "mic", "min",
247/*  "mis", "mk",  "mkh", "ml",  "mn",  "mnc", "mni", "mno",    */
248    "mis", "mkd", "mkh", "mal", "mon", "mnc", "mni", "mno",
249/*  "mo",  "moh", "mos", "mr",  "ms",  "mt",  "mul", "mun",    */
250    "mol", "moh", "mos", "mar", "msa", "mlt", "mul", "mun",
251/*  "mus", "mwl", "mwr", "my",  "myn", "myv", "na",  "nah", "nai", "nap",    */
252    "mus", "mwl", "mwr", "mya", "myn", "myv", "nau", "nah", "nai", "nap",
253/*  "nb",  "nd",  "nds", "ne",  "new", "ng",  "nia", "nic",    */
254    "nob", "nde", "nds", "nep", "new", "ndo", "nia", "nic",
255/*  "niu", "nl",  "nn",  "no",  "nog", "non", "nqo", "nr",  "nso", "nub",    */
256    "niu", "nld", "nno", "nor", "nog", "non", "nqo", "nbl", "nso", "nub",
257/*  "nv",  "nwc", "ny",  "nym", "nyn", "nyo", "nzi", "oc",  "oj",     */
258    "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi", "oci", "oji",
259/*  "om",  "or",  "os",  "osa", "ota", "oto", "pa",  "paa",    */
260    "orm", "ori", "oss", "osa", "ota", "oto", "pan", "paa",
261/*  "pag", "pal", "pam", "pap", "pau", "peo", "phi", "phn",    */
262    "pag", "pal", "pam", "pap", "pau", "peo", "phi", "phn",
263/*  "pi",  "pl",  "pon", "pra", "pro", "ps",  "pt",  "qu",     */
264    "pli", "pol", "pon", "pra", "pro", "pus", "por", "que",
265/*  "raj", "rap", "rar", "rm",  "rn",  "ro",  "roa", "rom",    */
266    "raj", "rap", "rar", "roh", "run", "ron", "roa", "rom",
267/*  "ru",  "rup", "rw",  "sa",  "sad", "sah", "sai", "sal", "sam",    */
268    "rus", "rup", "kin", "san", "sad", "sah", "sai", "sal", "sam",
269/*  "sas", "sat", "sc",  "scn", "sco", "sd",  "se",  "sel", "sem",    */
270    "sas", "sat", "srd", "scn", "sco", "snd", "sme", "sel", "sem",
271/*  "sg",  "sga", "sgn", "shn", "si",  "sid", "sio", "sit",    */
272    "sag", "sga", "sgn", "shn", "sin", "sid", "sio", "sit",
273/*  "sk",  "sl",  "sla", "sm",  "sma", "smi", "smj", "smn",    */
274    "slk", "slv", "sla", "smo", "sma", "smi", "smj", "smn",
275/*  "sms", "sn",  "snk", "so",  "sog", "son", "sq",  "sr",     */
276    "sms", "sna", "snk", "som", "sog", "son", "sqi", "srp",
277/*  "srn", "srr", "ss",  "ssa", "st",  "su",  "suk", "sus", "sux",    */
278    "srn", "srr", "ssw", "ssa", "sot", "sun", "suk", "sus", "sux",
279/*  "sv",  "sw",  "syc", "syr", "ta",  "tai", "te",  "tem", "ter",    */
280    "swe", "swa", "syc", "syr", "tam", "tai", "tel", "tem", "ter",
281/*  "tet", "tg",  "th",  "ti",  "tig", "tiv", "tk",  "tkl",    */
282    "tet", "tgk", "tha", "tir", "tig", "tiv", "tuk", "tkl",
283/*  "tl",  "tlh", "tli", "tmh", "tn",  "to",  "tog", "tpi", "tr", "trv",    */
284    "tgl", "tlh", "tli", "tmh", "tsn", "ton", "tog", "tpi", "tur", "trv",
285/*  "ts",  "tsi", "tt",  "tum", "tup", "tut", "tvl", "tw",     */
286    "tso", "tsi", "tat", "tum", "tup", "tut", "tvl", "twi",
287/*  "ty",  "tyv", "udm", "ug",  "uga", "uk",  "umb", "und", "ur",     */
288    "tah", "tyv", "udm", "uig", "uga", "ukr", "umb", "und", "urd",
289/*  "uz",  "vai", "ve",  "vi",  "vo",  "vot", "wa",  "wak",    */
290    "uzb", "vai", "ven", "vie", "vol", "vot", "wln", "wak",
291/*  "wal", "war", "was", "wen", "wo",  "xal", "xh",  "yao", "yap",    */
292    "wal", "war", "was", "wen", "wol", "xal", "xho", "yao", "yap",
293/*  "yi",  "yo",  "ypk", "za",  "zap", "zbl", "zen", "zh",  "znd",    */
294    "yid", "yor", "ypk", "zha", "zap", "zbl", "zen", "zho", "znd",
295/*  "zu",  "zun", "zxx", "zza",                                         */
296    "zul", "zun", "zxx", "zza",
297NULL,
298/*  "in",  "iw",  "ji",  "jw",  "sh",                          */
299    "ind", "heb", "yid", "jaw", "srp",
300NULL
301};
302
303/**
304 * Table of 2-letter country codes.
305 *
306 * This list must be in sorted order.  This list is returned directly
307 * to the user by some API.
308 *
309 * This list must be kept in sync with COUNTRIES_3, with corresponding
310 * entries matched.
311 *
312 * This table should be terminated with a NULL entry, followed by a
313 * second list, and another NULL entry.  The first list is visible to
314 * user code when this array is returned by API.  The second list
315 * contains codes we support, but do not expose through user API.
316 *
317 * Notes:
318 *
319 * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
320 * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
321 * new codes keeping the old ones for compatibility updated to include
322 * 1999/12/03 revisions *CWB*
323 *
324 * RO(ROM) is now RO(ROU) according to
325 * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
326 */
327static const char * const COUNTRIES[] = {
328    "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",  "AN",
329    "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",
330    "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",
331    "BJ",  "BL",  "BM",  "BN",  "BO",  "BR",  "BS",  "BT",  "BV",
332    "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",
333    "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CR",
334    "CU",  "CV",  "CX",  "CY",  "CZ",  "DE",  "DJ",  "DK",
335    "DM",  "DO",  "DZ",  "EC",  "EE",  "EG",  "EH",  "ER",
336    "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",
337    "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",
338    "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",
339    "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",
340    "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS",
341    "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",
342    "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",
343    "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",
344    "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",
345    "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",
346    "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",
347    "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",
348    "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",
349    "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",
350    "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",
351    "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",
352    "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "ST",  "SV",
353    "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",
354    "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",
355    "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",
356    "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",
357    "WS",  "YE",  "YT",  "ZA",  "ZM",  "ZW",
358NULL,
359    "FX",  "CS",  "RO",  "TP",  "YU",  "ZR",   /* obsolete country codes */
360NULL
361};
362
363static const char* const DEPRECATED_COUNTRIES[] ={
364    "BU", "CS", "DY", "FX", "HV", "NH", "RH", "TP", "YU", "ZR", NULL, NULL /* deprecated country list */
365};
366static const char* const REPLACEMENT_COUNTRIES[] = {
367/*  "BU", "CS", "DY", "FX", "HV", "NH", "RH", "TP", "YU", "ZR" */
368    "MM", "RS", "BJ", "FR", "BF", "VU", "ZW", "TL", "RS", "CD", NULL, NULL  /* replacement country codes */
369};
370
371/**
372 * Table of 3-letter country codes.
373 *
374 * This is a lookup table used to convert 3-letter country codes to
375 * their 2-letter equivalent.  It must be kept in sync with COUNTRIES.
376 * For all valid i, COUNTRIES[i] must refer to the same country as
377 * COUNTRIES_3[i].  The commented-out lines are copied from COUNTRIES
378 * to make eyeballing this baby easier.
379 *
380 * This table should be terminated with a NULL entry, followed by a
381 * second list, and another NULL entry.  The two lists correspond to
382 * the two lists in COUNTRIES.
383 */
384static const char * const COUNTRIES_3[] = {
385/*  "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",  "AN",     */
386    "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM", "ANT",
387/*  "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",     */
388    "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
389/*  "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",     */
390    "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
391/*  "BJ",  "BL",  "BM",  "BN",  "BO",  "BR",  "BS",  "BT",  "BV",     */
392    "BEN", "BLM", "BMU", "BRN", "BOL", "BRA", "BHS", "BTN", "BVT",
393/*  "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",     */
394    "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
395/*  "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CR",     */
396    "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CRI",
397/*  "CU",  "CV",  "CX",  "CY",  "CZ",  "DE",  "DJ",  "DK",     */
398    "CUB", "CPV", "CXR", "CYP", "CZE", "DEU", "DJI", "DNK",
399/*  "DM",  "DO",  "DZ",  "EC",  "EE",  "EG",  "EH",  "ER",     */
400    "DMA", "DOM", "DZA", "ECU", "EST", "EGY", "ESH", "ERI",
401/*  "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",     */
402    "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
403/*  "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",     */
404    "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
405/*  "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",     */
406    "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
407/*  "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",     */
408    "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
409/*  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS" */
410    "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL",
411/*  "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",     */
412    "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
413/*  "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",     */
414    "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
415/*  "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",     */
416    "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
417/*  "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",     */
418    "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
419/*  "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",     */
420    "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
421/*  "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",     */
422    "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
423/*  "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",     */
424    "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
425/*  "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",     */
426    "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
427/*  "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",     */
428    "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
429/*  "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",     */
430    "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
431/*  "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",     */
432    "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
433/*  "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "ST",  "SV",     */
434    "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "STP", "SLV",
435/*  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",     */
436    "SYR", "SWZ", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
437/*  "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",     */
438    "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
439/*  "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",     */
440    "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
441/*  "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",     */
442    "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
443/*  "WS",  "YE",  "YT",  "ZA",  "ZM",  "ZW",          */
444    "WSM", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
445NULL,
446/*  "FX",  "CS",  "RO",  "TP",  "YU",  "ZR",   */
447    "FXX", "SCG", "ROM", "TMP", "YUG", "ZAR",
448NULL
449};
450
451typedef struct CanonicalizationMap {
452    const char *id;          /* input ID */
453    const char *canonicalID; /* canonicalized output ID */
454    const char *keyword;     /* keyword, or NULL if none */
455    const char *value;       /* keyword value, or NULL if kw==NULL */
456} CanonicalizationMap;
457
458/**
459 * A map to canonicalize locale IDs.  This handles a variety of
460 * different semantic kinds of transformations.
461 */
462static const CanonicalizationMap CANONICALIZE_MAP[] = {
463    { "",               "en_US_POSIX", NULL, NULL }, /* .NET name */
464    { "c",              "en_US_POSIX", NULL, NULL }, /* POSIX name */
465    { "posix",          "en_US_POSIX", NULL, NULL }, /* POSIX name (alias of C) */
466    { "art_LOJBAN",     "jbo", NULL, NULL }, /* registered name */
467    { "az_AZ_CYRL",     "az_Cyrl_AZ", NULL, NULL }, /* .NET name */
468    { "az_AZ_LATN",     "az_Latn_AZ", NULL, NULL }, /* .NET name */
469    { "ca_ES_PREEURO",  "ca_ES", "currency", "ESP" },
470    { "cel_GAULISH",    "cel__GAULISH", NULL, NULL }, /* registered name */
471    { "de_1901",        "de__1901", NULL, NULL }, /* registered name */
472    { "de_1906",        "de__1906", NULL, NULL }, /* registered name */
473    { "de__PHONEBOOK",  "de", "collation", "phonebook" }, /* Old ICU name */
474    { "de_AT_PREEURO",  "de_AT", "currency", "ATS" },
475    { "de_DE_PREEURO",  "de_DE", "currency", "DEM" },
476    { "de_LU_PREEURO",  "de_LU", "currency", "LUF" },
477    { "el_GR_PREEURO",  "el_GR", "currency", "GRD" },
478    { "en_BOONT",       "en__BOONT", NULL, NULL }, /* registered name */
479    { "en_SCOUSE",      "en__SCOUSE", NULL, NULL }, /* registered name */
480    { "en_BE_PREEURO",  "en_BE", "currency", "BEF" },
481    { "en_IE_PREEURO",  "en_IE", "currency", "IEP" },
482    { "es__TRADITIONAL", "es", "collation", "traditional" }, /* Old ICU name */
483    { "es_ES_PREEURO",  "es_ES", "currency", "ESP" },
484    { "eu_ES_PREEURO",  "eu_ES", "currency", "ESP" },
485    { "fi_FI_PREEURO",  "fi_FI", "currency", "FIM" },
486    { "fr_BE_PREEURO",  "fr_BE", "currency", "BEF" },
487    { "fr_FR_PREEURO",  "fr_FR", "currency", "FRF" },
488    { "fr_LU_PREEURO",  "fr_LU", "currency", "LUF" },
489    { "ga_IE_PREEURO",  "ga_IE", "currency", "IEP" },
490    { "gl_ES_PREEURO",  "gl_ES", "currency", "ESP" },
491    { "hi__DIRECT",     "hi", "collation", "direct" }, /* Old ICU name */
492    { "it_IT_PREEURO",  "it_IT", "currency", "ITL" },
493    { "ja_JP_TRADITIONAL", "ja_JP", "calendar", "japanese" }, /* Old ICU name */
494    { "nb_NO_NY",       "nn_NO", NULL, NULL },  /* "markus said this was ok" :-) */
495    { "nl_BE_PREEURO",  "nl_BE", "currency", "BEF" },
496    { "nl_NL_PREEURO",  "nl_NL", "currency", "NLG" },
497    { "pt_PT_PREEURO",  "pt_PT", "currency", "PTE" },
498    { "sl_ROZAJ",       "sl__ROZAJ", NULL, NULL }, /* registered name */
499    { "sr_SP_CYRL",     "sr_Cyrl_RS", NULL, NULL }, /* .NET name */
500    { "sr_SP_LATN",     "sr_Latn_RS", NULL, NULL }, /* .NET name */
501    { "sr_YU_CYRILLIC", "sr_Cyrl_RS", NULL, NULL }, /* Linux name */
502    { "th_TH_TRADITIONAL", "th_TH", "calendar", "buddhist" }, /* Old ICU name */
503    { "uz_UZ_CYRILLIC", "uz_Cyrl_UZ", NULL, NULL }, /* Linux name */
504    { "uz_UZ_CYRL",     "uz_Cyrl_UZ", NULL, NULL }, /* .NET name */
505    { "uz_UZ_LATN",     "uz_Latn_UZ", NULL, NULL }, /* .NET name */
506    { "zh_CHS",         "zh_Hans", NULL, NULL }, /* .NET name */
507    { "zh_CHT",         "zh_Hant", NULL, NULL }, /* .NET name */
508    { "zh_GAN",         "zh__GAN", NULL, NULL }, /* registered name */
509    { "zh_GUOYU",       "zh", NULL, NULL }, /* registered name */
510    { "zh_HAKKA",       "zh__HAKKA", NULL, NULL }, /* registered name */
511    { "zh_MIN",         "zh__MIN", NULL, NULL }, /* registered name */
512    { "zh_MIN_NAN",     "zh__MINNAN", NULL, NULL }, /* registered name */
513    { "zh_WUU",         "zh__WUU", NULL, NULL }, /* registered name */
514    { "zh_XIANG",       "zh__XIANG", NULL, NULL }, /* registered name */
515    { "zh_YUE",         "zh__YUE", NULL, NULL }, /* registered name */
516};
517
518typedef struct VariantMap {
519    const char *variant;          /* input ID */
520    const char *keyword;     /* keyword, or NULL if none */
521    const char *value;       /* keyword value, or NULL if kw==NULL */
522} VariantMap;
523
524static const VariantMap VARIANT_MAP[] = {
525    { "EURO",   "currency", "EUR" },
526    { "PINYIN", "collation", "pinyin" }, /* Solaris variant */
527    { "STROKE", "collation", "stroke" }  /* Solaris variant */
528};
529
530/* ### BCP47 Conversion *******************************************/
531/* Test if the locale id has BCP47 u extension and does not have '@' */
532#define _hasBCP47Extension(id) (id && uprv_strstr(id, "@") == NULL && getShortestSubtagLength(localeID) == 1)
533/* Converts the BCP47 id to Unicode id. Does nothing to id if conversion fails */
534#define _ConvertBCP47(finalID, id, buffer, length,err) \
535        if (uloc_forLanguageTag(id, buffer, length, NULL, err) <= 0 || U_FAILURE(*err)) { \
536            finalID=id; \
537        } else { \
538            finalID=buffer; \
539        }
540/* Gets the size of the shortest subtag in the given localeID. */
541static int32_t getShortestSubtagLength(const char *localeID) {
542    int32_t localeIDLength = uprv_strlen(localeID);
543    int32_t length = localeIDLength;
544    int32_t tmpLength = 0;
545    int32_t i;
546    UBool reset = TRUE;
547
548    for (i = 0; i < localeIDLength; i++) {
549        if (localeID[i] != '_' && localeID[i] != '-') {
550            if (reset) {
551                tmpLength = 0;
552                reset = FALSE;
553            }
554            tmpLength++;
555        } else {
556            if (tmpLength != 0 && tmpLength < length) {
557                length = tmpLength;
558            }
559            reset = TRUE;
560        }
561    }
562
563    return length;
564}
565
566/* ### Keywords **************************************************/
567
568#define ULOC_KEYWORD_BUFFER_LEN 25
569#define ULOC_MAX_NO_KEYWORDS 25
570
571U_CAPI const char * U_EXPORT2
572locale_getKeywordsStart(const char *localeID) {
573    const char *result = NULL;
574    if((result = uprv_strchr(localeID, '@')) != NULL) {
575        return result;
576    }
577#if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
578    else {
579        /* We do this because the @ sign is variant, and the @ sign used on one
580        EBCDIC machine won't be compiled the same way on other EBCDIC based
581        machines. */
582        static const uint8_t ebcdicSigns[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 };
583        const uint8_t *charToFind = ebcdicSigns;
584        while(*charToFind) {
585            if((result = uprv_strchr(localeID, *charToFind)) != NULL) {
586                return result;
587            }
588            charToFind++;
589        }
590    }
591#endif
592    return NULL;
593}
594
595/**
596 * @param buf buffer of size [ULOC_KEYWORD_BUFFER_LEN]
597 * @param keywordName incoming name to be canonicalized
598 * @param status return status (keyword too long)
599 * @return length of the keyword name
600 */
601static int32_t locale_canonKeywordName(char *buf, const char *keywordName, UErrorCode *status)
602{
603  int32_t i;
604  int32_t keywordNameLen = (int32_t)uprv_strlen(keywordName);
605
606  if(keywordNameLen >= ULOC_KEYWORD_BUFFER_LEN) {
607    /* keyword name too long for internal buffer */
608    *status = U_INTERNAL_PROGRAM_ERROR;
609          return 0;
610  }
611
612  /* normalize the keyword name */
613  for(i = 0; i < keywordNameLen; i++) {
614    buf[i] = uprv_tolower(keywordName[i]);
615  }
616  buf[i] = 0;
617
618  return keywordNameLen;
619}
620
621typedef struct {
622    char keyword[ULOC_KEYWORD_BUFFER_LEN];
623    int32_t keywordLen;
624    const char *valueStart;
625    int32_t valueLen;
626} KeywordStruct;
627
628static int32_t U_CALLCONV
629compareKeywordStructs(const void *context, const void *left, const void *right) {
630    const char* leftString = ((const KeywordStruct *)left)->keyword;
631    const char* rightString = ((const KeywordStruct *)right)->keyword;
632    return uprv_strcmp(leftString, rightString);
633}
634
635/**
636 * Both addKeyword and addValue must already be in canonical form.
637 * Either both addKeyword and addValue are NULL, or neither is NULL.
638 * If they are not NULL they must be zero terminated.
639 * If addKeyword is not NULL is must have length small enough to fit in KeywordStruct.keyword.
640 */
641static int32_t
642_getKeywords(const char *localeID,
643             char prev,
644             char *keywords, int32_t keywordCapacity,
645             char *values, int32_t valuesCapacity, int32_t *valLen,
646             UBool valuesToo,
647             const char* addKeyword,
648             const char* addValue,
649             UErrorCode *status)
650{
651    KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS];
652
653    int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS;
654    int32_t numKeywords = 0;
655    const char* pos = localeID;
656    const char* equalSign = NULL;
657    const char* semicolon = NULL;
658    int32_t i = 0, j, n;
659    int32_t keywordsLen = 0;
660    int32_t valuesLen = 0;
661
662    if(prev == '@') { /* start of keyword definition */
663        /* we will grab pairs, trim spaces, lowercase keywords, sort and return */
664        do {
665            UBool duplicate = FALSE;
666            /* skip leading spaces */
667            while(*pos == ' ') {
668                pos++;
669            }
670            if (!*pos) { /* handle trailing "; " */
671                break;
672            }
673            if(numKeywords == maxKeywords) {
674                *status = U_INTERNAL_PROGRAM_ERROR;
675                return 0;
676            }
677            equalSign = uprv_strchr(pos, '=');
678            semicolon = uprv_strchr(pos, ';');
679            /* lack of '=' [foo@currency] is illegal */
680            /* ';' before '=' [foo@currency;collation=pinyin] is illegal */
681            if(!equalSign || (semicolon && semicolon<equalSign)) {
682                *status = U_INVALID_FORMAT_ERROR;
683                return 0;
684            }
685            /* need to normalize both keyword and keyword name */
686            if(equalSign - pos >= ULOC_KEYWORD_BUFFER_LEN) {
687                /* keyword name too long for internal buffer */
688                *status = U_INTERNAL_PROGRAM_ERROR;
689                return 0;
690            }
691            for(i = 0, n = 0; i < equalSign - pos; ++i) {
692                if (pos[i] != ' ') {
693                    keywordList[numKeywords].keyword[n++] = uprv_tolower(pos[i]);
694                }
695            }
696            keywordList[numKeywords].keyword[n] = 0;
697            keywordList[numKeywords].keywordLen = n;
698            /* now grab the value part. First we skip the '=' */
699            equalSign++;
700            /* then we leading spaces */
701            while(*equalSign == ' ') {
702                equalSign++;
703            }
704            keywordList[numKeywords].valueStart = equalSign;
705
706            pos = semicolon;
707            i = 0;
708            if(pos) {
709                while(*(pos - i - 1) == ' ') {
710                    i++;
711                }
712                keywordList[numKeywords].valueLen = (int32_t)(pos - equalSign - i);
713                pos++;
714            } else {
715                i = (int32_t)uprv_strlen(equalSign);
716                while(equalSign[i-1] == ' ') {
717                    i--;
718                }
719                keywordList[numKeywords].valueLen = i;
720            }
721            /* If this is a duplicate keyword, then ignore it */
722            for (j=0; j<numKeywords; ++j) {
723                if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == 0) {
724                    duplicate = TRUE;
725                    break;
726                }
727            }
728            if (!duplicate) {
729                ++numKeywords;
730            }
731        } while(pos);
732
733        /* Handle addKeyword/addValue. */
734        if (addKeyword != NULL) {
735            UBool duplicate = FALSE;
736            U_ASSERT(addValue != NULL);
737            /* Search for duplicate; if found, do nothing. Explicit keyword
738               overrides addKeyword. */
739            for (j=0; j<numKeywords; ++j) {
740                if (uprv_strcmp(keywordList[j].keyword, addKeyword) == 0) {
741                    duplicate = TRUE;
742                    break;
743                }
744            }
745            if (!duplicate) {
746                if (numKeywords == maxKeywords) {
747                    *status = U_INTERNAL_PROGRAM_ERROR;
748                    return 0;
749                }
750                uprv_strcpy(keywordList[numKeywords].keyword, addKeyword);
751                keywordList[numKeywords].keywordLen = (int32_t)uprv_strlen(addKeyword);
752                keywordList[numKeywords].valueStart = addValue;
753                keywordList[numKeywords].valueLen = (int32_t)uprv_strlen(addValue);
754                ++numKeywords;
755            }
756        } else {
757            U_ASSERT(addValue == NULL);
758        }
759
760        /* now we have a list of keywords */
761        /* we need to sort it */
762        uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, NULL, FALSE, status);
763
764        /* Now construct the keyword part */
765        for(i = 0; i < numKeywords; i++) {
766            if(keywordsLen + keywordList[i].keywordLen + 1< keywordCapacity) {
767                uprv_strcpy(keywords+keywordsLen, keywordList[i].keyword);
768                if(valuesToo) {
769                    keywords[keywordsLen + keywordList[i].keywordLen] = '=';
770                } else {
771                    keywords[keywordsLen + keywordList[i].keywordLen] = 0;
772                }
773            }
774            keywordsLen += keywordList[i].keywordLen + 1;
775            if(valuesToo) {
776                if(keywordsLen + keywordList[i].valueLen < keywordCapacity) {
777                    uprv_strncpy(keywords+keywordsLen, keywordList[i].valueStart, keywordList[i].valueLen);
778                }
779                keywordsLen += keywordList[i].valueLen;
780
781                if(i < numKeywords - 1) {
782                    if(keywordsLen < keywordCapacity) {
783                        keywords[keywordsLen] = ';';
784                    }
785                    keywordsLen++;
786                }
787            }
788            if(values) {
789                if(valuesLen + keywordList[i].valueLen + 1< valuesCapacity) {
790                    uprv_strcpy(values+valuesLen, keywordList[i].valueStart);
791                    values[valuesLen + keywordList[i].valueLen] = 0;
792                }
793                valuesLen += keywordList[i].valueLen + 1;
794            }
795        }
796        if(values) {
797            values[valuesLen] = 0;
798            if(valLen) {
799                *valLen = valuesLen;
800            }
801        }
802        return u_terminateChars(keywords, keywordCapacity, keywordsLen, status);
803    } else {
804        return 0;
805    }
806}
807
808U_CFUNC int32_t
809locale_getKeywords(const char *localeID,
810                   char prev,
811                   char *keywords, int32_t keywordCapacity,
812                   char *values, int32_t valuesCapacity, int32_t *valLen,
813                   UBool valuesToo,
814                   UErrorCode *status) {
815    return _getKeywords(localeID, prev, keywords, keywordCapacity,
816                        values, valuesCapacity, valLen, valuesToo,
817                        NULL, NULL, status);
818}
819
820U_CAPI int32_t U_EXPORT2
821uloc_getKeywordValue(const char* localeID,
822                     const char* keywordName,
823                     char* buffer, int32_t bufferCapacity,
824                     UErrorCode* status)
825{
826    const char* startSearchHere = NULL;
827    const char* nextSeparator = NULL;
828    char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
829    char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
830    int32_t i = 0;
831    int32_t result = 0;
832
833    if(status && U_SUCCESS(*status) && localeID) {
834      char tempBuffer[ULOC_FULLNAME_CAPACITY];
835      const char* tmpLocaleID;
836
837      if (_hasBCP47Extension(localeID)) {
838          _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
839      } else {
840          tmpLocaleID=localeID;
841      }
842
843      startSearchHere = uprv_strchr(tmpLocaleID, '@'); /* TODO: REVISIT: shouldn't this be locale_getKeywordsStart ? */
844      if(startSearchHere == NULL) {
845          /* no keywords, return at once */
846          return 0;
847      }
848
849      locale_canonKeywordName(keywordNameBuffer, keywordName, status);
850      if(U_FAILURE(*status)) {
851        return 0;
852      }
853
854      /* find the first keyword */
855      while(startSearchHere) {
856          startSearchHere++;
857          /* skip leading spaces (allowed?) */
858          while(*startSearchHere == ' ') {
859              startSearchHere++;
860          }
861          nextSeparator = uprv_strchr(startSearchHere, '=');
862          /* need to normalize both keyword and keyword name */
863          if(!nextSeparator) {
864              break;
865          }
866          if(nextSeparator - startSearchHere >= ULOC_KEYWORD_BUFFER_LEN) {
867              /* keyword name too long for internal buffer */
868              *status = U_INTERNAL_PROGRAM_ERROR;
869              return 0;
870          }
871          for(i = 0; i < nextSeparator - startSearchHere; i++) {
872              localeKeywordNameBuffer[i] = uprv_tolower(startSearchHere[i]);
873          }
874          /* trim trailing spaces */
875          while(startSearchHere[i-1] == ' ') {
876              i--;
877          }
878          localeKeywordNameBuffer[i] = 0;
879
880          startSearchHere = uprv_strchr(nextSeparator, ';');
881
882          if(uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer) == 0) {
883              nextSeparator++;
884              while(*nextSeparator == ' ') {
885                  nextSeparator++;
886              }
887              /* we actually found the keyword. Copy the value */
888              if(startSearchHere && startSearchHere - nextSeparator < bufferCapacity) {
889                  while(*(startSearchHere-1) == ' ') {
890                      startSearchHere--;
891                  }
892                  uprv_strncpy(buffer, nextSeparator, startSearchHere - nextSeparator);
893                  result = u_terminateChars(buffer, bufferCapacity, (int32_t)(startSearchHere - nextSeparator), status);
894              } else if(!startSearchHere && (int32_t)uprv_strlen(nextSeparator) < bufferCapacity) { /* last item in string */
895                  i = (int32_t)uprv_strlen(nextSeparator);
896                  while(nextSeparator[i - 1] == ' ') {
897                      i--;
898                  }
899                  uprv_strncpy(buffer, nextSeparator, i);
900                  result = u_terminateChars(buffer, bufferCapacity, i, status);
901              } else {
902                  /* give a bigger buffer, please */
903                  *status = U_BUFFER_OVERFLOW_ERROR;
904                  if(startSearchHere) {
905                      result = (int32_t)(startSearchHere - nextSeparator);
906                  } else {
907                      result = (int32_t)uprv_strlen(nextSeparator);
908                  }
909              }
910              return result;
911          }
912      }
913    }
914    return 0;
915}
916
917U_CAPI int32_t U_EXPORT2
918uloc_setKeywordValue(const char* keywordName,
919                     const char* keywordValue,
920                     char* buffer, int32_t bufferCapacity,
921                     UErrorCode* status)
922{
923    /* TODO: sorting. removal. */
924    int32_t keywordNameLen;
925    int32_t keywordValueLen;
926    int32_t bufLen;
927    int32_t needLen = 0;
928    int32_t foundValueLen;
929    int32_t keywordAtEnd = 0; /* is the keyword at the end of the string? */
930    char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
931    char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
932    int32_t i = 0;
933    int32_t rc;
934    char* nextSeparator = NULL;
935    char* nextEqualsign = NULL;
936    char* startSearchHere = NULL;
937    char* keywordStart = NULL;
938    char *insertHere = NULL;
939    if(U_FAILURE(*status)) {
940        return -1;
941    }
942    if(bufferCapacity>1) {
943        bufLen = (int32_t)uprv_strlen(buffer);
944    } else {
945        *status = U_ILLEGAL_ARGUMENT_ERROR;
946        return 0;
947    }
948    if(bufferCapacity<bufLen) {
949        /* The capacity is less than the length?! Is this NULL terminated? */
950        *status = U_ILLEGAL_ARGUMENT_ERROR;
951        return 0;
952    }
953    if(keywordValue && !*keywordValue) {
954        keywordValue = NULL;
955    }
956    if(keywordValue) {
957        keywordValueLen = (int32_t)uprv_strlen(keywordValue);
958    } else {
959        keywordValueLen = 0;
960    }
961    keywordNameLen = locale_canonKeywordName(keywordNameBuffer, keywordName, status);
962    if(U_FAILURE(*status)) {
963        return 0;
964    }
965    startSearchHere = (char*)locale_getKeywordsStart(buffer);
966    if(startSearchHere == NULL || (startSearchHere[1]==0)) {
967        if(!keywordValue) { /* no keywords = nothing to remove */
968            return bufLen;
969        }
970
971        needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
972        if(startSearchHere) { /* had a single @ */
973            needLen--; /* already had the @ */
974            /* startSearchHere points at the @ */
975        } else {
976            startSearchHere=buffer+bufLen;
977        }
978        if(needLen >= bufferCapacity) {
979            *status = U_BUFFER_OVERFLOW_ERROR;
980            return needLen; /* no change */
981        }
982        *startSearchHere = '@';
983        startSearchHere++;
984        uprv_strcpy(startSearchHere, keywordNameBuffer);
985        startSearchHere += keywordNameLen;
986        *startSearchHere = '=';
987        startSearchHere++;
988        uprv_strcpy(startSearchHere, keywordValue);
989        startSearchHere+=keywordValueLen;
990        return needLen;
991    } /* end shortcut - no @ */
992
993    keywordStart = startSearchHere;
994    /* search for keyword */
995    while(keywordStart) {
996        keywordStart++;
997        /* skip leading spaces (allowed?) */
998        while(*keywordStart == ' ') {
999            keywordStart++;
1000        }
1001        nextEqualsign = uprv_strchr(keywordStart, '=');
1002        /* need to normalize both keyword and keyword name */
1003        if(!nextEqualsign) {
1004            break;
1005        }
1006        if(nextEqualsign - keywordStart >= ULOC_KEYWORD_BUFFER_LEN) {
1007            /* keyword name too long for internal buffer */
1008            *status = U_INTERNAL_PROGRAM_ERROR;
1009            return 0;
1010        }
1011        for(i = 0; i < nextEqualsign - keywordStart; i++) {
1012            localeKeywordNameBuffer[i] = uprv_tolower(keywordStart[i]);
1013        }
1014        /* trim trailing spaces */
1015        while(keywordStart[i-1] == ' ') {
1016            i--;
1017        }
1018        localeKeywordNameBuffer[i] = 0;
1019
1020        nextSeparator = uprv_strchr(nextEqualsign, ';');
1021        rc = uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer);
1022        if(rc == 0) {
1023            nextEqualsign++;
1024            while(*nextEqualsign == ' ') {
1025                nextEqualsign++;
1026            }
1027            /* we actually found the keyword. Change the value */
1028            if (nextSeparator) {
1029                keywordAtEnd = 0;
1030                foundValueLen = (int32_t)(nextSeparator - nextEqualsign);
1031            } else {
1032                keywordAtEnd = 1;
1033                foundValueLen = (int32_t)uprv_strlen(nextEqualsign);
1034            }
1035            if(keywordValue) { /* adding a value - not removing */
1036              if(foundValueLen == keywordValueLen) {
1037                uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1038                return bufLen; /* no change in size */
1039              } else if(foundValueLen > keywordValueLen) {
1040                int32_t delta = foundValueLen - keywordValueLen;
1041                if(nextSeparator) { /* RH side */
1042                  uprv_memmove(nextSeparator - delta, nextSeparator, bufLen-(nextSeparator-buffer));
1043                }
1044                uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1045                bufLen -= delta;
1046                buffer[bufLen]=0;
1047                return bufLen;
1048              } else { /* FVL < KVL */
1049                int32_t delta = keywordValueLen - foundValueLen;
1050                if((bufLen+delta) >= bufferCapacity) {
1051                  *status = U_BUFFER_OVERFLOW_ERROR;
1052                  return bufLen+delta;
1053                }
1054                if(nextSeparator) { /* RH side */
1055                  uprv_memmove(nextSeparator+delta,nextSeparator, bufLen-(nextSeparator-buffer));
1056                }
1057                uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1058                bufLen += delta;
1059                buffer[bufLen]=0;
1060                return bufLen;
1061              }
1062            } else { /* removing a keyword */
1063              if(keywordAtEnd) {
1064                /* zero out the ';' or '@' just before startSearchhere */
1065                keywordStart[-1] = 0;
1066                return (int32_t)((keywordStart-buffer)-1); /* (string length without keyword) minus separator */
1067              } else {
1068                uprv_memmove(keywordStart, nextSeparator+1, bufLen-((nextSeparator+1)-buffer));
1069                keywordStart[bufLen-((nextSeparator+1)-buffer)]=0;
1070                return (int32_t)(bufLen-((nextSeparator+1)-keywordStart));
1071              }
1072            }
1073        } else if(rc<0){ /* end match keyword */
1074          /* could insert at this location. */
1075          insertHere = keywordStart;
1076        }
1077        keywordStart = nextSeparator;
1078    } /* end loop searching */
1079
1080    if(!keywordValue) {
1081      return bufLen; /* removal of non-extant keyword - no change */
1082    }
1083
1084    /* we know there is at least one keyword. */
1085    needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
1086    if(needLen >= bufferCapacity) {
1087        *status = U_BUFFER_OVERFLOW_ERROR;
1088        return needLen; /* no change */
1089    }
1090
1091    if(insertHere) {
1092      uprv_memmove(insertHere+(1+keywordNameLen+1+keywordValueLen), insertHere, bufLen-(insertHere-buffer));
1093      keywordStart = insertHere;
1094    } else {
1095      keywordStart = buffer+bufLen;
1096      *keywordStart = ';';
1097      keywordStart++;
1098    }
1099    uprv_strncpy(keywordStart, keywordNameBuffer, keywordNameLen);
1100    keywordStart += keywordNameLen;
1101    *keywordStart = '=';
1102    keywordStart++;
1103    uprv_strncpy(keywordStart, keywordValue, keywordValueLen); /* terminates. */
1104    keywordStart+=keywordValueLen;
1105    if(insertHere) {
1106      *keywordStart = ';';
1107      keywordStart++;
1108    }
1109    buffer[needLen]=0;
1110    return needLen;
1111}
1112
1113/* ### ID parsing implementation **************************************************/
1114
1115#define _isPrefixLetter(a) ((a=='x')||(a=='X')||(a=='i')||(a=='I'))
1116
1117/*returns TRUE if one of the special prefixes is here (s=string)
1118  'x-' or 'i-' */
1119#define _isIDPrefix(s) (_isPrefixLetter(s[0])&&_isIDSeparator(s[1]))
1120
1121/* Dot terminates it because of POSIX form  where dot precedes the codepage
1122 * except for variant
1123 */
1124#define _isTerminator(a)  ((a==0)||(a=='.')||(a=='@'))
1125
1126static char* _strnchr(const char* str, int32_t len, char c) {
1127    U_ASSERT(str != 0 && len >= 0);
1128    while (len-- != 0) {
1129        char d = *str;
1130        if (d == c) {
1131            return (char*) str;
1132        } else if (d == 0) {
1133            break;
1134        }
1135        ++str;
1136    }
1137    return NULL;
1138}
1139
1140/**
1141 * Lookup 'key' in the array 'list'.  The array 'list' should contain
1142 * a NULL entry, followed by more entries, and a second NULL entry.
1143 *
1144 * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
1145 * COUNTRIES_3.
1146 */
1147static int16_t _findIndex(const char* const* list, const char* key)
1148{
1149    const char* const* anchor = list;
1150    int32_t pass = 0;
1151
1152    /* Make two passes through two NULL-terminated arrays at 'list' */
1153    while (pass++ < 2) {
1154        while (*list) {
1155            if (uprv_strcmp(key, *list) == 0) {
1156                return (int16_t)(list - anchor);
1157            }
1158            list++;
1159        }
1160        ++list;     /* skip final NULL *CWB*/
1161    }
1162    return -1;
1163}
1164
1165/* count the length of src while copying it to dest; return strlen(src) */
1166static U_INLINE int32_t
1167_copyCount(char *dest, int32_t destCapacity, const char *src) {
1168    const char *anchor;
1169    char c;
1170
1171    anchor=src;
1172    for(;;) {
1173        if((c=*src)==0) {
1174            return (int32_t)(src-anchor);
1175        }
1176        if(destCapacity<=0) {
1177            return (int32_t)((src-anchor)+uprv_strlen(src));
1178        }
1179        ++src;
1180        *dest++=c;
1181        --destCapacity;
1182    }
1183}
1184
1185U_CFUNC const char*
1186uloc_getCurrentCountryID(const char* oldID){
1187    int32_t offset = _findIndex(DEPRECATED_COUNTRIES, oldID);
1188    if (offset >= 0) {
1189        return REPLACEMENT_COUNTRIES[offset];
1190    }
1191    return oldID;
1192}
1193U_CFUNC const char*
1194uloc_getCurrentLanguageID(const char* oldID){
1195    int32_t offset = _findIndex(DEPRECATED_LANGUAGES, oldID);
1196    if (offset >= 0) {
1197        return REPLACEMENT_LANGUAGES[offset];
1198    }
1199    return oldID;
1200}
1201/*
1202 * the internal functions _getLanguage(), _getCountry(), _getVariant()
1203 * avoid duplicating code to handle the earlier locale ID pieces
1204 * in the functions for the later ones by
1205 * setting the *pEnd pointer to where they stopped parsing
1206 *
1207 * TODO try to use this in Locale
1208 */
1209U_CFUNC int32_t
1210ulocimp_getLanguage(const char *localeID,
1211                    char *language, int32_t languageCapacity,
1212                    const char **pEnd) {
1213    int32_t i=0;
1214    int32_t offset;
1215    char lang[4]={ 0, 0, 0, 0 }; /* temporary buffer to hold language code for searching */
1216
1217    /* if it starts with i- or x- then copy that prefix */
1218    if(_isIDPrefix(localeID)) {
1219        if(i<languageCapacity) {
1220            language[i]=(char)uprv_tolower(*localeID);
1221        }
1222        if(i<languageCapacity) {
1223            language[i+1]='-';
1224        }
1225        i+=2;
1226        localeID+=2;
1227    }
1228
1229    /* copy the language as far as possible and count its length */
1230    while(!_isTerminator(*localeID) && !_isIDSeparator(*localeID)) {
1231        if(i<languageCapacity) {
1232            language[i]=(char)uprv_tolower(*localeID);
1233        }
1234        if(i<3) {
1235            lang[i]=(char)uprv_tolower(*localeID);
1236        }
1237        i++;
1238        localeID++;
1239    }
1240
1241    if(i==3) {
1242        /* convert 3 character code to 2 character code if possible *CWB*/
1243        offset=_findIndex(LANGUAGES_3, lang);
1244        if(offset>=0) {
1245            i=_copyCount(language, languageCapacity, LANGUAGES[offset]);
1246        }
1247    }
1248
1249    if(pEnd!=NULL) {
1250        *pEnd=localeID;
1251    }
1252    return i;
1253}
1254
1255U_CFUNC int32_t
1256ulocimp_getScript(const char *localeID,
1257                  char *script, int32_t scriptCapacity,
1258                  const char **pEnd)
1259{
1260    int32_t idLen = 0;
1261
1262    if (pEnd != NULL) {
1263        *pEnd = localeID;
1264    }
1265
1266    /* copy the second item as far as possible and count its length */
1267    while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])) {
1268        idLen++;
1269    }
1270
1271    /* If it's exactly 4 characters long, then it's a script and not a country. */
1272    if (idLen == 4) {
1273        int32_t i;
1274        if (pEnd != NULL) {
1275            *pEnd = localeID+idLen;
1276        }
1277        if(idLen > scriptCapacity) {
1278            idLen = scriptCapacity;
1279        }
1280        if (idLen >= 1) {
1281            script[0]=(char)uprv_toupper(*(localeID++));
1282        }
1283        for (i = 1; i < idLen; i++) {
1284            script[i]=(char)uprv_tolower(*(localeID++));
1285        }
1286    }
1287    else {
1288        idLen = 0;
1289    }
1290    return idLen;
1291}
1292
1293U_CFUNC int32_t
1294ulocimp_getCountry(const char *localeID,
1295                   char *country, int32_t countryCapacity,
1296                   const char **pEnd)
1297{
1298    int32_t idLen=0;
1299    char cnty[ULOC_COUNTRY_CAPACITY]={ 0, 0, 0, 0 };
1300    int32_t offset;
1301
1302    /* copy the country as far as possible and count its length */
1303    while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])) {
1304        if(idLen<(ULOC_COUNTRY_CAPACITY-1)) {   /*CWB*/
1305            cnty[idLen]=(char)uprv_toupper(localeID[idLen]);
1306        }
1307        idLen++;
1308    }
1309
1310    /* the country should be either length 2 or 3 */
1311    if (idLen == 2 || idLen == 3) {
1312        UBool gotCountry = FALSE;
1313        /* convert 3 character code to 2 character code if possible *CWB*/
1314        if(idLen==3) {
1315            offset=_findIndex(COUNTRIES_3, cnty);
1316            if(offset>=0) {
1317                idLen=_copyCount(country, countryCapacity, COUNTRIES[offset]);
1318                gotCountry = TRUE;
1319            }
1320        }
1321        if (!gotCountry) {
1322            int32_t i = 0;
1323            for (i = 0; i < idLen; i++) {
1324                if (i < countryCapacity) {
1325                    country[i]=(char)uprv_toupper(localeID[i]);
1326                }
1327            }
1328        }
1329        localeID+=idLen;
1330    } else {
1331        idLen = 0;
1332    }
1333
1334    if(pEnd!=NULL) {
1335        *pEnd=localeID;
1336    }
1337
1338    return idLen;
1339}
1340
1341/**
1342 * @param needSeparator if true, then add leading '_' if any variants
1343 * are added to 'variant'
1344 */
1345static int32_t
1346_getVariantEx(const char *localeID,
1347              char prev,
1348              char *variant, int32_t variantCapacity,
1349              UBool needSeparator) {
1350    int32_t i=0;
1351
1352    /* get one or more variant tags and separate them with '_' */
1353    if(_isIDSeparator(prev)) {
1354        /* get a variant string after a '-' or '_' */
1355        while(!_isTerminator(*localeID)) {
1356            if (needSeparator) {
1357                if (i<variantCapacity) {
1358                    variant[i] = '_';
1359                }
1360                ++i;
1361                needSeparator = FALSE;
1362            }
1363            if(i<variantCapacity) {
1364                variant[i]=(char)uprv_toupper(*localeID);
1365                if(variant[i]=='-') {
1366                    variant[i]='_';
1367                }
1368            }
1369            i++;
1370            localeID++;
1371        }
1372    }
1373
1374    /* if there is no variant tag after a '-' or '_' then look for '@' */
1375    if(i==0) {
1376        if(prev=='@') {
1377            /* keep localeID */
1378        } else if((localeID=locale_getKeywordsStart(localeID))!=NULL) {
1379            ++localeID; /* point after the '@' */
1380        } else {
1381            return 0;
1382        }
1383        while(!_isTerminator(*localeID)) {
1384            if (needSeparator) {
1385                if (i<variantCapacity) {
1386                    variant[i] = '_';
1387                }
1388                ++i;
1389                needSeparator = FALSE;
1390            }
1391            if(i<variantCapacity) {
1392                variant[i]=(char)uprv_toupper(*localeID);
1393                if(variant[i]=='-' || variant[i]==',') {
1394                    variant[i]='_';
1395                }
1396            }
1397            i++;
1398            localeID++;
1399        }
1400    }
1401
1402    return i;
1403}
1404
1405static int32_t
1406_getVariant(const char *localeID,
1407            char prev,
1408            char *variant, int32_t variantCapacity) {
1409    return _getVariantEx(localeID, prev, variant, variantCapacity, FALSE);
1410}
1411
1412/**
1413 * Delete ALL instances of a variant from the given list of one or
1414 * more variants.  Example: "FOO_EURO_BAR_EURO" => "FOO_BAR".
1415 * @param variants the source string of one or more variants,
1416 * separated by '_'.  This will be MODIFIED IN PLACE.  Not zero
1417 * terminated; if it is, trailing zero will NOT be maintained.
1418 * @param variantsLen length of variants
1419 * @param toDelete variant to delete, without separators, e.g.  "EURO"
1420 * or "PREEURO"; not zero terminated
1421 * @param toDeleteLen length of toDelete
1422 * @return number of characters deleted from variants
1423 */
1424static int32_t
1425_deleteVariant(char* variants, int32_t variantsLen,
1426               const char* toDelete, int32_t toDeleteLen)
1427{
1428    int32_t delta = 0; /* number of chars deleted */
1429    for (;;) {
1430        UBool flag = FALSE;
1431        if (variantsLen < toDeleteLen) {
1432            return delta;
1433        }
1434        if (uprv_strncmp(variants, toDelete, toDeleteLen) == 0 &&
1435            (variantsLen == toDeleteLen ||
1436             (flag=(variants[toDeleteLen] == '_'))))
1437        {
1438            int32_t d = toDeleteLen + (flag?1:0);
1439            variantsLen -= d;
1440            delta += d;
1441            if (variantsLen > 0) {
1442                uprv_memmove(variants, variants+d, variantsLen);
1443            }
1444        } else {
1445            char* p = _strnchr(variants, variantsLen, '_');
1446            if (p == NULL) {
1447                return delta;
1448            }
1449            ++p;
1450            variantsLen -= (int32_t)(p - variants);
1451            variants = p;
1452        }
1453    }
1454}
1455
1456/* Keyword enumeration */
1457
1458typedef struct UKeywordsContext {
1459    char* keywords;
1460    char* current;
1461} UKeywordsContext;
1462
1463static void U_CALLCONV
1464uloc_kw_closeKeywords(UEnumeration *enumerator) {
1465    uprv_free(((UKeywordsContext *)enumerator->context)->keywords);
1466    uprv_free(enumerator->context);
1467    uprv_free(enumerator);
1468}
1469
1470static int32_t U_CALLCONV
1471uloc_kw_countKeywords(UEnumeration *en, UErrorCode *status) {
1472    char *kw = ((UKeywordsContext *)en->context)->keywords;
1473    int32_t result = 0;
1474    while(*kw) {
1475        result++;
1476        kw += uprv_strlen(kw)+1;
1477    }
1478    return result;
1479}
1480
1481static const char* U_CALLCONV
1482uloc_kw_nextKeyword(UEnumeration* en,
1483                    int32_t* resultLength,
1484                    UErrorCode* status) {
1485    const char* result = ((UKeywordsContext *)en->context)->current;
1486    int32_t len = 0;
1487    if(*result) {
1488        len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current);
1489        ((UKeywordsContext *)en->context)->current += len+1;
1490    } else {
1491        result = NULL;
1492    }
1493    if (resultLength) {
1494        *resultLength = len;
1495    }
1496    return result;
1497}
1498
1499static void U_CALLCONV
1500uloc_kw_resetKeywords(UEnumeration* en,
1501                      UErrorCode* status) {
1502    ((UKeywordsContext *)en->context)->current = ((UKeywordsContext *)en->context)->keywords;
1503}
1504
1505static const UEnumeration gKeywordsEnum = {
1506    NULL,
1507    NULL,
1508    uloc_kw_closeKeywords,
1509    uloc_kw_countKeywords,
1510    uenum_unextDefault,
1511    uloc_kw_nextKeyword,
1512    uloc_kw_resetKeywords
1513};
1514
1515U_CAPI UEnumeration* U_EXPORT2
1516uloc_openKeywordList(const char *keywordList, int32_t keywordListSize, UErrorCode* status)
1517{
1518    UKeywordsContext *myContext = NULL;
1519    UEnumeration *result = NULL;
1520
1521    if(U_FAILURE(*status)) {
1522        return NULL;
1523    }
1524    result = (UEnumeration *)uprv_malloc(sizeof(UEnumeration));
1525    /* Null pointer test */
1526    if (result == NULL) {
1527        *status = U_MEMORY_ALLOCATION_ERROR;
1528        return NULL;
1529    }
1530    uprv_memcpy(result, &gKeywordsEnum, sizeof(UEnumeration));
1531    myContext = uprv_malloc(sizeof(UKeywordsContext));
1532    if (myContext == NULL) {
1533        *status = U_MEMORY_ALLOCATION_ERROR;
1534        uprv_free(result);
1535        return NULL;
1536    }
1537    myContext->keywords = (char *)uprv_malloc(keywordListSize+1);
1538    uprv_memcpy(myContext->keywords, keywordList, keywordListSize);
1539    myContext->keywords[keywordListSize] = 0;
1540    myContext->current = myContext->keywords;
1541    result->context = myContext;
1542    return result;
1543}
1544
1545U_CAPI UEnumeration* U_EXPORT2
1546uloc_openKeywords(const char* localeID,
1547                        UErrorCode* status)
1548{
1549    int32_t i=0;
1550    char keywords[256];
1551    int32_t keywordsCapacity = 256;
1552    char tempBuffer[ULOC_FULLNAME_CAPACITY];
1553    const char* tmpLocaleID;
1554
1555    if(status==NULL || U_FAILURE(*status)) {
1556        return 0;
1557    }
1558
1559    if (_hasBCP47Extension(localeID)) {
1560        _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
1561    } else {
1562        if (localeID==NULL) {
1563           localeID=uloc_getDefault();
1564        }
1565        tmpLocaleID=localeID;
1566    }
1567
1568    /* Skip the language */
1569    ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
1570    if(_isIDSeparator(*tmpLocaleID)) {
1571        const char *scriptID;
1572        /* Skip the script if available */
1573        ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
1574        if(scriptID != tmpLocaleID+1) {
1575            /* Found optional script */
1576            tmpLocaleID = scriptID;
1577        }
1578        /* Skip the Country */
1579        if (_isIDSeparator(*tmpLocaleID)) {
1580            ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &tmpLocaleID);
1581            if(_isIDSeparator(*tmpLocaleID)) {
1582                _getVariant(tmpLocaleID+1, *tmpLocaleID, NULL, 0);
1583            }
1584        }
1585    }
1586
1587    /* keywords are located after '@' */
1588    if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != NULL) {
1589        i=locale_getKeywords(tmpLocaleID+1, '@', keywords, keywordsCapacity, NULL, 0, NULL, FALSE, status);
1590    }
1591
1592    if(i) {
1593        return uloc_openKeywordList(keywords, i, status);
1594    } else {
1595        return NULL;
1596    }
1597}
1598
1599
1600/* bit-flags for 'options' parameter of _canonicalize */
1601#define _ULOC_STRIP_KEYWORDS 0x2
1602#define _ULOC_CANONICALIZE   0x1
1603
1604#define OPTION_SET(options, mask) ((options & mask) != 0)
1605
1606static const char i_default[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'};
1607#define I_DEFAULT_LENGTH (sizeof i_default / sizeof i_default[0])
1608
1609/**
1610 * Canonicalize the given localeID, to level 1 or to level 2,
1611 * depending on the options.  To specify level 1, pass in options=0.
1612 * To specify level 2, pass in options=_ULOC_CANONICALIZE.
1613 *
1614 * This is the code underlying uloc_getName and uloc_canonicalize.
1615 */
1616static int32_t
1617_canonicalize(const char* localeID,
1618              char* result,
1619              int32_t resultCapacity,
1620              uint32_t options,
1621              UErrorCode* err) {
1622    int32_t j, len, fieldCount=0, scriptSize=0, variantSize=0, nameCapacity;
1623    char localeBuffer[ULOC_FULLNAME_CAPACITY];
1624    char tempBuffer[ULOC_FULLNAME_CAPACITY];
1625    const char* origLocaleID;
1626    const char* tmpLocaleID;
1627    const char* keywordAssign = NULL;
1628    const char* separatorIndicator = NULL;
1629    const char* addKeyword = NULL;
1630    const char* addValue = NULL;
1631    char* name;
1632    char* variant = NULL; /* pointer into name, or NULL */
1633
1634    if (U_FAILURE(*err)) {
1635        return 0;
1636    }
1637
1638    if (_hasBCP47Extension(localeID)) {
1639        _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1640    } else {
1641        if (localeID==NULL) {
1642           localeID=uloc_getDefault();
1643        }
1644        tmpLocaleID=localeID;
1645    }
1646
1647    origLocaleID=tmpLocaleID;
1648
1649    /* if we are doing a full canonicalization, then put results in
1650       localeBuffer, if necessary; otherwise send them to result. */
1651    if (/*OPTION_SET(options, _ULOC_CANONICALIZE) &&*/
1652        (result == NULL || resultCapacity <  sizeof(localeBuffer))) {
1653        name = localeBuffer;
1654        nameCapacity = sizeof(localeBuffer);
1655    } else {
1656        name = result;
1657        nameCapacity = resultCapacity;
1658    }
1659
1660    /* get all pieces, one after another, and separate with '_' */
1661    len=ulocimp_getLanguage(tmpLocaleID, name, nameCapacity, &tmpLocaleID);
1662
1663    if(len == I_DEFAULT_LENGTH && uprv_strncmp(origLocaleID, i_default, len) == 0) {
1664        const char *d = uloc_getDefault();
1665
1666        len = (int32_t)uprv_strlen(d);
1667
1668        if (name != NULL) {
1669            uprv_strncpy(name, d, len);
1670        }
1671    } else if(_isIDSeparator(*tmpLocaleID)) {
1672        const char *scriptID;
1673
1674        ++fieldCount;
1675        if(len<nameCapacity) {
1676            name[len]='_';
1677        }
1678        ++len;
1679
1680        scriptSize=ulocimp_getScript(tmpLocaleID+1, name+len, nameCapacity-len, &scriptID);
1681        if(scriptSize > 0) {
1682            /* Found optional script */
1683            tmpLocaleID = scriptID;
1684            ++fieldCount;
1685            len+=scriptSize;
1686            if (_isIDSeparator(*tmpLocaleID)) {
1687                /* If there is something else, then we add the _ */
1688                if(len<nameCapacity) {
1689                    name[len]='_';
1690                }
1691                ++len;
1692            }
1693        }
1694
1695        if (_isIDSeparator(*tmpLocaleID)) {
1696            const char *cntryID;
1697            int32_t cntrySize = ulocimp_getCountry(tmpLocaleID+1, name+len, nameCapacity-len, &cntryID);
1698            if (cntrySize > 0) {
1699                /* Found optional country */
1700                tmpLocaleID = cntryID;
1701                len+=cntrySize;
1702            }
1703            if(_isIDSeparator(*tmpLocaleID)) {
1704                /* If there is something else, then we add the _  if we found country before.*/
1705                if (cntrySize > 0) {
1706                    ++fieldCount;
1707                    if(len<nameCapacity) {
1708                        name[len]='_';
1709                    }
1710                    ++len;
1711                }
1712
1713                variantSize = _getVariant(tmpLocaleID+1, *tmpLocaleID, name+len, nameCapacity-len);
1714                if (variantSize > 0) {
1715                    variant = name+len;
1716                    len += variantSize;
1717                    tmpLocaleID += variantSize + 1; /* skip '_' and variant */
1718                }
1719            }
1720        }
1721    }
1722
1723    /* Copy POSIX-style charset specifier, if any [mr.utf8] */
1724    if (!OPTION_SET(options, _ULOC_CANONICALIZE) && *tmpLocaleID == '.') {
1725        UBool done = FALSE;
1726        do {
1727            char c = *tmpLocaleID;
1728            switch (c) {
1729            case 0:
1730            case '@':
1731                done = TRUE;
1732                break;
1733            default:
1734                if (len<nameCapacity) {
1735                    name[len] = c;
1736                }
1737                ++len;
1738                ++tmpLocaleID;
1739                break;
1740            }
1741        } while (!done);
1742    }
1743
1744    /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'
1745       After this, tmpLocaleID either points to '@' or is NULL */
1746    if ((tmpLocaleID=locale_getKeywordsStart(tmpLocaleID))!=NULL) {
1747        keywordAssign = uprv_strchr(tmpLocaleID, '=');
1748        separatorIndicator = uprv_strchr(tmpLocaleID, ';');
1749    }
1750
1751    /* Copy POSIX-style variant, if any [mr@FOO] */
1752    if (!OPTION_SET(options, _ULOC_CANONICALIZE) &&
1753        tmpLocaleID != NULL && keywordAssign == NULL) {
1754        for (;;) {
1755            char c = *tmpLocaleID;
1756            if (c == 0) {
1757                break;
1758            }
1759            if (len<nameCapacity) {
1760                name[len] = c;
1761            }
1762            ++len;
1763            ++tmpLocaleID;
1764        }
1765    }
1766
1767    if (OPTION_SET(options, _ULOC_CANONICALIZE)) {
1768        /* Handle @FOO variant if @ is present and not followed by = */
1769        if (tmpLocaleID!=NULL && keywordAssign==NULL) {
1770            int32_t posixVariantSize;
1771            /* Add missing '_' if needed */
1772            if (fieldCount < 2 || (fieldCount < 3 && scriptSize > 0)) {
1773                do {
1774                    if(len<nameCapacity) {
1775                        name[len]='_';
1776                    }
1777                    ++len;
1778                    ++fieldCount;
1779                } while(fieldCount<2);
1780            }
1781            posixVariantSize = _getVariantEx(tmpLocaleID+1, '@', name+len, nameCapacity-len,
1782                                             (UBool)(variantSize > 0));
1783            if (posixVariantSize > 0) {
1784                if (variant == NULL) {
1785                    variant = name+len;
1786                }
1787                len += posixVariantSize;
1788                variantSize += posixVariantSize;
1789            }
1790        }
1791
1792        /* Handle generic variants first */
1793        if (variant) {
1794            for (j=0; j<(int32_t)(sizeof(VARIANT_MAP)/sizeof(VARIANT_MAP[0])); j++) {
1795                const char* variantToCompare = VARIANT_MAP[j].variant;
1796                int32_t n = (int32_t)uprv_strlen(variantToCompare);
1797                int32_t variantLen = _deleteVariant(variant, uprv_min(variantSize, (nameCapacity-len)), variantToCompare, n);
1798                len -= variantLen;
1799                if (variantLen > 0) {
1800                    if (name[len-1] == '_') { /* delete trailing '_' */
1801                        --len;
1802                    }
1803                    addKeyword = VARIANT_MAP[j].keyword;
1804                    addValue = VARIANT_MAP[j].value;
1805                    break;
1806                }
1807            }
1808            if (name[len-1] == '_') { /* delete trailing '_' */
1809                --len;
1810            }
1811        }
1812
1813        /* Look up the ID in the canonicalization map */
1814        for (j=0; j<(int32_t)(sizeof(CANONICALIZE_MAP)/sizeof(CANONICALIZE_MAP[0])); j++) {
1815            const char* id = CANONICALIZE_MAP[j].id;
1816            int32_t n = (int32_t)uprv_strlen(id);
1817            if (len == n && uprv_strncmp(name, id, n) == 0) {
1818                if (n == 0 && tmpLocaleID != NULL) {
1819                    break; /* Don't remap "" if keywords present */
1820                }
1821                len = _copyCount(name, nameCapacity, CANONICALIZE_MAP[j].canonicalID);
1822                if (CANONICALIZE_MAP[j].keyword) {
1823                    addKeyword = CANONICALIZE_MAP[j].keyword;
1824                    addValue = CANONICALIZE_MAP[j].value;
1825                }
1826                break;
1827            }
1828        }
1829    }
1830
1831    if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) {
1832        if (tmpLocaleID!=NULL && keywordAssign!=NULL &&
1833            (!separatorIndicator || separatorIndicator > keywordAssign)) {
1834            if(len<nameCapacity) {
1835                name[len]='@';
1836            }
1837            ++len;
1838            ++fieldCount;
1839            len += _getKeywords(tmpLocaleID+1, '@', name+len, nameCapacity-len, NULL, 0, NULL, TRUE,
1840                                addKeyword, addValue, err);
1841        } else if (addKeyword != NULL) {
1842            U_ASSERT(addValue != NULL);
1843            /* inelegant but works -- later make _getKeywords do this? */
1844            len += _copyCount(name+len, nameCapacity-len, "@");
1845            len += _copyCount(name+len, nameCapacity-len, addKeyword);
1846            len += _copyCount(name+len, nameCapacity-len, "=");
1847            len += _copyCount(name+len, nameCapacity-len, addValue);
1848        }
1849    }
1850
1851    if (U_SUCCESS(*err) && result != NULL && name == localeBuffer) {
1852        uprv_strncpy(result, localeBuffer, (len > resultCapacity) ? resultCapacity : len);
1853    }
1854
1855    return u_terminateChars(result, resultCapacity, len, err);
1856}
1857
1858/* ### ID parsing API **************************************************/
1859
1860U_CAPI int32_t  U_EXPORT2
1861uloc_getParent(const char*    localeID,
1862               char* parent,
1863               int32_t parentCapacity,
1864               UErrorCode* err)
1865{
1866    const char *lastUnderscore;
1867    int32_t i;
1868
1869    if (U_FAILURE(*err))
1870        return 0;
1871
1872    if (localeID == NULL)
1873        localeID = uloc_getDefault();
1874
1875    lastUnderscore=uprv_strrchr(localeID, '_');
1876    if(lastUnderscore!=NULL) {
1877        i=(int32_t)(lastUnderscore-localeID);
1878    } else {
1879        i=0;
1880    }
1881
1882    if(i>0 && parent != localeID) {
1883        uprv_memcpy(parent, localeID, uprv_min(i, parentCapacity));
1884    }
1885    return u_terminateChars(parent, parentCapacity, i, err);
1886}
1887
1888U_CAPI int32_t U_EXPORT2
1889uloc_getLanguage(const char*    localeID,
1890         char* language,
1891         int32_t languageCapacity,
1892         UErrorCode* err)
1893{
1894    /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/
1895    int32_t i=0;
1896
1897    if (err==NULL || U_FAILURE(*err)) {
1898        return 0;
1899    }
1900
1901    if(localeID==NULL) {
1902        localeID=uloc_getDefault();
1903    }
1904
1905    i=ulocimp_getLanguage(localeID, language, languageCapacity, NULL);
1906    return u_terminateChars(language, languageCapacity, i, err);
1907}
1908
1909U_CAPI int32_t U_EXPORT2
1910uloc_getScript(const char*    localeID,
1911         char* script,
1912         int32_t scriptCapacity,
1913         UErrorCode* err)
1914{
1915    int32_t i=0;
1916
1917    if(err==NULL || U_FAILURE(*err)) {
1918        return 0;
1919    }
1920
1921    if(localeID==NULL) {
1922        localeID=uloc_getDefault();
1923    }
1924
1925    /* skip the language */
1926    ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1927    if(_isIDSeparator(*localeID)) {
1928        i=ulocimp_getScript(localeID+1, script, scriptCapacity, NULL);
1929    }
1930    return u_terminateChars(script, scriptCapacity, i, err);
1931}
1932
1933U_CAPI int32_t  U_EXPORT2
1934uloc_getCountry(const char* localeID,
1935            char* country,
1936            int32_t countryCapacity,
1937            UErrorCode* err)
1938{
1939    int32_t i=0;
1940
1941    if(err==NULL || U_FAILURE(*err)) {
1942        return 0;
1943    }
1944
1945    if(localeID==NULL) {
1946        localeID=uloc_getDefault();
1947    }
1948
1949    /* Skip the language */
1950    ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1951    if(_isIDSeparator(*localeID)) {
1952        const char *scriptID;
1953        /* Skip the script if available */
1954        ulocimp_getScript(localeID+1, NULL, 0, &scriptID);
1955        if(scriptID != localeID+1) {
1956            /* Found optional script */
1957            localeID = scriptID;
1958        }
1959        if(_isIDSeparator(*localeID)) {
1960            i=ulocimp_getCountry(localeID+1, country, countryCapacity, NULL);
1961        }
1962    }
1963    return u_terminateChars(country, countryCapacity, i, err);
1964}
1965
1966U_CAPI int32_t  U_EXPORT2
1967uloc_getVariant(const char* localeID,
1968                char* variant,
1969                int32_t variantCapacity,
1970                UErrorCode* err)
1971{
1972    char tempBuffer[ULOC_FULLNAME_CAPACITY];
1973    const char* tmpLocaleID;
1974    int32_t i=0;
1975
1976    if(err==NULL || U_FAILURE(*err)) {
1977        return 0;
1978    }
1979
1980    if (_hasBCP47Extension(localeID)) {
1981        _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1982    } else {
1983        if (localeID==NULL) {
1984           localeID=uloc_getDefault();
1985        }
1986        tmpLocaleID=localeID;
1987    }
1988
1989    /* Skip the language */
1990    ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
1991    if(_isIDSeparator(*tmpLocaleID)) {
1992        const char *scriptID;
1993        /* Skip the script if available */
1994        ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
1995        if(scriptID != tmpLocaleID+1) {
1996            /* Found optional script */
1997            tmpLocaleID = scriptID;
1998        }
1999        /* Skip the Country */
2000        if (_isIDSeparator(*tmpLocaleID)) {
2001            const char *cntryID;
2002            ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &cntryID);
2003            if (cntryID != tmpLocaleID+1) {
2004                /* Found optional country */
2005                tmpLocaleID = cntryID;
2006            }
2007            if(_isIDSeparator(*tmpLocaleID)) {
2008                /* If there was no country ID, skip a possible extra IDSeparator */
2009                if (tmpLocaleID != cntryID && _isIDSeparator(tmpLocaleID[1])) {
2010                    tmpLocaleID++;
2011                }
2012                i=_getVariant(tmpLocaleID+1, *tmpLocaleID, variant, variantCapacity);
2013            }
2014        }
2015    }
2016
2017    /* removed by weiv. We don't want to handle POSIX variants anymore. Use canonicalization function */
2018    /* if we do not have a variant tag yet then try a POSIX variant after '@' */
2019/*
2020    if(!haveVariant && (localeID=uprv_strrchr(localeID, '@'))!=NULL) {
2021        i=_getVariant(localeID+1, '@', variant, variantCapacity);
2022    }
2023*/
2024    return u_terminateChars(variant, variantCapacity, i, err);
2025}
2026
2027U_CAPI int32_t  U_EXPORT2
2028uloc_getName(const char* localeID,
2029             char* name,
2030             int32_t nameCapacity,
2031             UErrorCode* err)
2032{
2033    return _canonicalize(localeID, name, nameCapacity, 0, err);
2034}
2035
2036U_CAPI int32_t  U_EXPORT2
2037uloc_getBaseName(const char* localeID,
2038                 char* name,
2039                 int32_t nameCapacity,
2040                 UErrorCode* err)
2041{
2042    return _canonicalize(localeID, name, nameCapacity, _ULOC_STRIP_KEYWORDS, err);
2043}
2044
2045U_CAPI int32_t  U_EXPORT2
2046uloc_canonicalize(const char* localeID,
2047                  char* name,
2048                  int32_t nameCapacity,
2049                  UErrorCode* err)
2050{
2051    return _canonicalize(localeID, name, nameCapacity, _ULOC_CANONICALIZE, err);
2052}
2053
2054U_CAPI const char*  U_EXPORT2
2055uloc_getISO3Language(const char* localeID)
2056{
2057    int16_t offset;
2058    char lang[ULOC_LANG_CAPACITY];
2059    UErrorCode err = U_ZERO_ERROR;
2060
2061    if (localeID == NULL)
2062    {
2063        localeID = uloc_getDefault();
2064    }
2065    uloc_getLanguage(localeID, lang, ULOC_LANG_CAPACITY, &err);
2066    if (U_FAILURE(err))
2067        return "";
2068    offset = _findIndex(LANGUAGES, lang);
2069    if (offset < 0)
2070        return "";
2071    return LANGUAGES_3[offset];
2072}
2073
2074U_CAPI const char*  U_EXPORT2
2075uloc_getISO3Country(const char* localeID)
2076{
2077    int16_t offset;
2078    char cntry[ULOC_LANG_CAPACITY];
2079    UErrorCode err = U_ZERO_ERROR;
2080
2081    if (localeID == NULL)
2082    {
2083        localeID = uloc_getDefault();
2084    }
2085    uloc_getCountry(localeID, cntry, ULOC_LANG_CAPACITY, &err);
2086    if (U_FAILURE(err))
2087        return "";
2088    offset = _findIndex(COUNTRIES, cntry);
2089    if (offset < 0)
2090        return "";
2091
2092    return COUNTRIES_3[offset];
2093}
2094
2095U_CAPI uint32_t  U_EXPORT2
2096uloc_getLCID(const char* localeID)
2097{
2098    UErrorCode status = U_ZERO_ERROR;
2099    char       langID[ULOC_FULLNAME_CAPACITY];
2100
2101    uloc_getLanguage(localeID, langID, sizeof(langID), &status);
2102    if (U_FAILURE(status)) {
2103        return 0;
2104    }
2105
2106    return uprv_convertToLCID(langID, localeID, &status);
2107}
2108
2109U_CAPI int32_t U_EXPORT2
2110uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
2111                UErrorCode *status)
2112{
2113    int32_t length;
2114    const char *posix = uprv_convertToPosix(hostid, status);
2115    if (U_FAILURE(*status) || posix == NULL) {
2116        return 0;
2117    }
2118    length = (int32_t)uprv_strlen(posix);
2119    if (length+1 > localeCapacity) {
2120        *status = U_BUFFER_OVERFLOW_ERROR;
2121    }
2122    else {
2123        uprv_strcpy(locale, posix);
2124    }
2125    return length;
2126}
2127
2128/* ### Default locale **************************************************/
2129
2130U_CAPI const char*  U_EXPORT2
2131uloc_getDefault()
2132{
2133    return locale_get_default();
2134}
2135
2136U_CAPI void  U_EXPORT2
2137uloc_setDefault(const char*   newDefaultLocale,
2138             UErrorCode* err)
2139{
2140    if (U_FAILURE(*err))
2141        return;
2142    /* the error code isn't currently used for anything by this function*/
2143
2144    /* propagate change to C++ */
2145    locale_set_default(newDefaultLocale);
2146}
2147
2148/**
2149 * Returns a list of all language codes defined in ISO 639.  This is a pointer
2150 * to an array of pointers to arrays of char.  All of these pointers are owned
2151 * by ICU-- do not delete them, and do not write through them.  The array is
2152 * terminated with a null pointer.
2153 */
2154U_CAPI const char* const*  U_EXPORT2
2155uloc_getISOLanguages()
2156{
2157    return LANGUAGES;
2158}
2159
2160/**
2161 * Returns a list of all 2-letter country codes defined in ISO 639.  This is a
2162 * pointer to an array of pointers to arrays of char.  All of these pointers are
2163 * owned by ICU-- do not delete them, and do not write through them.  The array is
2164 * terminated with a null pointer.
2165 */
2166U_CAPI const char* const*  U_EXPORT2
2167uloc_getISOCountries()
2168{
2169    return COUNTRIES;
2170}
2171
2172
2173/* this function to be moved into cstring.c later */
2174static char gDecimal = 0;
2175
2176static /* U_CAPI */
2177double
2178/* U_EXPORT2 */
2179_uloc_strtod(const char *start, char **end) {
2180    char *decimal;
2181    char *myEnd;
2182    char buf[30];
2183    double rv;
2184    if (!gDecimal) {
2185        char rep[5];
2186        /* For machines that decide to change the decimal on you,
2187        and try to be too smart with localization.
2188        This normally should be just a '.'. */
2189        sprintf(rep, "%+1.1f", 1.0);
2190        gDecimal = rep[2];
2191    }
2192
2193    if(gDecimal == '.') {
2194        return uprv_strtod(start, end); /* fall through to OS */
2195    } else {
2196        uprv_strncpy(buf, start, 29);
2197        buf[29]=0;
2198        decimal = uprv_strchr(buf, '.');
2199        if(decimal) {
2200            *decimal = gDecimal;
2201        } else {
2202            return uprv_strtod(start, end); /* no decimal point */
2203        }
2204        rv = uprv_strtod(buf, &myEnd);
2205        if(end) {
2206            *end = (char*)(start+(myEnd-buf)); /* cast away const (to follow uprv_strtod API.) */
2207        }
2208        return rv;
2209    }
2210}
2211
2212typedef struct {
2213    float q;
2214    int32_t dummy;  /* to avoid uninitialized memory copy from qsort */
2215    char *locale;
2216} _acceptLangItem;
2217
2218static int32_t U_CALLCONV
2219uloc_acceptLanguageCompare(const void *context, const void *a, const void *b)
2220{
2221    const _acceptLangItem *aa = (const _acceptLangItem*)a;
2222    const _acceptLangItem *bb = (const _acceptLangItem*)b;
2223
2224    int32_t rc = 0;
2225    if(bb->q < aa->q) {
2226        rc = -1;  /* A > B */
2227    } else if(bb->q > aa->q) {
2228        rc = 1;   /* A < B */
2229    } else {
2230        rc = 0;   /* A = B */
2231    }
2232
2233    if(rc==0) {
2234        rc = uprv_stricmp(aa->locale, bb->locale);
2235    }
2236
2237#if defined(ULOC_DEBUG)
2238    /*  fprintf(stderr, "a:[%s:%g], b:[%s:%g] -> %d\n",
2239    aa->locale, aa->q,
2240    bb->locale, bb->q,
2241    rc);*/
2242#endif
2243
2244    return rc;
2245}
2246
2247/*
2248mt-mt, ja;q=0.76, en-us;q=0.95, en;q=0.92, en-gb;q=0.89, fr;q=0.87, iu-ca;q=0.84, iu;q=0.82, ja-jp;q=0.79, mt;q=0.97, de-de;q=0.74, de;q=0.71, es;q=0.68, it-it;q=0.66, it;q=0.63, vi-vn;q=0.61, vi;q=0.58, nl-nl;q=0.55, nl;q=0.53
2249*/
2250
2251U_CAPI int32_t U_EXPORT2
2252uloc_acceptLanguageFromHTTP(char *result, int32_t resultAvailable, UAcceptResult *outResult,
2253                            const char *httpAcceptLanguage,
2254                            UEnumeration* availableLocales,
2255                            UErrorCode *status)
2256{
2257    _acceptLangItem *j;
2258    _acceptLangItem smallBuffer[30];
2259    char **strs;
2260    char tmp[ULOC_FULLNAME_CAPACITY +1];
2261    int32_t n = 0;
2262    const char *itemEnd;
2263    const char *paramEnd;
2264    const char *s;
2265    const char *t;
2266    int32_t res;
2267    int32_t i;
2268    int32_t l = (int32_t)uprv_strlen(httpAcceptLanguage);
2269    int32_t jSize;
2270    char *tempstr; /* Use for null pointer check */
2271
2272    j = smallBuffer;
2273    jSize = sizeof(smallBuffer)/sizeof(smallBuffer[0]);
2274    if(U_FAILURE(*status)) {
2275        return -1;
2276    }
2277
2278    for(s=httpAcceptLanguage;s&&*s;) {
2279        while(isspace(*s)) /* eat space at the beginning */
2280            s++;
2281        itemEnd=uprv_strchr(s,',');
2282        paramEnd=uprv_strchr(s,';');
2283        if(!itemEnd) {
2284            itemEnd = httpAcceptLanguage+l; /* end of string */
2285        }
2286        if(paramEnd && paramEnd<itemEnd) {
2287            /* semicolon (;) is closer than end (,) */
2288            t = paramEnd+1;
2289            if(*t=='q') {
2290                t++;
2291            }
2292            while(isspace(*t)) {
2293                t++;
2294            }
2295            if(*t=='=') {
2296                t++;
2297            }
2298            while(isspace(*t)) {
2299                t++;
2300            }
2301            j[n].q = (float)_uloc_strtod(t,NULL);
2302        } else {
2303            /* no semicolon - it's 1.0 */
2304            j[n].q = 1.0f;
2305            paramEnd = itemEnd;
2306        }
2307        j[n].dummy=0;
2308        /* eat spaces prior to semi */
2309        for(t=(paramEnd-1);(paramEnd>s)&&isspace(*t);t--)
2310            ;
2311        /* Check for null pointer from uprv_strndup */
2312        tempstr = uprv_strndup(s,(int32_t)((t+1)-s));
2313        if (tempstr == NULL) {
2314            *status = U_MEMORY_ALLOCATION_ERROR;
2315            return -1;
2316        }
2317        j[n].locale = tempstr;
2318        uloc_canonicalize(j[n].locale,tmp,sizeof(tmp)/sizeof(tmp[0]),status);
2319        if(strcmp(j[n].locale,tmp)) {
2320            uprv_free(j[n].locale);
2321            j[n].locale=uprv_strdup(tmp);
2322        }
2323#if defined(ULOC_DEBUG)
2324        /*fprintf(stderr,"%d: s <%s> q <%g>\n", n, j[n].locale, j[n].q);*/
2325#endif
2326        n++;
2327        s = itemEnd;
2328        while(*s==',') { /* eat duplicate commas */
2329            s++;
2330        }
2331        if(n>=jSize) {
2332            if(j==smallBuffer) {  /* overflowed the small buffer. */
2333                j = uprv_malloc(sizeof(j[0])*(jSize*2));
2334                if(j!=NULL) {
2335                    uprv_memcpy(j,smallBuffer,sizeof(j[0])*jSize);
2336                }
2337#if defined(ULOC_DEBUG)
2338                fprintf(stderr,"malloced at size %d\n", jSize);
2339#endif
2340            } else {
2341                j = uprv_realloc(j, sizeof(j[0])*jSize*2);
2342#if defined(ULOC_DEBUG)
2343                fprintf(stderr,"re-alloced at size %d\n", jSize);
2344#endif
2345            }
2346            jSize *= 2;
2347            if(j==NULL) {
2348                *status = U_MEMORY_ALLOCATION_ERROR;
2349                return -1;
2350            }
2351        }
2352    }
2353    uprv_sortArray(j, n, sizeof(j[0]), uloc_acceptLanguageCompare, NULL, TRUE, status);
2354    if(U_FAILURE(*status)) {
2355        if(j != smallBuffer) {
2356#if defined(ULOC_DEBUG)
2357            fprintf(stderr,"freeing j %p\n", j);
2358#endif
2359            uprv_free(j);
2360        }
2361        return -1;
2362    }
2363    strs = uprv_malloc((size_t)(sizeof(strs[0])*n));
2364    /* Check for null pointer */
2365    if (strs == NULL) {
2366        uprv_free(j); /* Free to avoid memory leak */
2367        *status = U_MEMORY_ALLOCATION_ERROR;
2368        return -1;
2369    }
2370    for(i=0;i<n;i++) {
2371#if defined(ULOC_DEBUG)
2372        /*fprintf(stderr,"%d: s <%s> q <%g>\n", i, j[i].locale, j[i].q);*/
2373#endif
2374        strs[i]=j[i].locale;
2375    }
2376    res =  uloc_acceptLanguage(result, resultAvailable, outResult,
2377        (const char**)strs, n, availableLocales, status);
2378    for(i=0;i<n;i++) {
2379        uprv_free(strs[i]);
2380    }
2381    uprv_free(strs);
2382    if(j != smallBuffer) {
2383#if defined(ULOC_DEBUG)
2384        fprintf(stderr,"freeing j %p\n", j);
2385#endif
2386        uprv_free(j);
2387    }
2388    return res;
2389}
2390
2391
2392U_CAPI int32_t U_EXPORT2
2393uloc_acceptLanguage(char *result, int32_t resultAvailable,
2394                    UAcceptResult *outResult, const char **acceptList,
2395                    int32_t acceptListCount,
2396                    UEnumeration* availableLocales,
2397                    UErrorCode *status)
2398{
2399    int32_t i,j;
2400    int32_t len;
2401    int32_t maxLen=0;
2402    char tmp[ULOC_FULLNAME_CAPACITY+1];
2403    const char *l;
2404    char **fallbackList;
2405    if(U_FAILURE(*status)) {
2406        return -1;
2407    }
2408    fallbackList = uprv_malloc((size_t)(sizeof(fallbackList[0])*acceptListCount));
2409    if(fallbackList==NULL) {
2410        *status = U_MEMORY_ALLOCATION_ERROR;
2411        return -1;
2412    }
2413    for(i=0;i<acceptListCount;i++) {
2414#if defined(ULOC_DEBUG)
2415        fprintf(stderr,"%02d: %s\n", i, acceptList[i]);
2416#endif
2417        while((l=uenum_next(availableLocales, NULL, status))) {
2418#if defined(ULOC_DEBUG)
2419            fprintf(stderr,"  %s\n", l);
2420#endif
2421            len = (int32_t)uprv_strlen(l);
2422            if(!uprv_strcmp(acceptList[i], l)) {
2423                if(outResult) {
2424                    *outResult = ULOC_ACCEPT_VALID;
2425                }
2426#if defined(ULOC_DEBUG)
2427                fprintf(stderr, "MATCH! %s\n", l);
2428#endif
2429                if(len>0) {
2430                    uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2431                }
2432                for(j=0;j<i;j++) {
2433                    uprv_free(fallbackList[j]);
2434                }
2435                uprv_free(fallbackList);
2436                return u_terminateChars(result, resultAvailable, len, status);
2437            }
2438            if(len>maxLen) {
2439                maxLen = len;
2440            }
2441        }
2442        uenum_reset(availableLocales, status);
2443        /* save off parent info */
2444        if(uloc_getParent(acceptList[i], tmp, sizeof(tmp)/sizeof(tmp[0]), status)!=0) {
2445            fallbackList[i] = uprv_strdup(tmp);
2446        } else {
2447            fallbackList[i]=0;
2448        }
2449    }
2450
2451    for(maxLen--;maxLen>0;maxLen--) {
2452        for(i=0;i<acceptListCount;i++) {
2453            if(fallbackList[i] && ((int32_t)uprv_strlen(fallbackList[i])==maxLen)) {
2454#if defined(ULOC_DEBUG)
2455                fprintf(stderr,"Try: [%s]", fallbackList[i]);
2456#endif
2457                while((l=uenum_next(availableLocales, NULL, status))) {
2458#if defined(ULOC_DEBUG)
2459                    fprintf(stderr,"  %s\n", l);
2460#endif
2461                    len = (int32_t)uprv_strlen(l);
2462                    if(!uprv_strcmp(fallbackList[i], l)) {
2463                        if(outResult) {
2464                            *outResult = ULOC_ACCEPT_FALLBACK;
2465                        }
2466#if defined(ULOC_DEBUG)
2467                        fprintf(stderr, "fallback MATCH! %s\n", l);
2468#endif
2469                        if(len>0) {
2470                            uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2471                        }
2472                        for(j=0;j<acceptListCount;j++) {
2473                            uprv_free(fallbackList[j]);
2474                        }
2475                        uprv_free(fallbackList);
2476                        return u_terminateChars(result, resultAvailable, len, status);
2477                    }
2478                }
2479                uenum_reset(availableLocales, status);
2480
2481                if(uloc_getParent(fallbackList[i], tmp, sizeof(tmp)/sizeof(tmp[0]), status)!=0) {
2482                    uprv_free(fallbackList[i]);
2483                    fallbackList[i] = uprv_strdup(tmp);
2484                } else {
2485                    uprv_free(fallbackList[i]);
2486                    fallbackList[i]=0;
2487                }
2488            }
2489        }
2490        if(outResult) {
2491            *outResult = ULOC_ACCEPT_FAILED;
2492        }
2493    }
2494    for(i=0;i<acceptListCount;i++) {
2495        uprv_free(fallbackList[i]);
2496    }
2497    uprv_free(fallbackList);
2498    return -1;
2499}
2500
2501/*eof*/
2502