1/*
2**********************************************************************
3*   Copyright (C) 1997-2010, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5**********************************************************************
6*
7* File ULOC.CPP
8*
9* Modification History:
10*
11*   Date        Name        Description
12*   04/01/97    aliu        Creation.
13*   08/21/98    stephen     JDK 1.2 sync
14*   12/08/98    rtg         New Locale implementation and C API
15*   03/15/99    damiba      overhaul.
16*   04/06/99    stephen     changed setDefault() to realloc and copy
17*   06/14/99    stephen     Changed calls to ures_open for new params
18*   07/21/99    stephen     Modified setDefault() to propagate to C++
19*   05/14/04    alan        7 years later: refactored, cleaned up, fixed bugs,
20*                           brought canonicalization code into line with spec
21*****************************************************************************/
22
23/*
24   POSIX's locale format, from putil.c: [no spaces]
25
26     ll [ _CC ] [ . MM ] [ @ VV]
27
28     l = lang, C = ctry, M = charmap, V = variant
29*/
30
31#include "unicode/utypes.h"
32#include "unicode/ustring.h"
33#include "unicode/uloc.h"
34
35#include "putilimp.h"
36#include "ustr_imp.h"
37#include "ulocimp.h"
38#include "umutex.h"
39#include "cstring.h"
40#include "cmemory.h"
41#include "ucln_cmn.h"
42#include "locmap.h"
43#include "uarrsort.h"
44#include "uenumimp.h"
45#include "uassert.h"
46
47#include <stdio.h> /* for sprintf */
48
49/* ### Declarations **************************************************/
50
51/* Locale stuff from locid.cpp */
52U_CFUNC void locale_set_default(const char *id);
53U_CFUNC const char *locale_get_default(void);
54U_CFUNC int32_t
55locale_getKeywords(const char *localeID,
56            char prev,
57            char *keywords, int32_t keywordCapacity,
58            char *values, int32_t valuesCapacity, int32_t *valLen,
59            UBool valuesToo,
60            UErrorCode *status);
61
62/* ### Data tables **************************************************/
63
64/**
65 * Table of language codes, both 2- and 3-letter, with preference
66 * given to 2-letter codes where possible.  Includes 3-letter codes
67 * that lack a 2-letter equivalent.
68 *
69 * This list must be in sorted order.  This list is returned directly
70 * to the user by some API.
71 *
72 * This list must be kept in sync with LANGUAGES_3, with corresponding
73 * entries matched.
74 *
75 * This table should be terminated with a NULL entry, followed by a
76 * second list, and another NULL entry.  The first list is visible to
77 * user code when this array is returned by API.  The second list
78 * contains codes we support, but do not expose through user API.
79 *
80 * Notes
81 *
82 * Tables updated per http://lcweb.loc.gov/standards/iso639-2/ to
83 * include the revisions up to 2001/7/27 *CWB*
84 *
85 * The 3 character codes are the terminology codes like RFC 3066.  This
86 * is compatible with prior ICU codes
87 *
88 * "in" "iw" "ji" "jw" & "sh" have been withdrawn but are still in the
89 * table but now at the end of the table because 3 character codes are
90 * duplicates.  This avoids bad searches going from 3 to 2 character
91 * codes.
92 *
93 * The range qaa-qtz is reserved for local use
94 */
95static const char * const LANGUAGES[] = {
96    "aa",  "ab",  "ace", "ach", "ada", "ady", "ae",  "af",  "afa",
97    "afh", "ain", "ak",  "akk", "ale", "alg", "alt", "am",  "an",
98    "ang", "anp", "apa",
99    "ar",  "arc", "arn", "arp", "art", "arw", "as",  "ast",
100    "ath", "aus", "av",  "awa", "ay",  "az",  "ba",  "bad",
101    "bai", "bal", "ban", "bas", "bat", "be",  "bej",
102    "bem", "ber", "bg",  "bh",  "bho", "bi",  "bik", "bin",
103    "bla", "bm",  "bn",  "bnt", "bo",  "br",  "bra", "bs",
104    "btk", "bua", "bug", "byn", "ca",  "cad", "cai", "car", "cau",
105    "cch", "ce",  "ceb", "cel", "ch",  "chb", "chg", "chk", "chm",
106    "chn", "cho", "chp", "chr", "chy", "cmc", "co",  "cop",
107    "cpe", "cpf", "cpp", "cr",  "crh", "crp", "cs",  "csb", "cu",  "cus",
108    "cv",  "cy",  "da",  "dak", "dar", "day", "de",  "del", "den",
109    "dgr", "din", "doi", "dra", "dsb", "dua", "dum", "dv",  "dyu",
110    "dz",  "ee",  "efi", "egy", "eka", "el",  "elx", "en",
111    "enm", "eo",  "es",  "et",  "eu",  "ewo", "fa",
112    "fan", "fat", "ff",  "fi",  "fil", "fiu", "fj",  "fo",  "fon",
113    "fr",  "frm", "fro", "frr", "frs", "fur", "fy",
114    "ga",  "gaa", "gay", "gba", "gd",  "gem", "gez", "gil",
115    "gl",  "gmh", "gn",  "goh", "gon", "gor", "got", "grb",
116    "grc", "gsw", "gu",  "gv", "gwi",
117    "ha",  "hai", "haw", "he",  "hi",  "hil", "him",
118    "hit", "hmn", "ho",  "hr",  "hsb", "ht",  "hu",  "hup", "hy",  "hz",
119    "ia",  "iba", "id",  "ie",  "ig",  "ii",  "ijo", "ik",
120    "ilo", "inc", "ine", "inh", "io",  "ira", "iro", "is",  "it",
121    "iu",  "ja",  "jbo", "jpr", "jrb", "jv",  "ka",  "kaa", "kab",
122    "kac", "kaj", "kam", "kar", "kaw", "kbd", "kcg", "kfo", "kg",  "kha", "khi",
123    "kho", "ki",  "kj",  "kk",  "kl",  "km",  "kmb", "kn",
124    "ko",  "kok", "kos", "kpe", "kr",  "krc", "krl", "kro", "kru", "ks",
125    "ku",  "kum", "kut", "kv",  "kw",  "ky",  "la",  "lad",
126    "lah", "lam", "lb",  "lez", "lg",  "li",  "ln",  "lo",  "lol",
127    "loz", "lt",  "lu",  "lua", "lui", "lun", "luo", "lus",
128    "lv",  "mad", "mag", "mai", "mak", "man", "map", "mas",
129    "mdf", "mdr", "men", "mfe", "mg",  "mga", "mh",  "mi",  "mic", "min",
130    "mis", "mk",  "mkh", "ml",  "mn",  "mnc", "mni", "mno",
131    "mo",  "moh", "mos", "mr",  "ms",  "mt",  "mul", "mun",
132    "mus", "mwl", "mwr", "my",  "myn", "myv", "na",  "nah", "nai", "nap",
133    "nb",  "nd",  "nds", "ne",  "new", "ng",  "nia", "nic",
134    "niu", "nl",  "nn",  "no",  "nog", "non", "nqo", "nr",  "nso", "nub",
135    "nv",  "nwc", "ny",  "nym", "nyn", "nyo", "nzi", "oc",  "oj",
136    "om",  "or",  "os",  "osa", "ota", "oto", "pa",  "paa",
137    "pag", "pal", "pam", "pap", "pau", "peo", "phi", "phn",
138    "pi",  "pl",  "pon", "pra", "pro", "ps",  "pt",  "qu",
139    "raj", "rap", "rar", "rm",  "rn",  "ro",  "roa", "rom",
140    "ru",  "rup", "rw",  "sa",  "sad", "sah", "sai", "sal", "sam",
141    "sas", "sat", "sc",  "scn", "sco", "sd",  "se",  "sel", "sem",
142    "sg",  "sga", "sgn", "shn", "si",  "sid", "sio", "sit",
143    "sk",  "sl",  "sla", "sm",  "sma", "smi", "smj", "smn",
144    "sms", "sn",  "snk", "so",  "sog", "son", "sq",  "sr",
145    "srn", "srr", "ss",  "ssa", "st",  "su",  "suk", "sus", "sux",
146    "sv",  "sw",  "syc", "syr", "ta",  "tai", "te",  "tem", "ter",
147    "tet", "tg",  "th",  "ti",  "tig", "tiv", "tk",  "tkl",
148    "tl",  "tlh", "tli", "tmh", "tn",  "to",  "tog", "tpi", "tr", "trv",
149    "ts",  "tsi", "tt",  "tum", "tup", "tut", "tvl", "tw",
150    "ty",  "tyv", "udm", "ug",  "uga", "uk",  "umb", "und", "ur",
151    "uz",  "vai", "ve",  "vi",  "vo",  "vot", "wa",  "wak",
152    "wal", "war", "was", "wen", "wo",  "xal", "xh",  "yao", "yap",
153    "yi",  "yo",  "ypk", "za",  "zap", "zbl", "zen", "zh",  "znd",
154    "zu",  "zun", "zxx", "zza",
155NULL,
156    "in",  "iw",  "ji",  "jw",  "sh",    /* obsolete language codes */
157NULL
158};
159static const char* const DEPRECATED_LANGUAGES[]={
160    "in", "iw", "ji", "jw", NULL, NULL
161};
162static const char* const REPLACEMENT_LANGUAGES[]={
163    "id", "he", "yi", "jv", NULL, NULL
164};
165
166/**
167 * Table of 3-letter language codes.
168 *
169 * This is a lookup table used to convert 3-letter language codes to
170 * their 2-letter equivalent, where possible.  It must be kept in sync
171 * with LANGUAGES.  For all valid i, LANGUAGES[i] must refer to the
172 * same language as LANGUAGES_3[i].  The commented-out lines are
173 * copied from LANGUAGES to make eyeballing this baby easier.
174 *
175 * Where a 3-letter language code has no 2-letter equivalent, the
176 * 3-letter code occupies both LANGUAGES[i] and LANGUAGES_3[i].
177 *
178 * This table should be terminated with a NULL entry, followed by a
179 * second list, and another NULL entry.  The two lists correspond to
180 * the two lists in LANGUAGES.
181 */
182static const char * const LANGUAGES_3[] = {
183/*  "aa",  "ab",  "ace", "ach", "ada", "ady", "ae",  "af",  "afa",    */
184    "aar", "abk", "ace", "ach", "ada", "ady", "ave", "afr", "afa",
185/*  "afh", "ain", "ak",  "akk", "ale", "alg", "alt", "am",  "an",  "ang", "anp", "apa",    */
186    "afh", "ain", "aka", "akk", "ale", "alg", "alt", "amh", "arg", "ang", "anp", "apa",
187/*  "ar",  "arc", "arn", "arp", "art", "arw", "as",  "ast",    */
188    "ara", "arc", "arn", "arp", "art", "arw", "asm", "ast",
189/*  "ath", "aus", "av",  "awa", "ay",  "az",  "ba",  "bad",    */
190    "ath", "aus", "ava", "awa", "aym", "aze", "bak", "bad",
191/*  "bai", "bal", "ban", "bas", "bat", "be",  "bej",    */
192    "bai", "bal", "ban", "bas", "bat", "bel", "bej",
193/*  "bem", "ber", "bg",  "bh",  "bho", "bi",  "bik", "bin",    */
194    "bem", "ber", "bul", "bih", "bho", "bis", "bik", "bin",
195/*  "bla", "bm",  "bn",  "bnt", "bo",  "br",  "bra", "bs",     */
196    "bla", "bam", "ben", "bnt", "bod", "bre", "bra", "bos",
197/*  "btk", "bua", "bug", "byn", "ca",  "cad", "cai", "car", "cau",    */
198    "btk", "bua", "bug", "byn", "cat", "cad", "cai", "car", "cau",
199/*  "cch", "ce",  "ceb", "cel", "ch",  "chb", "chg", "chk", "chm",    */
200    "cch", "che", "ceb", "cel", "cha", "chb", "chg", "chk", "chm",
201/*  "chn", "cho", "chp", "chr", "chy", "cmc", "co",  "cop",    */
202    "chn", "cho", "chp", "chr", "chy", "cmc", "cos", "cop",
203/*  "cpe", "cpf", "cpp", "cr",  "crh", "crp", "cs",  "csb", "cu",  "cus",    */
204    "cpe", "cpf", "cpp", "cre", "crh", "crp", "ces", "csb", "chu", "cus",
205/*  "cv",  "cy",  "da",  "dak", "dar", "day", "de",  "del", "den",    */
206    "chv", "cym", "dan", "dak", "dar", "day", "deu", "del", "den",
207/*  "dgr", "din", "doi", "dra", "dsb", "dua", "dum", "dv",  "dyu",    */
208    "dgr", "din", "doi", "dra", "dsb", "dua", "dum", "div", "dyu",
209/*  "dz",  "ee",  "efi", "egy", "eka", "el",  "elx", "en",     */
210    "dzo", "ewe", "efi", "egy", "eka", "ell", "elx", "eng",
211/*  "enm", "eo",  "es",  "et",  "eu",  "ewo", "fa",     */
212    "enm", "epo", "spa", "est", "eus", "ewo", "fas",
213/*  "fan", "fat", "ff",  "fi",  "fil", "fiu", "fj",  "fo",  "fon",    */
214    "fan", "fat", "ful", "fin", "fil", "fiu", "fij", "fao", "fon",
215/*  "fr",  "frm", "fro", "frr", "frs", "fur", "fy",  "ga",  "gaa", "gay",    */
216    "fra", "frm", "fro", "frr", "frs", "fur", "fry", "gle", "gaa", "gay",
217/*  "gba", "gd",  "gem", "gez", "gil", "gl",  "gmh", "gn",     */
218    "gba", "gla", "gem", "gez", "gil", "glg", "gmh", "grn",
219/*  "goh", "gon", "gor", "got", "grb", "grc", "gsw", "gu",  "gv",     */
220    "goh", "gon", "gor", "got", "grb", "grc", "gsw", "guj", "glv",
221/*  "gwi", "ha",  "hai", "haw", "he",  "hi",  "hil", "him",    */
222    "gwi", "hau", "hai", "haw", "heb", "hin", "hil", "him",
223/*  "hit", "hmn", "ho",  "hr",  "hsb", "ht",  "hu",  "hup", "hy",  "hz",     */
224    "hit", "hmn", "hmo", "hrv", "hsb", "hat", "hun", "hup", "hye", "her",
225/*  "ia",  "iba", "id",  "ie",  "ig",  "ii",  "ijo", "ik",     */
226    "ina", "iba", "ind", "ile", "ibo", "iii", "ijo", "ipk",
227/*  "ilo", "inc", "ine", "inh", "io",  "ira", "iro", "is",  "it",      */
228    "ilo", "inc", "ine", "inh", "ido", "ira", "iro", "isl", "ita",
229/*  "iu",  "ja",  "jbo", "jpr", "jrb", "jv",  "ka",  "kaa", "kab",   */
230    "iku", "jpn", "jbo", "jpr", "jrb", "jav", "kat", "kaa", "kab",
231/*  "kac", "kaj", "kam", "kar", "kaw", "kbd", "kcg", "kfo", "kg",  "kha", "khi",*/
232    "kac", "kaj", "kam", "kar", "kaw", "kbd", "kcg", "kfo", "kg",  "kha", "khi",
233/*  "kho", "ki",  "kj",  "kk",  "kl",  "km",  "kmb", "kn",     */
234    "kho", "kik", "kua", "kaz", "kal", "khm", "kmb", "kan",
235/*  "ko",  "kok", "kos", "kpe", "kr",  "krc", "krl", "kro", "kru", "ks",     */
236    "kor", "kok", "kos", "kpe", "kau", "krc", "krl", "kro", "kru", "kas",
237/*  "ku",  "kum", "kut", "kv",  "kw",  "ky",  "la",  "lad",    */
238    "kur", "kum", "kut", "kom", "cor", "kir", "lat", "lad",
239/*  "lah", "lam", "lb",  "lez", "lg",  "li",  "ln",  "lo",  "lol",    */
240    "lah", "lam", "ltz", "lez", "lug", "lim", "lin", "lao", "lol",
241/*  "loz", "lt",  "lu",  "lua", "lui", "lun", "luo", "lus",    */
242    "loz", "lit", "lub", "lua", "lui", "lun", "luo", "lus",
243/*  "lv",  "mad", "mag", "mai", "mak", "man", "map", "mas",    */
244    "lav", "mad", "mag", "mai", "mak", "man", "map", "mas",
245/*  "mdf", "mdr", "men", "mfe", "mg",  "mga", "mh",  "mi",  "mic", "min",    */
246    "mdf", "mdr", "men", "mfe", "mlg", "mga", "mah", "mri", "mic", "min",
247/*  "mis", "mk",  "mkh", "ml",  "mn",  "mnc", "mni", "mno",    */
248    "mis", "mkd", "mkh", "mal", "mon", "mnc", "mni", "mno",
249/*  "mo",  "moh", "mos", "mr",  "ms",  "mt",  "mul", "mun",    */
250    "mol", "moh", "mos", "mar", "msa", "mlt", "mul", "mun",
251/*  "mus", "mwl", "mwr", "my",  "myn", "myv", "na",  "nah", "nai", "nap",    */
252    "mus", "mwl", "mwr", "mya", "myn", "myv", "nau", "nah", "nai", "nap",
253/*  "nb",  "nd",  "nds", "ne",  "new", "ng",  "nia", "nic",    */
254    "nob", "nde", "nds", "nep", "new", "ndo", "nia", "nic",
255/*  "niu", "nl",  "nn",  "no",  "nog", "non", "nqo", "nr",  "nso", "nub",    */
256    "niu", "nld", "nno", "nor", "nog", "non", "nqo", "nbl", "nso", "nub",
257/*  "nv",  "nwc", "ny",  "nym", "nyn", "nyo", "nzi", "oc",  "oj",     */
258    "nav", "nwc", "nya", "nym", "nyn", "nyo", "nzi", "oci", "oji",
259/*  "om",  "or",  "os",  "osa", "ota", "oto", "pa",  "paa",    */
260    "orm", "ori", "oss", "osa", "ota", "oto", "pan", "paa",
261/*  "pag", "pal", "pam", "pap", "pau", "peo", "phi", "phn",    */
262    "pag", "pal", "pam", "pap", "pau", "peo", "phi", "phn",
263/*  "pi",  "pl",  "pon", "pra", "pro", "ps",  "pt",  "qu",     */
264    "pli", "pol", "pon", "pra", "pro", "pus", "por", "que",
265/*  "raj", "rap", "rar", "rm",  "rn",  "ro",  "roa", "rom",    */
266    "raj", "rap", "rar", "roh", "run", "ron", "roa", "rom",
267/*  "ru",  "rup", "rw",  "sa",  "sad", "sah", "sai", "sal", "sam",    */
268    "rus", "rup", "kin", "san", "sad", "sah", "sai", "sal", "sam",
269/*  "sas", "sat", "sc",  "scn", "sco", "sd",  "se",  "sel", "sem",    */
270    "sas", "sat", "srd", "scn", "sco", "snd", "sme", "sel", "sem",
271/*  "sg",  "sga", "sgn", "shn", "si",  "sid", "sio", "sit",    */
272    "sag", "sga", "sgn", "shn", "sin", "sid", "sio", "sit",
273/*  "sk",  "sl",  "sla", "sm",  "sma", "smi", "smj", "smn",    */
274    "slk", "slv", "sla", "smo", "sma", "smi", "smj", "smn",
275/*  "sms", "sn",  "snk", "so",  "sog", "son", "sq",  "sr",     */
276    "sms", "sna", "snk", "som", "sog", "son", "sqi", "srp",
277/*  "srn", "srr", "ss",  "ssa", "st",  "su",  "suk", "sus", "sux",    */
278    "srn", "srr", "ssw", "ssa", "sot", "sun", "suk", "sus", "sux",
279/*  "sv",  "sw",  "syc", "syr", "ta",  "tai", "te",  "tem", "ter",    */
280    "swe", "swa", "syc", "syr", "tam", "tai", "tel", "tem", "ter",
281/*  "tet", "tg",  "th",  "ti",  "tig", "tiv", "tk",  "tkl",    */
282    "tet", "tgk", "tha", "tir", "tig", "tiv", "tuk", "tkl",
283/*  "tl",  "tlh", "tli", "tmh", "tn",  "to",  "tog", "tpi", "tr", "trv",    */
284    "tgl", "tlh", "tli", "tmh", "tsn", "ton", "tog", "tpi", "tur", "trv",
285/*  "ts",  "tsi", "tt",  "tum", "tup", "tut", "tvl", "tw",     */
286    "tso", "tsi", "tat", "tum", "tup", "tut", "tvl", "twi",
287/*  "ty",  "tyv", "udm", "ug",  "uga", "uk",  "umb", "und", "ur",     */
288    "tah", "tyv", "udm", "uig", "uga", "ukr", "umb", "und", "urd",
289/*  "uz",  "vai", "ve",  "vi",  "vo",  "vot", "wa",  "wak",    */
290    "uzb", "vai", "ven", "vie", "vol", "vot", "wln", "wak",
291/*  "wal", "war", "was", "wen", "wo",  "xal", "xh",  "yao", "yap",    */
292    "wal", "war", "was", "wen", "wol", "xal", "xho", "yao", "yap",
293/*  "yi",  "yo",  "ypk", "za",  "zap", "zbl", "zen", "zh",  "znd",    */
294    "yid", "yor", "ypk", "zha", "zap", "zbl", "zen", "zho", "znd",
295/*  "zu",  "zun", "zxx", "zza",                                         */
296    "zul", "zun", "zxx", "zza",
297NULL,
298/*  "in",  "iw",  "ji",  "jw",  "sh",                          */
299    "ind", "heb", "yid", "jaw", "srp",
300NULL
301};
302
303/**
304 * Table of 2-letter country codes.
305 *
306 * This list must be in sorted order.  This list is returned directly
307 * to the user by some API.
308 *
309 * This list must be kept in sync with COUNTRIES_3, with corresponding
310 * entries matched.
311 *
312 * This table should be terminated with a NULL entry, followed by a
313 * second list, and another NULL entry.  The first list is visible to
314 * user code when this array is returned by API.  The second list
315 * contains codes we support, but do not expose through user API.
316 *
317 * Notes:
318 *
319 * ZR(ZAR) is now CD(COD) and FX(FXX) is PS(PSE) as per
320 * http://www.evertype.com/standards/iso3166/iso3166-1-en.html added
321 * new codes keeping the old ones for compatibility updated to include
322 * 1999/12/03 revisions *CWB*
323 *
324 * RO(ROM) is now RO(ROU) according to
325 * http://www.iso.org/iso/en/prods-services/iso3166ma/03updates-on-iso-3166/nlv3e-rou.html
326 */
327static const char * const COUNTRIES[] = {
328    "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",  "AN",
329    "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",
330    "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",
331    "BJ",  "BL",  "BM",  "BN",  "BO",  "BR",  "BS",  "BT",  "BV",
332    "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",
333    "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CR",
334    "CU",  "CV",  "CX",  "CY",  "CZ",  "DE",  "DJ",  "DK",
335    "DM",  "DO",  "DZ",  "EC",  "EE",  "EG",  "EH",  "ER",
336    "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",
337    "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",
338    "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",
339    "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",
340    "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS",
341    "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",
342    "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",
343    "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",
344    "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",
345    "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",
346    "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",
347    "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",
348    "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",
349    "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",
350    "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",
351    "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",
352    "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "ST",  "SV",
353    "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",
354    "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",
355    "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",
356    "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",
357    "WS",  "YE",  "YT",  "ZA",  "ZM",  "ZW",
358NULL,
359    "FX",  "CS",  "RO",  "TP",  "YU",  "ZR",   /* obsolete country codes */
360NULL
361};
362
363static const char* const DEPRECATED_COUNTRIES[] ={
364    "BU", "CS", "DY", "FX", "HV", "NH", "RH", "TP", "YU", "ZR", NULL, NULL /* deprecated country list */
365};
366static const char* const REPLACEMENT_COUNTRIES[] = {
367/*  "BU", "CS", "DY", "FX", "HV", "NH", "RH", "TP", "YU", "ZR" */
368    "MM", "RS", "BJ", "FR", "BF", "VU", "ZW", "TL", "RS", "CD", NULL, NULL  /* replacement country codes */
369};
370
371/**
372 * Table of 3-letter country codes.
373 *
374 * This is a lookup table used to convert 3-letter country codes to
375 * their 2-letter equivalent.  It must be kept in sync with COUNTRIES.
376 * For all valid i, COUNTRIES[i] must refer to the same country as
377 * COUNTRIES_3[i].  The commented-out lines are copied from COUNTRIES
378 * to make eyeballing this baby easier.
379 *
380 * This table should be terminated with a NULL entry, followed by a
381 * second list, and another NULL entry.  The two lists correspond to
382 * the two lists in COUNTRIES.
383 */
384static const char * const COUNTRIES_3[] = {
385/*  "AD",  "AE",  "AF",  "AG",  "AI",  "AL",  "AM",  "AN",     */
386    "AND", "ARE", "AFG", "ATG", "AIA", "ALB", "ARM", "ANT",
387/*  "AO",  "AQ",  "AR",  "AS",  "AT",  "AU",  "AW",  "AX",  "AZ",     */
388    "AGO", "ATA", "ARG", "ASM", "AUT", "AUS", "ABW", "ALA", "AZE",
389/*  "BA",  "BB",  "BD",  "BE",  "BF",  "BG",  "BH",  "BI",     */
390    "BIH", "BRB", "BGD", "BEL", "BFA", "BGR", "BHR", "BDI",
391/*  "BJ",  "BL",  "BM",  "BN",  "BO",  "BR",  "BS",  "BT",  "BV",     */
392    "BEN", "BLM", "BMU", "BRN", "BOL", "BRA", "BHS", "BTN", "BVT",
393/*  "BW",  "BY",  "BZ",  "CA",  "CC",  "CD",  "CF",  "CG",     */
394    "BWA", "BLR", "BLZ", "CAN", "CCK", "COD", "CAF", "COG",
395/*  "CH",  "CI",  "CK",  "CL",  "CM",  "CN",  "CO",  "CR",     */
396    "CHE", "CIV", "COK", "CHL", "CMR", "CHN", "COL", "CRI",
397/*  "CU",  "CV",  "CX",  "CY",  "CZ",  "DE",  "DJ",  "DK",     */
398    "CUB", "CPV", "CXR", "CYP", "CZE", "DEU", "DJI", "DNK",
399/*  "DM",  "DO",  "DZ",  "EC",  "EE",  "EG",  "EH",  "ER",     */
400    "DMA", "DOM", "DZA", "ECU", "EST", "EGY", "ESH", "ERI",
401/*  "ES",  "ET",  "FI",  "FJ",  "FK",  "FM",  "FO",  "FR",     */
402    "ESP", "ETH", "FIN", "FJI", "FLK", "FSM", "FRO", "FRA",
403/*  "GA",  "GB",  "GD",  "GE",  "GF",  "GG",  "GH",  "GI",  "GL",     */
404    "GAB", "GBR", "GRD", "GEO", "GUF", "GGY", "GHA", "GIB", "GRL",
405/*  "GM",  "GN",  "GP",  "GQ",  "GR",  "GS",  "GT",  "GU",     */
406    "GMB", "GIN", "GLP", "GNQ", "GRC", "SGS", "GTM", "GUM",
407/*  "GW",  "GY",  "HK",  "HM",  "HN",  "HR",  "HT",  "HU",     */
408    "GNB", "GUY", "HKG", "HMD", "HND", "HRV", "HTI", "HUN",
409/*  "ID",  "IE",  "IL",  "IM",  "IN",  "IO",  "IQ",  "IR",  "IS" */
410    "IDN", "IRL", "ISR", "IMN", "IND", "IOT", "IRQ", "IRN", "ISL",
411/*  "IT",  "JE",  "JM",  "JO",  "JP",  "KE",  "KG",  "KH",  "KI",     */
412    "ITA", "JEY", "JAM", "JOR", "JPN", "KEN", "KGZ", "KHM", "KIR",
413/*  "KM",  "KN",  "KP",  "KR",  "KW",  "KY",  "KZ",  "LA",     */
414    "COM", "KNA", "PRK", "KOR", "KWT", "CYM", "KAZ", "LAO",
415/*  "LB",  "LC",  "LI",  "LK",  "LR",  "LS",  "LT",  "LU",     */
416    "LBN", "LCA", "LIE", "LKA", "LBR", "LSO", "LTU", "LUX",
417/*  "LV",  "LY",  "MA",  "MC",  "MD",  "ME",  "MF",  "MG",  "MH",  "MK",     */
418    "LVA", "LBY", "MAR", "MCO", "MDA", "MNE", "MAF", "MDG", "MHL", "MKD",
419/*  "ML",  "MM",  "MN",  "MO",  "MP",  "MQ",  "MR",  "MS",     */
420    "MLI", "MMR", "MNG", "MAC", "MNP", "MTQ", "MRT", "MSR",
421/*  "MT",  "MU",  "MV",  "MW",  "MX",  "MY",  "MZ",  "NA",     */
422    "MLT", "MUS", "MDV", "MWI", "MEX", "MYS", "MOZ", "NAM",
423/*  "NC",  "NE",  "NF",  "NG",  "NI",  "NL",  "NO",  "NP",     */
424    "NCL", "NER", "NFK", "NGA", "NIC", "NLD", "NOR", "NPL",
425/*  "NR",  "NU",  "NZ",  "OM",  "PA",  "PE",  "PF",  "PG",     */
426    "NRU", "NIU", "NZL", "OMN", "PAN", "PER", "PYF", "PNG",
427/*  "PH",  "PK",  "PL",  "PM",  "PN",  "PR",  "PS",  "PT",     */
428    "PHL", "PAK", "POL", "SPM", "PCN", "PRI", "PSE", "PRT",
429/*  "PW",  "PY",  "QA",  "RE",  "RO",  "RS",  "RU",  "RW",  "SA",     */
430    "PLW", "PRY", "QAT", "REU", "ROU", "SRB", "RUS", "RWA", "SAU",
431/*  "SB",  "SC",  "SD",  "SE",  "SG",  "SH",  "SI",  "SJ",     */
432    "SLB", "SYC", "SDN", "SWE", "SGP", "SHN", "SVN", "SJM",
433/*  "SK",  "SL",  "SM",  "SN",  "SO",  "SR",  "ST",  "SV",     */
434    "SVK", "SLE", "SMR", "SEN", "SOM", "SUR", "STP", "SLV",
435/*  "SY",  "SZ",  "TC",  "TD",  "TF",  "TG",  "TH",  "TJ",     */
436    "SYR", "SWZ", "TCA", "TCD", "ATF", "TGO", "THA", "TJK",
437/*  "TK",  "TL",  "TM",  "TN",  "TO",  "TR",  "TT",  "TV",     */
438    "TKL", "TLS", "TKM", "TUN", "TON", "TUR", "TTO", "TUV",
439/*  "TW",  "TZ",  "UA",  "UG",  "UM",  "US",  "UY",  "UZ",     */
440    "TWN", "TZA", "UKR", "UGA", "UMI", "USA", "URY", "UZB",
441/*  "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",     */
442    "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
443/*  "WS",  "YE",  "YT",  "ZA",  "ZM",  "ZW",          */
444    "WSM", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
445NULL,
446/*  "FX",  "CS",  "RO",  "TP",  "YU",  "ZR",   */
447    "FXX", "SCG", "ROM", "TMP", "YUG", "ZAR",
448NULL
449};
450
451typedef struct CanonicalizationMap {
452    const char *id;          /* input ID */
453    const char *canonicalID; /* canonicalized output ID */
454    const char *keyword;     /* keyword, or NULL if none */
455    const char *value;       /* keyword value, or NULL if kw==NULL */
456} CanonicalizationMap;
457
458/**
459 * A map to canonicalize locale IDs.  This handles a variety of
460 * different semantic kinds of transformations.
461 */
462static const CanonicalizationMap CANONICALIZE_MAP[] = {
463    { "",               "en_US_POSIX", NULL, NULL }, /* .NET name */
464    { "c",              "en_US_POSIX", NULL, NULL }, /* POSIX name */
465    { "posix",          "en_US_POSIX", NULL, NULL }, /* POSIX name (alias of C) */
466    { "art_LOJBAN",     "jbo", NULL, NULL }, /* registered name */
467    { "az_AZ_CYRL",     "az_Cyrl_AZ", NULL, NULL }, /* .NET name */
468    { "az_AZ_LATN",     "az_Latn_AZ", NULL, NULL }, /* .NET name */
469    { "ca_ES_PREEURO",  "ca_ES", "currency", "ESP" },
470    { "cel_GAULISH",    "cel__GAULISH", NULL, NULL }, /* registered name */
471    { "de_1901",        "de__1901", NULL, NULL }, /* registered name */
472    { "de_1906",        "de__1906", NULL, NULL }, /* registered name */
473    { "de__PHONEBOOK",  "de", "collation", "phonebook" }, /* Old ICU name */
474    { "de_AT_PREEURO",  "de_AT", "currency", "ATS" },
475    { "de_DE_PREEURO",  "de_DE", "currency", "DEM" },
476    { "de_LU_PREEURO",  "de_LU", "currency", "LUF" },
477    { "el_GR_PREEURO",  "el_GR", "currency", "GRD" },
478    { "en_BOONT",       "en__BOONT", NULL, NULL }, /* registered name */
479    { "en_SCOUSE",      "en__SCOUSE", NULL, NULL }, /* registered name */
480    { "en_BE_PREEURO",  "en_BE", "currency", "BEF" },
481    { "en_IE_PREEURO",  "en_IE", "currency", "IEP" },
482    { "es__TRADITIONAL", "es", "collation", "traditional" }, /* Old ICU name */
483    { "es_ES_PREEURO",  "es_ES", "currency", "ESP" },
484    { "eu_ES_PREEURO",  "eu_ES", "currency", "ESP" },
485    { "fi_FI_PREEURO",  "fi_FI", "currency", "FIM" },
486    { "fr_BE_PREEURO",  "fr_BE", "currency", "BEF" },
487    { "fr_FR_PREEURO",  "fr_FR", "currency", "FRF" },
488    { "fr_LU_PREEURO",  "fr_LU", "currency", "LUF" },
489    { "ga_IE_PREEURO",  "ga_IE", "currency", "IEP" },
490    { "gl_ES_PREEURO",  "gl_ES", "currency", "ESP" },
491    { "hi__DIRECT",     "hi", "collation", "direct" }, /* Old ICU name */
492    { "it_IT_PREEURO",  "it_IT", "currency", "ITL" },
493    { "ja_JP_TRADITIONAL", "ja_JP", "calendar", "japanese" }, /* Old ICU name */
494    { "nb_NO_NY",       "nn_NO", NULL, NULL },  /* "markus said this was ok" :-) */
495    { "nl_BE_PREEURO",  "nl_BE", "currency", "BEF" },
496    { "nl_NL_PREEURO",  "nl_NL", "currency", "NLG" },
497    { "pt_PT_PREEURO",  "pt_PT", "currency", "PTE" },
498    { "sl_ROZAJ",       "sl__ROZAJ", NULL, NULL }, /* registered name */
499    { "sr_SP_CYRL",     "sr_Cyrl_RS", NULL, NULL }, /* .NET name */
500    { "sr_SP_LATN",     "sr_Latn_RS", NULL, NULL }, /* .NET name */
501    { "sr_YU_CYRILLIC", "sr_Cyrl_RS", NULL, NULL }, /* Linux name */
502    { "th_TH_TRADITIONAL", "th_TH", "calendar", "buddhist" }, /* Old ICU name */
503    { "uz_UZ_CYRILLIC", "uz_Cyrl_UZ", NULL, NULL }, /* Linux name */
504    { "uz_UZ_CYRL",     "uz_Cyrl_UZ", NULL, NULL }, /* .NET name */
505    { "uz_UZ_LATN",     "uz_Latn_UZ", NULL, NULL }, /* .NET name */
506    { "zh_CHS",         "zh_Hans", NULL, NULL }, /* .NET name */
507    { "zh_CHT",         "zh_Hant", NULL, NULL }, /* .NET name */
508    { "zh_GAN",         "zh__GAN", NULL, NULL }, /* registered name */
509    { "zh_GUOYU",       "zh", NULL, NULL }, /* registered name */
510    { "zh_HAKKA",       "zh__HAKKA", NULL, NULL }, /* registered name */
511    { "zh_MIN",         "zh__MIN", NULL, NULL }, /* registered name */
512    { "zh_MIN_NAN",     "zh__MINNAN", NULL, NULL }, /* registered name */
513    { "zh_WUU",         "zh__WUU", NULL, NULL }, /* registered name */
514    { "zh_XIANG",       "zh__XIANG", NULL, NULL }, /* registered name */
515    { "zh_YUE",         "zh__YUE", NULL, NULL }, /* registered name */
516};
517
518typedef struct VariantMap {
519    const char *variant;          /* input ID */
520    const char *keyword;     /* keyword, or NULL if none */
521    const char *value;       /* keyword value, or NULL if kw==NULL */
522} VariantMap;
523
524static const VariantMap VARIANT_MAP[] = {
525    { "EURO",   "currency", "EUR" },
526    { "PINYIN", "collation", "pinyin" }, /* Solaris variant */
527    { "STROKE", "collation", "stroke" }  /* Solaris variant */
528};
529
530/* ### BCP47 Conversion *******************************************/
531/* Test if the locale id has BCP47 u extension and does not have '@' */
532#define _hasBCP47Extension(id) (id && uprv_strstr(id, "@") == NULL && getShortestSubtagLength(localeID) == 1)
533/* Converts the BCP47 id to Unicode id. Does nothing to id if conversion fails */
534#define _ConvertBCP47(finalID, id, buffer, length,err) \
535        if (uloc_forLanguageTag(id, buffer, length, NULL, err) <= 0 || U_FAILURE(*err)) { \
536            finalID=id; \
537        } else { \
538            finalID=buffer; \
539        }
540/* Gets the size of the shortest subtag in the given localeID. */
541static int32_t getShortestSubtagLength(const char *localeID) {
542    int32_t localeIDLength = uprv_strlen(localeID);
543    int32_t length = localeIDLength;
544    int32_t tmpLength = 0;
545    int32_t i;
546    UBool reset = TRUE;
547
548    for (i = 0; i < localeIDLength; i++) {
549        if (localeID[i] != '_' && localeID[i] != '-') {
550            if (reset) {
551                tmpLength = 0;
552                reset = FALSE;
553            }
554            tmpLength++;
555        } else {
556            if (tmpLength != 0 && tmpLength < length) {
557                length = tmpLength;
558            }
559            reset = TRUE;
560        }
561    }
562
563    return length;
564}
565
566/* ### Keywords **************************************************/
567
568#define ULOC_KEYWORD_BUFFER_LEN 25
569#define ULOC_MAX_NO_KEYWORDS 25
570
571U_CAPI const char * U_EXPORT2
572locale_getKeywordsStart(const char *localeID) {
573    const char *result = NULL;
574    if((result = uprv_strchr(localeID, '@')) != NULL) {
575        return result;
576    }
577#if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
578    else {
579        /* We do this because the @ sign is variant, and the @ sign used on one
580        EBCDIC machine won't be compiled the same way on other EBCDIC based
581        machines. */
582        static const uint8_t ebcdicSigns[] = { 0x7C, 0x44, 0x66, 0x80, 0xAC, 0xAE, 0xAF, 0xB5, 0xEC, 0xEF, 0x00 };
583        const uint8_t *charToFind = ebcdicSigns;
584        while(*charToFind) {
585            if((result = uprv_strchr(localeID, *charToFind)) != NULL) {
586                return result;
587            }
588            charToFind++;
589        }
590    }
591#endif
592    return NULL;
593}
594
595/**
596 * @param buf buffer of size [ULOC_KEYWORD_BUFFER_LEN]
597 * @param keywordName incoming name to be canonicalized
598 * @param status return status (keyword too long)
599 * @return length of the keyword name
600 */
601static int32_t locale_canonKeywordName(char *buf, const char *keywordName, UErrorCode *status)
602{
603  int32_t i;
604  int32_t keywordNameLen = (int32_t)uprv_strlen(keywordName);
605
606  if(keywordNameLen >= ULOC_KEYWORD_BUFFER_LEN) {
607    /* keyword name too long for internal buffer */
608    *status = U_INTERNAL_PROGRAM_ERROR;
609          return 0;
610  }
611
612  /* normalize the keyword name */
613  for(i = 0; i < keywordNameLen; i++) {
614    buf[i] = uprv_tolower(keywordName[i]);
615  }
616  buf[i] = 0;
617
618  return keywordNameLen;
619}
620
621typedef struct {
622    char keyword[ULOC_KEYWORD_BUFFER_LEN];
623    int32_t keywordLen;
624    const char *valueStart;
625    int32_t valueLen;
626} KeywordStruct;
627
628static int32_t U_CALLCONV
629compareKeywordStructs(const void *context, const void *left, const void *right) {
630    const char* leftString = ((const KeywordStruct *)left)->keyword;
631    const char* rightString = ((const KeywordStruct *)right)->keyword;
632    return uprv_strcmp(leftString, rightString);
633}
634
635/**
636 * Both addKeyword and addValue must already be in canonical form.
637 * Either both addKeyword and addValue are NULL, or neither is NULL.
638 * If they are not NULL they must be zero terminated.
639 * If addKeyword is not NULL is must have length small enough to fit in KeywordStruct.keyword.
640 */
641static int32_t
642_getKeywords(const char *localeID,
643             char prev,
644             char *keywords, int32_t keywordCapacity,
645             char *values, int32_t valuesCapacity, int32_t *valLen,
646             UBool valuesToo,
647             const char* addKeyword,
648             const char* addValue,
649             UErrorCode *status)
650{
651    KeywordStruct keywordList[ULOC_MAX_NO_KEYWORDS];
652
653    int32_t maxKeywords = ULOC_MAX_NO_KEYWORDS;
654    int32_t numKeywords = 0;
655    const char* pos = localeID;
656    const char* equalSign = NULL;
657    const char* semicolon = NULL;
658    int32_t i = 0, j, n;
659    int32_t keywordsLen = 0;
660    int32_t valuesLen = 0;
661
662    if(prev == '@') { /* start of keyword definition */
663        /* we will grab pairs, trim spaces, lowercase keywords, sort and return */
664        do {
665            UBool duplicate = FALSE;
666            /* skip leading spaces */
667            while(*pos == ' ') {
668                pos++;
669            }
670            if (!*pos) { /* handle trailing "; " */
671                break;
672            }
673            if(numKeywords == maxKeywords) {
674                *status = U_INTERNAL_PROGRAM_ERROR;
675                return 0;
676            }
677            equalSign = uprv_strchr(pos, '=');
678            semicolon = uprv_strchr(pos, ';');
679            /* lack of '=' [foo@currency] is illegal */
680            /* ';' before '=' [foo@currency;collation=pinyin] is illegal */
681            if(!equalSign || (semicolon && semicolon<equalSign)) {
682                *status = U_INVALID_FORMAT_ERROR;
683                return 0;
684            }
685            /* need to normalize both keyword and keyword name */
686            if(equalSign - pos >= ULOC_KEYWORD_BUFFER_LEN) {
687                /* keyword name too long for internal buffer */
688                *status = U_INTERNAL_PROGRAM_ERROR;
689                return 0;
690            }
691            for(i = 0, n = 0; i < equalSign - pos; ++i) {
692                if (pos[i] != ' ') {
693                    keywordList[numKeywords].keyword[n++] = uprv_tolower(pos[i]);
694                }
695            }
696
697            /* zero-length keyword is an error. */
698            if (n == 0) {
699                *status = U_INVALID_FORMAT_ERROR;
700                return 0;
701            }
702
703            keywordList[numKeywords].keyword[n] = 0;
704            keywordList[numKeywords].keywordLen = n;
705            /* now grab the value part. First we skip the '=' */
706            equalSign++;
707            /* then we leading spaces */
708            while(*equalSign == ' ') {
709                equalSign++;
710            }
711
712            /* Premature end or zero-length value */
713            if (!equalSign || equalSign == semicolon) {
714                *status = U_INVALID_FORMAT_ERROR;
715                return 0;
716            }
717
718            keywordList[numKeywords].valueStart = equalSign;
719
720            pos = semicolon;
721            i = 0;
722            if(pos) {
723                while(*(pos - i - 1) == ' ') {
724                    i++;
725                }
726                keywordList[numKeywords].valueLen = (int32_t)(pos - equalSign - i);
727                pos++;
728            } else {
729                i = (int32_t)uprv_strlen(equalSign);
730                while(i && equalSign[i-1] == ' ') {
731                    i--;
732                }
733                keywordList[numKeywords].valueLen = i;
734            }
735            /* If this is a duplicate keyword, then ignore it */
736            for (j=0; j<numKeywords; ++j) {
737                if (uprv_strcmp(keywordList[j].keyword, keywordList[numKeywords].keyword) == 0) {
738                    duplicate = TRUE;
739                    break;
740                }
741            }
742            if (!duplicate) {
743                ++numKeywords;
744            }
745        } while(pos);
746
747        /* Handle addKeyword/addValue. */
748        if (addKeyword != NULL) {
749            UBool duplicate = FALSE;
750            U_ASSERT(addValue != NULL);
751            /* Search for duplicate; if found, do nothing. Explicit keyword
752               overrides addKeyword. */
753            for (j=0; j<numKeywords; ++j) {
754                if (uprv_strcmp(keywordList[j].keyword, addKeyword) == 0) {
755                    duplicate = TRUE;
756                    break;
757                }
758            }
759            if (!duplicate) {
760                if (numKeywords == maxKeywords) {
761                    *status = U_INTERNAL_PROGRAM_ERROR;
762                    return 0;
763                }
764                uprv_strcpy(keywordList[numKeywords].keyword, addKeyword);
765                keywordList[numKeywords].keywordLen = (int32_t)uprv_strlen(addKeyword);
766                keywordList[numKeywords].valueStart = addValue;
767                keywordList[numKeywords].valueLen = (int32_t)uprv_strlen(addValue);
768                ++numKeywords;
769            }
770        } else {
771            U_ASSERT(addValue == NULL);
772        }
773
774        /* now we have a list of keywords */
775        /* we need to sort it */
776        uprv_sortArray(keywordList, numKeywords, sizeof(KeywordStruct), compareKeywordStructs, NULL, FALSE, status);
777
778        /* Now construct the keyword part */
779        for(i = 0; i < numKeywords; i++) {
780            if(keywordsLen + keywordList[i].keywordLen + 1< keywordCapacity) {
781                uprv_strcpy(keywords+keywordsLen, keywordList[i].keyword);
782                if(valuesToo) {
783                    keywords[keywordsLen + keywordList[i].keywordLen] = '=';
784                } else {
785                    keywords[keywordsLen + keywordList[i].keywordLen] = 0;
786                }
787            }
788            keywordsLen += keywordList[i].keywordLen + 1;
789            if(valuesToo) {
790                if(keywordsLen + keywordList[i].valueLen < keywordCapacity) {
791                    uprv_strncpy(keywords+keywordsLen, keywordList[i].valueStart, keywordList[i].valueLen);
792                }
793                keywordsLen += keywordList[i].valueLen;
794
795                if(i < numKeywords - 1) {
796                    if(keywordsLen < keywordCapacity) {
797                        keywords[keywordsLen] = ';';
798                    }
799                    keywordsLen++;
800                }
801            }
802            if(values) {
803                if(valuesLen + keywordList[i].valueLen + 1< valuesCapacity) {
804                    uprv_strcpy(values+valuesLen, keywordList[i].valueStart);
805                    values[valuesLen + keywordList[i].valueLen] = 0;
806                }
807                valuesLen += keywordList[i].valueLen + 1;
808            }
809        }
810        if(values) {
811            values[valuesLen] = 0;
812            if(valLen) {
813                *valLen = valuesLen;
814            }
815        }
816        return u_terminateChars(keywords, keywordCapacity, keywordsLen, status);
817    } else {
818        return 0;
819    }
820}
821
822U_CFUNC int32_t
823locale_getKeywords(const char *localeID,
824                   char prev,
825                   char *keywords, int32_t keywordCapacity,
826                   char *values, int32_t valuesCapacity, int32_t *valLen,
827                   UBool valuesToo,
828                   UErrorCode *status) {
829    return _getKeywords(localeID, prev, keywords, keywordCapacity,
830                        values, valuesCapacity, valLen, valuesToo,
831                        NULL, NULL, status);
832}
833
834U_CAPI int32_t U_EXPORT2
835uloc_getKeywordValue(const char* localeID,
836                     const char* keywordName,
837                     char* buffer, int32_t bufferCapacity,
838                     UErrorCode* status)
839{
840    const char* startSearchHere = NULL;
841    const char* nextSeparator = NULL;
842    char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
843    char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
844    int32_t i = 0;
845    int32_t result = 0;
846
847    if(status && U_SUCCESS(*status) && localeID) {
848      char tempBuffer[ULOC_FULLNAME_CAPACITY];
849      const char* tmpLocaleID;
850
851      if (_hasBCP47Extension(localeID)) {
852          _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
853      } else {
854          tmpLocaleID=localeID;
855      }
856
857      startSearchHere = uprv_strchr(tmpLocaleID, '@'); /* TODO: REVISIT: shouldn't this be locale_getKeywordsStart ? */
858      if(startSearchHere == NULL) {
859          /* no keywords, return at once */
860          return 0;
861      }
862
863      locale_canonKeywordName(keywordNameBuffer, keywordName, status);
864      if(U_FAILURE(*status)) {
865        return 0;
866      }
867
868      /* find the first keyword */
869      while(startSearchHere) {
870          startSearchHere++;
871          /* skip leading spaces (allowed?) */
872          while(*startSearchHere == ' ') {
873              startSearchHere++;
874          }
875          nextSeparator = uprv_strchr(startSearchHere, '=');
876          /* need to normalize both keyword and keyword name */
877          if(!nextSeparator) {
878              break;
879          }
880          if(nextSeparator - startSearchHere >= ULOC_KEYWORD_BUFFER_LEN) {
881              /* keyword name too long for internal buffer */
882              *status = U_INTERNAL_PROGRAM_ERROR;
883              return 0;
884          }
885          for(i = 0; i < nextSeparator - startSearchHere; i++) {
886              localeKeywordNameBuffer[i] = uprv_tolower(startSearchHere[i]);
887          }
888          /* trim trailing spaces */
889          while(startSearchHere[i-1] == ' ') {
890              i--;
891          }
892          localeKeywordNameBuffer[i] = 0;
893
894          startSearchHere = uprv_strchr(nextSeparator, ';');
895
896          if(uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer) == 0) {
897              nextSeparator++;
898              while(*nextSeparator == ' ') {
899                  nextSeparator++;
900              }
901              /* we actually found the keyword. Copy the value */
902              if(startSearchHere && startSearchHere - nextSeparator < bufferCapacity) {
903                  while(*(startSearchHere-1) == ' ') {
904                      startSearchHere--;
905                  }
906                  uprv_strncpy(buffer, nextSeparator, startSearchHere - nextSeparator);
907                  result = u_terminateChars(buffer, bufferCapacity, (int32_t)(startSearchHere - nextSeparator), status);
908              } else if(!startSearchHere && (int32_t)uprv_strlen(nextSeparator) < bufferCapacity) { /* last item in string */
909                  i = (int32_t)uprv_strlen(nextSeparator);
910                  while(nextSeparator[i - 1] == ' ') {
911                      i--;
912                  }
913                  uprv_strncpy(buffer, nextSeparator, i);
914                  result = u_terminateChars(buffer, bufferCapacity, i, status);
915              } else {
916                  /* give a bigger buffer, please */
917                  *status = U_BUFFER_OVERFLOW_ERROR;
918                  if(startSearchHere) {
919                      result = (int32_t)(startSearchHere - nextSeparator);
920                  } else {
921                      result = (int32_t)uprv_strlen(nextSeparator);
922                  }
923              }
924              return result;
925          }
926      }
927    }
928    return 0;
929}
930
931U_CAPI int32_t U_EXPORT2
932uloc_setKeywordValue(const char* keywordName,
933                     const char* keywordValue,
934                     char* buffer, int32_t bufferCapacity,
935                     UErrorCode* status)
936{
937    /* TODO: sorting. removal. */
938    int32_t keywordNameLen;
939    int32_t keywordValueLen;
940    int32_t bufLen;
941    int32_t needLen = 0;
942    int32_t foundValueLen;
943    int32_t keywordAtEnd = 0; /* is the keyword at the end of the string? */
944    char keywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
945    char localeKeywordNameBuffer[ULOC_KEYWORD_BUFFER_LEN];
946    int32_t i = 0;
947    int32_t rc;
948    char* nextSeparator = NULL;
949    char* nextEqualsign = NULL;
950    char* startSearchHere = NULL;
951    char* keywordStart = NULL;
952    char *insertHere = NULL;
953    if(U_FAILURE(*status)) {
954        return -1;
955    }
956    if(bufferCapacity>1) {
957        bufLen = (int32_t)uprv_strlen(buffer);
958    } else {
959        *status = U_ILLEGAL_ARGUMENT_ERROR;
960        return 0;
961    }
962    if(bufferCapacity<bufLen) {
963        /* The capacity is less than the length?! Is this NULL terminated? */
964        *status = U_ILLEGAL_ARGUMENT_ERROR;
965        return 0;
966    }
967    if(keywordValue && !*keywordValue) {
968        keywordValue = NULL;
969    }
970    if(keywordValue) {
971        keywordValueLen = (int32_t)uprv_strlen(keywordValue);
972    } else {
973        keywordValueLen = 0;
974    }
975    keywordNameLen = locale_canonKeywordName(keywordNameBuffer, keywordName, status);
976    if(U_FAILURE(*status)) {
977        return 0;
978    }
979    startSearchHere = (char*)locale_getKeywordsStart(buffer);
980    if(startSearchHere == NULL || (startSearchHere[1]==0)) {
981        if(!keywordValue) { /* no keywords = nothing to remove */
982            return bufLen;
983        }
984
985        needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
986        if(startSearchHere) { /* had a single @ */
987            needLen--; /* already had the @ */
988            /* startSearchHere points at the @ */
989        } else {
990            startSearchHere=buffer+bufLen;
991        }
992        if(needLen >= bufferCapacity) {
993            *status = U_BUFFER_OVERFLOW_ERROR;
994            return needLen; /* no change */
995        }
996        *startSearchHere = '@';
997        startSearchHere++;
998        uprv_strcpy(startSearchHere, keywordNameBuffer);
999        startSearchHere += keywordNameLen;
1000        *startSearchHere = '=';
1001        startSearchHere++;
1002        uprv_strcpy(startSearchHere, keywordValue);
1003        startSearchHere+=keywordValueLen;
1004        return needLen;
1005    } /* end shortcut - no @ */
1006
1007    keywordStart = startSearchHere;
1008    /* search for keyword */
1009    while(keywordStart) {
1010        keywordStart++;
1011        /* skip leading spaces (allowed?) */
1012        while(*keywordStart == ' ') {
1013            keywordStart++;
1014        }
1015        nextEqualsign = uprv_strchr(keywordStart, '=');
1016        /* need to normalize both keyword and keyword name */
1017        if(!nextEqualsign) {
1018            break;
1019        }
1020        if(nextEqualsign - keywordStart >= ULOC_KEYWORD_BUFFER_LEN) {
1021            /* keyword name too long for internal buffer */
1022            *status = U_INTERNAL_PROGRAM_ERROR;
1023            return 0;
1024        }
1025        for(i = 0; i < nextEqualsign - keywordStart; i++) {
1026            localeKeywordNameBuffer[i] = uprv_tolower(keywordStart[i]);
1027        }
1028        /* trim trailing spaces */
1029        while(keywordStart[i-1] == ' ') {
1030            i--;
1031        }
1032        localeKeywordNameBuffer[i] = 0;
1033
1034        nextSeparator = uprv_strchr(nextEqualsign, ';');
1035        rc = uprv_strcmp(keywordNameBuffer, localeKeywordNameBuffer);
1036        if(rc == 0) {
1037            nextEqualsign++;
1038            while(*nextEqualsign == ' ') {
1039                nextEqualsign++;
1040            }
1041            /* we actually found the keyword. Change the value */
1042            if (nextSeparator) {
1043                keywordAtEnd = 0;
1044                foundValueLen = (int32_t)(nextSeparator - nextEqualsign);
1045            } else {
1046                keywordAtEnd = 1;
1047                foundValueLen = (int32_t)uprv_strlen(nextEqualsign);
1048            }
1049            if(keywordValue) { /* adding a value - not removing */
1050              if(foundValueLen == keywordValueLen) {
1051                uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1052                return bufLen; /* no change in size */
1053              } else if(foundValueLen > keywordValueLen) {
1054                int32_t delta = foundValueLen - keywordValueLen;
1055                if(nextSeparator) { /* RH side */
1056                  uprv_memmove(nextSeparator - delta, nextSeparator, bufLen-(nextSeparator-buffer));
1057                }
1058                uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1059                bufLen -= delta;
1060                buffer[bufLen]=0;
1061                return bufLen;
1062              } else { /* FVL < KVL */
1063                int32_t delta = keywordValueLen - foundValueLen;
1064                if((bufLen+delta) >= bufferCapacity) {
1065                  *status = U_BUFFER_OVERFLOW_ERROR;
1066                  return bufLen+delta;
1067                }
1068                if(nextSeparator) { /* RH side */
1069                  uprv_memmove(nextSeparator+delta,nextSeparator, bufLen-(nextSeparator-buffer));
1070                }
1071                uprv_strncpy(nextEqualsign, keywordValue, keywordValueLen);
1072                bufLen += delta;
1073                buffer[bufLen]=0;
1074                return bufLen;
1075              }
1076            } else { /* removing a keyword */
1077              if(keywordAtEnd) {
1078                /* zero out the ';' or '@' just before startSearchhere */
1079                keywordStart[-1] = 0;
1080                return (int32_t)((keywordStart-buffer)-1); /* (string length without keyword) minus separator */
1081              } else {
1082                uprv_memmove(keywordStart, nextSeparator+1, bufLen-((nextSeparator+1)-buffer));
1083                keywordStart[bufLen-((nextSeparator+1)-buffer)]=0;
1084                return (int32_t)(bufLen-((nextSeparator+1)-keywordStart));
1085              }
1086            }
1087        } else if(rc<0){ /* end match keyword */
1088          /* could insert at this location. */
1089          insertHere = keywordStart;
1090        }
1091        keywordStart = nextSeparator;
1092    } /* end loop searching */
1093
1094    if(!keywordValue) {
1095      return bufLen; /* removal of non-extant keyword - no change */
1096    }
1097
1098    /* we know there is at least one keyword. */
1099    needLen = bufLen+1+keywordNameLen+1+keywordValueLen;
1100    if(needLen >= bufferCapacity) {
1101        *status = U_BUFFER_OVERFLOW_ERROR;
1102        return needLen; /* no change */
1103    }
1104
1105    if(insertHere) {
1106      uprv_memmove(insertHere+(1+keywordNameLen+1+keywordValueLen), insertHere, bufLen-(insertHere-buffer));
1107      keywordStart = insertHere;
1108    } else {
1109      keywordStart = buffer+bufLen;
1110      *keywordStart = ';';
1111      keywordStart++;
1112    }
1113    uprv_strncpy(keywordStart, keywordNameBuffer, keywordNameLen);
1114    keywordStart += keywordNameLen;
1115    *keywordStart = '=';
1116    keywordStart++;
1117    uprv_strncpy(keywordStart, keywordValue, keywordValueLen); /* terminates. */
1118    keywordStart+=keywordValueLen;
1119    if(insertHere) {
1120      *keywordStart = ';';
1121      keywordStart++;
1122    }
1123    buffer[needLen]=0;
1124    return needLen;
1125}
1126
1127/* ### ID parsing implementation **************************************************/
1128
1129#define _isPrefixLetter(a) ((a=='x')||(a=='X')||(a=='i')||(a=='I'))
1130
1131/*returns TRUE if one of the special prefixes is here (s=string)
1132  'x-' or 'i-' */
1133#define _isIDPrefix(s) (_isPrefixLetter(s[0])&&_isIDSeparator(s[1]))
1134
1135/* Dot terminates it because of POSIX form  where dot precedes the codepage
1136 * except for variant
1137 */
1138#define _isTerminator(a)  ((a==0)||(a=='.')||(a=='@'))
1139
1140static char* _strnchr(const char* str, int32_t len, char c) {
1141    U_ASSERT(str != 0 && len >= 0);
1142    while (len-- != 0) {
1143        char d = *str;
1144        if (d == c) {
1145            return (char*) str;
1146        } else if (d == 0) {
1147            break;
1148        }
1149        ++str;
1150    }
1151    return NULL;
1152}
1153
1154/**
1155 * Lookup 'key' in the array 'list'.  The array 'list' should contain
1156 * a NULL entry, followed by more entries, and a second NULL entry.
1157 *
1158 * The 'list' param should be LANGUAGES, LANGUAGES_3, COUNTRIES, or
1159 * COUNTRIES_3.
1160 */
1161static int16_t _findIndex(const char* const* list, const char* key)
1162{
1163    const char* const* anchor = list;
1164    int32_t pass = 0;
1165
1166    /* Make two passes through two NULL-terminated arrays at 'list' */
1167    while (pass++ < 2) {
1168        while (*list) {
1169            if (uprv_strcmp(key, *list) == 0) {
1170                return (int16_t)(list - anchor);
1171            }
1172            list++;
1173        }
1174        ++list;     /* skip final NULL *CWB*/
1175    }
1176    return -1;
1177}
1178
1179/* count the length of src while copying it to dest; return strlen(src) */
1180static U_INLINE int32_t
1181_copyCount(char *dest, int32_t destCapacity, const char *src) {
1182    const char *anchor;
1183    char c;
1184
1185    anchor=src;
1186    for(;;) {
1187        if((c=*src)==0) {
1188            return (int32_t)(src-anchor);
1189        }
1190        if(destCapacity<=0) {
1191            return (int32_t)((src-anchor)+uprv_strlen(src));
1192        }
1193        ++src;
1194        *dest++=c;
1195        --destCapacity;
1196    }
1197}
1198
1199U_CFUNC const char*
1200uloc_getCurrentCountryID(const char* oldID){
1201    int32_t offset = _findIndex(DEPRECATED_COUNTRIES, oldID);
1202    if (offset >= 0) {
1203        return REPLACEMENT_COUNTRIES[offset];
1204    }
1205    return oldID;
1206}
1207U_CFUNC const char*
1208uloc_getCurrentLanguageID(const char* oldID){
1209    int32_t offset = _findIndex(DEPRECATED_LANGUAGES, oldID);
1210    if (offset >= 0) {
1211        return REPLACEMENT_LANGUAGES[offset];
1212    }
1213    return oldID;
1214}
1215/*
1216 * the internal functions _getLanguage(), _getCountry(), _getVariant()
1217 * avoid duplicating code to handle the earlier locale ID pieces
1218 * in the functions for the later ones by
1219 * setting the *pEnd pointer to where they stopped parsing
1220 *
1221 * TODO try to use this in Locale
1222 */
1223U_CFUNC int32_t
1224ulocimp_getLanguage(const char *localeID,
1225                    char *language, int32_t languageCapacity,
1226                    const char **pEnd) {
1227    int32_t i=0;
1228    int32_t offset;
1229    char lang[4]={ 0, 0, 0, 0 }; /* temporary buffer to hold language code for searching */
1230
1231    /* if it starts with i- or x- then copy that prefix */
1232    if(_isIDPrefix(localeID)) {
1233        if(i<languageCapacity) {
1234            language[i]=(char)uprv_tolower(*localeID);
1235        }
1236        if(i<languageCapacity) {
1237            language[i+1]='-';
1238        }
1239        i+=2;
1240        localeID+=2;
1241    }
1242
1243    /* copy the language as far as possible and count its length */
1244    while(!_isTerminator(*localeID) && !_isIDSeparator(*localeID)) {
1245        if(i<languageCapacity) {
1246            language[i]=(char)uprv_tolower(*localeID);
1247        }
1248        if(i<3) {
1249            lang[i]=(char)uprv_tolower(*localeID);
1250        }
1251        i++;
1252        localeID++;
1253    }
1254
1255    if(i==3) {
1256        /* convert 3 character code to 2 character code if possible *CWB*/
1257        offset=_findIndex(LANGUAGES_3, lang);
1258        if(offset>=0) {
1259            i=_copyCount(language, languageCapacity, LANGUAGES[offset]);
1260        }
1261    }
1262
1263    if(pEnd!=NULL) {
1264        *pEnd=localeID;
1265    }
1266    return i;
1267}
1268
1269U_CFUNC int32_t
1270ulocimp_getScript(const char *localeID,
1271                  char *script, int32_t scriptCapacity,
1272                  const char **pEnd)
1273{
1274    int32_t idLen = 0;
1275
1276    if (pEnd != NULL) {
1277        *pEnd = localeID;
1278    }
1279
1280    /* copy the second item as far as possible and count its length */
1281    while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])) {
1282        idLen++;
1283    }
1284
1285    /* If it's exactly 4 characters long, then it's a script and not a country. */
1286    if (idLen == 4) {
1287        int32_t i;
1288        if (pEnd != NULL) {
1289            *pEnd = localeID+idLen;
1290        }
1291        if(idLen > scriptCapacity) {
1292            idLen = scriptCapacity;
1293        }
1294        if (idLen >= 1) {
1295            script[0]=(char)uprv_toupper(*(localeID++));
1296        }
1297        for (i = 1; i < idLen; i++) {
1298            script[i]=(char)uprv_tolower(*(localeID++));
1299        }
1300    }
1301    else {
1302        idLen = 0;
1303    }
1304    return idLen;
1305}
1306
1307U_CFUNC int32_t
1308ulocimp_getCountry(const char *localeID,
1309                   char *country, int32_t countryCapacity,
1310                   const char **pEnd)
1311{
1312    int32_t idLen=0;
1313    char cnty[ULOC_COUNTRY_CAPACITY]={ 0, 0, 0, 0 };
1314    int32_t offset;
1315
1316    /* copy the country as far as possible and count its length */
1317    while(!_isTerminator(localeID[idLen]) && !_isIDSeparator(localeID[idLen])) {
1318        if(idLen<(ULOC_COUNTRY_CAPACITY-1)) {   /*CWB*/
1319            cnty[idLen]=(char)uprv_toupper(localeID[idLen]);
1320        }
1321        idLen++;
1322    }
1323
1324    /* the country should be either length 2 or 3 */
1325    if (idLen == 2 || idLen == 3) {
1326        UBool gotCountry = FALSE;
1327        /* convert 3 character code to 2 character code if possible *CWB*/
1328        if(idLen==3) {
1329            offset=_findIndex(COUNTRIES_3, cnty);
1330            if(offset>=0) {
1331                idLen=_copyCount(country, countryCapacity, COUNTRIES[offset]);
1332                gotCountry = TRUE;
1333            }
1334        }
1335        if (!gotCountry) {
1336            int32_t i = 0;
1337            for (i = 0; i < idLen; i++) {
1338                if (i < countryCapacity) {
1339                    country[i]=(char)uprv_toupper(localeID[i]);
1340                }
1341            }
1342        }
1343        localeID+=idLen;
1344    } else {
1345        idLen = 0;
1346    }
1347
1348    if(pEnd!=NULL) {
1349        *pEnd=localeID;
1350    }
1351
1352    return idLen;
1353}
1354
1355/**
1356 * @param needSeparator if true, then add leading '_' if any variants
1357 * are added to 'variant'
1358 */
1359static int32_t
1360_getVariantEx(const char *localeID,
1361              char prev,
1362              char *variant, int32_t variantCapacity,
1363              UBool needSeparator) {
1364    int32_t i=0;
1365
1366    /* get one or more variant tags and separate them with '_' */
1367    if(_isIDSeparator(prev)) {
1368        /* get a variant string after a '-' or '_' */
1369        while(!_isTerminator(*localeID)) {
1370            if (needSeparator) {
1371                if (i<variantCapacity) {
1372                    variant[i] = '_';
1373                }
1374                ++i;
1375                needSeparator = FALSE;
1376            }
1377            if(i<variantCapacity) {
1378                variant[i]=(char)uprv_toupper(*localeID);
1379                if(variant[i]=='-') {
1380                    variant[i]='_';
1381                }
1382            }
1383            i++;
1384            localeID++;
1385        }
1386    }
1387
1388    /* if there is no variant tag after a '-' or '_' then look for '@' */
1389    if(i==0) {
1390        if(prev=='@') {
1391            /* keep localeID */
1392        } else if((localeID=locale_getKeywordsStart(localeID))!=NULL) {
1393            ++localeID; /* point after the '@' */
1394        } else {
1395            return 0;
1396        }
1397        while(!_isTerminator(*localeID)) {
1398            if (needSeparator) {
1399                if (i<variantCapacity) {
1400                    variant[i] = '_';
1401                }
1402                ++i;
1403                needSeparator = FALSE;
1404            }
1405            if(i<variantCapacity) {
1406                variant[i]=(char)uprv_toupper(*localeID);
1407                if(variant[i]=='-' || variant[i]==',') {
1408                    variant[i]='_';
1409                }
1410            }
1411            i++;
1412            localeID++;
1413        }
1414    }
1415
1416    return i;
1417}
1418
1419static int32_t
1420_getVariant(const char *localeID,
1421            char prev,
1422            char *variant, int32_t variantCapacity) {
1423    return _getVariantEx(localeID, prev, variant, variantCapacity, FALSE);
1424}
1425
1426/**
1427 * Delete ALL instances of a variant from the given list of one or
1428 * more variants.  Example: "FOO_EURO_BAR_EURO" => "FOO_BAR".
1429 * @param variants the source string of one or more variants,
1430 * separated by '_'.  This will be MODIFIED IN PLACE.  Not zero
1431 * terminated; if it is, trailing zero will NOT be maintained.
1432 * @param variantsLen length of variants
1433 * @param toDelete variant to delete, without separators, e.g.  "EURO"
1434 * or "PREEURO"; not zero terminated
1435 * @param toDeleteLen length of toDelete
1436 * @return number of characters deleted from variants
1437 */
1438static int32_t
1439_deleteVariant(char* variants, int32_t variantsLen,
1440               const char* toDelete, int32_t toDeleteLen)
1441{
1442    int32_t delta = 0; /* number of chars deleted */
1443    for (;;) {
1444        UBool flag = FALSE;
1445        if (variantsLen < toDeleteLen) {
1446            return delta;
1447        }
1448        if (uprv_strncmp(variants, toDelete, toDeleteLen) == 0 &&
1449            (variantsLen == toDeleteLen ||
1450             (flag=(variants[toDeleteLen] == '_'))))
1451        {
1452            int32_t d = toDeleteLen + (flag?1:0);
1453            variantsLen -= d;
1454            delta += d;
1455            if (variantsLen > 0) {
1456                uprv_memmove(variants, variants+d, variantsLen);
1457            }
1458        } else {
1459            char* p = _strnchr(variants, variantsLen, '_');
1460            if (p == NULL) {
1461                return delta;
1462            }
1463            ++p;
1464            variantsLen -= (int32_t)(p - variants);
1465            variants = p;
1466        }
1467    }
1468}
1469
1470/* Keyword enumeration */
1471
1472typedef struct UKeywordsContext {
1473    char* keywords;
1474    char* current;
1475} UKeywordsContext;
1476
1477static void U_CALLCONV
1478uloc_kw_closeKeywords(UEnumeration *enumerator) {
1479    uprv_free(((UKeywordsContext *)enumerator->context)->keywords);
1480    uprv_free(enumerator->context);
1481    uprv_free(enumerator);
1482}
1483
1484static int32_t U_CALLCONV
1485uloc_kw_countKeywords(UEnumeration *en, UErrorCode *status) {
1486    char *kw = ((UKeywordsContext *)en->context)->keywords;
1487    int32_t result = 0;
1488    while(*kw) {
1489        result++;
1490        kw += uprv_strlen(kw)+1;
1491    }
1492    return result;
1493}
1494
1495static const char* U_CALLCONV
1496uloc_kw_nextKeyword(UEnumeration* en,
1497                    int32_t* resultLength,
1498                    UErrorCode* status) {
1499    const char* result = ((UKeywordsContext *)en->context)->current;
1500    int32_t len = 0;
1501    if(*result) {
1502        len = (int32_t)uprv_strlen(((UKeywordsContext *)en->context)->current);
1503        ((UKeywordsContext *)en->context)->current += len+1;
1504    } else {
1505        result = NULL;
1506    }
1507    if (resultLength) {
1508        *resultLength = len;
1509    }
1510    return result;
1511}
1512
1513static void U_CALLCONV
1514uloc_kw_resetKeywords(UEnumeration* en,
1515                      UErrorCode* status) {
1516    ((UKeywordsContext *)en->context)->current = ((UKeywordsContext *)en->context)->keywords;
1517}
1518
1519static const UEnumeration gKeywordsEnum = {
1520    NULL,
1521    NULL,
1522    uloc_kw_closeKeywords,
1523    uloc_kw_countKeywords,
1524    uenum_unextDefault,
1525    uloc_kw_nextKeyword,
1526    uloc_kw_resetKeywords
1527};
1528
1529U_CAPI UEnumeration* U_EXPORT2
1530uloc_openKeywordList(const char *keywordList, int32_t keywordListSize, UErrorCode* status)
1531{
1532    UKeywordsContext *myContext = NULL;
1533    UEnumeration *result = NULL;
1534
1535    if(U_FAILURE(*status)) {
1536        return NULL;
1537    }
1538    result = (UEnumeration *)uprv_malloc(sizeof(UEnumeration));
1539    /* Null pointer test */
1540    if (result == NULL) {
1541        *status = U_MEMORY_ALLOCATION_ERROR;
1542        return NULL;
1543    }
1544    uprv_memcpy(result, &gKeywordsEnum, sizeof(UEnumeration));
1545    myContext = uprv_malloc(sizeof(UKeywordsContext));
1546    if (myContext == NULL) {
1547        *status = U_MEMORY_ALLOCATION_ERROR;
1548        uprv_free(result);
1549        return NULL;
1550    }
1551    myContext->keywords = (char *)uprv_malloc(keywordListSize+1);
1552    uprv_memcpy(myContext->keywords, keywordList, keywordListSize);
1553    myContext->keywords[keywordListSize] = 0;
1554    myContext->current = myContext->keywords;
1555    result->context = myContext;
1556    return result;
1557}
1558
1559U_CAPI UEnumeration* U_EXPORT2
1560uloc_openKeywords(const char* localeID,
1561                        UErrorCode* status)
1562{
1563    int32_t i=0;
1564    char keywords[256];
1565    int32_t keywordsCapacity = 256;
1566    char tempBuffer[ULOC_FULLNAME_CAPACITY];
1567    const char* tmpLocaleID;
1568
1569    if(status==NULL || U_FAILURE(*status)) {
1570        return 0;
1571    }
1572
1573    if (_hasBCP47Extension(localeID)) {
1574        _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), status);
1575    } else {
1576        if (localeID==NULL) {
1577           localeID=uloc_getDefault();
1578        }
1579        tmpLocaleID=localeID;
1580    }
1581
1582    /* Skip the language */
1583    ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
1584    if(_isIDSeparator(*tmpLocaleID)) {
1585        const char *scriptID;
1586        /* Skip the script if available */
1587        ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
1588        if(scriptID != tmpLocaleID+1) {
1589            /* Found optional script */
1590            tmpLocaleID = scriptID;
1591        }
1592        /* Skip the Country */
1593        if (_isIDSeparator(*tmpLocaleID)) {
1594            ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &tmpLocaleID);
1595            if(_isIDSeparator(*tmpLocaleID)) {
1596                _getVariant(tmpLocaleID+1, *tmpLocaleID, NULL, 0);
1597            }
1598        }
1599    }
1600
1601    /* keywords are located after '@' */
1602    if((tmpLocaleID = locale_getKeywordsStart(tmpLocaleID)) != NULL) {
1603        i=locale_getKeywords(tmpLocaleID+1, '@', keywords, keywordsCapacity, NULL, 0, NULL, FALSE, status);
1604    }
1605
1606    if(i) {
1607        return uloc_openKeywordList(keywords, i, status);
1608    } else {
1609        return NULL;
1610    }
1611}
1612
1613
1614/* bit-flags for 'options' parameter of _canonicalize */
1615#define _ULOC_STRIP_KEYWORDS 0x2
1616#define _ULOC_CANONICALIZE   0x1
1617
1618#define OPTION_SET(options, mask) ((options & mask) != 0)
1619
1620static const char i_default[] = {'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'};
1621#define I_DEFAULT_LENGTH (sizeof i_default / sizeof i_default[0])
1622
1623/**
1624 * Canonicalize the given localeID, to level 1 or to level 2,
1625 * depending on the options.  To specify level 1, pass in options=0.
1626 * To specify level 2, pass in options=_ULOC_CANONICALIZE.
1627 *
1628 * This is the code underlying uloc_getName and uloc_canonicalize.
1629 */
1630static int32_t
1631_canonicalize(const char* localeID,
1632              char* result,
1633              int32_t resultCapacity,
1634              uint32_t options,
1635              UErrorCode* err) {
1636    int32_t j, len, fieldCount=0, scriptSize=0, variantSize=0, nameCapacity;
1637    char localeBuffer[ULOC_FULLNAME_CAPACITY];
1638    char tempBuffer[ULOC_FULLNAME_CAPACITY];
1639    const char* origLocaleID;
1640    const char* tmpLocaleID;
1641    const char* keywordAssign = NULL;
1642    const char* separatorIndicator = NULL;
1643    const char* addKeyword = NULL;
1644    const char* addValue = NULL;
1645    char* name;
1646    char* variant = NULL; /* pointer into name, or NULL */
1647
1648    if (U_FAILURE(*err)) {
1649        return 0;
1650    }
1651
1652    if (_hasBCP47Extension(localeID)) {
1653        _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1654    } else {
1655        if (localeID==NULL) {
1656           localeID=uloc_getDefault();
1657        }
1658        tmpLocaleID=localeID;
1659    }
1660
1661    origLocaleID=tmpLocaleID;
1662
1663    /* if we are doing a full canonicalization, then put results in
1664       localeBuffer, if necessary; otherwise send them to result. */
1665    if (/*OPTION_SET(options, _ULOC_CANONICALIZE) &&*/
1666        (result == NULL || resultCapacity <  sizeof(localeBuffer))) {
1667        name = localeBuffer;
1668        nameCapacity = sizeof(localeBuffer);
1669    } else {
1670        name = result;
1671        nameCapacity = resultCapacity;
1672    }
1673
1674    /* get all pieces, one after another, and separate with '_' */
1675    len=ulocimp_getLanguage(tmpLocaleID, name, nameCapacity, &tmpLocaleID);
1676
1677    if(len == I_DEFAULT_LENGTH && uprv_strncmp(origLocaleID, i_default, len) == 0) {
1678        const char *d = uloc_getDefault();
1679
1680        len = (int32_t)uprv_strlen(d);
1681
1682        if (name != NULL) {
1683            uprv_strncpy(name, d, len);
1684        }
1685    } else if(_isIDSeparator(*tmpLocaleID)) {
1686        const char *scriptID;
1687
1688        ++fieldCount;
1689        if(len<nameCapacity) {
1690            name[len]='_';
1691        }
1692        ++len;
1693
1694        scriptSize=ulocimp_getScript(tmpLocaleID+1, name+len, nameCapacity-len, &scriptID);
1695        if(scriptSize > 0) {
1696            /* Found optional script */
1697            tmpLocaleID = scriptID;
1698            ++fieldCount;
1699            len+=scriptSize;
1700            if (_isIDSeparator(*tmpLocaleID)) {
1701                /* If there is something else, then we add the _ */
1702                if(len<nameCapacity) {
1703                    name[len]='_';
1704                }
1705                ++len;
1706            }
1707        }
1708
1709        if (_isIDSeparator(*tmpLocaleID)) {
1710            const char *cntryID;
1711            int32_t cntrySize = ulocimp_getCountry(tmpLocaleID+1, name+len, nameCapacity-len, &cntryID);
1712            if (cntrySize > 0) {
1713                /* Found optional country */
1714                tmpLocaleID = cntryID;
1715                len+=cntrySize;
1716            }
1717            if(_isIDSeparator(*tmpLocaleID)) {
1718                /* If there is something else, then we add the _  if we found country before.*/
1719                if (cntrySize > 0) {
1720                    ++fieldCount;
1721                    if(len<nameCapacity) {
1722                        name[len]='_';
1723                    }
1724                    ++len;
1725                }
1726
1727                variantSize = _getVariant(tmpLocaleID+1, *tmpLocaleID, name+len, nameCapacity-len);
1728                if (variantSize > 0) {
1729                    variant = name+len;
1730                    len += variantSize;
1731                    tmpLocaleID += variantSize + 1; /* skip '_' and variant */
1732                }
1733            }
1734        }
1735    }
1736
1737    /* Copy POSIX-style charset specifier, if any [mr.utf8] */
1738    if (!OPTION_SET(options, _ULOC_CANONICALIZE) && *tmpLocaleID == '.') {
1739        UBool done = FALSE;
1740        do {
1741            char c = *tmpLocaleID;
1742            switch (c) {
1743            case 0:
1744            case '@':
1745                done = TRUE;
1746                break;
1747            default:
1748                if (len<nameCapacity) {
1749                    name[len] = c;
1750                }
1751                ++len;
1752                ++tmpLocaleID;
1753                break;
1754            }
1755        } while (!done);
1756    }
1757
1758    /* Scan ahead to next '@' and determine if it is followed by '=' and/or ';'
1759       After this, tmpLocaleID either points to '@' or is NULL */
1760    if ((tmpLocaleID=locale_getKeywordsStart(tmpLocaleID))!=NULL) {
1761        keywordAssign = uprv_strchr(tmpLocaleID, '=');
1762        separatorIndicator = uprv_strchr(tmpLocaleID, ';');
1763    }
1764
1765    /* Copy POSIX-style variant, if any [mr@FOO] */
1766    if (!OPTION_SET(options, _ULOC_CANONICALIZE) &&
1767        tmpLocaleID != NULL && keywordAssign == NULL) {
1768        for (;;) {
1769            char c = *tmpLocaleID;
1770            if (c == 0) {
1771                break;
1772            }
1773            if (len<nameCapacity) {
1774                name[len] = c;
1775            }
1776            ++len;
1777            ++tmpLocaleID;
1778        }
1779    }
1780
1781    if (OPTION_SET(options, _ULOC_CANONICALIZE)) {
1782        /* Handle @FOO variant if @ is present and not followed by = */
1783        if (tmpLocaleID!=NULL && keywordAssign==NULL) {
1784            int32_t posixVariantSize;
1785            /* Add missing '_' if needed */
1786            if (fieldCount < 2 || (fieldCount < 3 && scriptSize > 0)) {
1787                do {
1788                    if(len<nameCapacity) {
1789                        name[len]='_';
1790                    }
1791                    ++len;
1792                    ++fieldCount;
1793                } while(fieldCount<2);
1794            }
1795            posixVariantSize = _getVariantEx(tmpLocaleID+1, '@', name+len, nameCapacity-len,
1796                                             (UBool)(variantSize > 0));
1797            if (posixVariantSize > 0) {
1798                if (variant == NULL) {
1799                    variant = name+len;
1800                }
1801                len += posixVariantSize;
1802                variantSize += posixVariantSize;
1803            }
1804        }
1805
1806        /* Handle generic variants first */
1807        if (variant) {
1808            for (j=0; j<(int32_t)(sizeof(VARIANT_MAP)/sizeof(VARIANT_MAP[0])); j++) {
1809                const char* variantToCompare = VARIANT_MAP[j].variant;
1810                int32_t n = (int32_t)uprv_strlen(variantToCompare);
1811                int32_t variantLen = _deleteVariant(variant, uprv_min(variantSize, (nameCapacity-len)), variantToCompare, n);
1812                len -= variantLen;
1813                if (variantLen > 0) {
1814                    if (len > 0 && name[len-1] == '_') { /* delete trailing '_' */
1815                        --len;
1816                    }
1817                    addKeyword = VARIANT_MAP[j].keyword;
1818                    addValue = VARIANT_MAP[j].value;
1819                    break;
1820                }
1821            }
1822            if (len > 0 && len <= nameCapacity && name[len-1] == '_') { /* delete trailing '_' */
1823                --len;
1824            }
1825        }
1826
1827        /* Look up the ID in the canonicalization map */
1828        for (j=0; j<(int32_t)(sizeof(CANONICALIZE_MAP)/sizeof(CANONICALIZE_MAP[0])); j++) {
1829            const char* id = CANONICALIZE_MAP[j].id;
1830            int32_t n = (int32_t)uprv_strlen(id);
1831            if (len == n && uprv_strncmp(name, id, n) == 0) {
1832                if (n == 0 && tmpLocaleID != NULL) {
1833                    break; /* Don't remap "" if keywords present */
1834                }
1835                len = _copyCount(name, nameCapacity, CANONICALIZE_MAP[j].canonicalID);
1836                if (CANONICALIZE_MAP[j].keyword) {
1837                    addKeyword = CANONICALIZE_MAP[j].keyword;
1838                    addValue = CANONICALIZE_MAP[j].value;
1839                }
1840                break;
1841            }
1842        }
1843    }
1844
1845    if (!OPTION_SET(options, _ULOC_STRIP_KEYWORDS)) {
1846        if (tmpLocaleID!=NULL && keywordAssign!=NULL &&
1847            (!separatorIndicator || separatorIndicator > keywordAssign)) {
1848            if(len<nameCapacity) {
1849                name[len]='@';
1850            }
1851            ++len;
1852            ++fieldCount;
1853            len += _getKeywords(tmpLocaleID+1, '@', name+len, nameCapacity-len, NULL, 0, NULL, TRUE,
1854                                addKeyword, addValue, err);
1855        } else if (addKeyword != NULL) {
1856            U_ASSERT(addValue != NULL);
1857            /* inelegant but works -- later make _getKeywords do this? */
1858            len += _copyCount(name+len, nameCapacity-len, "@");
1859            len += _copyCount(name+len, nameCapacity-len, addKeyword);
1860            len += _copyCount(name+len, nameCapacity-len, "=");
1861            len += _copyCount(name+len, nameCapacity-len, addValue);
1862        }
1863    }
1864
1865    if (U_SUCCESS(*err) && result != NULL && name == localeBuffer) {
1866        uprv_strncpy(result, localeBuffer, (len > resultCapacity) ? resultCapacity : len);
1867    }
1868
1869    return u_terminateChars(result, resultCapacity, len, err);
1870}
1871
1872/* ### ID parsing API **************************************************/
1873
1874U_CAPI int32_t  U_EXPORT2
1875uloc_getParent(const char*    localeID,
1876               char* parent,
1877               int32_t parentCapacity,
1878               UErrorCode* err)
1879{
1880    const char *lastUnderscore;
1881    int32_t i;
1882
1883    if (U_FAILURE(*err))
1884        return 0;
1885
1886    if (localeID == NULL)
1887        localeID = uloc_getDefault();
1888
1889    lastUnderscore=uprv_strrchr(localeID, '_');
1890    if(lastUnderscore!=NULL) {
1891        i=(int32_t)(lastUnderscore-localeID);
1892    } else {
1893        i=0;
1894    }
1895
1896    if(i>0 && parent != localeID) {
1897        uprv_memcpy(parent, localeID, uprv_min(i, parentCapacity));
1898    }
1899    return u_terminateChars(parent, parentCapacity, i, err);
1900}
1901
1902U_CAPI int32_t U_EXPORT2
1903uloc_getLanguage(const char*    localeID,
1904         char* language,
1905         int32_t languageCapacity,
1906         UErrorCode* err)
1907{
1908    /* uloc_getLanguage will return a 2 character iso-639 code if one exists. *CWB*/
1909    int32_t i=0;
1910
1911    if (err==NULL || U_FAILURE(*err)) {
1912        return 0;
1913    }
1914
1915    if(localeID==NULL) {
1916        localeID=uloc_getDefault();
1917    }
1918
1919    i=ulocimp_getLanguage(localeID, language, languageCapacity, NULL);
1920    return u_terminateChars(language, languageCapacity, i, err);
1921}
1922
1923U_CAPI int32_t U_EXPORT2
1924uloc_getScript(const char*    localeID,
1925         char* script,
1926         int32_t scriptCapacity,
1927         UErrorCode* err)
1928{
1929    int32_t i=0;
1930
1931    if(err==NULL || U_FAILURE(*err)) {
1932        return 0;
1933    }
1934
1935    if(localeID==NULL) {
1936        localeID=uloc_getDefault();
1937    }
1938
1939    /* skip the language */
1940    ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1941    if(_isIDSeparator(*localeID)) {
1942        i=ulocimp_getScript(localeID+1, script, scriptCapacity, NULL);
1943    }
1944    return u_terminateChars(script, scriptCapacity, i, err);
1945}
1946
1947U_CAPI int32_t  U_EXPORT2
1948uloc_getCountry(const char* localeID,
1949            char* country,
1950            int32_t countryCapacity,
1951            UErrorCode* err)
1952{
1953    int32_t i=0;
1954
1955    if(err==NULL || U_FAILURE(*err)) {
1956        return 0;
1957    }
1958
1959    if(localeID==NULL) {
1960        localeID=uloc_getDefault();
1961    }
1962
1963    /* Skip the language */
1964    ulocimp_getLanguage(localeID, NULL, 0, &localeID);
1965    if(_isIDSeparator(*localeID)) {
1966        const char *scriptID;
1967        /* Skip the script if available */
1968        ulocimp_getScript(localeID+1, NULL, 0, &scriptID);
1969        if(scriptID != localeID+1) {
1970            /* Found optional script */
1971            localeID = scriptID;
1972        }
1973        if(_isIDSeparator(*localeID)) {
1974            i=ulocimp_getCountry(localeID+1, country, countryCapacity, NULL);
1975        }
1976    }
1977    return u_terminateChars(country, countryCapacity, i, err);
1978}
1979
1980U_CAPI int32_t  U_EXPORT2
1981uloc_getVariant(const char* localeID,
1982                char* variant,
1983                int32_t variantCapacity,
1984                UErrorCode* err)
1985{
1986    char tempBuffer[ULOC_FULLNAME_CAPACITY];
1987    const char* tmpLocaleID;
1988    int32_t i=0;
1989
1990    if(err==NULL || U_FAILURE(*err)) {
1991        return 0;
1992    }
1993
1994    if (_hasBCP47Extension(localeID)) {
1995        _ConvertBCP47(tmpLocaleID, localeID, tempBuffer, sizeof(tempBuffer), err);
1996    } else {
1997        if (localeID==NULL) {
1998           localeID=uloc_getDefault();
1999        }
2000        tmpLocaleID=localeID;
2001    }
2002
2003    /* Skip the language */
2004    ulocimp_getLanguage(tmpLocaleID, NULL, 0, &tmpLocaleID);
2005    if(_isIDSeparator(*tmpLocaleID)) {
2006        const char *scriptID;
2007        /* Skip the script if available */
2008        ulocimp_getScript(tmpLocaleID+1, NULL, 0, &scriptID);
2009        if(scriptID != tmpLocaleID+1) {
2010            /* Found optional script */
2011            tmpLocaleID = scriptID;
2012        }
2013        /* Skip the Country */
2014        if (_isIDSeparator(*tmpLocaleID)) {
2015            const char *cntryID;
2016            ulocimp_getCountry(tmpLocaleID+1, NULL, 0, &cntryID);
2017            if (cntryID != tmpLocaleID+1) {
2018                /* Found optional country */
2019                tmpLocaleID = cntryID;
2020            }
2021            if(_isIDSeparator(*tmpLocaleID)) {
2022                /* If there was no country ID, skip a possible extra IDSeparator */
2023                if (tmpLocaleID != cntryID && _isIDSeparator(tmpLocaleID[1])) {
2024                    tmpLocaleID++;
2025                }
2026                i=_getVariant(tmpLocaleID+1, *tmpLocaleID, variant, variantCapacity);
2027            }
2028        }
2029    }
2030
2031    /* removed by weiv. We don't want to handle POSIX variants anymore. Use canonicalization function */
2032    /* if we do not have a variant tag yet then try a POSIX variant after '@' */
2033/*
2034    if(!haveVariant && (localeID=uprv_strrchr(localeID, '@'))!=NULL) {
2035        i=_getVariant(localeID+1, '@', variant, variantCapacity);
2036    }
2037*/
2038    return u_terminateChars(variant, variantCapacity, i, err);
2039}
2040
2041U_CAPI int32_t  U_EXPORT2
2042uloc_getName(const char* localeID,
2043             char* name,
2044             int32_t nameCapacity,
2045             UErrorCode* err)
2046{
2047    return _canonicalize(localeID, name, nameCapacity, 0, err);
2048}
2049
2050U_CAPI int32_t  U_EXPORT2
2051uloc_getBaseName(const char* localeID,
2052                 char* name,
2053                 int32_t nameCapacity,
2054                 UErrorCode* err)
2055{
2056    return _canonicalize(localeID, name, nameCapacity, _ULOC_STRIP_KEYWORDS, err);
2057}
2058
2059U_CAPI int32_t  U_EXPORT2
2060uloc_canonicalize(const char* localeID,
2061                  char* name,
2062                  int32_t nameCapacity,
2063                  UErrorCode* err)
2064{
2065    return _canonicalize(localeID, name, nameCapacity, _ULOC_CANONICALIZE, err);
2066}
2067
2068U_CAPI const char*  U_EXPORT2
2069uloc_getISO3Language(const char* localeID)
2070{
2071    int16_t offset;
2072    char lang[ULOC_LANG_CAPACITY];
2073    UErrorCode err = U_ZERO_ERROR;
2074
2075    if (localeID == NULL)
2076    {
2077        localeID = uloc_getDefault();
2078    }
2079    uloc_getLanguage(localeID, lang, ULOC_LANG_CAPACITY, &err);
2080    if (U_FAILURE(err))
2081        return "";
2082    offset = _findIndex(LANGUAGES, lang);
2083    if (offset < 0)
2084        return "";
2085    return LANGUAGES_3[offset];
2086}
2087
2088U_CAPI const char*  U_EXPORT2
2089uloc_getISO3Country(const char* localeID)
2090{
2091    int16_t offset;
2092    char cntry[ULOC_LANG_CAPACITY];
2093    UErrorCode err = U_ZERO_ERROR;
2094
2095    if (localeID == NULL)
2096    {
2097        localeID = uloc_getDefault();
2098    }
2099    uloc_getCountry(localeID, cntry, ULOC_LANG_CAPACITY, &err);
2100    if (U_FAILURE(err))
2101        return "";
2102    offset = _findIndex(COUNTRIES, cntry);
2103    if (offset < 0)
2104        return "";
2105
2106    return COUNTRIES_3[offset];
2107}
2108
2109U_CAPI uint32_t  U_EXPORT2
2110uloc_getLCID(const char* localeID)
2111{
2112    UErrorCode status = U_ZERO_ERROR;
2113    char       langID[ULOC_FULLNAME_CAPACITY];
2114
2115    uloc_getLanguage(localeID, langID, sizeof(langID), &status);
2116    if (U_FAILURE(status)) {
2117        return 0;
2118    }
2119
2120    return uprv_convertToLCID(langID, localeID, &status);
2121}
2122
2123U_CAPI int32_t U_EXPORT2
2124uloc_getLocaleForLCID(uint32_t hostid, char *locale, int32_t localeCapacity,
2125                UErrorCode *status)
2126{
2127    int32_t length;
2128    const char *posix = uprv_convertToPosix(hostid, status);
2129    if (U_FAILURE(*status) || posix == NULL) {
2130        return 0;
2131    }
2132    length = (int32_t)uprv_strlen(posix);
2133    if (length+1 > localeCapacity) {
2134        *status = U_BUFFER_OVERFLOW_ERROR;
2135    }
2136    else {
2137        uprv_strcpy(locale, posix);
2138    }
2139    return length;
2140}
2141
2142/* ### Default locale **************************************************/
2143
2144U_CAPI const char*  U_EXPORT2
2145uloc_getDefault()
2146{
2147    return locale_get_default();
2148}
2149
2150U_CAPI void  U_EXPORT2
2151uloc_setDefault(const char*   newDefaultLocale,
2152             UErrorCode* err)
2153{
2154    if (U_FAILURE(*err))
2155        return;
2156    /* the error code isn't currently used for anything by this function*/
2157
2158    /* propagate change to C++ */
2159    locale_set_default(newDefaultLocale);
2160}
2161
2162/**
2163 * Returns a list of all language codes defined in ISO 639.  This is a pointer
2164 * to an array of pointers to arrays of char.  All of these pointers are owned
2165 * by ICU-- do not delete them, and do not write through them.  The array is
2166 * terminated with a null pointer.
2167 */
2168U_CAPI const char* const*  U_EXPORT2
2169uloc_getISOLanguages()
2170{
2171    return LANGUAGES;
2172}
2173
2174/**
2175 * Returns a list of all 2-letter country codes defined in ISO 639.  This is a
2176 * pointer to an array of pointers to arrays of char.  All of these pointers are
2177 * owned by ICU-- do not delete them, and do not write through them.  The array is
2178 * terminated with a null pointer.
2179 */
2180U_CAPI const char* const*  U_EXPORT2
2181uloc_getISOCountries()
2182{
2183    return COUNTRIES;
2184}
2185
2186
2187/* this function to be moved into cstring.c later */
2188static char gDecimal = 0;
2189
2190static /* U_CAPI */
2191double
2192/* U_EXPORT2 */
2193_uloc_strtod(const char *start, char **end) {
2194    char *decimal;
2195    char *myEnd;
2196    char buf[30];
2197    double rv;
2198    if (!gDecimal) {
2199        char rep[5];
2200        /* For machines that decide to change the decimal on you,
2201        and try to be too smart with localization.
2202        This normally should be just a '.'. */
2203        sprintf(rep, "%+1.1f", 1.0);
2204        gDecimal = rep[2];
2205    }
2206
2207    if(gDecimal == '.') {
2208        return uprv_strtod(start, end); /* fall through to OS */
2209    } else {
2210        uprv_strncpy(buf, start, 29);
2211        buf[29]=0;
2212        decimal = uprv_strchr(buf, '.');
2213        if(decimal) {
2214            *decimal = gDecimal;
2215        } else {
2216            return uprv_strtod(start, end); /* no decimal point */
2217        }
2218        rv = uprv_strtod(buf, &myEnd);
2219        if(end) {
2220            *end = (char*)(start+(myEnd-buf)); /* cast away const (to follow uprv_strtod API.) */
2221        }
2222        return rv;
2223    }
2224}
2225
2226typedef struct {
2227    float q;
2228    int32_t dummy;  /* to avoid uninitialized memory copy from qsort */
2229    char *locale;
2230} _acceptLangItem;
2231
2232static int32_t U_CALLCONV
2233uloc_acceptLanguageCompare(const void *context, const void *a, const void *b)
2234{
2235    const _acceptLangItem *aa = (const _acceptLangItem*)a;
2236    const _acceptLangItem *bb = (const _acceptLangItem*)b;
2237
2238    int32_t rc = 0;
2239    if(bb->q < aa->q) {
2240        rc = -1;  /* A > B */
2241    } else if(bb->q > aa->q) {
2242        rc = 1;   /* A < B */
2243    } else {
2244        rc = 0;   /* A = B */
2245    }
2246
2247    if(rc==0) {
2248        rc = uprv_stricmp(aa->locale, bb->locale);
2249    }
2250
2251#if defined(ULOC_DEBUG)
2252    /*  fprintf(stderr, "a:[%s:%g], b:[%s:%g] -> %d\n",
2253    aa->locale, aa->q,
2254    bb->locale, bb->q,
2255    rc);*/
2256#endif
2257
2258    return rc;
2259}
2260
2261/*
2262mt-mt, ja;q=0.76, en-us;q=0.95, en;q=0.92, en-gb;q=0.89, fr;q=0.87, iu-ca;q=0.84, iu;q=0.82, ja-jp;q=0.79, mt;q=0.97, de-de;q=0.74, de;q=0.71, es;q=0.68, it-it;q=0.66, it;q=0.63, vi-vn;q=0.61, vi;q=0.58, nl-nl;q=0.55, nl;q=0.53
2263*/
2264
2265U_CAPI int32_t U_EXPORT2
2266uloc_acceptLanguageFromHTTP(char *result, int32_t resultAvailable, UAcceptResult *outResult,
2267                            const char *httpAcceptLanguage,
2268                            UEnumeration* availableLocales,
2269                            UErrorCode *status)
2270{
2271    _acceptLangItem *j;
2272    _acceptLangItem smallBuffer[30];
2273    char **strs;
2274    char tmp[ULOC_FULLNAME_CAPACITY +1];
2275    int32_t n = 0;
2276    const char *itemEnd;
2277    const char *paramEnd;
2278    const char *s;
2279    const char *t;
2280    int32_t res;
2281    int32_t i;
2282    int32_t l = (int32_t)uprv_strlen(httpAcceptLanguage);
2283    int32_t jSize;
2284    char *tempstr; /* Use for null pointer check */
2285
2286    j = smallBuffer;
2287    jSize = sizeof(smallBuffer)/sizeof(smallBuffer[0]);
2288    if(U_FAILURE(*status)) {
2289        return -1;
2290    }
2291
2292    for(s=httpAcceptLanguage;s&&*s;) {
2293        while(isspace(*s)) /* eat space at the beginning */
2294            s++;
2295        itemEnd=uprv_strchr(s,',');
2296        paramEnd=uprv_strchr(s,';');
2297        if(!itemEnd) {
2298            itemEnd = httpAcceptLanguage+l; /* end of string */
2299        }
2300        if(paramEnd && paramEnd<itemEnd) {
2301            /* semicolon (;) is closer than end (,) */
2302            t = paramEnd+1;
2303            if(*t=='q') {
2304                t++;
2305            }
2306            while(isspace(*t)) {
2307                t++;
2308            }
2309            if(*t=='=') {
2310                t++;
2311            }
2312            while(isspace(*t)) {
2313                t++;
2314            }
2315            j[n].q = (float)_uloc_strtod(t,NULL);
2316        } else {
2317            /* no semicolon - it's 1.0 */
2318            j[n].q = 1.0f;
2319            paramEnd = itemEnd;
2320        }
2321        j[n].dummy=0;
2322        /* eat spaces prior to semi */
2323        for(t=(paramEnd-1);(paramEnd>s)&&isspace(*t);t--)
2324            ;
2325        /* Check for null pointer from uprv_strndup */
2326        tempstr = uprv_strndup(s,(int32_t)((t+1)-s));
2327        if (tempstr == NULL) {
2328            *status = U_MEMORY_ALLOCATION_ERROR;
2329            return -1;
2330        }
2331        j[n].locale = tempstr;
2332        uloc_canonicalize(j[n].locale,tmp,sizeof(tmp)/sizeof(tmp[0]),status);
2333        if(strcmp(j[n].locale,tmp)) {
2334            uprv_free(j[n].locale);
2335            j[n].locale=uprv_strdup(tmp);
2336        }
2337#if defined(ULOC_DEBUG)
2338        /*fprintf(stderr,"%d: s <%s> q <%g>\n", n, j[n].locale, j[n].q);*/
2339#endif
2340        n++;
2341        s = itemEnd;
2342        while(*s==',') { /* eat duplicate commas */
2343            s++;
2344        }
2345        if(n>=jSize) {
2346            if(j==smallBuffer) {  /* overflowed the small buffer. */
2347                j = uprv_malloc(sizeof(j[0])*(jSize*2));
2348                if(j!=NULL) {
2349                    uprv_memcpy(j,smallBuffer,sizeof(j[0])*jSize);
2350                }
2351#if defined(ULOC_DEBUG)
2352                fprintf(stderr,"malloced at size %d\n", jSize);
2353#endif
2354            } else {
2355                j = uprv_realloc(j, sizeof(j[0])*jSize*2);
2356#if defined(ULOC_DEBUG)
2357                fprintf(stderr,"re-alloced at size %d\n", jSize);
2358#endif
2359            }
2360            jSize *= 2;
2361            if(j==NULL) {
2362                *status = U_MEMORY_ALLOCATION_ERROR;
2363                return -1;
2364            }
2365        }
2366    }
2367    uprv_sortArray(j, n, sizeof(j[0]), uloc_acceptLanguageCompare, NULL, TRUE, status);
2368    if(U_FAILURE(*status)) {
2369        if(j != smallBuffer) {
2370#if defined(ULOC_DEBUG)
2371            fprintf(stderr,"freeing j %p\n", j);
2372#endif
2373            uprv_free(j);
2374        }
2375        return -1;
2376    }
2377    strs = uprv_malloc((size_t)(sizeof(strs[0])*n));
2378    /* Check for null pointer */
2379    if (strs == NULL) {
2380        uprv_free(j); /* Free to avoid memory leak */
2381        *status = U_MEMORY_ALLOCATION_ERROR;
2382        return -1;
2383    }
2384    for(i=0;i<n;i++) {
2385#if defined(ULOC_DEBUG)
2386        /*fprintf(stderr,"%d: s <%s> q <%g>\n", i, j[i].locale, j[i].q);*/
2387#endif
2388        strs[i]=j[i].locale;
2389    }
2390    res =  uloc_acceptLanguage(result, resultAvailable, outResult,
2391        (const char**)strs, n, availableLocales, status);
2392    for(i=0;i<n;i++) {
2393        uprv_free(strs[i]);
2394    }
2395    uprv_free(strs);
2396    if(j != smallBuffer) {
2397#if defined(ULOC_DEBUG)
2398        fprintf(stderr,"freeing j %p\n", j);
2399#endif
2400        uprv_free(j);
2401    }
2402    return res;
2403}
2404
2405
2406U_CAPI int32_t U_EXPORT2
2407uloc_acceptLanguage(char *result, int32_t resultAvailable,
2408                    UAcceptResult *outResult, const char **acceptList,
2409                    int32_t acceptListCount,
2410                    UEnumeration* availableLocales,
2411                    UErrorCode *status)
2412{
2413    int32_t i,j;
2414    int32_t len;
2415    int32_t maxLen=0;
2416    char tmp[ULOC_FULLNAME_CAPACITY+1];
2417    const char *l;
2418    char **fallbackList;
2419    if(U_FAILURE(*status)) {
2420        return -1;
2421    }
2422    fallbackList = uprv_malloc((size_t)(sizeof(fallbackList[0])*acceptListCount));
2423    if(fallbackList==NULL) {
2424        *status = U_MEMORY_ALLOCATION_ERROR;
2425        return -1;
2426    }
2427    for(i=0;i<acceptListCount;i++) {
2428#if defined(ULOC_DEBUG)
2429        fprintf(stderr,"%02d: %s\n", i, acceptList[i]);
2430#endif
2431        while((l=uenum_next(availableLocales, NULL, status))) {
2432#if defined(ULOC_DEBUG)
2433            fprintf(stderr,"  %s\n", l);
2434#endif
2435            len = (int32_t)uprv_strlen(l);
2436            if(!uprv_strcmp(acceptList[i], l)) {
2437                if(outResult) {
2438                    *outResult = ULOC_ACCEPT_VALID;
2439                }
2440#if defined(ULOC_DEBUG)
2441                fprintf(stderr, "MATCH! %s\n", l);
2442#endif
2443                if(len>0) {
2444                    uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2445                }
2446                for(j=0;j<i;j++) {
2447                    uprv_free(fallbackList[j]);
2448                }
2449                uprv_free(fallbackList);
2450                return u_terminateChars(result, resultAvailable, len, status);
2451            }
2452            if(len>maxLen) {
2453                maxLen = len;
2454            }
2455        }
2456        uenum_reset(availableLocales, status);
2457        /* save off parent info */
2458        if(uloc_getParent(acceptList[i], tmp, sizeof(tmp)/sizeof(tmp[0]), status)!=0) {
2459            fallbackList[i] = uprv_strdup(tmp);
2460        } else {
2461            fallbackList[i]=0;
2462        }
2463    }
2464
2465    for(maxLen--;maxLen>0;maxLen--) {
2466        for(i=0;i<acceptListCount;i++) {
2467            if(fallbackList[i] && ((int32_t)uprv_strlen(fallbackList[i])==maxLen)) {
2468#if defined(ULOC_DEBUG)
2469                fprintf(stderr,"Try: [%s]", fallbackList[i]);
2470#endif
2471                while((l=uenum_next(availableLocales, NULL, status))) {
2472#if defined(ULOC_DEBUG)
2473                    fprintf(stderr,"  %s\n", l);
2474#endif
2475                    len = (int32_t)uprv_strlen(l);
2476                    if(!uprv_strcmp(fallbackList[i], l)) {
2477                        if(outResult) {
2478                            *outResult = ULOC_ACCEPT_FALLBACK;
2479                        }
2480#if defined(ULOC_DEBUG)
2481                        fprintf(stderr, "fallback MATCH! %s\n", l);
2482#endif
2483                        if(len>0) {
2484                            uprv_strncpy(result, l, uprv_min(len, resultAvailable));
2485                        }
2486                        for(j=0;j<acceptListCount;j++) {
2487                            uprv_free(fallbackList[j]);
2488                        }
2489                        uprv_free(fallbackList);
2490                        return u_terminateChars(result, resultAvailable, len, status);
2491                    }
2492                }
2493                uenum_reset(availableLocales, status);
2494
2495                if(uloc_getParent(fallbackList[i], tmp, sizeof(tmp)/sizeof(tmp[0]), status)!=0) {
2496                    uprv_free(fallbackList[i]);
2497                    fallbackList[i] = uprv_strdup(tmp);
2498                } else {
2499                    uprv_free(fallbackList[i]);
2500                    fallbackList[i]=0;
2501                }
2502            }
2503        }
2504        if(outResult) {
2505            *outResult = ULOC_ACCEPT_FAILED;
2506        }
2507    }
2508    for(i=0;i<acceptListCount;i++) {
2509        uprv_free(fallbackList[i]);
2510    }
2511    uprv_free(fallbackList);
2512    return -1;
2513}
2514
2515/*eof*/
2516