1/*
2 * Copyright (C) 2007 Apple Computer, Inc.
3 *
4 * Portions are Copyright (C) 1998 Netscape Communications Corporation.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
19 *
20 * Alternatively, the contents of this file may be used under the terms
21 * of either the Mozilla Public License Version 1.1, found at
22 * http://www.mozilla.org/MPL/ (the "MPL") or the GNU General Public
23 * License Version 2.0, found at http://www.fsf.org/copyleft/gpl.html
24 * (the "GPL"), in which case the provisions of the MPL or the GPL are
25 * applicable instead of those above.  If you wish to allow use of your
26 * version of this file only under the terms of one of those two
27 * licenses (the MPL or the GPL) and not to allow others to use your
28 * version of this file under the LGPL, indicate your decision by
29 * deletingthe provisions above and replace them with the notice and
30 * other provisions required by the MPL or the GPL, as the case may be.
31 * If you do not delete the provisions above, a recipient may use your
32 * version of this file under any of the LGPL, the MPL or the GPL.
33 */
34
35#include "config.h"
36#include "UnicodeRange.h"
37
38namespace WebCore {
39
40// This table depends on unicode range definitions.
41// Each item's index must correspond to a unicode range value
42// eg. x-cyrillic = LangGroupTable[cRangeCyrillic]
43static const char* gUnicodeRangeToLangGroupTable[] =
44{
45  "x-cyrillic",
46  "el",
47  "tr",
48  "he",
49  "ar",
50  "x-baltic",
51  "th",
52  "ko",
53  "ja",
54  "zh-CN",
55  "zh-TW",
56  "x-devanagari",
57  "x-tamil",
58  "x-armn",
59  "x-beng",
60  "x-cans",
61  "x-ethi",
62  "x-geor",
63  "x-gujr",
64  "x-guru",
65  "x-khmr",
66  "x-mlym"
67};
68
69/**********************************************************************
70 * Unicode subranges as defined in unicode 3.0
71 * x-western, x-central-euro, tr, x-baltic  -> latin
72 *  0000 - 036f
73 *  1e00 - 1eff
74 *  2000 - 206f  (general punctuation)
75 *  20a0 - 20cf  (currency symbols)
76 *  2100 - 214f  (letterlike symbols)
77 *  2150 - 218f  (Number Forms)
78 * el         -> greek
79 *  0370 - 03ff
80 *  1f00 - 1fff
81 * x-cyrillic -> cyrillic
82 *  0400 - 04ff
83 * he         -> hebrew
84 *  0590 - 05ff
85 * ar         -> arabic
86 *  0600 - 06ff
87 *  fb50 - fdff (arabic presentation forms)
88 *  fe70 - feff (arabic presentation forms b)
89 * th - thai
90 *  0e00 - 0e7f
91 * ko        -> korean
92 *  ac00 - d7af  (hangul Syllables)
93 *  1100 - 11ff    (jamo)
94 *  3130 - 318f (hangul compatibility jamo)
95 * ja
96 *  3040 - 309f (hiragana)
97 *  30a0 - 30ff (katakana)
98 * zh-CN
99 * zh-TW
100 *
101 * CJK
102 *  3100 - 312f (bopomofo)
103 *  31a0 - 31bf (bopomofo extended)
104 *  3000 - 303f (CJK Symbols and Punctuation)
105 *  2e80 - 2eff (CJK radicals supplement)
106 *  2f00 - 2fdf (Kangxi Radicals)
107 *  2ff0 - 2fff (Ideographic Description Characters)
108 *  3190 - 319f (kanbun)
109 *  3200 - 32ff (Enclosed CJK letters and Months)
110 *  3300 - 33ff (CJK compatibility)
111 *  3400 - 4dbf (CJK Unified Ideographs Extension A)
112 *  4e00 - 9faf (CJK Unified Ideographs)
113 *  f900 - fa5f (CJK Compatibility Ideographs)
114 *  fe30 - fe4f (CJK compatibility Forms)
115 *  ff00 - ffef (halfwidth and fullwidth forms)
116 *
117 * Armenian
118 *  0530 - 058f
119 * Sriac
120 *  0700 - 074f
121 * Thaana
122 *  0780 - 07bf
123 * Devanagari
124 *  0900 - 097f
125 * Bengali
126 *  0980 - 09ff
127 * Gurmukhi
128 *  0a00 - 0a7f
129 * Gujarati
130 *  0a80 - 0aff
131 * Oriya
132 *  0b00 - 0b7f
133 * Tamil
134 *  0b80 - 0bff
135 * Telugu
136 *  0c00 - 0c7f
137 * Kannada
138 *  0c80 - 0cff
139 * Malayalam
140 *  0d00 - 0d7f
141 * Sinhala
142 *  0d80 - 0def
143 * Lao
144 *  0e80 - 0eff
145 * Tibetan
146 *  0f00 - 0fbf
147 * Myanmar
148 *  1000 - 109f
149 * Georgian
150 *  10a0 - 10ff
151 * Ethiopic
152 *  1200 - 137f
153 * Cherokee
154 *  13a0 - 13ff
155 * Canadian Aboriginal Syllabics
156 *  1400 - 167f
157 * Ogham
158 *  1680 - 169f
159 * Runic
160 *  16a0 - 16ff
161 * Khmer
162 *  1780 - 17ff
163 * Mongolian
164 *  1800 - 18af
165 * Misc - superscripts and subscripts
166 *  2070 - 209f
167 * Misc - Combining Diacritical Marks for Symbols
168 *  20d0 - 20ff
169 * Misc - Arrows
170 *  2190 - 21ff
171 * Misc - Mathematical Operators
172 *  2200 - 22ff
173 * Misc - Miscellaneous Technical
174 *  2300 - 23ff
175 * Misc - Control picture
176 *  2400 - 243f
177 * Misc - Optical character recognition
178 *  2440 - 2450
179 * Misc - Enclose Alphanumerics
180 *  2460 - 24ff
181 * Misc - Box Drawing
182 *  2500 - 257f
183 * Misc - Block Elements
184 *  2580 - 259f
185 * Misc - Geometric Shapes
186 *  25a0 - 25ff
187 * Misc - Miscellaneous Symbols
188 *  2600 - 267f
189 * Misc - Dingbats
190 *  2700 - 27bf
191 * Misc - Braille Patterns
192 *  2800 - 28ff
193 * Yi Syllables
194 *  a000 - a48f
195 * Yi radicals
196 *  a490 - a4cf
197 * Alphabetic Presentation Forms
198 *  fb00 - fb4f
199 * Misc - Combining half Marks
200 *  fe20 - fe2f
201 * Misc - small form variants
202 *  fe50 - fe6f
203 * Misc - Specials
204 *  fff0 - ffff
205 *********************************************************************/
206
207static const unsigned cNumSubTables = 9;
208static const unsigned cSubTableSize = 16;
209
210static const unsigned char gUnicodeSubrangeTable[cNumSubTables][cSubTableSize] =
211{
212  { // table for X---
213    cRangeTableBase+1,  //u0xxx
214    cRangeTableBase+2,  //u1xxx
215    cRangeTableBase+3,  //u2xxx
216    cRangeSetCJK,       //u3xxx
217    cRangeSetCJK,       //u4xxx
218    cRangeSetCJK,       //u5xxx
219    cRangeSetCJK,       //u6xxx
220    cRangeSetCJK,       //u7xxx
221    cRangeSetCJK,       //u8xxx
222    cRangeSetCJK,       //u9xxx
223    cRangeTableBase+4,  //uaxxx
224    cRangeKorean,       //ubxxx
225    cRangeKorean,       //ucxxx
226    cRangeTableBase+5,  //udxxx
227    cRangePrivate,      //uexxx
228    cRangeTableBase+6   //ufxxx
229  },
230  { //table for 0X--
231    cRangeSetLatin,          //u00xx
232    cRangeSetLatin,          //u01xx
233    cRangeSetLatin,          //u02xx
234    cRangeGreek,             //u03xx     XXX 0300-036f is in fact cRangeCombiningDiacriticalMarks
235    cRangeCyrillic,          //u04xx
236    cRangeTableBase+7,       //u05xx, includes Cyrillic supplement, Hebrew, and Armenian
237    cRangeArabic,            //u06xx
238    cRangeTertiaryTable,     //u07xx
239    cRangeUnassigned,        //u08xx
240    cRangeTertiaryTable,     //u09xx
241    cRangeTertiaryTable,     //u0axx
242    cRangeTertiaryTable,     //u0bxx
243    cRangeTertiaryTable,     //u0cxx
244    cRangeTertiaryTable,     //u0dxx
245    cRangeTertiaryTable,     //u0exx
246    cRangeTibetan,           //u0fxx
247  },
248  { //table for 1x--
249    cRangeTertiaryTable,     //u10xx
250    cRangeKorean,            //u11xx
251    cRangeEthiopic,          //u12xx
252    cRangeTertiaryTable,     //u13xx
253    cRangeCanadian,          //u14xx
254    cRangeCanadian,          //u15xx
255    cRangeTertiaryTable,     //u16xx
256    cRangeKhmer,             //u17xx
257    cRangeMongolian,         //u18xx
258    cRangeUnassigned,        //u19xx
259    cRangeUnassigned,        //u1axx
260    cRangeUnassigned,        //u1bxx
261    cRangeUnassigned,        //u1cxx
262    cRangeUnassigned,        //u1dxx
263    cRangeSetLatin,          //u1exx
264    cRangeGreek,             //u1fxx
265  },
266  { //table for 2x--
267    cRangeSetLatin,          //u20xx
268    cRangeSetLatin,          //u21xx
269    cRangeMathOperators,     //u22xx
270    cRangeMiscTechnical,     //u23xx
271    cRangeControlOpticalEnclose, //u24xx
272    cRangeBoxBlockGeometrics, //u25xx
273    cRangeMiscSymbols,       //u26xx
274    cRangeDingbats,          //u27xx
275    cRangeBraillePattern,    //u28xx
276    cRangeUnassigned,        //u29xx
277    cRangeUnassigned,        //u2axx
278    cRangeUnassigned,        //u2bxx
279    cRangeUnassigned,        //u2cxx
280    cRangeUnassigned,        //u2dxx
281    cRangeSetCJK,            //u2exx
282    cRangeSetCJK,            //u2fxx
283  },
284  {  //table for ax--
285    cRangeYi,                //ua0xx
286    cRangeYi,                //ua1xx
287    cRangeYi,                //ua2xx
288    cRangeYi,                //ua3xx
289    cRangeYi,                //ua4xx
290    cRangeUnassigned,        //ua5xx
291    cRangeUnassigned,        //ua6xx
292    cRangeUnassigned,        //ua7xx
293    cRangeUnassigned,        //ua8xx
294    cRangeUnassigned,        //ua9xx
295    cRangeUnassigned,        //uaaxx
296    cRangeUnassigned,        //uabxx
297    cRangeKorean,            //uacxx
298    cRangeKorean,            //uadxx
299    cRangeKorean,            //uaexx
300    cRangeKorean,            //uafxx
301  },
302  {  //table for dx--
303    cRangeKorean,            //ud0xx
304    cRangeKorean,            //ud1xx
305    cRangeKorean,            //ud2xx
306    cRangeKorean,            //ud3xx
307    cRangeKorean,            //ud4xx
308    cRangeKorean,            //ud5xx
309    cRangeKorean,            //ud6xx
310    cRangeKorean,            //ud7xx
311    cRangeSurrogate,         //ud8xx
312    cRangeSurrogate,         //ud9xx
313    cRangeSurrogate,         //udaxx
314    cRangeSurrogate,         //udbxx
315    cRangeSurrogate,         //udcxx
316    cRangeSurrogate,         //uddxx
317    cRangeSurrogate,         //udexx
318    cRangeSurrogate,         //udfxx
319  },
320  { // table for fx--
321    cRangePrivate,           //uf0xx
322    cRangePrivate,           //uf1xx
323    cRangePrivate,           //uf2xx
324    cRangePrivate,           //uf3xx
325    cRangePrivate,           //uf4xx
326    cRangePrivate,           //uf5xx
327    cRangePrivate,           //uf6xx
328    cRangePrivate,           //uf7xx
329    cRangePrivate,           //uf8xx
330    cRangeSetCJK,            //uf9xx
331    cRangeSetCJK,            //ufaxx
332    cRangeArabic,            //ufbxx, includes alphabic presentation form
333    cRangeArabic,            //ufcxx
334    cRangeArabic,            //ufdxx
335    cRangeArabic,            //ufexx, includes Combining half marks,
336                             //                CJK compatibility forms,
337                             //                CJK compatibility forms,
338                             //                small form variants
339    cRangeTableBase+8,       //uffxx, halfwidth and fullwidth forms, includes Specials
340  },
341  { //table for 0x0500 - 0x05ff
342    cRangeCyrillic,          //u050x
343    cRangeCyrillic,          //u051x
344    cRangeCyrillic,          //u052x
345    cRangeArmenian,          //u053x
346    cRangeArmenian,          //u054x
347    cRangeArmenian,          //u055x
348    cRangeArmenian,          //u056x
349    cRangeArmenian,          //u057x
350    cRangeArmenian,          //u058x
351    cRangeHebrew,            //u059x
352    cRangeHebrew,            //u05ax
353    cRangeHebrew,            //u05bx
354    cRangeHebrew,            //u05cx
355    cRangeHebrew,            //u05dx
356    cRangeHebrew,            //u05ex
357    cRangeHebrew,            //u05fx
358  },
359  { //table for 0xff00 - 0xffff
360    cRangeSetCJK,            //uff0x, fullwidth latin
361    cRangeSetCJK,            //uff1x, fullwidth latin
362    cRangeSetCJK,            //uff2x, fullwidth latin
363    cRangeSetCJK,            //uff3x, fullwidth latin
364    cRangeSetCJK,            //uff4x, fullwidth latin
365    cRangeSetCJK,            //uff5x, fullwidth latin
366    cRangeSetCJK,            //uff6x, halfwidth katakana
367    cRangeSetCJK,            //uff7x, halfwidth katakana
368    cRangeSetCJK,            //uff8x, halfwidth katakana
369    cRangeSetCJK,            //uff9x, halfwidth katakana
370    cRangeSetCJK,            //uffax, halfwidth hangul jamo
371    cRangeSetCJK,            //uffbx, halfwidth hangul jamo
372    cRangeSetCJK,            //uffcx, halfwidth hangul jamo
373    cRangeSetCJK,            //uffdx, halfwidth hangul jamo
374    cRangeSetCJK,            //uffex, fullwidth symbols
375    cRangeSpecials,          //ufffx, Specials
376  },
377};
378
379// Most scripts between U+0700 and U+16FF are assigned a chunk of 128 (0x80)
380// code points so that the number of entries in the tertiary range
381// table for that range is obtained by dividing (0x1700 - 0x0700) by 128.
382// Exceptions: Ethiopic, Tibetan, Hangul Jamo and Canadian aboriginal
383// syllabaries take multiple chunks and Ogham and Runic share a single chunk.
384static const unsigned cTertiaryTableSize = ((0x1700 - 0x0700) / 0x80);
385
386static const unsigned char gUnicodeTertiaryRangeTable[cTertiaryTableSize] =
387{ //table for 0x0700 - 0x1600
388    cRangeSyriac,            //u070x
389    cRangeThaana,            //u078x
390    cRangeUnassigned,        //u080x  place holder(resolved in the 2ndary tab.)
391    cRangeUnassigned,        //u088x  place holder(resolved in the 2ndary tab.)
392    cRangeDevanagari,        //u090x
393    cRangeBengali,           //u098x
394    cRangeGurmukhi,          //u0a0x
395    cRangeGujarati,          //u0a8x
396    cRangeOriya,             //u0b0x
397    cRangeTamil,             //u0b8x
398    cRangeTelugu,            //u0c0x
399    cRangeKannada,           //u0c8x
400    cRangeMalayalam,         //u0d0x
401    cRangeSinhala,           //u0d8x
402    cRangeThai,              //u0e0x
403    cRangeLao,               //u0e8x
404    cRangeTibetan,           //u0f0x  place holder(resolved in the 2ndary tab.)
405    cRangeTibetan,           //u0f8x  place holder(resolved in the 2ndary tab.)
406    cRangeMyanmar,           //u100x
407    cRangeGeorgian,          //u108x
408    cRangeKorean,            //u110x  place holder(resolved in the 2ndary tab.)
409    cRangeKorean,            //u118x  place holder(resolved in the 2ndary tab.)
410    cRangeEthiopic,          //u120x  place holder(resolved in the 2ndary tab.)
411    cRangeEthiopic,          //u128x  place holder(resolved in the 2ndary tab.)
412    cRangeEthiopic,          //u130x
413    cRangeCherokee,          //u138x
414    cRangeCanadian,          //u140x  place holder(resolved in the 2ndary tab.)
415    cRangeCanadian,          //u148x  place holder(resolved in the 2ndary tab.)
416    cRangeCanadian,          //u150x  place holder(resolved in the 2ndary tab.)
417    cRangeCanadian,          //u158x  place holder(resolved in the 2ndary tab.)
418    cRangeCanadian,          //u160x
419    cRangeOghamRunic,        //u168x  this contains two scripts, Ogham & Runic
420};
421
422// A two level index is almost enough for locating a range, with the
423// exception of u03xx and u05xx. Since we don't really care about range for
424// combining diacritical marks in our font application, they are
425// not discriminated further.  Future adoption of this method for other use
426// should be aware of this limitation. The implementation can be extended if
427// there is such a need.
428// For Indic, Southeast Asian scripts and some other scripts between
429// U+0700 and U+16FF, it's extended to the third level.
430unsigned int findCharUnicodeRange(UChar32 ch)
431{
432    if (ch >= 0xFFFF)
433        return 0;
434
435    unsigned int range;
436
437    //search the first table
438    range = gUnicodeSubrangeTable[0][ch >> 12];
439
440    if (range < cRangeTableBase)
441        // we try to get a specific range
442        return range;
443
444    // otherwise, we have one more table to look at
445    range = gUnicodeSubrangeTable[range - cRangeTableBase][(ch & 0x0f00) >> 8];
446    if (range < cRangeTableBase)
447        return range;
448    if (range < cRangeTertiaryTable)
449        return gUnicodeSubrangeTable[range - cRangeTableBase][(ch & 0x00f0) >> 4];
450
451    // Yet another table to look at : U+0700 - U+16FF : 128 code point blocks
452    return gUnicodeTertiaryRangeTable[(ch - 0x0700) >> 7];
453}
454
455const char* langGroupFromUnicodeRange(unsigned char unicodeRange)
456{
457    if (cRangeSpecificItemNum > unicodeRange)
458        return gUnicodeRangeToLangGroupTable[unicodeRange];
459    return 0;
460}
461
462}
463