1/*
2*******************************************************************************
3*
4*   Copyright (C) 1999-2010, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  uinvchar.c
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:2
12*
13*   created on: 2004sep14
14*   created by: Markus W. Scherer
15*
16*   Functions for handling invariant characters, moved here from putil.c
17*   for better modularization.
18*/
19
20#include "unicode/utypes.h"
21#include "unicode/ustring.h"
22#include "udataswp.h"
23#include "cstring.h"
24#include "cmemory.h"
25#include "uassert.h"
26#include "uinvchar.h"
27
28/* invariant-character handling --------------------------------------------- */
29
30/*
31 * These maps for ASCII to/from EBCDIC map invariant characters (see utypes.h)
32 * appropriately for most EBCDIC codepages.
33 *
34 * They currently also map most other ASCII graphic characters,
35 * appropriately for codepages 37 and 1047.
36 * Exceptions: The characters for []^ have different codes in 37 & 1047.
37 * Both versions are mapped to ASCII.
38 *
39 *    ASCII 37 1047
40 * [     5B BA   AD
41 * ]     5D BB   BD
42 * ^     5E B0   5F
43 *
44 * There are no mappings for variant characters from Unicode to EBCDIC.
45 *
46 * Currently, C0 control codes are also included in these maps.
47 * Exceptions: S/390 Open Edition swaps LF and NEL codes compared with other
48 * EBCDIC platforms; both codes (15 and 25) are mapped to ASCII LF (0A),
49 * but there is no mapping for ASCII LF back to EBCDIC.
50 *
51 *    ASCII EBCDIC S/390-OE
52 * LF    0A     25       15
53 * NEL   85     15       25
54 *
55 * The maps below explicitly exclude the variant
56 * control and graphical characters that are in ASCII-based
57 * codepages at 0x80 and above.
58 * "No mapping" is expressed by mapping to a 00 byte.
59 *
60 * These tables do not establish a converter or a codepage.
61 */
62
63static const uint8_t asciiFromEbcdic[256]={
64    0x00, 0x01, 0x02, 0x03, 0x00, 0x09, 0x00, 0x7f, 0x00, 0x00, 0x00, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
65    0x10, 0x11, 0x12, 0x13, 0x00, 0x0a, 0x08, 0x00, 0x18, 0x19, 0x00, 0x00, 0x1c, 0x1d, 0x1e, 0x1f,
66    0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x17, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x06, 0x07,
67    0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x14, 0x15, 0x00, 0x1a,
68
69    0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2e, 0x3c, 0x28, 0x2b, 0x7c,
70    0x26, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x21, 0x24, 0x2a, 0x29, 0x3b, 0x5e,
71    0x2d, 0x2f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2c, 0x25, 0x5f, 0x3e, 0x3f,
72    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x60, 0x3a, 0x23, 0x40, 0x27, 0x3d, 0x22,
73
74    0x00, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
75    0x00, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
76    0x00, 0x7e, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00,
77    0x5e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5b, 0x5d, 0x00, 0x5d, 0x00, 0x00,
78
79    0x7b, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
80    0x7d, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
81    0x5c, 0x00, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
82    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
83};
84
85static const uint8_t ebcdicFromAscii[256]={
86    0x00, 0x01, 0x02, 0x03, 0x37, 0x2d, 0x2e, 0x2f, 0x16, 0x05, 0x00, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
87    0x10, 0x11, 0x12, 0x13, 0x3c, 0x3d, 0x32, 0x26, 0x18, 0x19, 0x3f, 0x27, 0x1c, 0x1d, 0x1e, 0x1f,
88    0x40, 0x00, 0x7f, 0x00, 0x00, 0x6c, 0x50, 0x7d, 0x4d, 0x5d, 0x5c, 0x4e, 0x6b, 0x60, 0x4b, 0x61,
89    0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0x7a, 0x5e, 0x4c, 0x7e, 0x6e, 0x6f,
90
91    0x00, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6,
92    0xd7, 0xd8, 0xd9, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0x00, 0x00, 0x00, 0x00, 0x6d,
93    0x00, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
94    0x97, 0x98, 0x99, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0x00, 0x00, 0x00, 0x00, 0x07,
95
96    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
97    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
98    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
99    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
100
101    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
102    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
104    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
105};
106
107/*
108 * Bit sets indicating which characters of the ASCII repertoire
109 * (by ASCII/Unicode code) are "invariant".
110 * See utypes.h for more details.
111 *
112 * As invariant are considered the characters of the ASCII repertoire except
113 * for the following:
114 * 21  '!' <exclamation mark>
115 * 23  '#' <number sign>
116 * 24  '$' <dollar sign>
117 *
118 * 40  '@' <commercial at>
119 *
120 * 5b  '[' <left bracket>
121 * 5c  '\' <backslash>
122 * 5d  ']' <right bracket>
123 * 5e  '^' <circumflex>
124 *
125 * 60  '`' <grave accent>
126 *
127 * 7b  '{' <left brace>
128 * 7c  '|' <vertical line>
129 * 7d  '}' <right brace>
130 * 7e  '~' <tilde>
131 */
132static const uint32_t invariantChars[4]={
133    0xfffffbff, /* 00..1f but not 0a */
134    0xffffffe5, /* 20..3f but not 21 23 24 */
135    0x87fffffe, /* 40..5f but not 40 5b..5e */
136    0x87fffffe  /* 60..7f but not 60 7b..7e */
137};
138
139/*
140 * test unsigned types (or values known to be non-negative) for invariant characters,
141 * tests ASCII-family character values
142 */
143#define UCHAR_IS_INVARIANT(c) (((c)<=0x7f) && (invariantChars[(c)>>5]&((uint32_t)1<<((c)&0x1f)))!=0)
144
145/* test signed types for invariant characters, adds test for positive values */
146#define SCHAR_IS_INVARIANT(c) ((0<=(c)) && UCHAR_IS_INVARIANT(c))
147
148#if U_CHARSET_FAMILY==U_ASCII_FAMILY
149#define CHAR_TO_UCHAR(c) c
150#define UCHAR_TO_CHAR(c) c
151#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
152#define CHAR_TO_UCHAR(u) asciiFromEbcdic[u]
153#define UCHAR_TO_CHAR(u) ebcdicFromAscii[u]
154#else
155#   error U_CHARSET_FAMILY is not valid
156#endif
157
158
159U_CAPI void U_EXPORT2
160u_charsToUChars(const char *cs, UChar *us, int32_t length) {
161    UChar u;
162    uint8_t c;
163
164    /*
165     * Allow the entire ASCII repertoire to be mapped _to_ Unicode.
166     * For EBCDIC systems, this works for characters with codes from
167     * codepages 37 and 1047 or compatible.
168     */
169    while(length>0) {
170        c=(uint8_t)(*cs++);
171        u=(UChar)CHAR_TO_UCHAR(c);
172        U_ASSERT((u!=0 || c==0)); /* only invariant chars converted? */
173        *us++=u;
174        --length;
175    }
176}
177
178U_CAPI void U_EXPORT2
179u_UCharsToChars(const UChar *us, char *cs, int32_t length) {
180    UChar u;
181
182    while(length>0) {
183        u=*us++;
184        if(!UCHAR_IS_INVARIANT(u)) {
185            U_ASSERT(FALSE); /* Variant characters were used. These are not portable in ICU. */
186            u=0;
187        }
188        *cs++=(char)UCHAR_TO_CHAR(u);
189        --length;
190    }
191}
192
193U_CAPI UBool U_EXPORT2
194uprv_isInvariantString(const char *s, int32_t length) {
195    uint8_t c;
196
197    for(;;) {
198        if(length<0) {
199            /* NUL-terminated */
200            c=(uint8_t)*s++;
201            if(c==0) {
202                break;
203            }
204        } else {
205            /* count length */
206            if(length==0) {
207                break;
208            }
209            --length;
210            c=(uint8_t)*s++;
211            if(c==0) {
212                continue; /* NUL is invariant */
213            }
214        }
215        /* c!=0 now, one branch below checks c==0 for variant characters */
216
217        /*
218         * no assertions here because these functions are legitimately called
219         * for strings with variant characters
220         */
221#if U_CHARSET_FAMILY==U_ASCII_FAMILY
222        if(!UCHAR_IS_INVARIANT(c)) {
223            return FALSE; /* found a variant char */
224        }
225#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
226        c=CHAR_TO_UCHAR(c);
227        if(c==0 || !UCHAR_IS_INVARIANT(c)) {
228            return FALSE; /* found a variant char */
229        }
230#else
231#   error U_CHARSET_FAMILY is not valid
232#endif
233    }
234    return TRUE;
235}
236
237U_CAPI UBool U_EXPORT2
238uprv_isInvariantUString(const UChar *s, int32_t length) {
239    UChar c;
240
241    for(;;) {
242        if(length<0) {
243            /* NUL-terminated */
244            c=*s++;
245            if(c==0) {
246                break;
247            }
248        } else {
249            /* count length */
250            if(length==0) {
251                break;
252            }
253            --length;
254            c=*s++;
255        }
256
257        /*
258         * no assertions here because these functions are legitimately called
259         * for strings with variant characters
260         */
261        if(!UCHAR_IS_INVARIANT(c)) {
262            return FALSE; /* found a variant char */
263        }
264    }
265    return TRUE;
266}
267
268/* UDataSwapFn implementations used in udataswp.c ------- */
269
270/* convert ASCII to EBCDIC and verify that all characters are invariant */
271U_CAPI int32_t U_EXPORT2
272uprv_ebcdicFromAscii(const UDataSwapper *ds,
273                     const void *inData, int32_t length, void *outData,
274                     UErrorCode *pErrorCode) {
275    const uint8_t *s;
276    uint8_t *t;
277    uint8_t c;
278
279    int32_t count;
280
281    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
282        return 0;
283    }
284    if(ds==NULL || inData==NULL || length<0 || (length>0 && outData==NULL)) {
285        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
286        return 0;
287    }
288
289    /* setup and swapping */
290    s=(const uint8_t *)inData;
291    t=(uint8_t *)outData;
292    count=length;
293    while(count>0) {
294        c=*s++;
295        if(!UCHAR_IS_INVARIANT(c)) {
296            udata_printError(ds, "uprv_ebcdicFromAscii() string[%d] contains a variant character in position %d\n",
297                             length, length-count);
298            *pErrorCode=U_INVALID_CHAR_FOUND;
299            return 0;
300        }
301        *t++=ebcdicFromAscii[c];
302        --count;
303    }
304
305    return length;
306}
307
308/* this function only checks and copies ASCII strings without conversion */
309U_CFUNC int32_t
310uprv_copyAscii(const UDataSwapper *ds,
311               const void *inData, int32_t length, void *outData,
312               UErrorCode *pErrorCode) {
313    const uint8_t *s;
314    uint8_t c;
315
316    int32_t count;
317
318    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
319        return 0;
320    }
321    if(ds==NULL || inData==NULL || length<0 || (length>0 && outData==NULL)) {
322        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
323        return 0;
324    }
325
326    /* setup and checking */
327    s=(const uint8_t *)inData;
328    count=length;
329    while(count>0) {
330        c=*s++;
331        if(!UCHAR_IS_INVARIANT(c)) {
332            udata_printError(ds, "uprv_copyFromAscii() string[%d] contains a variant character in position %d\n",
333                             length, length-count);
334            *pErrorCode=U_INVALID_CHAR_FOUND;
335            return 0;
336        }
337        --count;
338    }
339
340    if(length>0 && inData!=outData) {
341        uprv_memcpy(outData, inData, length);
342    }
343
344    return length;
345}
346
347/* convert EBCDIC to ASCII and verify that all characters are invariant */
348U_CFUNC int32_t
349uprv_asciiFromEbcdic(const UDataSwapper *ds,
350                     const void *inData, int32_t length, void *outData,
351                     UErrorCode *pErrorCode) {
352    const uint8_t *s;
353    uint8_t *t;
354    uint8_t c;
355
356    int32_t count;
357
358    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
359        return 0;
360    }
361    if(ds==NULL || inData==NULL || length<0 ||  (length>0 && outData==NULL)) {
362        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
363        return 0;
364    }
365
366    /* setup and swapping */
367    s=(const uint8_t *)inData;
368    t=(uint8_t *)outData;
369    count=length;
370    while(count>0) {
371        c=*s++;
372        if(c!=0 && ((c=asciiFromEbcdic[c])==0 || !UCHAR_IS_INVARIANT(c))) {
373            udata_printError(ds, "uprv_asciiFromEbcdic() string[%d] contains a variant character in position %d\n",
374                             length, length-count);
375            *pErrorCode=U_INVALID_CHAR_FOUND;
376            return 0;
377        }
378        *t++=c;
379        --count;
380    }
381
382    return length;
383}
384
385/* this function only checks and copies EBCDIC strings without conversion */
386U_CFUNC int32_t
387uprv_copyEbcdic(const UDataSwapper *ds,
388                const void *inData, int32_t length, void *outData,
389                UErrorCode *pErrorCode) {
390    const uint8_t *s;
391    uint8_t c;
392
393    int32_t count;
394
395    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
396        return 0;
397    }
398    if(ds==NULL || inData==NULL || length<0 || (length>0 && outData==NULL)) {
399        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
400        return 0;
401    }
402
403    /* setup and checking */
404    s=(const uint8_t *)inData;
405    count=length;
406    while(count>0) {
407        c=*s++;
408        if(c!=0 && ((c=asciiFromEbcdic[c])==0 || !UCHAR_IS_INVARIANT(c))) {
409            udata_printError(ds, "uprv_copyEbcdic() string[%] contains a variant character in position %d\n",
410                             length, length-count);
411            *pErrorCode=U_INVALID_CHAR_FOUND;
412            return 0;
413        }
414        --count;
415    }
416
417    if(length>0 && inData!=outData) {
418        uprv_memcpy(outData, inData, length);
419    }
420
421    return length;
422}
423
424/* compare invariant strings; variant characters compare less than others and unlike each other */
425U_CFUNC int32_t
426uprv_compareInvAscii(const UDataSwapper *ds,
427                     const char *outString, int32_t outLength,
428                     const UChar *localString, int32_t localLength) {
429    int32_t minLength;
430    UChar32 c1, c2;
431    uint8_t c;
432
433    if(outString==NULL || outLength<-1 || localString==NULL || localLength<-1) {
434        return 0;
435    }
436
437    if(outLength<0) {
438        outLength=(int32_t)uprv_strlen(outString);
439    }
440    if(localLength<0) {
441        localLength=u_strlen(localString);
442    }
443
444    minLength= outLength<localLength ? outLength : localLength;
445
446    while(minLength>0) {
447        c=(uint8_t)*outString++;
448        if(UCHAR_IS_INVARIANT(c)) {
449            c1=c;
450        } else {
451            c1=-1;
452        }
453
454        c2=*localString++;
455        if(!UCHAR_IS_INVARIANT(c2)) {
456            c2=-2;
457        }
458
459        if((c1-=c2)!=0) {
460            return c1;
461        }
462
463        --minLength;
464    }
465
466    /* strings start with same prefix, compare lengths */
467    return outLength-localLength;
468}
469
470U_CFUNC int32_t
471uprv_compareInvEbcdic(const UDataSwapper *ds,
472                      const char *outString, int32_t outLength,
473                      const UChar *localString, int32_t localLength) {
474    int32_t minLength;
475    UChar32 c1, c2;
476    uint8_t c;
477
478    if(outString==NULL || outLength<-1 || localString==NULL || localLength<-1) {
479        return 0;
480    }
481
482    if(outLength<0) {
483        outLength=(int32_t)uprv_strlen(outString);
484    }
485    if(localLength<0) {
486        localLength=u_strlen(localString);
487    }
488
489    minLength= outLength<localLength ? outLength : localLength;
490
491    while(minLength>0) {
492        c=(uint8_t)*outString++;
493        if(c==0) {
494            c1=0;
495        } else if((c1=asciiFromEbcdic[c])!=0 && UCHAR_IS_INVARIANT(c1)) {
496            /* c1 is set */
497        } else {
498            c1=-1;
499        }
500
501        c2=*localString++;
502        if(!UCHAR_IS_INVARIANT(c2)) {
503            c2=-2;
504        }
505
506        if((c1-=c2)!=0) {
507            return c1;
508        }
509
510        --minLength;
511    }
512
513    /* strings start with same prefix, compare lengths */
514    return outLength-localLength;
515}
516
517U_CAPI int32_t U_EXPORT2
518uprv_compareInvEbcdicAsAscii(const char *s1, const char *s2) {
519    int32_t c1, c2;
520
521    for(;; ++s1, ++s2) {
522        c1=(uint8_t)*s1;
523        c2=(uint8_t)*s2;
524        if(c1!=c2) {
525            if(c1!=0 && ((c1=asciiFromEbcdic[c1])==0 || !UCHAR_IS_INVARIANT(c1))) {
526                c1=-(int32_t)(uint8_t)*s1;
527            }
528            if(c2!=0 && ((c2=asciiFromEbcdic[c2])==0 || !UCHAR_IS_INVARIANT(c2))) {
529                c2=-(int32_t)(uint8_t)*s2;
530            }
531            return c1-c2;
532        } else if(c1==0) {
533            return 0;
534        }
535    }
536}
537
538
539U_INTERNAL uint8_t* U_EXPORT2
540uprv_aestrncpy(uint8_t *dst, const uint8_t *src, int32_t n)
541{
542  uint8_t *orig_dst = dst;
543
544  if(n==-1) {
545    n = uprv_strlen((const char*)src)+1; /* copy NUL */
546  }
547  /* copy non-null */
548  while(*src && n>0) {
549    *(dst++) = asciiFromEbcdic[*(src++)];
550    n--;
551  }
552  /* pad */
553  while(n>0) {
554    *(dst++) = 0;
555    n--;
556  }
557  return orig_dst;
558}
559
560U_INTERNAL uint8_t* U_EXPORT2
561uprv_eastrncpy(uint8_t *dst, const uint8_t *src, int32_t n)
562{
563  uint8_t *orig_dst = dst;
564
565  if(n==-1) {
566    n = uprv_strlen((const char*)src)+1; /* copy NUL */
567  }
568  /* copy non-null */
569  while(*src && n>0) {
570    char ch = ebcdicFromAscii[*(src++)];
571    if(ch == 0) {
572      ch = ebcdicFromAscii[0x3f]; /* questionmark (subchar) */
573    }
574    *(dst++) = ch;
575    n--;
576  }
577  /* pad */
578  while(n>0) {
579    *(dst++) = 0;
580    n--;
581  }
582  return orig_dst;
583}
584
585