1/*
2******************************************************************************
3*
4*   Copyright (C) 2000-2007, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7******************************************************************************
8*   file name:  ubidiwrt.c
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 1999aug06
14*   created by: Markus W. Scherer, updated by Matitiahu Allouche
15*
16* This file contains implementations for BiDi functions that use
17* the core algorithm and core API to write reordered text.
18*/
19
20/* set import/export definitions */
21#ifndef U_COMMON_IMPLEMENTATION
22#   define U_COMMON_IMPLEMENTATION
23#endif
24
25#include "unicode/utypes.h"
26#include "unicode/ustring.h"
27#include "unicode/uchar.h"
28#include "unicode/ubidi.h"
29#include "cmemory.h"
30#include "ustr_imp.h"
31#include "ubidiimp.h"
32
33/*
34 * The function implementations in this file are designed
35 * for UTF-16 and UTF-32, not for UTF-8.
36 *
37 * Assumptions that are not true for UTF-8:
38 * - Any code point always needs the same number of code units
39 *   ("minimum-length-problem" of UTF-8)
40 * - The BiDi control characters need only one code unit each
41 *
42 * Further assumptions for all UTFs:
43 * - u_charMirror(c) needs the same number of code units as c
44 */
45#if UTF_SIZE==8
46# error reimplement ubidi_writeReordered() for UTF-8, see comment above
47#endif
48
49#define IS_COMBINING(type) ((1UL<<(type))&(1UL<<U_NON_SPACING_MARK|1UL<<U_COMBINING_SPACING_MARK|1UL<<U_ENCLOSING_MARK))
50
51/*
52 * When we have UBIDI_OUTPUT_REVERSE set on ubidi_writeReordered(), then we
53 * semantically write RTL runs in reverse and later reverse them again.
54 * Instead, we actually write them in forward order to begin with.
55 * However, if the RTL run was to be mirrored, we need to mirror here now
56 * since the implicit second reversal must not do it.
57 * It looks strange to do mirroring in LTR output, but it is only because
58 * we are writing RTL output in reverse.
59 */
60static int32_t
61doWriteForward(const UChar *src, int32_t srcLength,
62               UChar *dest, int32_t destSize,
63               uint16_t options,
64               UErrorCode *pErrorCode) {
65    /* optimize for several combinations of options */
66    switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING)) {
67    case 0: {
68        /* simply copy the LTR run to the destination */
69        int32_t length=srcLength;
70        if(destSize<length) {
71            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
72            return srcLength;
73        }
74        do {
75            *dest++=*src++;
76        } while(--length>0);
77        return srcLength;
78    }
79    case UBIDI_DO_MIRRORING: {
80        /* do mirroring */
81        int32_t i=0, j=0;
82        UChar32 c;
83
84        if(destSize<srcLength) {
85            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
86            return srcLength;
87        }
88        do {
89            UTF_NEXT_CHAR(src, i, srcLength, c);
90            c=u_charMirror(c);
91            UTF_APPEND_CHAR_UNSAFE(dest, j, c);
92        } while(i<srcLength);
93        return srcLength;
94    }
95    case UBIDI_REMOVE_BIDI_CONTROLS: {
96        /* copy the LTR run and remove any BiDi control characters */
97        int32_t remaining=destSize;
98        UChar c;
99        do {
100            c=*src++;
101            if(!IS_BIDI_CONTROL_CHAR(c)) {
102                if(--remaining<0) {
103                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
104
105                    /* preflight the length */
106                    while(--srcLength>0) {
107                        c=*src++;
108                        if(!IS_BIDI_CONTROL_CHAR(c)) {
109                            --remaining;
110                        }
111                    }
112                    return destSize-remaining;
113                }
114                *dest++=c;
115            }
116        } while(--srcLength>0);
117        return destSize-remaining;
118    }
119    default: {
120        /* remove BiDi control characters and do mirroring */
121        int32_t remaining=destSize;
122        int32_t i, j=0;
123        UChar32 c;
124        do {
125            i=0;
126            UTF_NEXT_CHAR(src, i, srcLength, c);
127            src+=i;
128            srcLength-=i;
129            if(!IS_BIDI_CONTROL_CHAR(c)) {
130                remaining-=i;
131                if(remaining<0) {
132                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
133
134                    /* preflight the length */
135                    while(srcLength>0) {
136                        c=*src++;
137                        if(!IS_BIDI_CONTROL_CHAR(c)) {
138                            --remaining;
139                        }
140                        --srcLength;
141                    }
142                    return destSize-remaining;
143                }
144                c=u_charMirror(c);
145                UTF_APPEND_CHAR_UNSAFE(dest, j, c);
146            }
147        } while(srcLength>0);
148        return j;
149    }
150    } /* end of switch */
151}
152
153static int32_t
154doWriteReverse(const UChar *src, int32_t srcLength,
155               UChar *dest, int32_t destSize,
156               uint16_t options,
157               UErrorCode *pErrorCode) {
158    /*
159     * RTL run -
160     *
161     * RTL runs need to be copied to the destination in reverse order
162     * of code points, not code units, to keep Unicode characters intact.
163     *
164     * The general strategy for this is to read the source text
165     * in backward order, collect all code units for a code point
166     * (and optionally following combining characters, see below),
167     * and copy all these code units in ascending order
168     * to the destination for this run.
169     *
170     * Several options request whether combining characters
171     * should be kept after their base characters,
172     * whether BiDi control characters should be removed, and
173     * whether characters should be replaced by their mirror-image
174     * equivalent Unicode characters.
175     */
176    int32_t i, j;
177    UChar32 c;
178
179    /* optimize for several combinations of options */
180    switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING|UBIDI_KEEP_BASE_COMBINING)) {
181    case 0:
182        /*
183         * With none of the "complicated" options set, the destination
184         * run will have the same length as the source run,
185         * and there is no mirroring and no keeping combining characters
186         * with their base characters.
187         */
188        if(destSize<srcLength) {
189            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
190            return srcLength;
191        }
192        destSize=srcLength;
193
194        /* preserve character integrity */
195        do {
196            /* i is always after the last code unit known to need to be kept in this segment */
197            i=srcLength;
198
199            /* collect code units for one base character */
200            UTF_BACK_1(src, 0, srcLength);
201
202            /* copy this base character */
203            j=srcLength;
204            do {
205                *dest++=src[j++];
206            } while(j<i);
207        } while(srcLength>0);
208        break;
209    case UBIDI_KEEP_BASE_COMBINING:
210        /*
211         * Here, too, the destination
212         * run will have the same length as the source run,
213         * and there is no mirroring.
214         * We do need to keep combining characters with their base characters.
215         */
216        if(destSize<srcLength) {
217            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
218            return srcLength;
219        }
220        destSize=srcLength;
221
222        /* preserve character integrity */
223        do {
224            /* i is always after the last code unit known to need to be kept in this segment */
225            i=srcLength;
226
227            /* collect code units and modifier letters for one base character */
228            do {
229                UTF_PREV_CHAR(src, 0, srcLength, c);
230            } while(srcLength>0 && IS_COMBINING(u_charType(c)));
231
232            /* copy this "user character" */
233            j=srcLength;
234            do {
235                *dest++=src[j++];
236            } while(j<i);
237        } while(srcLength>0);
238        break;
239    default:
240        /*
241         * With several "complicated" options set, this is the most
242         * general and the slowest copying of an RTL run.
243         * We will do mirroring, remove BiDi controls, and
244         * keep combining characters with their base characters
245         * as requested.
246         */
247        if(!(options&UBIDI_REMOVE_BIDI_CONTROLS)) {
248            i=srcLength;
249        } else {
250            /* we need to find out the destination length of the run,
251               which will not include the BiDi control characters */
252            int32_t length=srcLength;
253            UChar ch;
254
255            i=0;
256            do {
257                ch=*src++;
258                if(!IS_BIDI_CONTROL_CHAR(ch)) {
259                    ++i;
260                }
261            } while(--length>0);
262            src-=srcLength;
263        }
264
265        if(destSize<i) {
266            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
267            return i;
268        }
269        destSize=i;
270
271        /* preserve character integrity */
272        do {
273            /* i is always after the last code unit known to need to be kept in this segment */
274            i=srcLength;
275
276            /* collect code units for one base character */
277            UTF_PREV_CHAR(src, 0, srcLength, c);
278            if(options&UBIDI_KEEP_BASE_COMBINING) {
279                /* collect modifier letters for this base character */
280                while(srcLength>0 && IS_COMBINING(u_charType(c))) {
281                    UTF_PREV_CHAR(src, 0, srcLength, c);
282                }
283            }
284
285            if(options&UBIDI_REMOVE_BIDI_CONTROLS && IS_BIDI_CONTROL_CHAR(c)) {
286                /* do not copy this BiDi control character */
287                continue;
288            }
289
290            /* copy this "user character" */
291            j=srcLength;
292            if(options&UBIDI_DO_MIRRORING) {
293                /* mirror only the base character */
294                int32_t k=0;
295                c=u_charMirror(c);
296                UTF_APPEND_CHAR_UNSAFE(dest, k, c);
297                dest+=k;
298                j+=k;
299            }
300            while(j<i) {
301                *dest++=src[j++];
302            }
303        } while(srcLength>0);
304        break;
305    } /* end of switch */
306
307    return destSize;
308}
309
310U_CAPI int32_t U_EXPORT2
311ubidi_writeReverse(const UChar *src, int32_t srcLength,
312                   UChar *dest, int32_t destSize,
313                   uint16_t options,
314                   UErrorCode *pErrorCode) {
315    int32_t destLength;
316
317    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
318        return 0;
319    }
320
321    /* more error checking */
322    if( src==NULL || srcLength<-1 ||
323        destSize<0 || (destSize>0 && dest==NULL))
324    {
325        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
326        return 0;
327    }
328
329    /* do input and output overlap? */
330    if( dest!=NULL &&
331        ((src>=dest && src<dest+destSize) ||
332         (dest>=src && dest<src+srcLength)))
333    {
334        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
335        return 0;
336    }
337
338    if(srcLength==-1) {
339        srcLength=u_strlen(src);
340    }
341    if(srcLength>0) {
342        destLength=doWriteReverse(src, srcLength, dest, destSize, options, pErrorCode);
343    } else {
344        /* nothing to do */
345        destLength=0;
346    }
347
348    return u_terminateUChars(dest, destSize, destLength, pErrorCode);
349}
350
351U_CAPI int32_t U_EXPORT2
352ubidi_writeReordered(UBiDi *pBiDi,
353                     UChar *dest, int32_t destSize,
354                     uint16_t options,
355                     UErrorCode *pErrorCode) {
356    const UChar *text;
357    UChar *saveDest;
358    int32_t length, destCapacity;
359    int32_t run, runCount, logicalStart, runLength;
360
361    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
362        return 0;
363    }
364
365    /* more error checking */
366    if( pBiDi==NULL ||
367        (text=pBiDi->text)==NULL || (length=pBiDi->length)<0 ||
368        destSize<0 || (destSize>0 && dest==NULL))
369    {
370        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
371        return 0;
372    }
373
374    /* do input and output overlap? */
375    if( dest!=NULL &&
376        ((text>=dest && text<dest+destSize) ||
377         (dest>=text && dest<text+pBiDi->originalLength)))
378    {
379        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
380        return 0;
381    }
382
383    if(length==0) {
384        /* nothing to do */
385        return u_terminateUChars(dest, destSize, 0, pErrorCode);
386    }
387
388    runCount=ubidi_countRuns(pBiDi, pErrorCode);
389    if(U_FAILURE(*pErrorCode)) {
390        return 0;
391    }
392
393    /* destSize shrinks, later destination length=destCapacity-destSize */
394    saveDest=dest;
395    destCapacity=destSize;
396
397    /*
398     * Option "insert marks" implies UBIDI_INSERT_LRM_FOR_NUMERIC if the
399     * reordering mode (checked below) is appropriate.
400     */
401    if(pBiDi->reorderingOptions & UBIDI_OPTION_INSERT_MARKS) {
402        options|=UBIDI_INSERT_LRM_FOR_NUMERIC;
403        options&=~UBIDI_REMOVE_BIDI_CONTROLS;
404    }
405    /*
406     * Option "remove controls" implies UBIDI_REMOVE_BIDI_CONTROLS
407     * and cancels UBIDI_INSERT_LRM_FOR_NUMERIC.
408     */
409    if(pBiDi->reorderingOptions & UBIDI_OPTION_REMOVE_CONTROLS) {
410        options|=UBIDI_REMOVE_BIDI_CONTROLS;
411        options&=~UBIDI_INSERT_LRM_FOR_NUMERIC;
412    }
413    /*
414     * If we do not perform the "inverse BiDi" algorithm, then we
415     * don't need to insert any LRMs, and don't need to test for it.
416     */
417    if((pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_NUMBERS_AS_L) &&
418       (pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_LIKE_DIRECT)  &&
419       (pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL) &&
420       (pBiDi->reorderingMode != UBIDI_REORDER_RUNS_ONLY)) {
421        options&=~UBIDI_INSERT_LRM_FOR_NUMERIC;
422    }
423    /*
424     * Iterate through all visual runs and copy the run text segments to
425     * the destination, according to the options.
426     *
427     * The tests for where to insert LRMs ignore the fact that there may be
428     * BN codes or non-BMP code points at the beginning and end of a run;
429     * they may insert LRMs unnecessarily but the tests are faster this way
430     * (this would have to be improved for UTF-8).
431     *
432     * Note that the only errors that are set by doWriteXY() are buffer overflow
433     * errors. Ignore them until the end, and continue for preflighting.
434     */
435    if(!(options&UBIDI_OUTPUT_REVERSE)) {
436        /* forward output */
437        if(!(options&UBIDI_INSERT_LRM_FOR_NUMERIC)) {
438            /* do not insert BiDi controls */
439            for(run=0; run<runCount; ++run) {
440                if(UBIDI_LTR==ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength)) {
441                    runLength=doWriteForward(text+logicalStart, runLength,
442                                             dest, destSize,
443                                             (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
444                } else {
445                    runLength=doWriteReverse(text+logicalStart, runLength,
446                                             dest, destSize,
447                                             options, pErrorCode);
448                }
449                dest+=runLength;
450                destSize-=runLength;
451            }
452        } else {
453            /* insert BiDi controls for "inverse BiDi" */
454            const DirProp *dirProps=pBiDi->dirProps;
455            const UChar *src;
456            UChar uc;
457            UBiDiDirection dir;
458            int32_t markFlag;
459
460            for(run=0; run<runCount; ++run) {
461                dir=ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength);
462                src=text+logicalStart;
463                /* check if something relevant in insertPoints */
464                markFlag=pBiDi->runs[run].insertRemove;
465                if(markFlag<0) {        /* BiDi controls count */
466                    markFlag=0;
467                }
468
469                if(UBIDI_LTR==dir) {
470                    if((pBiDi->isInverse) &&
471                       (/*run>0 &&*/ dirProps[logicalStart]!=L)) {
472                        markFlag |= LRM_BEFORE;
473                    }
474                    if (markFlag & LRM_BEFORE) {
475                        uc=LRM_CHAR;
476                    }
477                    else if (markFlag & RLM_BEFORE) {
478                        uc=RLM_CHAR;
479                    }
480                    else  uc=0;
481                    if(uc) {
482                        if(destSize>0) {
483                            *dest++=uc;
484                        }
485                        --destSize;
486                    }
487
488                    runLength=doWriteForward(src, runLength,
489                                             dest, destSize,
490                                             (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
491                    dest+=runLength;
492                    destSize-=runLength;
493
494                    if((pBiDi->isInverse) &&
495                       (/*run<runCount-1 &&*/ dirProps[logicalStart+runLength-1]!=L)) {
496                        markFlag |= LRM_AFTER;
497                    }
498                    if (markFlag & LRM_AFTER) {
499                        uc=LRM_CHAR;
500                    }
501                    else if (markFlag & RLM_AFTER) {
502                        uc=RLM_CHAR;
503                    }
504                    else  uc=0;
505                    if(uc) {
506                        if(destSize>0) {
507                            *dest++=uc;
508                        }
509                        --destSize;
510                    }
511                } else {                /* RTL run */
512                    if((pBiDi->isInverse) &&
513                       (/*run>0 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart+runLength-1])))) {
514                        markFlag |= RLM_BEFORE;
515                    }
516                    if (markFlag & LRM_BEFORE) {
517                        uc=LRM_CHAR;
518                    }
519                    else if (markFlag & RLM_BEFORE) {
520                        uc=RLM_CHAR;
521                    }
522                    else  uc=0;
523                    if(uc) {
524                        if(destSize>0) {
525                            *dest++=uc;
526                        }
527                        --destSize;
528                    }
529
530                    runLength=doWriteReverse(src, runLength,
531                                             dest, destSize,
532                                             options, pErrorCode);
533                    dest+=runLength;
534                    destSize-=runLength;
535
536                    if((pBiDi->isInverse) &&
537                       (/*run<runCount-1 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart])))) {
538                        markFlag |= RLM_AFTER;
539                    }
540                    if (markFlag & LRM_AFTER) {
541                        uc=LRM_CHAR;
542                    }
543                    else if (markFlag & RLM_AFTER) {
544                        uc=RLM_CHAR;
545                    }
546                    else  uc=0;
547                    if(uc) {
548                        if(destSize>0) {
549                            *dest++=uc;
550                        }
551                        --destSize;
552                    }
553                }
554            }
555        }
556    } else {
557        /* reverse output */
558        if(!(options&UBIDI_INSERT_LRM_FOR_NUMERIC)) {
559            /* do not insert BiDi controls */
560            for(run=runCount; --run>=0;) {
561                if(UBIDI_LTR==ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength)) {
562                    runLength=doWriteReverse(text+logicalStart, runLength,
563                                             dest, destSize,
564                                             (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
565                } else {
566                    runLength=doWriteForward(text+logicalStart, runLength,
567                                             dest, destSize,
568                                             options, pErrorCode);
569                }
570                dest+=runLength;
571                destSize-=runLength;
572            }
573        } else {
574            /* insert BiDi controls for "inverse BiDi" */
575            const DirProp *dirProps=pBiDi->dirProps;
576            const UChar *src;
577            UBiDiDirection dir;
578
579            for(run=runCount; --run>=0;) {
580                /* reverse output */
581                dir=ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength);
582                src=text+logicalStart;
583
584                if(UBIDI_LTR==dir) {
585                    if(/*run<runCount-1 &&*/ dirProps[logicalStart+runLength-1]!=L) {
586                        if(destSize>0) {
587                            *dest++=LRM_CHAR;
588                        }
589                        --destSize;
590                    }
591
592                    runLength=doWriteReverse(src, runLength,
593                                             dest, destSize,
594                                             (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
595                    dest+=runLength;
596                    destSize-=runLength;
597
598                    if(/*run>0 &&*/ dirProps[logicalStart]!=L) {
599                        if(destSize>0) {
600                            *dest++=LRM_CHAR;
601                        }
602                        --destSize;
603                    }
604                } else {
605                    if(/*run<runCount-1 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart]))) {
606                        if(destSize>0) {
607                            *dest++=RLM_CHAR;
608                        }
609                        --destSize;
610                    }
611
612                    runLength=doWriteForward(src, runLength,
613                                             dest, destSize,
614                                             options, pErrorCode);
615                    dest+=runLength;
616                    destSize-=runLength;
617
618                    if(/*run>0 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart+runLength-1]))) {
619                        if(destSize>0) {
620                            *dest++=RLM_CHAR;
621                        }
622                        --destSize;
623                    }
624                }
625            }
626        }
627    }
628
629    return u_terminateUChars(saveDest, destCapacity, destCapacity-destSize, pErrorCode);
630}
631