1// Copyright (C) 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4******************************************************************************
5*
6*   Copyright (C) 2000-2015, International Business Machines
7*   Corporation and others.  All Rights Reserved.
8*
9******************************************************************************
10*   file name:  ubidiwrt.c
11*   encoding:   US-ASCII
12*   tab size:   8 (not used)
13*   indentation:4
14*
15*   created on: 1999aug06
16*   created by: Markus W. Scherer, updated by Matitiahu Allouche
17*
18* This file contains implementations for BiDi functions that use
19* the core algorithm and core API to write reordered text.
20*/
21
22#include "unicode/utypes.h"
23#include "unicode/ustring.h"
24#include "unicode/uchar.h"
25#include "unicode/ubidi.h"
26#include "unicode/utf16.h"
27#include "cmemory.h"
28#include "ustr_imp.h"
29#include "ubidiimp.h"
30
31/*
32 * The function implementations in this file are designed
33 * for UTF-16 and UTF-32, not for UTF-8.
34 *
35 * Assumptions that are not true for UTF-8:
36 * - Any code point always needs the same number of code units
37 *   ("minimum-length-problem" of UTF-8)
38 * - The BiDi control characters need only one code unit each
39 *
40 * Further assumptions for all UTFs:
41 * - u_charMirror(c) needs the same number of code units as c
42 */
43#if UTF_SIZE==8
44# error reimplement ubidi_writeReordered() for UTF-8, see comment above
45#endif
46
47#define IS_COMBINING(type) ((1UL<<(type))&(1UL<<U_NON_SPACING_MARK|1UL<<U_COMBINING_SPACING_MARK|1UL<<U_ENCLOSING_MARK))
48
49/*
50 * When we have UBIDI_OUTPUT_REVERSE set on ubidi_writeReordered(), then we
51 * semantically write RTL runs in reverse and later reverse them again.
52 * Instead, we actually write them in forward order to begin with.
53 * However, if the RTL run was to be mirrored, we need to mirror here now
54 * since the implicit second reversal must not do it.
55 * It looks strange to do mirroring in LTR output, but it is only because
56 * we are writing RTL output in reverse.
57 */
58static int32_t
59doWriteForward(const UChar *src, int32_t srcLength,
60               UChar *dest, int32_t destSize,
61               uint16_t options,
62               UErrorCode *pErrorCode) {
63    /* optimize for several combinations of options */
64    switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING)) {
65    case 0: {
66        /* simply copy the LTR run to the destination */
67        int32_t length=srcLength;
68        if(destSize<length) {
69            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
70            return srcLength;
71        }
72        do {
73            *dest++=*src++;
74        } while(--length>0);
75        return srcLength;
76    }
77    case UBIDI_DO_MIRRORING: {
78        /* do mirroring */
79        int32_t i=0, j=0;
80        UChar32 c;
81
82        if(destSize<srcLength) {
83            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
84            return srcLength;
85        }
86        do {
87            U16_NEXT(src, i, srcLength, c);
88            c=u_charMirror(c);
89            U16_APPEND_UNSAFE(dest, j, c);
90        } while(i<srcLength);
91        return srcLength;
92    }
93    case UBIDI_REMOVE_BIDI_CONTROLS: {
94        /* copy the LTR run and remove any BiDi control characters */
95        int32_t remaining=destSize;
96        UChar c;
97        do {
98            c=*src++;
99            if(!IS_BIDI_CONTROL_CHAR(c)) {
100                if(--remaining<0) {
101                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
102
103                    /* preflight the length */
104                    while(--srcLength>0) {
105                        c=*src++;
106                        if(!IS_BIDI_CONTROL_CHAR(c)) {
107                            --remaining;
108                        }
109                    }
110                    return destSize-remaining;
111                }
112                *dest++=c;
113            }
114        } while(--srcLength>0);
115        return destSize-remaining;
116    }
117    default: {
118        /* remove BiDi control characters and do mirroring */
119        int32_t remaining=destSize;
120        int32_t i, j=0;
121        UChar32 c;
122        do {
123            i=0;
124            U16_NEXT(src, i, srcLength, c);
125            src+=i;
126            srcLength-=i;
127            if(!IS_BIDI_CONTROL_CHAR(c)) {
128                remaining-=i;
129                if(remaining<0) {
130                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
131
132                    /* preflight the length */
133                    while(srcLength>0) {
134                        c=*src++;
135                        if(!IS_BIDI_CONTROL_CHAR(c)) {
136                            --remaining;
137                        }
138                        --srcLength;
139                    }
140                    return destSize-remaining;
141                }
142                c=u_charMirror(c);
143                U16_APPEND_UNSAFE(dest, j, c);
144            }
145        } while(srcLength>0);
146        return j;
147    }
148    } /* end of switch */
149}
150
151static int32_t
152doWriteReverse(const UChar *src, int32_t srcLength,
153               UChar *dest, int32_t destSize,
154               uint16_t options,
155               UErrorCode *pErrorCode) {
156    /*
157     * RTL run -
158     *
159     * RTL runs need to be copied to the destination in reverse order
160     * of code points, not code units, to keep Unicode characters intact.
161     *
162     * The general strategy for this is to read the source text
163     * in backward order, collect all code units for a code point
164     * (and optionally following combining characters, see below),
165     * and copy all these code units in ascending order
166     * to the destination for this run.
167     *
168     * Several options request whether combining characters
169     * should be kept after their base characters,
170     * whether BiDi control characters should be removed, and
171     * whether characters should be replaced by their mirror-image
172     * equivalent Unicode characters.
173     */
174    int32_t i, j;
175    UChar32 c;
176
177    /* optimize for several combinations of options */
178    switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING|UBIDI_KEEP_BASE_COMBINING)) {
179    case 0:
180        /*
181         * With none of the "complicated" options set, the destination
182         * run will have the same length as the source run,
183         * and there is no mirroring and no keeping combining characters
184         * with their base characters.
185         */
186        if(destSize<srcLength) {
187            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
188            return srcLength;
189        }
190        destSize=srcLength;
191
192        /* preserve character integrity */
193        do {
194            /* i is always after the last code unit known to need to be kept in this segment */
195            i=srcLength;
196
197            /* collect code units for one base character */
198            U16_BACK_1(src, 0, srcLength);
199
200            /* copy this base character */
201            j=srcLength;
202            do {
203                *dest++=src[j++];
204            } while(j<i);
205        } while(srcLength>0);
206        break;
207    case UBIDI_KEEP_BASE_COMBINING:
208        /*
209         * Here, too, the destination
210         * run will have the same length as the source run,
211         * and there is no mirroring.
212         * We do need to keep combining characters with their base characters.
213         */
214        if(destSize<srcLength) {
215            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
216            return srcLength;
217        }
218        destSize=srcLength;
219
220        /* preserve character integrity */
221        do {
222            /* i is always after the last code unit known to need to be kept in this segment */
223            i=srcLength;
224
225            /* collect code units and modifier letters for one base character */
226            do {
227                U16_PREV(src, 0, srcLength, c);
228            } while(srcLength>0 && IS_COMBINING(u_charType(c)));
229
230            /* copy this "user character" */
231            j=srcLength;
232            do {
233                *dest++=src[j++];
234            } while(j<i);
235        } while(srcLength>0);
236        break;
237    default:
238        /*
239         * With several "complicated" options set, this is the most
240         * general and the slowest copying of an RTL run.
241         * We will do mirroring, remove BiDi controls, and
242         * keep combining characters with their base characters
243         * as requested.
244         */
245        if(!(options&UBIDI_REMOVE_BIDI_CONTROLS)) {
246            i=srcLength;
247        } else {
248            /* we need to find out the destination length of the run,
249               which will not include the BiDi control characters */
250            int32_t length=srcLength;
251            UChar ch;
252
253            i=0;
254            do {
255                ch=*src++;
256                if(!IS_BIDI_CONTROL_CHAR(ch)) {
257                    ++i;
258                }
259            } while(--length>0);
260            src-=srcLength;
261        }
262
263        if(destSize<i) {
264            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
265            return i;
266        }
267        destSize=i;
268
269        /* preserve character integrity */
270        do {
271            /* i is always after the last code unit known to need to be kept in this segment */
272            i=srcLength;
273
274            /* collect code units for one base character */
275            U16_PREV(src, 0, srcLength, c);
276            if(options&UBIDI_KEEP_BASE_COMBINING) {
277                /* collect modifier letters for this base character */
278                while(srcLength>0 && IS_COMBINING(u_charType(c))) {
279                    U16_PREV(src, 0, srcLength, c);
280                }
281            }
282
283            if(options&UBIDI_REMOVE_BIDI_CONTROLS && IS_BIDI_CONTROL_CHAR(c)) {
284                /* do not copy this BiDi control character */
285                continue;
286            }
287
288            /* copy this "user character" */
289            j=srcLength;
290            if(options&UBIDI_DO_MIRRORING) {
291                /* mirror only the base character */
292                int32_t k=0;
293                c=u_charMirror(c);
294                U16_APPEND_UNSAFE(dest, k, c);
295                dest+=k;
296                j+=k;
297            }
298            while(j<i) {
299                *dest++=src[j++];
300            }
301        } while(srcLength>0);
302        break;
303    } /* end of switch */
304
305    return destSize;
306}
307
308U_CAPI int32_t U_EXPORT2
309ubidi_writeReverse(const UChar *src, int32_t srcLength,
310                   UChar *dest, int32_t destSize,
311                   uint16_t options,
312                   UErrorCode *pErrorCode) {
313    int32_t destLength;
314
315    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
316        return 0;
317    }
318
319    /* more error checking */
320    if( src==NULL || srcLength<-1 ||
321        destSize<0 || (destSize>0 && dest==NULL))
322    {
323        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
324        return 0;
325    }
326
327    /* do input and output overlap? */
328    if( dest!=NULL &&
329        ((src>=dest && src<dest+destSize) ||
330         (dest>=src && dest<src+srcLength)))
331    {
332        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
333        return 0;
334    }
335
336    if(srcLength==-1) {
337        srcLength=u_strlen(src);
338    }
339    if(srcLength>0) {
340        destLength=doWriteReverse(src, srcLength, dest, destSize, options, pErrorCode);
341    } else {
342        /* nothing to do */
343        destLength=0;
344    }
345
346    return u_terminateUChars(dest, destSize, destLength, pErrorCode);
347}
348
349U_CAPI int32_t U_EXPORT2
350ubidi_writeReordered(UBiDi *pBiDi,
351                     UChar *dest, int32_t destSize,
352                     uint16_t options,
353                     UErrorCode *pErrorCode) {
354    const UChar *text;
355    UChar *saveDest;
356    int32_t length, destCapacity;
357    int32_t run, runCount, logicalStart, runLength;
358
359    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
360        return 0;
361    }
362
363    /* more error checking */
364    if( pBiDi==NULL ||
365        (text=pBiDi->text)==NULL || (length=pBiDi->length)<0 ||
366        destSize<0 || (destSize>0 && dest==NULL))
367    {
368        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
369        return 0;
370    }
371
372    /* do input and output overlap? */
373    if( dest!=NULL &&
374        ((text>=dest && text<dest+destSize) ||
375         (dest>=text && dest<text+pBiDi->originalLength)))
376    {
377        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
378        return 0;
379    }
380
381    if(length==0) {
382        /* nothing to do */
383        return u_terminateUChars(dest, destSize, 0, pErrorCode);
384    }
385
386    runCount=ubidi_countRuns(pBiDi, pErrorCode);
387    if(U_FAILURE(*pErrorCode)) {
388        return 0;
389    }
390
391    /* destSize shrinks, later destination length=destCapacity-destSize */
392    saveDest=dest;
393    destCapacity=destSize;
394
395    /*
396     * Option "insert marks" implies UBIDI_INSERT_LRM_FOR_NUMERIC if the
397     * reordering mode (checked below) is appropriate.
398     */
399    if(pBiDi->reorderingOptions & UBIDI_OPTION_INSERT_MARKS) {
400        options|=UBIDI_INSERT_LRM_FOR_NUMERIC;
401        options&=~UBIDI_REMOVE_BIDI_CONTROLS;
402    }
403    /*
404     * Option "remove controls" implies UBIDI_REMOVE_BIDI_CONTROLS
405     * and cancels UBIDI_INSERT_LRM_FOR_NUMERIC.
406     */
407    if(pBiDi->reorderingOptions & UBIDI_OPTION_REMOVE_CONTROLS) {
408        options|=UBIDI_REMOVE_BIDI_CONTROLS;
409        options&=~UBIDI_INSERT_LRM_FOR_NUMERIC;
410    }
411    /*
412     * If we do not perform the "inverse BiDi" algorithm, then we
413     * don't need to insert any LRMs, and don't need to test for it.
414     */
415    if((pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_NUMBERS_AS_L) &&
416       (pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_LIKE_DIRECT)  &&
417       (pBiDi->reorderingMode != UBIDI_REORDER_INVERSE_FOR_NUMBERS_SPECIAL) &&
418       (pBiDi->reorderingMode != UBIDI_REORDER_RUNS_ONLY)) {
419        options&=~UBIDI_INSERT_LRM_FOR_NUMERIC;
420    }
421    /*
422     * Iterate through all visual runs and copy the run text segments to
423     * the destination, according to the options.
424     *
425     * The tests for where to insert LRMs ignore the fact that there may be
426     * BN codes or non-BMP code points at the beginning and end of a run;
427     * they may insert LRMs unnecessarily but the tests are faster this way
428     * (this would have to be improved for UTF-8).
429     *
430     * Note that the only errors that are set by doWriteXY() are buffer overflow
431     * errors. Ignore them until the end, and continue for preflighting.
432     */
433    if(!(options&UBIDI_OUTPUT_REVERSE)) {
434        /* forward output */
435        if(!(options&UBIDI_INSERT_LRM_FOR_NUMERIC)) {
436            /* do not insert BiDi controls */
437            for(run=0; run<runCount; ++run) {
438                if(UBIDI_LTR==ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength)) {
439                    runLength=doWriteForward(text+logicalStart, runLength,
440                                             dest, destSize,
441                                             (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
442                } else {
443                    runLength=doWriteReverse(text+logicalStart, runLength,
444                                             dest, destSize,
445                                             options, pErrorCode);
446                }
447                if(dest!=NULL) {
448                  dest+=runLength;
449                }
450                destSize-=runLength;
451            }
452        } else {
453            /* insert BiDi controls for "inverse BiDi" */
454            const DirProp *dirProps=pBiDi->dirProps;
455            const UChar *src;
456            UChar uc;
457            UBiDiDirection dir;
458            int32_t markFlag;
459
460            for(run=0; run<runCount; ++run) {
461                dir=ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength);
462                src=text+logicalStart;
463                /* check if something relevant in insertPoints */
464                markFlag=pBiDi->runs[run].insertRemove;
465                if(markFlag<0) {        /* BiDi controls count */
466                    markFlag=0;
467                }
468
469                if(UBIDI_LTR==dir) {
470                    if((pBiDi->isInverse) &&
471                       (/*run>0 &&*/ dirProps[logicalStart]!=L)) {
472                        markFlag |= LRM_BEFORE;
473                    }
474                    if (markFlag & LRM_BEFORE) {
475                        uc=LRM_CHAR;
476                    }
477                    else if (markFlag & RLM_BEFORE) {
478                        uc=RLM_CHAR;
479                    }
480                    else  uc=0;
481                    if(uc) {
482                        if(destSize>0) {
483                            *dest++=uc;
484                        }
485                        --destSize;
486                    }
487
488                    runLength=doWriteForward(src, runLength,
489                                             dest, destSize,
490                                             (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
491                    if(dest!=NULL) {
492                      dest+=runLength;
493                    }
494                    destSize-=runLength;
495
496                    if((pBiDi->isInverse) &&
497                       (/*run<runCount-1 &&*/ dirProps[logicalStart+runLength-1]!=L)) {
498                        markFlag |= LRM_AFTER;
499                    }
500                    if (markFlag & LRM_AFTER) {
501                        uc=LRM_CHAR;
502                    }
503                    else if (markFlag & RLM_AFTER) {
504                        uc=RLM_CHAR;
505                    }
506                    else  uc=0;
507                    if(uc) {
508                        if(destSize>0) {
509                            *dest++=uc;
510                        }
511                        --destSize;
512                    }
513                } else {                /* RTL run */
514                    if((pBiDi->isInverse) &&
515                       (/*run>0 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart+runLength-1])))) {
516                        markFlag |= RLM_BEFORE;
517                    }
518                    if (markFlag & LRM_BEFORE) {
519                        uc=LRM_CHAR;
520                    }
521                    else if (markFlag & RLM_BEFORE) {
522                        uc=RLM_CHAR;
523                    }
524                    else  uc=0;
525                    if(uc) {
526                        if(destSize>0) {
527                            *dest++=uc;
528                        }
529                        --destSize;
530                    }
531
532                    runLength=doWriteReverse(src, runLength,
533                                             dest, destSize,
534                                             options, pErrorCode);
535                    if(dest!=NULL) {
536                      dest+=runLength;
537                    }
538                    destSize-=runLength;
539
540                    if((pBiDi->isInverse) &&
541                       (/*run<runCount-1 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart])))) {
542                        markFlag |= RLM_AFTER;
543                    }
544                    if (markFlag & LRM_AFTER) {
545                        uc=LRM_CHAR;
546                    }
547                    else if (markFlag & RLM_AFTER) {
548                        uc=RLM_CHAR;
549                    }
550                    else  uc=0;
551                    if(uc) {
552                        if(destSize>0) {
553                            *dest++=uc;
554                        }
555                        --destSize;
556                    }
557                }
558            }
559        }
560    } else {
561        /* reverse output */
562        if(!(options&UBIDI_INSERT_LRM_FOR_NUMERIC)) {
563            /* do not insert BiDi controls */
564            for(run=runCount; --run>=0;) {
565                if(UBIDI_LTR==ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength)) {
566                    runLength=doWriteReverse(text+logicalStart, runLength,
567                                             dest, destSize,
568                                             (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
569                } else {
570                    runLength=doWriteForward(text+logicalStart, runLength,
571                                             dest, destSize,
572                                             options, pErrorCode);
573                }
574                if(dest!=NULL) {
575                  dest+=runLength;
576                }
577                destSize-=runLength;
578            }
579        } else {
580            /* insert BiDi controls for "inverse BiDi" */
581            const DirProp *dirProps=pBiDi->dirProps;
582            const UChar *src;
583            UBiDiDirection dir;
584
585            for(run=runCount; --run>=0;) {
586                /* reverse output */
587                dir=ubidi_getVisualRun(pBiDi, run, &logicalStart, &runLength);
588                src=text+logicalStart;
589
590                if(UBIDI_LTR==dir) {
591                    if(/*run<runCount-1 &&*/ dirProps[logicalStart+runLength-1]!=L) {
592                        if(destSize>0) {
593                            *dest++=LRM_CHAR;
594                        }
595                        --destSize;
596                    }
597
598                    runLength=doWriteReverse(src, runLength,
599                                             dest, destSize,
600                                             (uint16_t)(options&~UBIDI_DO_MIRRORING), pErrorCode);
601                    if(dest!=NULL) {
602                      dest+=runLength;
603                    }
604                    destSize-=runLength;
605
606                    if(/*run>0 &&*/ dirProps[logicalStart]!=L) {
607                        if(destSize>0) {
608                            *dest++=LRM_CHAR;
609                        }
610                        --destSize;
611                    }
612                } else {
613                    if(/*run<runCount-1 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart]))) {
614                        if(destSize>0) {
615                            *dest++=RLM_CHAR;
616                        }
617                        --destSize;
618                    }
619
620                    runLength=doWriteForward(src, runLength,
621                                             dest, destSize,
622                                             options, pErrorCode);
623                    if(dest!=NULL) {
624                      dest+=runLength;
625                    }
626                    destSize-=runLength;
627
628                    if(/*run>0 &&*/ !(MASK_R_AL&DIRPROP_FLAG(dirProps[logicalStart+runLength-1]))) {
629                        if(destSize>0) {
630                            *dest++=RLM_CHAR;
631                        }
632                        --destSize;
633                    }
634                }
635            }
636        }
637    }
638
639    return u_terminateUChars(saveDest, destCapacity, destCapacity-destSize, pErrorCode);
640}
641