1// Copyright (C) 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6*   Copyright (C) 2002-2011, International Business Machines
7*   Corporation and others.  All Rights Reserved.
8*
9*******************************************************************************
10*   file name:  propsvec.c
11*   encoding:   US-ASCII
12*   tab size:   8 (not used)
13*   indentation:4
14*
15*   created on: 2002feb22
16*   created by: Markus W. Scherer
17*
18*   Store bits (Unicode character properties) in bit set vectors.
19*/
20
21#include <stdlib.h>
22#include "unicode/utypes.h"
23#include "cmemory.h"
24#include "utrie.h"
25#include "utrie2.h"
26#include "uarrsort.h"
27#include "propsvec.h"
28#include "uassert.h"
29
30struct UPropsVectors {
31    uint32_t *v;
32    int32_t columns;  /* number of columns, plus two for start & limit values */
33    int32_t maxRows;
34    int32_t rows;
35    int32_t prevRow;  /* search optimization: remember last row seen */
36    UBool isCompacted;
37};
38
39#define UPVEC_INITIAL_ROWS (1<<12)
40#define UPVEC_MEDIUM_ROWS ((int32_t)1<<16)
41#define UPVEC_MAX_ROWS (UPVEC_MAX_CP+1)
42
43U_CAPI UPropsVectors * U_EXPORT2
44upvec_open(int32_t columns, UErrorCode *pErrorCode) {
45    UPropsVectors *pv;
46    uint32_t *v, *row;
47    uint32_t cp;
48
49    if(U_FAILURE(*pErrorCode)) {
50        return NULL;
51    }
52    if(columns<1) {
53        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
54        return NULL;
55    }
56    columns+=2; /* count range start and limit columns */
57
58    pv=(UPropsVectors *)uprv_malloc(sizeof(UPropsVectors));
59    v=(uint32_t *)uprv_malloc(UPVEC_INITIAL_ROWS*columns*4);
60    if(pv==NULL || v==NULL) {
61        uprv_free(pv);
62        uprv_free(v);
63        *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
64        return NULL;
65    }
66    uprv_memset(pv, 0, sizeof(UPropsVectors));
67    pv->v=v;
68    pv->columns=columns;
69    pv->maxRows=UPVEC_INITIAL_ROWS;
70    pv->rows=2+(UPVEC_MAX_CP-UPVEC_FIRST_SPECIAL_CP);
71
72    /* set the all-Unicode row and the special-value rows */
73    row=pv->v;
74    uprv_memset(row, 0, pv->rows*columns*4);
75    row[0]=0;
76    row[1]=0x110000;
77    row+=columns;
78    for(cp=UPVEC_FIRST_SPECIAL_CP; cp<=UPVEC_MAX_CP; ++cp) {
79        row[0]=cp;
80        row[1]=cp+1;
81        row+=columns;
82    }
83    return pv;
84}
85
86U_CAPI void U_EXPORT2
87upvec_close(UPropsVectors *pv) {
88    if(pv!=NULL) {
89        uprv_free(pv->v);
90        uprv_free(pv);
91    }
92}
93
94static uint32_t *
95_findRow(UPropsVectors *pv, UChar32 rangeStart) {
96    uint32_t *row;
97    int32_t columns, i, start, limit, prevRow;
98
99    columns=pv->columns;
100    limit=pv->rows;
101    prevRow=pv->prevRow;
102
103    /* check the vicinity of the last-seen row (start searching with an unrolled loop) */
104    row=pv->v+prevRow*columns;
105    if(rangeStart>=(UChar32)row[0]) {
106        if(rangeStart<(UChar32)row[1]) {
107            /* same row as last seen */
108            return row;
109        } else if(rangeStart<(UChar32)(row+=columns)[1]) {
110            /* next row after the last one */
111            pv->prevRow=prevRow+1;
112            return row;
113        } else if(rangeStart<(UChar32)(row+=columns)[1]) {
114            /* second row after the last one */
115            pv->prevRow=prevRow+2;
116            return row;
117        } else if((rangeStart-(UChar32)row[1])<10) {
118            /* we are close, continue looping */
119            prevRow+=2;
120            do {
121                ++prevRow;
122                row+=columns;
123            } while(rangeStart>=(UChar32)row[1]);
124            pv->prevRow=prevRow;
125            return row;
126        }
127    } else if(rangeStart<(UChar32)pv->v[1]) {
128        /* the very first row */
129        pv->prevRow=0;
130        return pv->v;
131    }
132
133    /* do a binary search for the start of the range */
134    start=0;
135    while(start<limit-1) {
136        i=(start+limit)/2;
137        row=pv->v+i*columns;
138        if(rangeStart<(UChar32)row[0]) {
139            limit=i;
140        } else if(rangeStart<(UChar32)row[1]) {
141            pv->prevRow=i;
142            return row;
143        } else {
144            start=i;
145        }
146    }
147
148    /* must be found because all ranges together always cover all of Unicode */
149    pv->prevRow=start;
150    return pv->v+start*columns;
151}
152
153U_CAPI void U_EXPORT2
154upvec_setValue(UPropsVectors *pv,
155               UChar32 start, UChar32 end,
156               int32_t column,
157               uint32_t value, uint32_t mask,
158               UErrorCode *pErrorCode) {
159    uint32_t *firstRow, *lastRow;
160    int32_t columns;
161    UChar32 limit;
162    UBool splitFirstRow, splitLastRow;
163
164    /* argument checking */
165    if(U_FAILURE(*pErrorCode)) {
166        return;
167    }
168    if( pv==NULL ||
169        start<0 || start>end || end>UPVEC_MAX_CP ||
170        column<0 || column>=(pv->columns-2)
171    ) {
172        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
173        return;
174    }
175    if(pv->isCompacted) {
176        *pErrorCode=U_NO_WRITE_PERMISSION;
177        return;
178    }
179    limit=end+1;
180
181    /* initialize */
182    columns=pv->columns;
183    column+=2; /* skip range start and limit columns */
184    value&=mask;
185
186    /* find the rows whose ranges overlap with the input range */
187
188    /* find the first and last rows, always successful */
189    firstRow=_findRow(pv, start);
190    lastRow=_findRow(pv, end);
191
192    /*
193     * Rows need to be split if they partially overlap with the
194     * input range (only possible for the first and last rows)
195     * and if their value differs from the input value.
196     */
197    splitFirstRow= (UBool)(start!=(UChar32)firstRow[0] && value!=(firstRow[column]&mask));
198    splitLastRow= (UBool)(limit!=(UChar32)lastRow[1] && value!=(lastRow[column]&mask));
199
200    /* split first/last rows if necessary */
201    if(splitFirstRow || splitLastRow) {
202        int32_t count, rows;
203
204        rows=pv->rows;
205        if((rows+splitFirstRow+splitLastRow)>pv->maxRows) {
206            uint32_t *newVectors;
207            int32_t newMaxRows;
208
209            if(pv->maxRows<UPVEC_MEDIUM_ROWS) {
210                newMaxRows=UPVEC_MEDIUM_ROWS;
211            } else if(pv->maxRows<UPVEC_MAX_ROWS) {
212                newMaxRows=UPVEC_MAX_ROWS;
213            } else {
214                /* Implementation bug, or UPVEC_MAX_ROWS too low. */
215                *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
216                return;
217            }
218            newVectors=(uint32_t *)uprv_malloc(newMaxRows*columns*4);
219            if(newVectors==NULL) {
220                *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
221                return;
222            }
223            uprv_memcpy(newVectors, pv->v, (size_t)rows*columns*4);
224            firstRow=newVectors+(firstRow-pv->v);
225            lastRow=newVectors+(lastRow-pv->v);
226            uprv_free(pv->v);
227            pv->v=newVectors;
228            pv->maxRows=newMaxRows;
229        }
230
231        /* count the number of row cells to move after the last row, and move them */
232        count = (int32_t)((pv->v+rows*columns)-(lastRow+columns));
233        if(count>0) {
234            uprv_memmove(
235                lastRow+(1+splitFirstRow+splitLastRow)*columns,
236                lastRow+columns,
237                count*4);
238        }
239        pv->rows=rows+splitFirstRow+splitLastRow;
240
241        /* split the first row, and move the firstRow pointer to the second part */
242        if(splitFirstRow) {
243            /* copy all affected rows up one and move the lastRow pointer */
244            count = (int32_t)((lastRow-firstRow)+columns);
245            uprv_memmove(firstRow+columns, firstRow, (size_t)count*4);
246            lastRow+=columns;
247
248            /* split the range and move the firstRow pointer */
249            firstRow[1]=firstRow[columns]=(uint32_t)start;
250            firstRow+=columns;
251        }
252
253        /* split the last row */
254        if(splitLastRow) {
255            /* copy the last row data */
256            uprv_memcpy(lastRow+columns, lastRow, (size_t)columns*4);
257
258            /* split the range and move the firstRow pointer */
259            lastRow[1]=lastRow[columns]=(uint32_t)limit;
260        }
261    }
262
263    /* set the "row last seen" to the last row for the range */
264    pv->prevRow=(int32_t)((lastRow-(pv->v))/columns);
265
266    /* set the input value in all remaining rows */
267    firstRow+=column;
268    lastRow+=column;
269    mask=~mask;
270    for(;;) {
271        *firstRow=(*firstRow&mask)|value;
272        if(firstRow==lastRow) {
273            break;
274        }
275        firstRow+=columns;
276    }
277}
278
279U_CAPI uint32_t U_EXPORT2
280upvec_getValue(const UPropsVectors *pv, UChar32 c, int32_t column) {
281    uint32_t *row;
282    UPropsVectors *ncpv;
283
284    if(pv->isCompacted || c<0 || c>UPVEC_MAX_CP || column<0 || column>=(pv->columns-2)) {
285        return 0;
286    }
287    ncpv=(UPropsVectors *)pv;
288    row=_findRow(ncpv, c);
289    return row[2+column];
290}
291
292U_CAPI uint32_t * U_EXPORT2
293upvec_getRow(const UPropsVectors *pv, int32_t rowIndex,
294             UChar32 *pRangeStart, UChar32 *pRangeEnd) {
295    uint32_t *row;
296    int32_t columns;
297
298    if(pv->isCompacted || rowIndex<0 || rowIndex>=pv->rows) {
299        return NULL;
300    }
301
302    columns=pv->columns;
303    row=pv->v+rowIndex*columns;
304    if(pRangeStart!=NULL) {
305        *pRangeStart=(UChar32)row[0];
306    }
307    if(pRangeEnd!=NULL) {
308        *pRangeEnd=(UChar32)row[1]-1;
309    }
310    return row+2;
311}
312
313static int32_t U_CALLCONV
314upvec_compareRows(const void *context, const void *l, const void *r) {
315    const uint32_t *left=(const uint32_t *)l, *right=(const uint32_t *)r;
316    const UPropsVectors *pv=(const UPropsVectors *)context;
317    int32_t i, count, columns;
318
319    count=columns=pv->columns; /* includes start/limit columns */
320
321    /* start comparing after start/limit but wrap around to them */
322    i=2;
323    do {
324        if(left[i]!=right[i]) {
325            return left[i]<right[i] ? -1 : 1;
326        }
327        if(++i==columns) {
328            i=0;
329        }
330    } while(--count>0);
331
332    return 0;
333}
334
335U_CAPI void U_EXPORT2
336upvec_compact(UPropsVectors *pv, UPVecCompactHandler *handler, void *context, UErrorCode *pErrorCode) {
337    uint32_t *row;
338    int32_t i, columns, valueColumns, rows, count;
339    UChar32 start, limit;
340
341    /* argument checking */
342    if(U_FAILURE(*pErrorCode)) {
343        return;
344    }
345    if(handler==NULL) {
346        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
347        return;
348    }
349    if(pv->isCompacted) {
350        return;
351    }
352
353    /* Set the flag now: Sorting and compacting destroys the builder data structure. */
354    pv->isCompacted=TRUE;
355
356    rows=pv->rows;
357    columns=pv->columns;
358    U_ASSERT(columns>=3); /* upvec_open asserts this */
359    valueColumns=columns-2; /* not counting start & limit */
360
361    /* sort the properties vectors to find unique vector values */
362    uprv_sortArray(pv->v, rows, columns*4,
363                   upvec_compareRows, pv, FALSE, pErrorCode);
364    if(U_FAILURE(*pErrorCode)) {
365        return;
366    }
367
368    /*
369     * Find and set the special values.
370     * This has to do almost the same work as the compaction below,
371     * to find the indexes where the special-value rows will move.
372     */
373    row=pv->v;
374    count=-valueColumns;
375    for(i=0; i<rows; ++i) {
376        start=(UChar32)row[0];
377
378        /* count a new values vector if it is different from the current one */
379        if(count<0 || 0!=uprv_memcmp(row+2, row-valueColumns, valueColumns*4)) {
380            count+=valueColumns;
381        }
382
383        if(start>=UPVEC_FIRST_SPECIAL_CP) {
384            handler(context, start, start, count, row+2, valueColumns, pErrorCode);
385            if(U_FAILURE(*pErrorCode)) {
386                return;
387            }
388        }
389
390        row+=columns;
391    }
392
393    /* count is at the beginning of the last vector, add valueColumns to include that last vector */
394    count+=valueColumns;
395
396    /* Call the handler once more to signal the start of delivering real values. */
397    handler(context, UPVEC_START_REAL_VALUES_CP, UPVEC_START_REAL_VALUES_CP,
398            count, row-valueColumns, valueColumns, pErrorCode);
399    if(U_FAILURE(*pErrorCode)) {
400        return;
401    }
402
403    /*
404     * Move vector contents up to a contiguous array with only unique
405     * vector values, and call the handler function for each vector.
406     *
407     * This destroys the Properties Vector structure and replaces it
408     * with an array of just vector values.
409     */
410    row=pv->v;
411    count=-valueColumns;
412    for(i=0; i<rows; ++i) {
413        /* fetch these first before memmove() may overwrite them */
414        start=(UChar32)row[0];
415        limit=(UChar32)row[1];
416
417        /* add a new values vector if it is different from the current one */
418        if(count<0 || 0!=uprv_memcmp(row+2, pv->v+count, valueColumns*4)) {
419            count+=valueColumns;
420            uprv_memmove(pv->v+count, row+2, (size_t)valueColumns*4);
421        }
422
423        if(start<UPVEC_FIRST_SPECIAL_CP) {
424            handler(context, start, limit-1, count, pv->v+count, valueColumns, pErrorCode);
425            if(U_FAILURE(*pErrorCode)) {
426                return;
427            }
428        }
429
430        row+=columns;
431    }
432
433    /* count is at the beginning of the last vector, add one to include that last vector */
434    pv->rows=count/valueColumns+1;
435}
436
437U_CAPI const uint32_t * U_EXPORT2
438upvec_getArray(const UPropsVectors *pv, int32_t *pRows, int32_t *pColumns) {
439    if(!pv->isCompacted) {
440        return NULL;
441    }
442    if(pRows!=NULL) {
443        *pRows=pv->rows;
444    }
445    if(pColumns!=NULL) {
446        *pColumns=pv->columns-2;
447    }
448    return pv->v;
449}
450
451U_CAPI uint32_t * U_EXPORT2
452upvec_cloneArray(const UPropsVectors *pv,
453                 int32_t *pRows, int32_t *pColumns, UErrorCode *pErrorCode) {
454    uint32_t *clonedArray;
455    int32_t byteLength;
456
457    if(U_FAILURE(*pErrorCode)) {
458        return NULL;
459    }
460    if(!pv->isCompacted) {
461        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
462        return NULL;
463    }
464    byteLength=pv->rows*(pv->columns-2)*4;
465    clonedArray=(uint32_t *)uprv_malloc(byteLength);
466    if(clonedArray==NULL) {
467        *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
468        return NULL;
469    }
470    uprv_memcpy(clonedArray, pv->v, byteLength);
471    if(pRows!=NULL) {
472        *pRows=pv->rows;
473    }
474    if(pColumns!=NULL) {
475        *pColumns=pv->columns-2;
476    }
477    return clonedArray;
478}
479
480U_CAPI UTrie2 * U_EXPORT2
481upvec_compactToUTrie2WithRowIndexes(UPropsVectors *pv, UErrorCode *pErrorCode) {
482    UPVecToUTrie2Context toUTrie2={ NULL, 0, 0, 0 };
483    upvec_compact(pv, upvec_compactToUTrie2Handler, &toUTrie2, pErrorCode);
484    utrie2_freeze(toUTrie2.trie, UTRIE2_16_VALUE_BITS, pErrorCode);
485    if(U_FAILURE(*pErrorCode)) {
486        utrie2_close(toUTrie2.trie);
487        toUTrie2.trie=NULL;
488    }
489    return toUTrie2.trie;
490}
491
492/*
493 * TODO(markus): Add upvec_16BitsToUTrie2() function that enumerates all rows, extracts
494 * some 16-bit field and builds and returns a UTrie2.
495 */
496
497U_CAPI void U_CALLCONV
498upvec_compactToUTrie2Handler(void *context,
499                             UChar32 start, UChar32 end,
500                             int32_t rowIndex, uint32_t *row, int32_t columns,
501                             UErrorCode *pErrorCode) {
502    UPVecToUTrie2Context *toUTrie2=(UPVecToUTrie2Context *)context;
503    if(start<UPVEC_FIRST_SPECIAL_CP) {
504        utrie2_setRange32(toUTrie2->trie, start, end, (uint32_t)rowIndex, TRUE, pErrorCode);
505    } else {
506        switch(start) {
507        case UPVEC_INITIAL_VALUE_CP:
508            toUTrie2->initialValue=rowIndex;
509            break;
510        case UPVEC_ERROR_VALUE_CP:
511            toUTrie2->errorValue=rowIndex;
512            break;
513        case UPVEC_START_REAL_VALUES_CP:
514            toUTrie2->maxValue=rowIndex;
515            if(rowIndex>0xffff) {
516                /* too many rows for a 16-bit trie */
517                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
518            } else {
519                toUTrie2->trie=utrie2_open(toUTrie2->initialValue,
520                                           toUTrie2->errorValue, pErrorCode);
521            }
522            break;
523        default:
524            break;
525        }
526    }
527}
528