1/*
2*******************************************************************************
3*
4*   Copyright (C) 2002-2011, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  propsvec.c
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2002feb22
14*   created by: Markus W. Scherer
15*
16*   Store bits (Unicode character properties) in bit set vectors.
17*/
18
19#include <stdlib.h>
20#include "unicode/utypes.h"
21#include "cmemory.h"
22#include "utrie.h"
23#include "utrie2.h"
24#include "uarrsort.h"
25#include "propsvec.h"
26#include "uassert.h"
27
28struct UPropsVectors {
29    uint32_t *v;
30    int32_t columns;  /* number of columns, plus two for start & limit values */
31    int32_t maxRows;
32    int32_t rows;
33    int32_t prevRow;  /* search optimization: remember last row seen */
34    UBool isCompacted;
35};
36
37#define UPVEC_INITIAL_ROWS (1<<12)
38#define UPVEC_MEDIUM_ROWS ((int32_t)1<<16)
39#define UPVEC_MAX_ROWS (UPVEC_MAX_CP+1)
40
41U_CAPI UPropsVectors * U_EXPORT2
42upvec_open(int32_t columns, UErrorCode *pErrorCode) {
43    UPropsVectors *pv;
44    uint32_t *v, *row;
45    uint32_t cp;
46
47    if(U_FAILURE(*pErrorCode)) {
48        return NULL;
49    }
50    if(columns<1) {
51        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
52        return NULL;
53    }
54    columns+=2; /* count range start and limit columns */
55
56    pv=(UPropsVectors *)uprv_malloc(sizeof(UPropsVectors));
57    v=(uint32_t *)uprv_malloc(UPVEC_INITIAL_ROWS*columns*4);
58    if(pv==NULL || v==NULL) {
59        uprv_free(pv);
60        uprv_free(v);
61        *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
62        return NULL;
63    }
64    uprv_memset(pv, 0, sizeof(UPropsVectors));
65    pv->v=v;
66    pv->columns=columns;
67    pv->maxRows=UPVEC_INITIAL_ROWS;
68    pv->rows=2+(UPVEC_MAX_CP-UPVEC_FIRST_SPECIAL_CP);
69
70    /* set the all-Unicode row and the special-value rows */
71    row=pv->v;
72    uprv_memset(row, 0, pv->rows*columns*4);
73    row[0]=0;
74    row[1]=0x110000;
75    row+=columns;
76    for(cp=UPVEC_FIRST_SPECIAL_CP; cp<=UPVEC_MAX_CP; ++cp) {
77        row[0]=cp;
78        row[1]=cp+1;
79        row+=columns;
80    }
81    return pv;
82}
83
84U_CAPI void U_EXPORT2
85upvec_close(UPropsVectors *pv) {
86    if(pv!=NULL) {
87        uprv_free(pv->v);
88        uprv_free(pv);
89    }
90}
91
92static uint32_t *
93_findRow(UPropsVectors *pv, UChar32 rangeStart) {
94    uint32_t *row;
95    int32_t columns, i, start, limit, prevRow;
96
97    columns=pv->columns;
98    limit=pv->rows;
99    prevRow=pv->prevRow;
100
101    /* check the vicinity of the last-seen row (start searching with an unrolled loop) */
102    row=pv->v+prevRow*columns;
103    if(rangeStart>=(UChar32)row[0]) {
104        if(rangeStart<(UChar32)row[1]) {
105            /* same row as last seen */
106            return row;
107        } else if(rangeStart<(UChar32)(row+=columns)[1]) {
108            /* next row after the last one */
109            pv->prevRow=prevRow+1;
110            return row;
111        } else if(rangeStart<(UChar32)(row+=columns)[1]) {
112            /* second row after the last one */
113            pv->prevRow=prevRow+2;
114            return row;
115        } else if((rangeStart-(UChar32)row[1])<10) {
116            /* we are close, continue looping */
117            prevRow+=2;
118            do {
119                ++prevRow;
120                row+=columns;
121            } while(rangeStart>=(UChar32)row[1]);
122            pv->prevRow=prevRow;
123            return row;
124        }
125    } else if(rangeStart<(UChar32)pv->v[1]) {
126        /* the very first row */
127        pv->prevRow=0;
128        return pv->v;
129    }
130
131    /* do a binary search for the start of the range */
132    start=0;
133    while(start<limit-1) {
134        i=(start+limit)/2;
135        row=pv->v+i*columns;
136        if(rangeStart<(UChar32)row[0]) {
137            limit=i;
138        } else if(rangeStart<(UChar32)row[1]) {
139            pv->prevRow=i;
140            return row;
141        } else {
142            start=i;
143        }
144    }
145
146    /* must be found because all ranges together always cover all of Unicode */
147    pv->prevRow=start;
148    return pv->v+start*columns;
149}
150
151U_CAPI void U_EXPORT2
152upvec_setValue(UPropsVectors *pv,
153               UChar32 start, UChar32 end,
154               int32_t column,
155               uint32_t value, uint32_t mask,
156               UErrorCode *pErrorCode) {
157    uint32_t *firstRow, *lastRow;
158    int32_t columns;
159    UChar32 limit;
160    UBool splitFirstRow, splitLastRow;
161
162    /* argument checking */
163    if(U_FAILURE(*pErrorCode)) {
164        return;
165    }
166    if( pv==NULL ||
167        start<0 || start>end || end>UPVEC_MAX_CP ||
168        column<0 || column>=(pv->columns-2)
169    ) {
170        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
171        return;
172    }
173    if(pv->isCompacted) {
174        *pErrorCode=U_NO_WRITE_PERMISSION;
175        return;
176    }
177    limit=end+1;
178
179    /* initialize */
180    columns=pv->columns;
181    column+=2; /* skip range start and limit columns */
182    value&=mask;
183
184    /* find the rows whose ranges overlap with the input range */
185
186    /* find the first and last rows, always successful */
187    firstRow=_findRow(pv, start);
188    lastRow=_findRow(pv, end);
189
190    /*
191     * Rows need to be split if they partially overlap with the
192     * input range (only possible for the first and last rows)
193     * and if their value differs from the input value.
194     */
195    splitFirstRow= (UBool)(start!=(UChar32)firstRow[0] && value!=(firstRow[column]&mask));
196    splitLastRow= (UBool)(limit!=(UChar32)lastRow[1] && value!=(lastRow[column]&mask));
197
198    /* split first/last rows if necessary */
199    if(splitFirstRow || splitLastRow) {
200        int32_t count, rows;
201
202        rows=pv->rows;
203        if((rows+splitFirstRow+splitLastRow)>pv->maxRows) {
204            uint32_t *newVectors;
205            int32_t newMaxRows;
206
207            if(pv->maxRows<UPVEC_MEDIUM_ROWS) {
208                newMaxRows=UPVEC_MEDIUM_ROWS;
209            } else if(pv->maxRows<UPVEC_MAX_ROWS) {
210                newMaxRows=UPVEC_MAX_ROWS;
211            } else {
212                /* Implementation bug, or UPVEC_MAX_ROWS too low. */
213                *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
214                return;
215            }
216            newVectors=(uint32_t *)uprv_malloc(newMaxRows*columns*4);
217            if(newVectors==NULL) {
218                *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
219                return;
220            }
221            uprv_memcpy(newVectors, pv->v, rows*columns*4);
222            firstRow=newVectors+(firstRow-pv->v);
223            lastRow=newVectors+(lastRow-pv->v);
224            uprv_free(pv->v);
225            pv->v=newVectors;
226            pv->maxRows=newMaxRows;
227        }
228
229        /* count the number of row cells to move after the last row, and move them */
230        count = (int32_t)((pv->v+rows*columns)-(lastRow+columns));
231        if(count>0) {
232            uprv_memmove(
233                lastRow+(1+splitFirstRow+splitLastRow)*columns,
234                lastRow+columns,
235                count*4);
236        }
237        pv->rows=rows+splitFirstRow+splitLastRow;
238
239        /* split the first row, and move the firstRow pointer to the second part */
240        if(splitFirstRow) {
241            /* copy all affected rows up one and move the lastRow pointer */
242            count = (int32_t)((lastRow-firstRow)+columns);
243            uprv_memmove(firstRow+columns, firstRow, count*4);
244            lastRow+=columns;
245
246            /* split the range and move the firstRow pointer */
247            firstRow[1]=firstRow[columns]=(uint32_t)start;
248            firstRow+=columns;
249        }
250
251        /* split the last row */
252        if(splitLastRow) {
253            /* copy the last row data */
254            uprv_memcpy(lastRow+columns, lastRow, columns*4);
255
256            /* split the range and move the firstRow pointer */
257            lastRow[1]=lastRow[columns]=(uint32_t)limit;
258        }
259    }
260
261    /* set the "row last seen" to the last row for the range */
262    pv->prevRow=(int32_t)((lastRow-(pv->v))/columns);
263
264    /* set the input value in all remaining rows */
265    firstRow+=column;
266    lastRow+=column;
267    mask=~mask;
268    for(;;) {
269        *firstRow=(*firstRow&mask)|value;
270        if(firstRow==lastRow) {
271            break;
272        }
273        firstRow+=columns;
274    }
275}
276
277U_CAPI uint32_t U_EXPORT2
278upvec_getValue(const UPropsVectors *pv, UChar32 c, int32_t column) {
279    uint32_t *row;
280    UPropsVectors *ncpv;
281
282    if(pv->isCompacted || c<0 || c>UPVEC_MAX_CP || column<0 || column>=(pv->columns-2)) {
283        return 0;
284    }
285    ncpv=(UPropsVectors *)pv;
286    row=_findRow(ncpv, c);
287    return row[2+column];
288}
289
290U_CAPI uint32_t * U_EXPORT2
291upvec_getRow(const UPropsVectors *pv, int32_t rowIndex,
292             UChar32 *pRangeStart, UChar32 *pRangeEnd) {
293    uint32_t *row;
294    int32_t columns;
295
296    if(pv->isCompacted || rowIndex<0 || rowIndex>=pv->rows) {
297        return NULL;
298    }
299
300    columns=pv->columns;
301    row=pv->v+rowIndex*columns;
302    if(pRangeStart!=NULL) {
303        *pRangeStart=(UChar32)row[0];
304    }
305    if(pRangeEnd!=NULL) {
306        *pRangeEnd=(UChar32)row[1]-1;
307    }
308    return row+2;
309}
310
311static int32_t U_CALLCONV
312upvec_compareRows(const void *context, const void *l, const void *r) {
313    const uint32_t *left=(const uint32_t *)l, *right=(const uint32_t *)r;
314    const UPropsVectors *pv=(const UPropsVectors *)context;
315    int32_t i, count, columns;
316
317    count=columns=pv->columns; /* includes start/limit columns */
318
319    /* start comparing after start/limit but wrap around to them */
320    i=2;
321    do {
322        if(left[i]!=right[i]) {
323            return left[i]<right[i] ? -1 : 1;
324        }
325        if(++i==columns) {
326            i=0;
327        }
328    } while(--count>0);
329
330    return 0;
331}
332
333U_CAPI void U_EXPORT2
334upvec_compact(UPropsVectors *pv, UPVecCompactHandler *handler, void *context, UErrorCode *pErrorCode) {
335    uint32_t *row;
336    int32_t i, columns, valueColumns, rows, count;
337    UChar32 start, limit;
338
339    /* argument checking */
340    if(U_FAILURE(*pErrorCode)) {
341        return;
342    }
343    if(handler==NULL) {
344        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
345        return;
346    }
347    if(pv->isCompacted) {
348        return;
349    }
350
351    /* Set the flag now: Sorting and compacting destroys the builder data structure. */
352    pv->isCompacted=TRUE;
353
354    rows=pv->rows;
355    columns=pv->columns;
356    U_ASSERT(columns>=3); /* upvec_open asserts this */
357    valueColumns=columns-2; /* not counting start & limit */
358
359    /* sort the properties vectors to find unique vector values */
360    uprv_sortArray(pv->v, rows, columns*4,
361                   upvec_compareRows, pv, FALSE, pErrorCode);
362    if(U_FAILURE(*pErrorCode)) {
363        return;
364    }
365
366    /*
367     * Find and set the special values.
368     * This has to do almost the same work as the compaction below,
369     * to find the indexes where the special-value rows will move.
370     */
371    row=pv->v;
372    count=-valueColumns;
373    for(i=0; i<rows; ++i) {
374        start=(UChar32)row[0];
375
376        /* count a new values vector if it is different from the current one */
377        if(count<0 || 0!=uprv_memcmp(row+2, row-valueColumns, valueColumns*4)) {
378            count+=valueColumns;
379        }
380
381        if(start>=UPVEC_FIRST_SPECIAL_CP) {
382            handler(context, start, start, count, row+2, valueColumns, pErrorCode);
383            if(U_FAILURE(*pErrorCode)) {
384                return;
385            }
386        }
387
388        row+=columns;
389    }
390
391    /* count is at the beginning of the last vector, add valueColumns to include that last vector */
392    count+=valueColumns;
393
394    /* Call the handler once more to signal the start of delivering real values. */
395    handler(context, UPVEC_START_REAL_VALUES_CP, UPVEC_START_REAL_VALUES_CP,
396            count, row-valueColumns, valueColumns, pErrorCode);
397    if(U_FAILURE(*pErrorCode)) {
398        return;
399    }
400
401    /*
402     * Move vector contents up to a contiguous array with only unique
403     * vector values, and call the handler function for each vector.
404     *
405     * This destroys the Properties Vector structure and replaces it
406     * with an array of just vector values.
407     */
408    row=pv->v;
409    count=-valueColumns;
410    for(i=0; i<rows; ++i) {
411        /* fetch these first before memmove() may overwrite them */
412        start=(UChar32)row[0];
413        limit=(UChar32)row[1];
414
415        /* add a new values vector if it is different from the current one */
416        if(count<0 || 0!=uprv_memcmp(row+2, pv->v+count, valueColumns*4)) {
417            count+=valueColumns;
418            uprv_memmove(pv->v+count, row+2, valueColumns*4);
419        }
420
421        if(start<UPVEC_FIRST_SPECIAL_CP) {
422            handler(context, start, limit-1, count, pv->v+count, valueColumns, pErrorCode);
423            if(U_FAILURE(*pErrorCode)) {
424                return;
425            }
426        }
427
428        row+=columns;
429    }
430
431    /* count is at the beginning of the last vector, add one to include that last vector */
432    pv->rows=count/valueColumns+1;
433}
434
435U_CAPI const uint32_t * U_EXPORT2
436upvec_getArray(const UPropsVectors *pv, int32_t *pRows, int32_t *pColumns) {
437    if(!pv->isCompacted) {
438        return NULL;
439    }
440    if(pRows!=NULL) {
441        *pRows=pv->rows;
442    }
443    if(pColumns!=NULL) {
444        *pColumns=pv->columns-2;
445    }
446    return pv->v;
447}
448
449U_CAPI uint32_t * U_EXPORT2
450upvec_cloneArray(const UPropsVectors *pv,
451                 int32_t *pRows, int32_t *pColumns, UErrorCode *pErrorCode) {
452    uint32_t *clonedArray;
453    int32_t byteLength;
454
455    if(U_FAILURE(*pErrorCode)) {
456        return NULL;
457    }
458    if(!pv->isCompacted) {
459        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
460        return NULL;
461    }
462    byteLength=pv->rows*(pv->columns-2)*4;
463    clonedArray=(uint32_t *)uprv_malloc(byteLength);
464    if(clonedArray==NULL) {
465        *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
466        return NULL;
467    }
468    uprv_memcpy(clonedArray, pv->v, byteLength);
469    if(pRows!=NULL) {
470        *pRows=pv->rows;
471    }
472    if(pColumns!=NULL) {
473        *pColumns=pv->columns-2;
474    }
475    return clonedArray;
476}
477
478U_CAPI UTrie2 * U_EXPORT2
479upvec_compactToUTrie2WithRowIndexes(UPropsVectors *pv, UErrorCode *pErrorCode) {
480    UPVecToUTrie2Context toUTrie2={ NULL };
481    upvec_compact(pv, upvec_compactToUTrie2Handler, &toUTrie2, pErrorCode);
482    utrie2_freeze(toUTrie2.trie, UTRIE2_16_VALUE_BITS, pErrorCode);
483    if(U_FAILURE(*pErrorCode)) {
484        utrie2_close(toUTrie2.trie);
485        toUTrie2.trie=NULL;
486    }
487    return toUTrie2.trie;
488}
489
490/*
491 * TODO(markus): Add upvec_16BitsToUTrie2() function that enumerates all rows, extracts
492 * some 16-bit field and builds and returns a UTrie2.
493 */
494
495U_CAPI void U_CALLCONV
496upvec_compactToUTrie2Handler(void *context,
497                             UChar32 start, UChar32 end,
498                             int32_t rowIndex, uint32_t *row, int32_t columns,
499                             UErrorCode *pErrorCode) {
500    UPVecToUTrie2Context *toUTrie2=(UPVecToUTrie2Context *)context;
501    if(start<UPVEC_FIRST_SPECIAL_CP) {
502        utrie2_setRange32(toUTrie2->trie, start, end, (uint32_t)rowIndex, TRUE, pErrorCode);
503    } else {
504        switch(start) {
505        case UPVEC_INITIAL_VALUE_CP:
506            toUTrie2->initialValue=rowIndex;
507            break;
508        case UPVEC_ERROR_VALUE_CP:
509            toUTrie2->errorValue=rowIndex;
510            break;
511        case UPVEC_START_REAL_VALUES_CP:
512            toUTrie2->maxValue=rowIndex;
513            if(rowIndex>0xffff) {
514                /* too many rows for a 16-bit trie */
515                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
516            } else {
517                toUTrie2->trie=utrie2_open(toUTrie2->initialValue,
518                                           toUTrie2->errorValue, pErrorCode);
519            }
520            break;
521        default:
522            break;
523        }
524    }
525}
526