propsvec.c revision b0ac937921a2c196d8b9da665135bf6ba01a1ccf
1/*
2*******************************************************************************
3*
4*   Copyright (C) 2002-2009, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  propsvec.c
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2002feb22
14*   created by: Markus W. Scherer
15*
16*   Store bits (Unicode character properties) in bit set vectors.
17*/
18
19#include <stdlib.h>
20#include "unicode/utypes.h"
21#include "cmemory.h"
22#include "utrie.h"
23#include "utrie2.h"
24#include "uarrsort.h"
25#include "propsvec.h"
26
27struct UPropsVectors {
28    uint32_t *v;
29    int32_t columns;  /* number of columns, plus two for start & limit values */
30    int32_t maxRows;
31    int32_t rows;
32    int32_t prevRow;  /* search optimization: remember last row seen */
33    UBool isCompacted;
34};
35
36#define UPVEC_INITIAL_ROWS (1<<12)
37#define UPVEC_MEDIUM_ROWS ((int32_t)1<<16)
38#define UPVEC_MAX_ROWS (UPVEC_MAX_CP+1)
39
40U_CAPI UPropsVectors * U_EXPORT2
41upvec_open(int32_t columns, UErrorCode *pErrorCode) {
42    UPropsVectors *pv;
43    uint32_t *v, *row;
44    uint32_t cp;
45
46    if(U_FAILURE(*pErrorCode)) {
47        return NULL;
48    }
49    if(columns<1) {
50        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
51        return NULL;
52    }
53    columns+=2; /* count range start and limit columns */
54
55    pv=(UPropsVectors *)uprv_malloc(sizeof(UPropsVectors));
56    v=(uint32_t *)uprv_malloc(UPVEC_INITIAL_ROWS*columns*4);
57    if(pv==NULL || v==NULL) {
58        uprv_free(pv);
59        uprv_free(v);
60        *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
61        return NULL;
62    }
63    uprv_memset(pv, 0, sizeof(UPropsVectors));
64    pv->v=v;
65    pv->columns=columns;
66    pv->maxRows=UPVEC_INITIAL_ROWS;
67    pv->rows=2+(UPVEC_MAX_CP-UPVEC_FIRST_SPECIAL_CP);
68
69    /* set the all-Unicode row and the special-value rows */
70    row=pv->v;
71    uprv_memset(row, 0, pv->rows*columns*4);
72    row[0]=0;
73    row[1]=0x110000;
74    row+=columns;
75    for(cp=UPVEC_FIRST_SPECIAL_CP; cp<=UPVEC_MAX_CP; ++cp) {
76        row[0]=cp;
77        row[1]=cp+1;
78        row+=columns;
79    }
80    return pv;
81}
82
83U_CAPI void U_EXPORT2
84upvec_close(UPropsVectors *pv) {
85    if(pv!=NULL) {
86        uprv_free(pv->v);
87        uprv_free(pv);
88    }
89}
90
91static uint32_t *
92_findRow(UPropsVectors *pv, UChar32 rangeStart) {
93    uint32_t *row;
94    int32_t columns, i, start, limit, prevRow, rows;
95
96    columns=pv->columns;
97    rows=limit=pv->rows;
98    prevRow=pv->prevRow;
99
100    /* check the vicinity of the last-seen row (start searching with an unrolled loop) */
101    row=pv->v+prevRow*columns;
102    if(rangeStart>=(UChar32)row[0]) {
103        if(rangeStart<(UChar32)row[1]) {
104            /* same row as last seen */
105            return row;
106        } else if(rangeStart<(UChar32)(row+=columns)[1]) {
107            /* next row after the last one */
108            pv->prevRow=prevRow+1;
109            return row;
110        } else if(rangeStart<(UChar32)(row+=columns)[1]) {
111            /* second row after the last one */
112            pv->prevRow=prevRow+2;
113            return row;
114        } else if((rangeStart-(UChar32)row[1])<10) {
115            /* we are close, continue looping */
116            prevRow+=2;
117            do {
118                ++prevRow;
119                row+=columns;
120            } while(rangeStart>=(UChar32)row[1]);
121            pv->prevRow=prevRow;
122            return row;
123        }
124    } else if(rangeStart<(UChar32)pv->v[1]) {
125        /* the very first row */
126        pv->prevRow=0;
127        return pv->v;
128    }
129
130    /* do a binary search for the start of the range */
131    start=0;
132    while(start<limit-1) {
133        i=(start+limit)/2;
134        row=pv->v+i*columns;
135        if(rangeStart<(UChar32)row[0]) {
136            limit=i;
137        } else if(rangeStart<(UChar32)row[1]) {
138            pv->prevRow=i;
139            return row;
140        } else {
141            start=i;
142        }
143    }
144
145    /* must be found because all ranges together always cover all of Unicode */
146    pv->prevRow=start;
147    return pv->v+start*columns;
148}
149
150U_CAPI void U_EXPORT2
151upvec_setValue(UPropsVectors *pv,
152               UChar32 start, UChar32 end,
153               int32_t column,
154               uint32_t value, uint32_t mask,
155               UErrorCode *pErrorCode) {
156    uint32_t *firstRow, *lastRow;
157    int32_t columns;
158    UChar32 limit;
159    UBool splitFirstRow, splitLastRow;
160
161    /* argument checking */
162    if(U_FAILURE(*pErrorCode)) {
163        return;
164    }
165    if( pv==NULL ||
166        start<0 || start>end || end>UPVEC_MAX_CP ||
167        column<0 || column>=(pv->columns-2)
168    ) {
169        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
170        return;
171    }
172    if(pv->isCompacted) {
173        *pErrorCode=U_NO_WRITE_PERMISSION;
174        return;
175    }
176    limit=end+1;
177
178    /* initialize */
179    columns=pv->columns;
180    column+=2; /* skip range start and limit columns */
181    value&=mask;
182
183    /* find the rows whose ranges overlap with the input range */
184
185    /* find the first and last rows, always successful */
186    firstRow=_findRow(pv, start);
187    lastRow=_findRow(pv, end);
188
189    /*
190     * Rows need to be split if they partially overlap with the
191     * input range (only possible for the first and last rows)
192     * and if their value differs from the input value.
193     */
194    splitFirstRow= (UBool)(start!=(UChar32)firstRow[0] && value!=(firstRow[column]&mask));
195    splitLastRow= (UBool)(limit!=(UChar32)lastRow[1] && value!=(lastRow[column]&mask));
196
197    /* split first/last rows if necessary */
198    if(splitFirstRow || splitLastRow) {
199        int32_t count, rows;
200
201        rows=pv->rows;
202        if((rows+splitFirstRow+splitLastRow)>pv->maxRows) {
203            uint32_t *newVectors;
204            int32_t newMaxRows;
205
206            if(pv->maxRows<UPVEC_MEDIUM_ROWS) {
207                newMaxRows=UPVEC_MEDIUM_ROWS;
208            } else if(pv->maxRows<UPVEC_MAX_ROWS) {
209                newMaxRows=UPVEC_MAX_ROWS;
210            } else {
211                /* Implementation bug, or UPVEC_MAX_ROWS too low. */
212                *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
213                return;
214            }
215            newVectors=(uint32_t *)uprv_malloc(newMaxRows*columns*4);
216            if(newVectors==NULL) {
217                *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
218                return;
219            }
220            uprv_memcpy(newVectors, pv->v, rows*columns*4);
221            firstRow=newVectors+(firstRow-pv->v);
222            lastRow=newVectors+(lastRow-pv->v);
223            uprv_free(pv->v);
224            pv->v=newVectors;
225            pv->maxRows=newMaxRows;
226        }
227
228        /* count the number of row cells to move after the last row, and move them */
229        count = (int32_t)((pv->v+rows*columns)-(lastRow+columns));
230        if(count>0) {
231            uprv_memmove(
232                lastRow+(1+splitFirstRow+splitLastRow)*columns,
233                lastRow+columns,
234                count*4);
235        }
236        pv->rows=rows+splitFirstRow+splitLastRow;
237
238        /* split the first row, and move the firstRow pointer to the second part */
239        if(splitFirstRow) {
240            /* copy all affected rows up one and move the lastRow pointer */
241            count = (int32_t)((lastRow-firstRow)+columns);
242            uprv_memmove(firstRow+columns, firstRow, count*4);
243            lastRow+=columns;
244
245            /* split the range and move the firstRow pointer */
246            firstRow[1]=firstRow[columns]=(uint32_t)start;
247            firstRow+=columns;
248        }
249
250        /* split the last row */
251        if(splitLastRow) {
252            /* copy the last row data */
253            uprv_memcpy(lastRow+columns, lastRow, columns*4);
254
255            /* split the range and move the firstRow pointer */
256            lastRow[1]=lastRow[columns]=(uint32_t)limit;
257        }
258    }
259
260    /* set the "row last seen" to the last row for the range */
261    pv->prevRow=(int32_t)((lastRow-(pv->v))/columns);
262
263    /* set the input value in all remaining rows */
264    firstRow+=column;
265    lastRow+=column;
266    mask=~mask;
267    for(;;) {
268        *firstRow=(*firstRow&mask)|value;
269        if(firstRow==lastRow) {
270            break;
271        }
272        firstRow+=columns;
273    }
274}
275
276U_CAPI uint32_t U_EXPORT2
277upvec_getValue(const UPropsVectors *pv, UChar32 c, int32_t column) {
278    uint32_t *row;
279
280    if(pv->isCompacted || c<0 || c>UPVEC_MAX_CP || column<0 || column>=(pv->columns-2)) {
281        return 0;
282    }
283    row=_findRow((UPropsVectors *)pv, c);
284    return row[2+column];
285}
286
287U_CAPI uint32_t * U_EXPORT2
288upvec_getRow(const UPropsVectors *pv, int32_t rowIndex,
289             UChar32 *pRangeStart, UChar32 *pRangeEnd) {
290    uint32_t *row;
291    int32_t columns;
292
293    if(pv->isCompacted || rowIndex<0 || rowIndex>=pv->rows) {
294        return NULL;
295    }
296
297    columns=pv->columns;
298    row=pv->v+rowIndex*columns;
299    if(pRangeStart!=NULL) {
300        *pRangeStart=(UChar32)row[0];
301    }
302    if(pRangeEnd!=NULL) {
303        *pRangeEnd=(UChar32)row[1]-1;
304    }
305    return row+2;
306}
307
308static int32_t U_CALLCONV
309upvec_compareRows(const void *context, const void *l, const void *r) {
310    const uint32_t *left=(const uint32_t *)l, *right=(const uint32_t *)r;
311    const UPropsVectors *pv=(const UPropsVectors *)context;
312    int32_t i, count, columns;
313
314    count=columns=pv->columns; /* includes start/limit columns */
315
316    /* start comparing after start/limit but wrap around to them */
317    i=2;
318    do {
319        if(left[i]!=right[i]) {
320            return left[i]<right[i] ? -1 : 1;
321        }
322        if(++i==columns) {
323            i=0;
324        }
325    } while(--count>0);
326
327    return 0;
328}
329
330U_CAPI void U_EXPORT2
331upvec_compact(UPropsVectors *pv, UPVecCompactHandler *handler, void *context, UErrorCode *pErrorCode) {
332    uint32_t *row;
333    int32_t i, columns, valueColumns, rows, count;
334    UChar32 start, limit;
335
336    /* argument checking */
337    if(U_FAILURE(*pErrorCode)) {
338        return;
339    }
340    if(handler==NULL) {
341        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
342        return;
343    }
344    if(pv->isCompacted) {
345        return;
346    }
347
348    /* Set the flag now: Sorting and compacting destroys the builder data structure. */
349    pv->isCompacted=TRUE;
350
351    rows=pv->rows;
352    columns=pv->columns;
353    valueColumns=columns-2; /* not counting start & limit */
354
355    /* sort the properties vectors to find unique vector values */
356    uprv_sortArray(pv->v, rows, columns*4,
357                   upvec_compareRows, pv, FALSE, pErrorCode);
358    if(U_FAILURE(*pErrorCode)) {
359        return;
360    }
361
362    /*
363     * Find and set the special values.
364     * This has to do almost the same work as the compaction below,
365     * to find the indexes where the special-value rows will move.
366     */
367    row=pv->v;
368    count=-valueColumns;
369    for(i=0; i<rows; ++i) {
370        start=(UChar32)row[0];
371
372        /* count a new values vector if it is different from the current one */
373        if(count<0 || 0!=uprv_memcmp(row+2, row-valueColumns, valueColumns*4)) {
374            count+=valueColumns;
375        }
376
377        if(start>=UPVEC_FIRST_SPECIAL_CP) {
378            handler(context, start, start, count, row+2, valueColumns, pErrorCode);
379            if(U_FAILURE(*pErrorCode)) {
380                return;
381            }
382        }
383
384        row+=columns;
385    }
386
387    /* count is at the beginning of the last vector, add valueColumns to include that last vector */
388    count+=valueColumns;
389
390    /* Call the handler once more to signal the start of delivering real values. */
391    handler(context, UPVEC_START_REAL_VALUES_CP, UPVEC_START_REAL_VALUES_CP,
392            count, row-valueColumns, valueColumns, pErrorCode);
393    if(U_FAILURE(*pErrorCode)) {
394        return;
395    }
396
397    /*
398     * Move vector contents up to a contiguous array with only unique
399     * vector values, and call the handler function for each vector.
400     *
401     * This destroys the Properties Vector structure and replaces it
402     * with an array of just vector values.
403     */
404    row=pv->v;
405    count=-valueColumns;
406    for(i=0; i<rows; ++i) {
407        /* fetch these first before memmove() may overwrite them */
408        start=(UChar32)row[0];
409        limit=(UChar32)row[1];
410
411        /* add a new values vector if it is different from the current one */
412        if(count<0 || 0!=uprv_memcmp(row+2, pv->v+count, valueColumns*4)) {
413            count+=valueColumns;
414            uprv_memmove(pv->v+count, row+2, valueColumns*4);
415        }
416
417        if(start<UPVEC_FIRST_SPECIAL_CP) {
418            handler(context, start, limit-1, count, pv->v+count, valueColumns, pErrorCode);
419            if(U_FAILURE(*pErrorCode)) {
420                return;
421            }
422        }
423
424        row+=columns;
425    }
426
427    /* count is at the beginning of the last vector, add one to include that last vector */
428    pv->rows=count/valueColumns+1;
429}
430
431U_CAPI const uint32_t * U_EXPORT2
432upvec_getArray(const UPropsVectors *pv, int32_t *pRows, int32_t *pColumns) {
433    if(!pv->isCompacted) {
434        return NULL;
435    }
436    if(pRows!=NULL) {
437        *pRows=pv->rows;
438    }
439    if(pColumns!=NULL) {
440        *pColumns=pv->columns-2;
441    }
442    return pv->v;
443}
444
445U_CAPI uint32_t * U_EXPORT2
446upvec_cloneArray(const UPropsVectors *pv,
447                 int32_t *pRows, int32_t *pColumns, UErrorCode *pErrorCode) {
448    uint32_t *clonedArray;
449    int32_t byteLength;
450
451    if(U_FAILURE(*pErrorCode)) {
452        return NULL;
453    }
454    if(!pv->isCompacted) {
455        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
456        return NULL;
457    }
458    byteLength=pv->rows*(pv->columns-2)*4;
459    clonedArray=(uint32_t *)uprv_malloc(byteLength);
460    if(clonedArray==NULL) {
461        *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
462        return NULL;
463    }
464    uprv_memcpy(clonedArray, pv->v, byteLength);
465    if(pRows!=NULL) {
466        *pRows=pv->rows;
467    }
468    if(pColumns!=NULL) {
469        *pColumns=pv->columns-2;
470    }
471    return clonedArray;
472}
473
474U_CAPI UTrie2 * U_EXPORT2
475upvec_compactToUTrie2WithRowIndexes(UPropsVectors *pv, UErrorCode *pErrorCode) {
476    UPVecToUTrie2Context toUTrie2={ NULL };
477    upvec_compact(pv, upvec_compactToUTrie2Handler, &toUTrie2, pErrorCode);
478    utrie2_freeze(toUTrie2.trie, UTRIE2_16_VALUE_BITS, pErrorCode);
479    if(U_FAILURE(*pErrorCode)) {
480        utrie2_close(toUTrie2.trie);
481        toUTrie2.trie=NULL;
482    }
483    return toUTrie2.trie;
484}
485
486/*
487 * TODO(markus): Add upvec_16BitsToUTrie2() function that enumerates all rows, extracts
488 * some 16-bit field and builds and returns a UTrie2.
489 */
490
491U_CAPI void U_CALLCONV
492upvec_compactToUTrieHandler(void *context,
493                            UChar32 start, UChar32 end,
494                            int32_t rowIndex, uint32_t *row, int32_t columns,
495                            UErrorCode *pErrorCode) {
496    UPVecToUTrieContext *toUTrie=(UPVecToUTrieContext *)context;
497    if(start<UPVEC_FIRST_SPECIAL_CP) {
498        if(!utrie_setRange32(toUTrie->newTrie, start, end+1, (uint32_t)rowIndex, TRUE)) {
499            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
500        }
501    } else {
502        switch(start) {
503        case UPVEC_INITIAL_VALUE_CP:
504            toUTrie->initialValue=rowIndex;
505            break;
506        case UPVEC_START_REAL_VALUES_CP:
507            if(rowIndex>0xffff) {
508                /* too many rows for a 16-bit trie */
509                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
510            } else {
511                toUTrie->newTrie=utrie_open(NULL, NULL, toUTrie->capacity,
512                                            toUTrie->initialValue, toUTrie->initialValue,
513                                            toUTrie->latin1Linear);
514                if(toUTrie->newTrie==NULL) {
515                    *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
516                }
517            }
518            break;
519        default:
520            break;
521        }
522    }
523}
524
525U_CAPI void U_CALLCONV
526upvec_compactToUTrie2Handler(void *context,
527                             UChar32 start, UChar32 end,
528                             int32_t rowIndex, uint32_t *row, int32_t columns,
529                             UErrorCode *pErrorCode) {
530    UPVecToUTrie2Context *toUTrie2=(UPVecToUTrie2Context *)context;
531    if(start<UPVEC_FIRST_SPECIAL_CP) {
532        utrie2_setRange32(toUTrie2->trie, start, end, (uint32_t)rowIndex, TRUE, pErrorCode);
533    } else {
534        switch(start) {
535        case UPVEC_INITIAL_VALUE_CP:
536            toUTrie2->initialValue=rowIndex;
537            break;
538        case UPVEC_ERROR_VALUE_CP:
539            toUTrie2->errorValue=rowIndex;
540            break;
541        case UPVEC_START_REAL_VALUES_CP:
542            toUTrie2->maxValue=rowIndex;
543            if(rowIndex>0xffff) {
544                /* too many rows for a 16-bit trie */
545                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
546            } else {
547                toUTrie2->trie=utrie2_open(toUTrie2->initialValue,
548                                           toUTrie2->errorValue, pErrorCode);
549            }
550            break;
551        default:
552            break;
553        }
554    }
555}
556