1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru******************************************************************************* 3b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho* Copyright (C) 2001-2011, International Business Machines 4ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Corporation and others. All Rights Reserved. 5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru******************************************************************************* 6b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho* file name: bocsu.cpp 7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* encoding: US-ASCII 8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* tab size: 8 (not used) 9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* indentation:4 10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* 11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Author: Markus W. Scherer 12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* 13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Modification history: 14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* 05/18/2001 weiv Made into separate module 15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*/ 16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h" 19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#if !UCONFIG_NO_COLLATION 21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 2283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius#include "unicode/bytestream.h" 2383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius#include "unicode/utf16.h" 24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "bocsu.h" 25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * encode one difference value -0x10ffff..+0x10ffff in 1..3 bytes, 28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * preserving lexical order 29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CFUNC uint8_t * 31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruu_writeDiff(int32_t diff, uint8_t *p) { 32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(diff>=SLOPE_REACH_NEG_1) { 33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(diff<=SLOPE_REACH_POS_1) { 34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *p++=(uint8_t)(SLOPE_MIDDLE+diff); 35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(diff<=SLOPE_REACH_POS_2) { 36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *p++=(uint8_t)(SLOPE_START_POS_2+(diff/SLOPE_TAIL_COUNT)); 37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *p++=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT); 38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(diff<=SLOPE_REACH_POS_3) { 39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru p[2]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT); 40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru diff/=SLOPE_TAIL_COUNT; 41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru p[1]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT); 42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *p=(uint8_t)(SLOPE_START_POS_3+(diff/SLOPE_TAIL_COUNT)); 43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru p+=3; 44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru p[3]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT); 46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru diff/=SLOPE_TAIL_COUNT; 47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru p[2]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT); 48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru diff/=SLOPE_TAIL_COUNT; 49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru p[1]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT); 50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *p=SLOPE_MAX; 51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru p+=4; 52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t m; 55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(diff>=SLOPE_REACH_NEG_2) { 57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m); 58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *p++=(uint8_t)(SLOPE_START_NEG_2+diff); 59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *p++=(uint8_t)(SLOPE_MIN+m); 60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(diff>=SLOPE_REACH_NEG_3) { 61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m); 62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru p[2]=(uint8_t)(SLOPE_MIN+m); 63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m); 64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru p[1]=(uint8_t)(SLOPE_MIN+m); 65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *p=(uint8_t)(SLOPE_START_NEG_3+diff); 66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru p+=3; 67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m); 69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru p[3]=(uint8_t)(SLOPE_MIN+m); 70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m); 71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru p[2]=(uint8_t)(SLOPE_MIN+m); 72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m); 73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru p[1]=(uint8_t)(SLOPE_MIN+m); 74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *p=SLOPE_MIN; 75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru p+=4; 76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return p; 79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Encode the code points of a string as 83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * a sequence of byte-encoded differences (slope detection), 84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * preserving lexical order. 85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Optimize the difference-taking for runs of Unicode text within 87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * small scripts: 88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Most small scripts are allocated within aligned 128-blocks of Unicode 90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * code points. Lexical order is preserved if "prev" is always moved 91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * into the middle of such a block. 92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Additionally, "prev" is moved from anywhere in the Unihan 94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * area into the middle of that area. 95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Note that the identical-level run in a sort key is generated from 96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * NFD text - there are never Hangul characters included. 97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 98b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoU_CFUNC void 9983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Corneliusu_writeIdenticalLevelRun(const UChar *s, int32_t length, icu::ByteSink &sink) { 100b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho char scratch[64]; 101b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t capacity; 102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 103b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UChar32 prev=0; 104b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t i=0; 105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while(i<length) { 106b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho char *buffer=sink.GetAppendBuffer(1, length*2, scratch, (int32_t)sizeof(scratch), &capacity); 107b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho uint8_t *p; 108b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // We must have capacity>=SLOPE_MAX_BYTES in case u_writeDiff() writes that much, 109b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // but we do not want to force the sink.GetAppendBuffer() to allocate 110b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // for a large min_capacity because we might actually only write one byte. 111b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(capacity<16) { 112b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho buffer=scratch; 113b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho capacity=(int32_t)sizeof(scratch); 114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 115b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho p=reinterpret_cast<uint8_t *>(buffer); 116b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho uint8_t *lastSafe=p+capacity-SLOPE_MAX_BYTES; 117b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho while(i<length && p<=lastSafe) { 118b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if(prev<0x4e00 || prev>=0xa000) { 119b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho prev=(prev&~0x7f)-SLOPE_REACH_NEG_1; 120b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } else { 121b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho /* 122b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Unihan U+4e00..U+9fa5: 123b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * double-bytes down from the upper end 124b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho */ 125b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho prev=0x9fff-SLOPE_REACH_POS_2; 126b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 127b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 128b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UChar32 c; 129b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho U16_NEXT(s, i, length, c); 130b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho p=u_writeDiff(c-prev, p); 131b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho prev=c; 132b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 133b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho sink.Append(buffer, (int32_t)(p-reinterpret_cast<uint8_t *>(buffer))); 134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CFUNC int32_t 138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruu_writeIdenticalLevelRunTwoChars(UChar32 first, UChar32 second, uint8_t *p) { 139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint8_t *p0 = p; 140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(first<0x4e00 || first>=0xa000) { 141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru first=(first&~0x7f)-SLOPE_REACH_NEG_1; 142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* 144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Unihan U+4e00..U+9fa5: 145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * double-bytes down from the upper end 146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru first=0x9fff-SLOPE_REACH_POS_2; 148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru p=u_writeDiff(second-first, p); 151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return (int32_t)(p-p0); 152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif /* #if !UCONFIG_NO_COLLATION */ 155