1ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho/* 2ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho******************************************************************************* 3ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho* 4ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho* Copyright (C) 2003, International Business Machines 5ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho* Corporation and others. All Rights Reserved. 6ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho* 7ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho******************************************************************************* 8ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho* file name: ucdmerge.c 9ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho* encoding: US-ASCII 10ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho* tab size: 8 (not used) 11ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho* indentation:4 12ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho* 13ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho* created on: 2003feb20 14ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho* created by: Markus W. Scherer 15ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho* 16ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho* Simple tool for Unicode Character Database files with semicolon-delimited fields. 17ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho* Merges adjacent, identical per-code point data lines into one line with range syntax. 18ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho* 19ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho* To compile, just call a C compiler/linker with this source file. 20ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho* On Windows: cl ucdmerge.c 21ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho*/ 22ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho 23ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho#include <stdio.h> 24ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho#include <string.h> 25ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho#include <stdlib.h> 26ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho 27ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehostatic const char * 28ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehoskipWhitespace(const char *s) { 29ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho while(*s==' ' || *s=='\t') { 30ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho ++s; 31ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho } 32ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho return s; 33ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho} 34ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho 35ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho/* return the first character position after the end of the data */ 36ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehostatic char * 37ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehoendOfData(const char *l) { 38ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho char *end; 39ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho char c; 40ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho 41ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho end=strchr(l, '#'); 42ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho if(end!=NULL) { 43ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho /* ignore whitespace before the comment */ 44ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho while(l!=end && ((c=*(end-1))==' ' || c=='\t')) { 45ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho --end; 46ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho } 47ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho } else { 48ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho end=strchr(l, 0); 49ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho } 50ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho return end; 51ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho} 52ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho 53ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehostatic int 54ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehosameData(const char *l1, const char *l2) { 55ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho char *end1, *end2; 56ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho int length; 57ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho 58ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho /* find the first semicolon in each line - there must be one */ 59ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho l1=strchr(l1, ';')+1; 60ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho l2=strchr(l2, ';')+1; 61ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho 62ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho /* find the end of data: end of string or start of comment */ 63ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho end1=endOfData(l1); 64ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho end2=endOfData(l2); 65ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho 66ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho /* compare the line data portions */ 67ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho length=end1-l1; 68ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho return length==(end2-l2) && 0==memcmp(l1, l2, length); 69ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho} 70ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho 71ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehoextern int 72ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehomain(int argc, const char *argv[]) { 73ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho static char line[2000], firstLine[2000], lastLine[2000]; 74ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho char *end; 75ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho long first, last, c; 76ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho int finished; 77ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho 78ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho first=last=-1; 79ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho finished=0; 80ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho 81ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho for(;;) { 82ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho if(gets(line)!=NULL) { 83ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho /* parse the initial code point, if any */ 84ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho c=strtol(line, &end, 16); 85ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho if(end!=line && *skipWhitespace(end)==';') { 86ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho /* single code point followed by semicolon and data, keep c */ 87ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho } else { 88ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho c=-1; 89ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho } 90ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho } else { 91ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho line[0]=0; 92ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho c=-1; 93ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho finished=1; 94ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho } 95ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho 96ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho if(last>=0 && (c!=(last+1) || !sameData(firstLine, line))) { 97ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho /* output the current range */ 98ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho if(first==last) { 99ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho /* there was no range, just output the one line we found */ 100ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho puts(firstLine); 101ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho } else { 102ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho /* there was a real range, merge their lines */ 103ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho end=strchr(lastLine, '#'); 104ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho if(end==NULL) { 105ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho /* no comment in second line */ 106ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho printf("%04lX..%04lX%s\n", 107ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho first, last, /* code point range */ 108ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho strchr(firstLine, ';'));/* first line starting from the first ; */ 109ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho } else if(strchr(firstLine, '#')==NULL) { 110ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho /* no comment in first line */ 111ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho printf("%04lX..%04lX%s%s\n", 112ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho first, last, /* code point range */ 113ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho strchr(firstLine, ';'), /* first line starting from the first ; */ 114ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho end); /* comment from second line */ 115ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho } else { 116ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho /* merge comments from both lines */ 117ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho printf("%04lX..%04lX%s..%s\n", 118ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho first, last, /* code point range */ 119ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho strchr(firstLine, ';'), /* first line starting from the first ; */ 120ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho skipWhitespace(end+1)); /* comment from second line, after # and spaces */ 121ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho } 122ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho } 123ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho first=last=-1; 124ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho } 125ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho 126ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho if(c<0) { 127ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho if(finished) { 128ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho break; 129ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho } 130ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho 131ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho /* no data on this line, output as is */ 132ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho puts(line); 133ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho } else { 134ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho /* data on this line, store for possible range compaction */ 135ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho if(last<0) { 136ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho /* set as the first line in a possible range */ 137ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho first=last=c; 138ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho strcpy(firstLine, line); 139ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho lastLine[0]=0; 140ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho } else /* must be c==(last+1) && sameData() because of previous conditions */ { 141ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho /* continue with the current range */ 142ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho last=c; 143ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho strcpy(lastLine, line); 144ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho } 145ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho } 146ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho } 147ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho 148ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho return 0; 149ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho} 150