1/* 2******************************************************************************* 3* 4* Copyright (C) 2003, International Business Machines 5* Corporation and others. All Rights Reserved. 6* 7******************************************************************************* 8* file name: ucdmerge.c 9* encoding: US-ASCII 10* tab size: 8 (not used) 11* indentation:4 12* 13* created on: 2003feb20 14* created by: Markus W. Scherer 15* 16* Simple tool for Unicode Character Database files with semicolon-delimited fields. 17* Merges adjacent, identical per-code point data lines into one line with range syntax. 18* 19* To compile, just call a C compiler/linker with this source file. 20* On Windows: cl ucdmerge.c 21*/ 22 23#include <stdio.h> 24#include <string.h> 25#include <stdlib.h> 26 27static const char * 28skipWhitespace(const char *s) { 29 while(*s==' ' || *s=='\t') { 30 ++s; 31 } 32 return s; 33} 34 35/* return the first character position after the end of the data */ 36static char * 37endOfData(const char *l) { 38 char *end; 39 char c; 40 41 end=strchr(l, '#'); 42 if(end!=NULL) { 43 /* ignore whitespace before the comment */ 44 while(l!=end && ((c=*(end-1))==' ' || c=='\t')) { 45 --end; 46 } 47 } else { 48 end=strchr(l, 0); 49 } 50 return end; 51} 52 53static int 54sameData(const char *l1, const char *l2) { 55 char *end1, *end2; 56 int length; 57 58 /* find the first semicolon in each line - there must be one */ 59 l1=strchr(l1, ';')+1; 60 l2=strchr(l2, ';')+1; 61 62 /* find the end of data: end of string or start of comment */ 63 end1=endOfData(l1); 64 end2=endOfData(l2); 65 66 /* compare the line data portions */ 67 length=end1-l1; 68 return length==(end2-l2) && 0==memcmp(l1, l2, length); 69} 70 71extern int 72main(int argc, const char *argv[]) { 73 static char line[2000], firstLine[2000], lastLine[2000]; 74 char *end; 75 long first, last, c; 76 int finished; 77 78 first=last=-1; 79 finished=0; 80 81 for(;;) { 82 if(gets(line)!=NULL) { 83 /* parse the initial code point, if any */ 84 c=strtol(line, &end, 16); 85 if(end!=line && *skipWhitespace(end)==';') { 86 /* single code point followed by semicolon and data, keep c */ 87 } else { 88 c=-1; 89 } 90 } else { 91 line[0]=0; 92 c=-1; 93 finished=1; 94 } 95 96 if(last>=0 && (c!=(last+1) || !sameData(firstLine, line))) { 97 /* output the current range */ 98 if(first==last) { 99 /* there was no range, just output the one line we found */ 100 puts(firstLine); 101 } else { 102 /* there was a real range, merge their lines */ 103 end=strchr(lastLine, '#'); 104 if(end==NULL) { 105 /* no comment in second line */ 106 printf("%04lX..%04lX%s\n", 107 first, last, /* code point range */ 108 strchr(firstLine, ';'));/* first line starting from the first ; */ 109 } else if(strchr(firstLine, '#')==NULL) { 110 /* no comment in first line */ 111 printf("%04lX..%04lX%s%s\n", 112 first, last, /* code point range */ 113 strchr(firstLine, ';'), /* first line starting from the first ; */ 114 end); /* comment from second line */ 115 } else { 116 /* merge comments from both lines */ 117 printf("%04lX..%04lX%s..%s\n", 118 first, last, /* code point range */ 119 strchr(firstLine, ';'), /* first line starting from the first ; */ 120 skipWhitespace(end+1)); /* comment from second line, after # and spaces */ 121 } 122 } 123 first=last=-1; 124 } 125 126 if(c<0) { 127 if(finished) { 128 break; 129 } 130 131 /* no data on this line, output as is */ 132 puts(line); 133 } else { 134 /* data on this line, store for possible range compaction */ 135 if(last<0) { 136 /* set as the first line in a possible range */ 137 first=last=c; 138 strcpy(firstLine, line); 139 lastLine[0]=0; 140 } else /* must be c==(last+1) && sameData() because of previous conditions */ { 141 /* continue with the current range */ 142 last=c; 143 strcpy(lastLine, line); 144 } 145 } 146 } 147 148 return 0; 149} 150