1/*
2*******************************************************************************
3*
4*   Copyright (C) 2003, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7*******************************************************************************
8*   file name:  ucdmerge.c
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2003feb20
14*   created by: Markus W. Scherer
15*
16*   Simple tool for Unicode Character Database files with semicolon-delimited fields.
17*   Merges adjacent, identical per-code point data lines into one line with range syntax.
18*
19*   To compile, just call a C compiler/linker with this source file.
20*   On Windows: cl ucdmerge.c
21*/
22
23#include <stdio.h>
24#include <string.h>
25#include <stdlib.h>
26
27static const char *
28skipWhitespace(const char *s) {
29    while(*s==' ' || *s=='\t') {
30        ++s;
31    }
32    return s;
33}
34
35/* return the first character position after the end of the data */
36static char *
37endOfData(const char *l) {
38    char *end;
39    char c;
40
41    end=strchr(l, '#');
42    if(end!=NULL) {
43        /* ignore whitespace before the comment */
44        while(l!=end && ((c=*(end-1))==' ' || c=='\t')) {
45            --end;
46        }
47    } else {
48        end=strchr(l, 0);
49    }
50    return end;
51}
52
53static int
54sameData(const char *l1, const char *l2) {
55    char *end1, *end2;
56    int length;
57
58    /* find the first semicolon in each line - there must be one */
59    l1=strchr(l1, ';')+1;
60    l2=strchr(l2, ';')+1;
61
62    /* find the end of data: end of string or start of comment */
63    end1=endOfData(l1);
64    end2=endOfData(l2);
65
66    /* compare the line data portions */
67    length=end1-l1;
68    return length==(end2-l2) && 0==memcmp(l1, l2, length);
69}
70
71extern int
72main(int argc, const char *argv[]) {
73    static char line[2000], firstLine[2000], lastLine[2000];
74    char *end;
75    long first, last, c;
76    int finished;
77
78    first=last=-1;
79    finished=0;
80
81    for(;;) {
82        if(gets(line)!=NULL) {
83            /* parse the initial code point, if any */
84            c=strtol(line, &end, 16);
85            if(end!=line && *skipWhitespace(end)==';') {
86                /* single code point followed by semicolon and data, keep c */
87            } else {
88                c=-1;
89            }
90        } else {
91            line[0]=0;
92            c=-1;
93            finished=1;
94        }
95
96        if(last>=0 && (c!=(last+1) || !sameData(firstLine, line))) {
97            /* output the current range */
98            if(first==last) {
99                /* there was no range, just output the one line we found */
100                puts(firstLine);
101            } else {
102                /* there was a real range, merge their lines */
103                end=strchr(lastLine, '#');
104                if(end==NULL) {
105                    /* no comment in second line */
106                    printf("%04lX..%04lX%s\n",
107                            first, last,            /* code point range */
108                            strchr(firstLine, ';'));/* first line starting from the first ; */
109                } else if(strchr(firstLine, '#')==NULL) {
110                    /* no comment in first line */
111                    printf("%04lX..%04lX%s%s\n",
112                            first, last,            /* code point range */
113                            strchr(firstLine, ';'), /* first line starting from the first ; */
114                            end);                   /* comment from second line */
115                } else {
116                    /* merge comments from both lines */
117                    printf("%04lX..%04lX%s..%s\n",
118                            first, last,            /* code point range */
119                            strchr(firstLine, ';'), /* first line starting from the first ; */
120                            skipWhitespace(end+1)); /* comment from second line, after # and spaces */
121                }
122            }
123            first=last=-1;
124        }
125
126        if(c<0) {
127            if(finished) {
128                break;
129            }
130
131            /* no data on this line, output as is */
132            puts(line);
133        } else {
134            /* data on this line, store for possible range compaction */
135            if(last<0) {
136                /* set as the first line in a possible range */
137                first=last=c;
138                strcpy(firstLine, line);
139                lastLine[0]=0;
140            } else /* must be c==(last+1) && sameData() because of previous conditions */ {
141                /* continue with the current range */
142                last=c;
143                strcpy(lastLine, line);
144            }
145        }
146    }
147
148    return 0;
149}
150