1ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho/*
2ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho*******************************************************************************
3ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho*
4ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho*   Copyright (C) 2003, International Business Machines
5ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho*   Corporation and others.  All Rights Reserved.
6ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho*
7ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho*******************************************************************************
8ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho*   file name:  ucdmerge.c
9ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho*   encoding:   US-ASCII
10ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho*   tab size:   8 (not used)
11ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho*   indentation:4
12ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho*
13ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho*   created on: 2003feb20
14ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho*   created by: Markus W. Scherer
15ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho*
16ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho*   Simple tool for Unicode Character Database files with semicolon-delimited fields.
17ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho*   Merges adjacent, identical per-code point data lines into one line with range syntax.
18ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho*
19ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho*   To compile, just call a C compiler/linker with this source file.
20ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho*   On Windows: cl ucdmerge.c
21ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho*/
22ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
23ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho#include <stdio.h>
24ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho#include <string.h>
25ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho#include <stdlib.h>
26ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
27ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehostatic const char *
28ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehoskipWhitespace(const char *s) {
29ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    while(*s==' ' || *s=='\t') {
30ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        ++s;
31ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    }
32ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    return s;
33ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho}
34ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
35ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho/* return the first character position after the end of the data */
36ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehostatic char *
37ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehoendOfData(const char *l) {
38ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    char *end;
39ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    char c;
40ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
41ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    end=strchr(l, '#');
42ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    if(end!=NULL) {
43ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        /* ignore whitespace before the comment */
44ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        while(l!=end && ((c=*(end-1))==' ' || c=='\t')) {
45ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho            --end;
46ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        }
47ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    } else {
48ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        end=strchr(l, 0);
49ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    }
50ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    return end;
51ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho}
52ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
53ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehostatic int
54ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehosameData(const char *l1, const char *l2) {
55ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    char *end1, *end2;
56ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    int length;
57ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
58ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    /* find the first semicolon in each line - there must be one */
59ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    l1=strchr(l1, ';')+1;
60ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    l2=strchr(l2, ';')+1;
61ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
62ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    /* find the end of data: end of string or start of comment */
63ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    end1=endOfData(l1);
64ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    end2=endOfData(l2);
65ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
66ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    /* compare the line data portions */
67ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    length=end1-l1;
68ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    return length==(end2-l2) && 0==memcmp(l1, l2, length);
69ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho}
70ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
71ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehoextern int
72ea1f1813c8b13a850b13f256aeb5152bb0942e81clairehomain(int argc, const char *argv[]) {
73ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    static char line[2000], firstLine[2000], lastLine[2000];
74ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    char *end;
75ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    long first, last, c;
76ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    int finished;
77ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
78ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    first=last=-1;
79ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    finished=0;
80ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
81ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    for(;;) {
82ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        if(gets(line)!=NULL) {
83ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho            /* parse the initial code point, if any */
84ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho            c=strtol(line, &end, 16);
85ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho            if(end!=line && *skipWhitespace(end)==';') {
86ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                /* single code point followed by semicolon and data, keep c */
87ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho            } else {
88ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                c=-1;
89ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho            }
90ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        } else {
91ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho            line[0]=0;
92ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho            c=-1;
93ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho            finished=1;
94ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        }
95ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
96ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        if(last>=0 && (c!=(last+1) || !sameData(firstLine, line))) {
97ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho            /* output the current range */
98ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho            if(first==last) {
99ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                /* there was no range, just output the one line we found */
100ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                puts(firstLine);
101ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho            } else {
102ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                /* there was a real range, merge their lines */
103ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                end=strchr(lastLine, '#');
104ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                if(end==NULL) {
105ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                    /* no comment in second line */
106ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                    printf("%04lX..%04lX%s\n",
107ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                            first, last,            /* code point range */
108ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                            strchr(firstLine, ';'));/* first line starting from the first ; */
109ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                } else if(strchr(firstLine, '#')==NULL) {
110ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                    /* no comment in first line */
111ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                    printf("%04lX..%04lX%s%s\n",
112ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                            first, last,            /* code point range */
113ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                            strchr(firstLine, ';'), /* first line starting from the first ; */
114ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                            end);                   /* comment from second line */
115ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                } else {
116ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                    /* merge comments from both lines */
117ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                    printf("%04lX..%04lX%s..%s\n",
118ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                            first, last,            /* code point range */
119ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                            strchr(firstLine, ';'), /* first line starting from the first ; */
120ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                            skipWhitespace(end+1)); /* comment from second line, after # and spaces */
121ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                }
122ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho            }
123ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho            first=last=-1;
124ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        }
125ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
126ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        if(c<0) {
127ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho            if(finished) {
128ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                break;
129ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho            }
130ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
131ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho            /* no data on this line, output as is */
132ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho            puts(line);
133ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        } else {
134ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho            /* data on this line, store for possible range compaction */
135ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho            if(last<0) {
136ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                /* set as the first line in a possible range */
137ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                first=last=c;
138ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                strcpy(firstLine, line);
139ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                lastLine[0]=0;
140ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho            } else /* must be c==(last+1) && sameData() because of previous conditions */ {
141ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                /* continue with the current range */
142ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                last=c;
143ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho                strcpy(lastLine, line);
144ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho            }
145ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho        }
146ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    }
147ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho
148ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho    return 0;
149ea1f1813c8b13a850b13f256aeb5152bb0942e81claireho}
150