1//
2// A utility for finding substring embeddings in patterns
3
4#include <stdio.h>
5#include <string.h>
6#include <stdlib.h>
7
8#define MAXPATHS (256*1024)
9
10//
11//
12static void die(
13  const char*msg
14) {
15  fprintf(stderr,"%s\n",msg);
16  exit(1);
17}
18
19
20// Finds the index of an entry, only used on xxx_key arrays
21// Caveat: the table has to be sorted
22static int find_in(
23  char *tab[],
24  int max,
25  const char *pat
26) {
27  int left=0, right=max-1;
28  while (left <= right) {
29    int mid = ((right-left)/2)+left;
30    int v   = strcmp(pat,tab[mid]);
31    if (v>0) {
32      left = mid + 1;
33    } else if (v<0) {
34      right = mid -1;
35    } else {
36      return mid;
37    }
38  }
39  return -1;
40}
41
42
43// used by partition (which is used by qsort_arr)
44//
45static void swap2(
46  char *a[],
47  char *b[],
48  int i,
49  int j
50) {
51  if (i==j) return;
52  char*v;
53  v=a[i]; a[i]=a[j]; a[j]=v;
54  v=b[i]; b[i]=b[j]; b[j]=v;
55}
56
57
58// used by qsort_arr
59//
60static int partition(
61  char *a[],
62  char *b[],
63  int left,
64  int right,
65  int p
66) {
67  const char *pivotValue = a[p];
68  int i;
69  swap2(a,b,p,right); // Move pivot to end
70  p = left;
71  for (i=left; i<right; i++) {
72    if (strcmp(a[i],pivotValue)<=0) {
73      swap2(a,b,p,i);
74      p++;
75    }
76  }
77  swap2(a,b,right,p); // Move pivot to its final place
78  return p;
79}
80
81
82//
83//
84static void qsort_arr(
85  char *a[],
86  char *b[],
87  int left,
88  int right
89) {
90  while (right > left) {
91    int p = left + (right-left)/2; //select a pivot
92    p = partition(a,b, left, right, p);
93    if ((p-1) - left < right - (p+1)) {
94      qsort_arr(a,b, left, p-1);
95      left  = p+1;
96    } else {
97      qsort_arr(a,b, p+1, right);
98      right = p-1;
99    }
100  }
101}
102
103
104// Removes extra '0' entries from the string
105//
106static char* compact(
107  char *expr
108) {
109  int l=strlen(expr);
110  int i,j;
111  for (i=0,j=0; i<l; i++) {
112    if (expr[i]!='0') expr[j++] = expr[i];
113  }
114  expr[j]=0;
115  return expr;
116}
117
118
119// convert 'n1im' to 0n1i0m0 expressed as a string
120//
121static void expand(
122  char *expr,
123  const char *pat,
124  int l
125) {
126  int  el = 0;
127  char last = '.';
128  int  i;
129  for (i=0; i<l; i++) {
130    char c = pat[i];
131    if ( (last<'0' || last>'9')
132      && (c   <'0' || c   >'9')
133      ) {
134      expr[el++] = '0';
135    }
136    expr[el++] = c;
137    last = c;
138  }
139  if (last<'0' || last>'9') expr[el++] = '0';
140  expr[el]=0;
141}
142
143
144// Combine two patterns, i.e. .ad4der + a2d becomes .a2d4der
145// The second pattern needs to be a right side match of the first
146// (modulo digits)
147static char *combine(
148  char *expr,
149  const char *subexpr
150) {
151  int l1 = strlen(expr);
152  int l2 = strlen(subexpr);
153  int off = l1-l2;
154  int j;
155  // this works also for utf8 sequences because the substring is identical
156  // to the last substring-length bytes of expr except for the (single byte)
157  // hyphenation encoders
158  for (j=0; j<l2; j++) {
159    if (subexpr[j]>expr[off+j]) {
160      expr[off+j] = subexpr[j];
161    }
162  }
163  return expr;
164}
165
166
167//
168//
169int main(int argc, const char* argv[]) {
170  FILE *in, *out;
171  char *pattab_key[MAXPATHS];
172  char *pattab_val[MAXPATHS];
173  int   patterns = 0;
174  char *newpattab_key[MAXPATHS];
175  char *newpattab_val[MAXPATHS];
176  int   newpatterns = 0;
177  char format[132]; // 64+65+newline+zero+spare
178  int p;
179  if (argc!=3) die("Usage: <orig-file> <new-file>\n");
180  if ((in = fopen(argv[1],"r"))==NULL) die("Could not read input");
181  if ((out = fopen(argv[2],"w"))==NULL) die("Could not create output");
182  // read all patterns and split in pure text (_key) & expanded patterns (_val)
183  while(fgets(format,132,in)) {
184    int l = strlen(format);
185    if (format[l-1]=='\n') { l--; format[l]=0; } // Chomp
186    if (format[0]=='%' || format[0]==0) {
187      // skip
188    } else {
189      if (format[l-1]=='%') {
190        l--;
191        format[l] = 0; // remove '%'
192      }
193      int i,j;
194      char *pat = (char*) malloc(l+1);
195      char *org = (char*) malloc(l*2+1);
196      expand(org,format,l);
197      // remove hyphenation encoders (digits) from pat
198      for (i=0,j=0; i<l; i++) {
199        // odd, but utf-8 proof
200        char c = format[i];
201        if (c<'0' || c>'9') pat[j++]=c;
202      }
203      pat[j]=0;
204      p = patterns;
205      pattab_key[patterns]   = pat;
206      pattab_val[patterns++] = org;
207      if (patterns>MAXPATHS) die("to many base patterns");
208    }
209  }
210  fclose(in);
211  // As we use binairy search, make sure it is sorted
212  qsort_arr(pattab_key,pattab_val,0,patterns-1);
213
214  for (p=0; p<patterns; p++) {
215    char *pat = pattab_key[p];
216    int   patsize = strlen(pat);
217    int   j,l;
218    for (l=1; l<=patsize; l++) {
219      for (j=1; j<=l; j++) {
220        int i = l-j;
221        int  subpat_ndx;
222        char subpat[132];
223        strncpy(subpat,pat+i,j); subpat[j]=0;
224        if ((subpat_ndx = find_in(pattab_key,patterns,subpat))>=0) {
225          int   newpat_ndx;
226          char *newpat=malloc(l+1);
227      //printf("%s is embedded in %s\n",pattab_val[subpat_ndx],pattab_val[p]);
228          strncpy(newpat, pat+0,l); newpat[l]=0;
229          if ((newpat_ndx = find_in(newpattab_key,newpatterns,newpat))<0) {
230            char *neworg = malloc(132); // TODO: compute exact length
231            expand(neworg,newpat,l);
232            newpattab_key[newpatterns]   = newpat;
233            newpattab_val[newpatterns++] = combine(neworg,pattab_val[subpat_ndx]);
234            if (newpatterns>MAXPATHS) die("to many new patterns");
235    //printf("%*.*s|%*.*s[%s] (%s|%s) = %s\n",i,i,pat,j,j,pat+i,pat+i+j,pattab_val[p],pattab_val[subpat_ndx],neworg);
236          } else {
237            free(newpat);
238            newpattab_val[newpat_ndx] = combine(
239              newpattab_val[newpat_ndx], pattab_val[subpat_ndx] );
240          }
241        }
242      }
243    }
244  }
245
246  /* for some tiny extra speed, one could forget the free()s
247   * as the memory is freed anyway on exit().
248   * However, the gain is minimal and now the code can be cleanly
249   * incorporated into other code */
250  for (p=0; p<newpatterns; p++) {
251    fprintf(out,"%s\n",compact(newpattab_val[p]));
252    free(newpattab_key[p]);
253    free(newpattab_val[p]);
254  }
255  fclose(out);
256
257  for (p=0; p<patterns; p++) {
258    free(pattab_key[p]);
259    free(pattab_val[p]);
260  }
261  return 0;
262}
263