1/* Regular expression tests.
2   Copyright (C) 2003 Free Software Foundation, Inc.
3   This file is part of the GNU C Library.
4   Contributed by Jakub Jelinek <jakub@redhat.com>, 2003.
5
6   The GNU C Library is free software; you can redistribute it and/or
7   modify it under the terms of the GNU Lesser General Public
8   License as published by the Free Software Foundation; either
9   version 2.1 of the License, or (at your option) any later version.
10
11   The GNU C Library is distributed in the hope that it will be useful,
12   but WITHOUT ANY WARRANTY; without even the implied warranty of
13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14   Lesser General Public License for more details.
15
16   You should have received a copy of the GNU Lesser General Public
17   License along with the GNU C Library; if not, write to the Free
18   Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19   02110-1301 USA.  */
20
21#ifdef HAVE_CONFIG_H
22#include "config.h"
23#endif
24
25#include <sys/types.h>
26#ifdef HAVE_MCHECK_H
27#include <mcheck.h>
28#endif
29#include <regex.h>
30#include <stdio.h>
31#include <stdlib.h>
32#include <string.h>
33#include <locale.h>
34#include <getopt.h>
35
36static void
37replace_special_chars (char *str)
38{
39  for (; (str = strpbrk (str, "NTSZ")) != NULL; ++str)
40    switch (*str)
41      {
42      case 'N': *str = '\n'; break;
43      case 'T': *str = '\t'; break;
44      case 'S': *str = ' '; break;
45      case 'Z': *str = '\0'; break;
46      }
47}
48
49static void
50glibc_re_syntax (char *str)
51{
52  char *p, *end = strchr (str, '\0') + 1;
53
54  /* Replace [[:<:]] with \< and [[:>:]] with \>.  */
55  for (p = str; (p = strstr (p, "[[:")) != NULL; )
56    if ((p[3] == '<' || p[3] == '>') && strncmp (p + 4, ":]]", 3) == 0)
57      {
58        p[0] = '\\';
59        p[1] = p[3];
60        memmove (p + 2, p + 7, end - p - 7);
61        end -= 5;
62        p += 2;
63      }
64    else
65      p += 3;
66}
67
68static char *
69mb_replace (char *dst, const char c)
70{
71  switch (c)
72    {
73    /* Replace a with \'a and A with \'A.  */
74    case 'a':
75      *dst++ = '\xc3';
76      *dst++ = '\xa1';
77      break;
78    case 'A':
79      *dst++ = '\xc3';
80      *dst++ = '\x81';
81      break;
82    /* Replace b with \v{c} and B with \v{C}.  */
83    case 'b':
84      *dst++ = '\xc4';
85      *dst++ = '\x8d';
86      break;
87    case 'B':
88      *dst++ = '\xc4';
89      *dst++ = '\x8c';
90      break;
91    /* Replace c with \v{d} and C with \v{D}.  */
92    case 'c':
93      *dst++ = '\xc4';
94      *dst++ = '\x8f';
95      break;
96    case 'C':
97      *dst++ = '\xc4';
98      *dst++ = '\x8e';
99      break;
100    /* Replace d with \'e and D with \'E.  */
101    case 'd':
102      *dst++ = '\xc3';
103      *dst++ = '\xa9';
104      break;
105    case 'D':
106      *dst++ = '\xc3';
107      *dst++ = '\x89';
108      break;
109    }
110  return dst;
111}
112
113static char *
114mb_frob_string (const char *str, const char *letters)
115{
116  char *ret, *dst;
117  const char *src;
118
119  if (str == NULL)
120    return NULL;
121
122  ret = malloc (2 * strlen (str) + 1);
123  if (ret == NULL)
124    return NULL;
125
126  for (src = str, dst = ret; *src; ++src)
127    if (strchr (letters, *src))
128      dst = mb_replace (dst, *src);
129    else
130      *dst++ = *src;
131  *dst = '\0';
132  return ret;
133}
134
135/* Like mb_frob_string, but don't replace anything between
136   [: and :], [. and .] or [= and =].  */
137
138static char *
139mb_frob_pattern (const char *str, const char *letters)
140{
141  char *ret, *dst;
142  const char *src;
143  int in_class = 0;
144
145  if (str == NULL)
146    return NULL;
147
148  ret = malloc (2 * strlen (str) + 1);
149  if (ret == NULL)
150    return NULL;
151
152  for (src = str, dst = ret; *src; ++src)
153    if (!in_class && strchr (letters, *src))
154      dst = mb_replace (dst, *src);
155    else
156      {
157	if (!in_class && *src == '[' && strchr (":.=", src[1]))
158	  in_class = 1;
159	else if (in_class && *src == ']' && strchr (":.=", src[-1]))
160	  in_class = 0;
161	*dst++ = *src;
162      }
163  *dst = '\0';
164  return ret;
165}
166
167static int
168check_match (regmatch_t *rm, int idx, const char *string,
169	     const char *match, const char *fail)
170{
171  if (match[0] == '-' && match[1] == '\0')
172    {
173      if (rm[idx].rm_so == -1 && rm[idx].rm_eo == -1)
174	return 0;
175      printf ("%s rm[%d] unexpectedly matched\n", fail, idx);
176      return 1;
177    }
178
179  if (rm[idx].rm_so == -1 || rm[idx].rm_eo == -1)
180    {
181      printf ("%s rm[%d] unexpectedly did not match\n", fail, idx);
182      return 1;
183    }
184
185  if (match[0] == '@')
186    {
187      if (rm[idx].rm_so != rm[idx].rm_eo)
188	{
189	  printf ("%s rm[%d] not empty\n", fail, idx);
190	  return 1;
191	}
192
193      if (strncmp (string + rm[idx].rm_so, match + 1, strlen (match + 1)
194						      ? strlen (match + 1) : 1))
195	{
196	  printf ("%s rm[%d] not matching %s\n", fail, idx, match);
197	  return 1;
198	}
199      return 0;
200    }
201
202  if (rm[idx].rm_eo - rm[idx].rm_so != strlen (match)
203      || strncmp (string + rm[idx].rm_so, match,
204		  rm[idx].rm_eo - rm[idx].rm_so))
205    {
206      printf ("%s rm[%d] not matching %s\n", fail, idx, match);
207      return 1;
208    }
209
210  return 0;
211}
212
213static int
214test (const char *pattern, int cflags, const char *string, int eflags,
215      char *expect, char *matches, const char *fail)
216{
217  regex_t re;
218  regmatch_t rm[10];
219  int n, ret = 0;
220
221  n = regcomp (&re, pattern, cflags);
222  if (n != 0)
223    {
224      char buf[500];
225      if (eflags == -1)
226	{
227	  static struct { reg_errcode_t code; const char *name; } codes []
228#define C(x) { REG_##x, #x }
229	    = { C(NOERROR), C(NOMATCH), C(BADPAT), C(ECOLLATE),
230		C(ECTYPE), C(EESCAPE), C(ESUBREG), C(EBRACK),
231		C(EPAREN), C(EBRACE), C(BADBR), C(ERANGE),
232		C(ESPACE), C(BADRPT) };
233
234	  int i;
235	  for (i = 0; i < sizeof (codes) / sizeof (codes[0]); ++i)
236	    if (n == codes[i].code)
237	      {
238		if (strcmp (string, codes[i].name))
239		  {
240		    printf ("%s regcomp returned REG_%s (expected REG_%s)\n",
241			    fail, codes[i].name, string);
242		    return 1;
243		  }
244	        return 0;
245	      }
246
247	  printf ("%s regcomp return value REG_%d\n", fail, n);
248	  return 1;
249	}
250
251      regerror (n, &re, buf, sizeof (buf));
252      printf ("%s regcomp failed: %s\n", fail, buf);
253      return 1;
254    }
255
256  if (eflags == -1)
257    {
258      regfree (&re);
259
260      /* The test case file assumes something only guaranteed by the
261	 rxspencer regex implementation.  Namely that for empty
262	 expressions regcomp() return REG_EMPTY.  This is not the case
263	 for us and so we ignore this error.  */
264      if (strcmp (string, "EMPTY") == 0)
265	return 0;
266
267      printf ("%s regcomp unexpectedly succeeded\n", fail);
268      return 1;
269    }
270
271  if (regexec (&re, string, 10, rm, eflags))
272    {
273      regfree (&re);
274      if (expect == NULL)
275	return 0;
276      printf ("%s regexec failed\n", fail);
277      return 1;
278    }
279
280  regfree (&re);
281
282  if (expect == NULL)
283    {
284      printf ("%s regexec unexpectedly succeeded\n", fail);
285      return 1;
286    }
287
288  if (cflags & REG_NOSUB)
289    return 0;
290
291  ret = check_match (rm, 0, string, expect, fail);
292  if (matches == NULL)
293    return ret;
294
295  for (n = 1; ret == 0 && n < 10; ++n)
296    {
297      char *p = NULL;
298
299      if (matches)
300	{
301	  p = strchr (matches, ',');
302	  if (p != NULL)
303	    *p = '\0';
304	}
305      ret = check_match (rm, n, string, matches ? matches : "-", fail);
306      if (p)
307	{
308	  *p = ',';
309	  matches = p + 1;
310	}
311      else
312	matches = NULL;
313    }
314
315  return ret;
316}
317
318static int
319mb_test (const char *pattern, int cflags, const char *string, int eflags,
320	 char *expect, const char *matches, const char *letters,
321	 const char *fail)
322{
323  char *pattern_mb = mb_frob_pattern (pattern, letters);
324  const char *string_mb
325    = eflags == -1 ? string : mb_frob_string (string, letters);
326  char *expect_mb = mb_frob_string (expect, letters);
327  char *matches_mb = mb_frob_string (matches, letters);
328  int ret = 0;
329
330  if (!pattern_mb || !string_mb
331      || (expect && !expect_mb) || (matches && !matches_mb))
332    {
333      printf ("%s %m", fail);
334      ret = 1;
335    }
336  else
337    ret = test (pattern_mb, cflags, string_mb, eflags, expect_mb,
338		matches_mb, fail);
339
340  free (matches_mb);
341  free (expect_mb);
342  if (string_mb != string)
343    free ((char *) string_mb);
344  free (pattern_mb);
345  return ret;
346}
347
348static int
349mb_tests (const char *pattern, int cflags, const char *string, int eflags,
350	  char *expect, const char *matches)
351{
352  int ret = 0;
353  int i;
354  char letters[9], fail[20];
355
356  /* The tests aren't supposed to work with xdigit, since a-dA-D are
357     hex digits while \'a \'A \v{c}\v{C}\v{d}\v{D}\'e \'E are not.  */
358  if (strstr (pattern, "[:xdigit:]"))
359    return 0;
360
361  /* XXX: regex ATM handles only single byte equivalence classes.  */
362  if (strstr (pattern, "[[=b=]]"))
363    return 0;
364
365  for (i = 1; i < 16; ++i)
366    {
367      char *p = letters;
368      if (i & 1)
369	{
370	  if (!strchr (pattern, 'a') && !strchr (string, 'a')
371	      && !strchr (pattern, 'A') && !strchr (string, 'A'))
372	    continue;
373	  *p++ = 'a', *p++ = 'A';
374	}
375      if (i & 2)
376	{
377	  if (!strchr (pattern, 'b') && !strchr (string, 'b')
378	      && !strchr (pattern, 'B') && !strchr (string, 'B'))
379	    continue;
380	  *p++ = 'b', *p++ = 'B';
381	}
382      if (i & 4)
383	{
384	  if (!strchr (pattern, 'c') && !strchr (string, 'c')
385	      && !strchr (pattern, 'C') && !strchr (string, 'C'))
386	    continue;
387	  *p++ = 'c', *p++ = 'C';
388	}
389      if (i & 8)
390	{
391	  if (!strchr (pattern, 'd') && !strchr (string, 'd')
392	      && !strchr (pattern, 'D') && !strchr (string, 'D'))
393	    continue;
394	  *p++ = 'd', *p++ = 'D';
395	}
396      *p++ = '\0';
397      sprintf (fail, "UTF-8 %s FAIL", letters);
398      ret |= mb_test (pattern, cflags, string, eflags, expect, matches,
399		      letters, fail);
400    }
401  return ret;
402}
403
404int
405main (int argc, char **argv)
406{
407  int ret = 0;
408  char *line = NULL;
409  size_t line_len = 0;
410  ssize_t len;
411  FILE *f;
412  static int test_utf8 = 0;
413  static const struct option options[] =
414    {
415      {"utf8",	no_argument,	&test_utf8,	1},
416      {NULL,	0,		NULL,		0 }
417    };
418
419#ifdef HAVE_MCHECK_H
420  mtrace ();
421#endif
422
423  while (getopt_long (argc, argv, "", options, NULL) >= 0);
424
425  if (optind + 1 != argc)
426    {
427      fprintf (stderr, "Missing test filename\n");
428      return 1;
429    }
430
431  f = fopen (argv[optind], "r");
432  if (f == NULL)
433    {
434      fprintf (stderr, "Couldn't open %s\n", argv[optind]);
435      return 1;
436    }
437
438  while ((len = getline (&line, &line_len, f)) > 0)
439    {
440      char *pattern, *flagstr, *string, *expect, *matches, *p;
441      int cflags = REG_EXTENDED, eflags = 0, try_bre_ere = 0;
442
443      if (line[len - 1] == '\n')
444        line[len - 1] = '\0';
445
446      /* Skip comments and empty lines.  */
447      if (*line == '#' || *line == '\0')
448	continue;
449
450      puts (line);
451      fflush (stdout);
452
453      pattern = strtok (line, "\t");
454      if (pattern == NULL)
455        continue;
456
457      if (strcmp (pattern, "\"\"") == 0)
458	pattern += 2;
459
460      flagstr = strtok (NULL, "\t");
461      if (flagstr == NULL)
462        continue;
463
464      string = strtok (NULL, "\t");
465      if (string == NULL)
466        continue;
467
468      if (strcmp (string, "\"\"") == 0)
469	string += 2;
470
471      for (p = flagstr; *p; ++p)
472	switch (*p)
473	  {
474	  case '-':
475	    break;
476	  case 'b':
477	    cflags &= ~REG_EXTENDED;
478	    break;
479	  case '&':
480	    try_bre_ere = 1;
481	    break;
482	  case 'C':
483	    eflags = -1;
484	    break;
485	  case 'i':
486	    cflags |= REG_ICASE;
487	    break;
488	  case 's':
489	    cflags |= REG_NOSUB;
490	    break;
491	  case 'n':
492	    cflags |= REG_NEWLINE;
493	    break;
494	  case '^':
495	    eflags |= REG_NOTBOL;
496	    break;
497	  case '$':
498	    eflags |= REG_NOTEOL;
499	    break;
500	  case 'm':
501	  case 'p':
502	  case '#':
503	    /* Not supported.  */
504	    flagstr = NULL;
505	    break;
506	  }
507
508      if (flagstr == NULL)
509	continue;
510
511      replace_special_chars (pattern);
512      glibc_re_syntax (pattern);
513      if (eflags != -1)
514        replace_special_chars (string);
515
516      expect = strtok (NULL, "\t");
517      matches = NULL;
518      if (expect != NULL)
519        {
520	  replace_special_chars (expect);
521	  matches = strtok (NULL, "\t");
522	  if (matches != NULL)
523	    replace_special_chars (matches);
524        }
525
526      if (setlocale (LC_ALL, "C") == NULL)
527	{
528	  puts ("setlocale C failed");
529	  ret = 1;
530	}
531      if (test (pattern, cflags, string, eflags, expect, matches, "FAIL")
532	  || (try_bre_ere
533	      && test (pattern, cflags & ~REG_EXTENDED, string, eflags,
534		       expect, matches, "FAIL")))
535	ret = 1;
536      else if (test_utf8)
537	{
538	  if (setlocale (LC_ALL, "cs_CZ.UTF-8") == NULL)
539	    {
540	      puts ("setlocale cs_CZ.UTF-8 failed");
541	      ret = 1;
542	    }
543	  else if (test (pattern, cflags, string, eflags, expect, matches,
544			 "UTF-8 FAIL")
545		   || (try_bre_ere
546		       && test (pattern, cflags & ~REG_EXTENDED, string,
547				eflags, expect, matches, "UTF-8 FAIL")))
548	    ret = 1;
549	  else if (mb_tests (pattern, cflags, string, eflags, expect, matches)
550		   || (try_bre_ere
551		       && mb_tests (pattern, cflags & ~REG_EXTENDED, string,
552				    eflags, expect, matches)))
553	    ret = 1;
554	}
555    }
556
557  free (line);
558  fclose (f);
559  return ret;
560}
561