1/* -*- buffer-read-only: t -*- vi: set ro: */
2/* DO NOT EDIT! GENERATED AUTOMATICALLY! */
3/* Extended regular expression matching and search library.
4   Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
5   Free Software Foundation, Inc.
6   This file is part of the GNU C Library.
7   Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
8
9   This program is free software; you can redistribute it and/or modify
10   it under the terms of the GNU General Public License as published by
11   the Free Software Foundation; either version 3, or (at your option)
12   any later version.
13
14   This program is distributed in the hope that it will be useful,
15   but WITHOUT ANY WARRANTY; without even the implied warranty of
16   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   GNU General Public License for more details.
18
19   You should have received a copy of the GNU General Public License along
20   with this program; if not, write to the Free Software Foundation,
21   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
22
23static void re_string_construct_common (const char *str, Idx len,
24					re_string_t *pstr,
25					RE_TRANSLATE_TYPE trans, bool icase,
26					const re_dfa_t *dfa) internal_function;
27static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
28					  const re_node_set *nodes,
29					  re_hashval_t hash) internal_function;
30static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
31					  const re_node_set *nodes,
32					  unsigned int context,
33					  re_hashval_t hash) internal_function;
34
35/* Functions for string operation.  */
36
37/* This function allocate the buffers.  It is necessary to call
38   re_string_reconstruct before using the object.  */
39
40static reg_errcode_t
41internal_function
42re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
43		    RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
44{
45  reg_errcode_t ret;
46  Idx init_buf_len;
47
48  /* Ensure at least one character fits into the buffers.  */
49  if (init_len < dfa->mb_cur_max)
50    init_len = dfa->mb_cur_max;
51  init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
52  re_string_construct_common (str, len, pstr, trans, icase, dfa);
53
54  ret = re_string_realloc_buffers (pstr, init_buf_len);
55  if (BE (ret != REG_NOERROR, 0))
56    return ret;
57
58  pstr->word_char = dfa->word_char;
59  pstr->word_ops_used = dfa->word_ops_used;
60  pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
61  pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
62  pstr->valid_raw_len = pstr->valid_len;
63  return REG_NOERROR;
64}
65
66/* This function allocate the buffers, and initialize them.  */
67
68static reg_errcode_t
69internal_function
70re_string_construct (re_string_t *pstr, const char *str, Idx len,
71		     RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
72{
73  reg_errcode_t ret;
74  memset (pstr, '\0', sizeof (re_string_t));
75  re_string_construct_common (str, len, pstr, trans, icase, dfa);
76
77  if (len > 0)
78    {
79      ret = re_string_realloc_buffers (pstr, len + 1);
80      if (BE (ret != REG_NOERROR, 0))
81	return ret;
82    }
83  pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
84
85  if (icase)
86    {
87#ifdef RE_ENABLE_I18N
88      if (dfa->mb_cur_max > 1)
89	{
90	  while (1)
91	    {
92	      ret = build_wcs_upper_buffer (pstr);
93	      if (BE (ret != REG_NOERROR, 0))
94		return ret;
95	      if (pstr->valid_raw_len >= len)
96		break;
97	      if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
98		break;
99	      ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
100	      if (BE (ret != REG_NOERROR, 0))
101		return ret;
102	    }
103	}
104      else
105#endif /* RE_ENABLE_I18N  */
106	build_upper_buffer (pstr);
107    }
108  else
109    {
110#ifdef RE_ENABLE_I18N
111      if (dfa->mb_cur_max > 1)
112	build_wcs_buffer (pstr);
113      else
114#endif /* RE_ENABLE_I18N  */
115	{
116	  if (trans != NULL)
117	    re_string_translate_buffer (pstr);
118	  else
119	    {
120	      pstr->valid_len = pstr->bufs_len;
121	      pstr->valid_raw_len = pstr->bufs_len;
122	    }
123	}
124    }
125
126  return REG_NOERROR;
127}
128
129/* Helper functions for re_string_allocate, and re_string_construct.  */
130
131static reg_errcode_t
132internal_function
133re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
134{
135#ifdef RE_ENABLE_I18N
136  if (pstr->mb_cur_max > 1)
137    {
138      wint_t *new_wcs;
139
140      /* Avoid overflow.  */
141      size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
142      if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
143	return REG_ESPACE;
144
145      new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
146      if (BE (new_wcs == NULL, 0))
147	return REG_ESPACE;
148      pstr->wcs = new_wcs;
149      if (pstr->offsets != NULL)
150	{
151	  Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
152	  if (BE (new_offsets == NULL, 0))
153	    return REG_ESPACE;
154	  pstr->offsets = new_offsets;
155	}
156    }
157#endif /* RE_ENABLE_I18N  */
158  if (pstr->mbs_allocated)
159    {
160      unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
161					   new_buf_len);
162      if (BE (new_mbs == NULL, 0))
163	return REG_ESPACE;
164      pstr->mbs = new_mbs;
165    }
166  pstr->bufs_len = new_buf_len;
167  return REG_NOERROR;
168}
169
170
171static void
172internal_function
173re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
174			    RE_TRANSLATE_TYPE trans, bool icase,
175			    const re_dfa_t *dfa)
176{
177  pstr->raw_mbs = (const unsigned char *) str;
178  pstr->len = len;
179  pstr->raw_len = len;
180  pstr->trans = trans;
181  pstr->icase = icase;
182  pstr->mbs_allocated = (trans != NULL || icase);
183  pstr->mb_cur_max = dfa->mb_cur_max;
184  pstr->is_utf8 = dfa->is_utf8;
185  pstr->map_notascii = dfa->map_notascii;
186  pstr->stop = pstr->len;
187  pstr->raw_stop = pstr->stop;
188}
189
190#ifdef RE_ENABLE_I18N
191
192/* Build wide character buffer PSTR->WCS.
193   If the byte sequence of the string are:
194     <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
195   Then wide character buffer will be:
196     <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
197   We use WEOF for padding, they indicate that the position isn't
198   a first byte of a multibyte character.
199
200   Note that this function assumes PSTR->VALID_LEN elements are already
201   built and starts from PSTR->VALID_LEN.  */
202
203static void
204internal_function
205build_wcs_buffer (re_string_t *pstr)
206{
207#ifdef _LIBC
208  unsigned char buf[MB_LEN_MAX];
209  assert (MB_LEN_MAX >= pstr->mb_cur_max);
210#else
211  unsigned char buf[64];
212#endif
213  mbstate_t prev_st;
214  Idx byte_idx, end_idx, remain_len;
215  size_t mbclen;
216
217  /* Build the buffers from pstr->valid_len to either pstr->len or
218     pstr->bufs_len.  */
219  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
220  for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
221    {
222      wchar_t wc;
223      const char *p;
224
225      remain_len = end_idx - byte_idx;
226      prev_st = pstr->cur_state;
227      /* Apply the translation if we need.  */
228      if (BE (pstr->trans != NULL, 0))
229	{
230	  int i, ch;
231
232	  for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
233	    {
234	      ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
235	      buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
236	    }
237	  p = (const char *) buf;
238	}
239      else
240	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
241      mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
242      if (BE (mbclen == (size_t) -2, 0))
243	{
244	  /* The buffer doesn't have enough space, finish to build.  */
245	  pstr->cur_state = prev_st;
246	  break;
247	}
248      else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
249	{
250	  /* We treat these cases as a singlebyte character.  */
251	  mbclen = 1;
252	  wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
253	  if (BE (pstr->trans != NULL, 0))
254	    wc = pstr->trans[wc];
255	  pstr->cur_state = prev_st;
256	}
257
258      /* Write wide character and padding.  */
259      pstr->wcs[byte_idx++] = wc;
260      /* Write paddings.  */
261      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
262	pstr->wcs[byte_idx++] = WEOF;
263    }
264  pstr->valid_len = byte_idx;
265  pstr->valid_raw_len = byte_idx;
266}
267
268/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
269   but for REG_ICASE.  */
270
271static reg_errcode_t
272internal_function
273build_wcs_upper_buffer (re_string_t *pstr)
274{
275  mbstate_t prev_st;
276  Idx src_idx, byte_idx, end_idx, remain_len;
277  size_t mbclen;
278#ifdef _LIBC
279  char buf[MB_LEN_MAX];
280  assert (MB_LEN_MAX >= pstr->mb_cur_max);
281#else
282  char buf[64];
283#endif
284
285  byte_idx = pstr->valid_len;
286  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
287
288  /* The following optimization assumes that ASCII characters can be
289     mapped to wide characters with a simple cast.  */
290  if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
291    {
292      while (byte_idx < end_idx)
293	{
294	  wchar_t wc;
295
296	  if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
297	      && mbsinit (&pstr->cur_state))
298	    {
299	      /* In case of a singlebyte character.  */
300	      pstr->mbs[byte_idx]
301		= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
302	      /* The next step uses the assumption that wchar_t is encoded
303		 ASCII-safe: all ASCII values can be converted like this.  */
304	      pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
305	      ++byte_idx;
306	      continue;
307	    }
308
309	  remain_len = end_idx - byte_idx;
310	  prev_st = pstr->cur_state;
311	  mbclen = __mbrtowc (&wc,
312			      ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
313			       + byte_idx), remain_len, &pstr->cur_state);
314	  if (BE (mbclen < (size_t) -2, 1))
315	    {
316	      wchar_t wcu = wc;
317	      if (iswlower (wc))
318		{
319		  size_t mbcdlen;
320
321		  wcu = towupper (wc);
322		  mbcdlen = wcrtomb (buf, wcu, &prev_st);
323		  if (BE (mbclen == mbcdlen, 1))
324		    memcpy (pstr->mbs + byte_idx, buf, mbclen);
325		  else
326		    {
327		      src_idx = byte_idx;
328		      goto offsets_needed;
329		    }
330		}
331	      else
332		memcpy (pstr->mbs + byte_idx,
333			pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
334	      pstr->wcs[byte_idx++] = wcu;
335	      /* Write paddings.  */
336	      for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
337		pstr->wcs[byte_idx++] = WEOF;
338	    }
339	  else if (mbclen == (size_t) -1 || mbclen == 0)
340	    {
341	      /* It is an invalid character or '\0'.  Just use the byte.  */
342	      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
343	      pstr->mbs[byte_idx] = ch;
344	      /* And also cast it to wide char.  */
345	      pstr->wcs[byte_idx++] = (wchar_t) ch;
346	      if (BE (mbclen == (size_t) -1, 0))
347		pstr->cur_state = prev_st;
348	    }
349	  else
350	    {
351	      /* The buffer doesn't have enough space, finish to build.  */
352	      pstr->cur_state = prev_st;
353	      break;
354	    }
355	}
356      pstr->valid_len = byte_idx;
357      pstr->valid_raw_len = byte_idx;
358      return REG_NOERROR;
359    }
360  else
361    for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
362      {
363	wchar_t wc;
364	const char *p;
365      offsets_needed:
366	remain_len = end_idx - byte_idx;
367	prev_st = pstr->cur_state;
368	if (BE (pstr->trans != NULL, 0))
369	  {
370	    int i, ch;
371
372	    for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
373	      {
374		ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
375		buf[i] = pstr->trans[ch];
376	      }
377	    p = (const char *) buf;
378	  }
379	else
380	  p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
381	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
382	if (BE (mbclen < (size_t) -2, 1))
383	  {
384	    wchar_t wcu = wc;
385	    if (iswlower (wc))
386	      {
387		size_t mbcdlen;
388
389		wcu = towupper (wc);
390		mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
391		if (BE (mbclen == mbcdlen, 1))
392		  memcpy (pstr->mbs + byte_idx, buf, mbclen);
393		else if (mbcdlen != (size_t) -1)
394		  {
395		    size_t i;
396
397		    if (byte_idx + mbcdlen > pstr->bufs_len)
398		      {
399			pstr->cur_state = prev_st;
400			break;
401		      }
402
403		    if (pstr->offsets == NULL)
404		      {
405			pstr->offsets = re_malloc (Idx, pstr->bufs_len);
406
407			if (pstr->offsets == NULL)
408			  return REG_ESPACE;
409		      }
410		    if (!pstr->offsets_needed)
411		      {
412			for (i = 0; i < (size_t) byte_idx; ++i)
413			  pstr->offsets[i] = i;
414			pstr->offsets_needed = 1;
415		      }
416
417		    memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
418		    pstr->wcs[byte_idx] = wcu;
419		    pstr->offsets[byte_idx] = src_idx;
420		    for (i = 1; i < mbcdlen; ++i)
421		      {
422			pstr->offsets[byte_idx + i]
423			  = src_idx + (i < mbclen ? i : mbclen - 1);
424			pstr->wcs[byte_idx + i] = WEOF;
425		      }
426		    pstr->len += mbcdlen - mbclen;
427		    if (pstr->raw_stop > src_idx)
428		      pstr->stop += mbcdlen - mbclen;
429		    end_idx = (pstr->bufs_len > pstr->len)
430			      ? pstr->len : pstr->bufs_len;
431		    byte_idx += mbcdlen;
432		    src_idx += mbclen;
433		    continue;
434		  }
435                else
436                  memcpy (pstr->mbs + byte_idx, p, mbclen);
437	      }
438	    else
439	      memcpy (pstr->mbs + byte_idx, p, mbclen);
440
441	    if (BE (pstr->offsets_needed != 0, 0))
442	      {
443		size_t i;
444		for (i = 0; i < mbclen; ++i)
445		  pstr->offsets[byte_idx + i] = src_idx + i;
446	      }
447	    src_idx += mbclen;
448
449	    pstr->wcs[byte_idx++] = wcu;
450	    /* Write paddings.  */
451	    for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
452	      pstr->wcs[byte_idx++] = WEOF;
453	  }
454	else if (mbclen == (size_t) -1 || mbclen == 0)
455	  {
456	    /* It is an invalid character or '\0'.  Just use the byte.  */
457	    int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
458
459	    if (BE (pstr->trans != NULL, 0))
460	      ch = pstr->trans [ch];
461	    pstr->mbs[byte_idx] = ch;
462
463	    if (BE (pstr->offsets_needed != 0, 0))
464	      pstr->offsets[byte_idx] = src_idx;
465	    ++src_idx;
466
467	    /* And also cast it to wide char.  */
468	    pstr->wcs[byte_idx++] = (wchar_t) ch;
469	    if (BE (mbclen == (size_t) -1, 0))
470	      pstr->cur_state = prev_st;
471	  }
472	else
473	  {
474	    /* The buffer doesn't have enough space, finish to build.  */
475	    pstr->cur_state = prev_st;
476	    break;
477	  }
478      }
479  pstr->valid_len = byte_idx;
480  pstr->valid_raw_len = src_idx;
481  return REG_NOERROR;
482}
483
484/* Skip characters until the index becomes greater than NEW_RAW_IDX.
485   Return the index.  */
486
487static Idx
488internal_function
489re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
490{
491  mbstate_t prev_st;
492  Idx rawbuf_idx;
493  size_t mbclen;
494  wint_t wc = WEOF;
495
496  /* Skip the characters which are not necessary to check.  */
497  for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
498       rawbuf_idx < new_raw_idx;)
499    {
500      wchar_t wc2;
501      Idx remain_len;
502      remain_len = pstr->len - rawbuf_idx;
503      prev_st = pstr->cur_state;
504      mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
505			  remain_len, &pstr->cur_state);
506      if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
507	{
508	  /* We treat these cases as a single byte character.  */
509	  if (mbclen == 0 || remain_len == 0)
510	    wc = L'\0';
511	  else
512	    wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
513	  mbclen = 1;
514	  pstr->cur_state = prev_st;
515	}
516      else
517	wc = wc2;
518      /* Then proceed the next character.  */
519      rawbuf_idx += mbclen;
520    }
521  *last_wc = wc;
522  return rawbuf_idx;
523}
524#endif /* RE_ENABLE_I18N  */
525
526/* Build the buffer PSTR->MBS, and apply the translation if we need.
527   This function is used in case of REG_ICASE.  */
528
529static void
530internal_function
531build_upper_buffer (re_string_t *pstr)
532{
533  Idx char_idx, end_idx;
534  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
535
536  for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
537    {
538      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
539      if (BE (pstr->trans != NULL, 0))
540	ch = pstr->trans[ch];
541      if (islower (ch))
542	pstr->mbs[char_idx] = toupper (ch);
543      else
544	pstr->mbs[char_idx] = ch;
545    }
546  pstr->valid_len = char_idx;
547  pstr->valid_raw_len = char_idx;
548}
549
550/* Apply TRANS to the buffer in PSTR.  */
551
552static void
553internal_function
554re_string_translate_buffer (re_string_t *pstr)
555{
556  Idx buf_idx, end_idx;
557  end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
558
559  for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
560    {
561      int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
562      pstr->mbs[buf_idx] = pstr->trans[ch];
563    }
564
565  pstr->valid_len = buf_idx;
566  pstr->valid_raw_len = buf_idx;
567}
568
569/* This function re-construct the buffers.
570   Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
571   convert to upper case in case of REG_ICASE, apply translation.  */
572
573static reg_errcode_t
574internal_function
575re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
576{
577  Idx offset;
578
579  if (BE (pstr->raw_mbs_idx <= idx, 0))
580    offset = idx - pstr->raw_mbs_idx;
581  else
582    {
583      /* Reset buffer.  */
584#ifdef RE_ENABLE_I18N
585      if (pstr->mb_cur_max > 1)
586	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
587#endif /* RE_ENABLE_I18N */
588      pstr->len = pstr->raw_len;
589      pstr->stop = pstr->raw_stop;
590      pstr->valid_len = 0;
591      pstr->raw_mbs_idx = 0;
592      pstr->valid_raw_len = 0;
593      pstr->offsets_needed = 0;
594      pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
595			   : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
596      if (!pstr->mbs_allocated)
597	pstr->mbs = (unsigned char *) pstr->raw_mbs;
598      offset = idx;
599    }
600
601  if (BE (offset != 0, 1))
602    {
603      /* Should the already checked characters be kept?  */
604      if (BE (offset < pstr->valid_raw_len, 1))
605	{
606	  /* Yes, move them to the front of the buffer.  */
607#ifdef RE_ENABLE_I18N
608	  if (BE (pstr->offsets_needed, 0))
609	    {
610	      Idx low = 0, high = pstr->valid_len, mid;
611	      do
612		{
613		  mid = (high + low) / 2;
614		  if (pstr->offsets[mid] > offset)
615		    high = mid;
616		  else if (pstr->offsets[mid] < offset)
617		    low = mid + 1;
618		  else
619		    break;
620		}
621	      while (low < high);
622	      if (pstr->offsets[mid] < offset)
623		++mid;
624	      pstr->tip_context = re_string_context_at (pstr, mid - 1,
625							eflags);
626	      /* This can be quite complicated, so handle specially
627		 only the common and easy case where the character with
628		 different length representation of lower and upper
629		 case is present at or after offset.  */
630	      if (pstr->valid_len > offset
631		  && mid == offset && pstr->offsets[mid] == offset)
632		{
633		  memmove (pstr->wcs, pstr->wcs + offset,
634			   (pstr->valid_len - offset) * sizeof (wint_t));
635		  memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
636		  pstr->valid_len -= offset;
637		  pstr->valid_raw_len -= offset;
638		  for (low = 0; low < pstr->valid_len; low++)
639		    pstr->offsets[low] = pstr->offsets[low + offset] - offset;
640		}
641	      else
642		{
643		  /* Otherwise, just find out how long the partial multibyte
644		     character at offset is and fill it with WEOF/255.  */
645		  pstr->len = pstr->raw_len - idx + offset;
646		  pstr->stop = pstr->raw_stop - idx + offset;
647		  pstr->offsets_needed = 0;
648		  while (mid > 0 && pstr->offsets[mid - 1] == offset)
649		    --mid;
650		  while (mid < pstr->valid_len)
651		    if (pstr->wcs[mid] != WEOF)
652		      break;
653		    else
654		      ++mid;
655		  if (mid == pstr->valid_len)
656		    pstr->valid_len = 0;
657		  else
658		    {
659		      pstr->valid_len = pstr->offsets[mid] - offset;
660		      if (pstr->valid_len)
661			{
662			  for (low = 0; low < pstr->valid_len; ++low)
663			    pstr->wcs[low] = WEOF;
664			  memset (pstr->mbs, 255, pstr->valid_len);
665			}
666		    }
667		  pstr->valid_raw_len = pstr->valid_len;
668		}
669	    }
670	  else
671#endif
672	    {
673	      pstr->tip_context = re_string_context_at (pstr, offset - 1,
674							eflags);
675#ifdef RE_ENABLE_I18N
676	      if (pstr->mb_cur_max > 1)
677		memmove (pstr->wcs, pstr->wcs + offset,
678			 (pstr->valid_len - offset) * sizeof (wint_t));
679#endif /* RE_ENABLE_I18N */
680	      if (BE (pstr->mbs_allocated, 0))
681		memmove (pstr->mbs, pstr->mbs + offset,
682			 pstr->valid_len - offset);
683	      pstr->valid_len -= offset;
684	      pstr->valid_raw_len -= offset;
685#if DEBUG
686	      assert (pstr->valid_len > 0);
687#endif
688	    }
689	}
690      else
691	{
692#ifdef RE_ENABLE_I18N
693	  /* No, skip all characters until IDX.  */
694	  Idx prev_valid_len = pstr->valid_len;
695
696	  if (BE (pstr->offsets_needed, 0))
697	    {
698	      pstr->len = pstr->raw_len - idx + offset;
699	      pstr->stop = pstr->raw_stop - idx + offset;
700	      pstr->offsets_needed = 0;
701	    }
702#endif
703	  pstr->valid_len = 0;
704#ifdef RE_ENABLE_I18N
705	  if (pstr->mb_cur_max > 1)
706	    {
707	      Idx wcs_idx;
708	      wint_t wc = WEOF;
709
710	      if (pstr->is_utf8)
711		{
712		  const unsigned char *raw, *p, *end;
713
714		  /* Special case UTF-8.  Multi-byte chars start with any
715		     byte other than 0x80 - 0xbf.  */
716		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
717		  end = raw + (offset - pstr->mb_cur_max);
718		  if (end < pstr->raw_mbs)
719		    end = pstr->raw_mbs;
720		  p = raw + offset - 1;
721#ifdef _LIBC
722		  /* We know the wchar_t encoding is UCS4, so for the simple
723		     case, ASCII characters, skip the conversion step.  */
724		  if (isascii (*p) && BE (pstr->trans == NULL, 1))
725		    {
726		      memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
727		      /* pstr->valid_len = 0; */
728		      wc = (wchar_t) *p;
729		    }
730		  else
731#endif
732		    for (; p >= end; --p)
733		      if ((*p & 0xc0) != 0x80)
734			{
735			  mbstate_t cur_state;
736			  wchar_t wc2;
737			  Idx mlen = raw + pstr->len - p;
738			  unsigned char buf[6];
739			  size_t mbclen;
740
741			  if (BE (pstr->trans != NULL, 0))
742			    {
743			      int i = mlen < 6 ? mlen : 6;
744			      while (--i >= 0)
745				buf[i] = pstr->trans[p[i]];
746			    }
747			  /* XXX Don't use mbrtowc, we know which conversion
748			     to use (UTF-8 -> UCS4).  */
749			  memset (&cur_state, 0, sizeof (cur_state));
750			  mbclen = __mbrtowc (&wc2, (const char *) p, mlen,
751					      &cur_state);
752			  if (raw + offset - p <= mbclen
753			      && mbclen < (size_t) -2)
754			    {
755			      memset (&pstr->cur_state, '\0',
756				      sizeof (mbstate_t));
757			      pstr->valid_len = mbclen - (raw + offset - p);
758			      wc = wc2;
759			    }
760			  break;
761			}
762		}
763
764	      if (wc == WEOF)
765		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
766	      if (wc == WEOF)
767		pstr->tip_context
768		  = re_string_context_at (pstr, prev_valid_len - 1, eflags);
769	      else
770		pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
771				      && IS_WIDE_WORD_CHAR (wc))
772				     ? CONTEXT_WORD
773				     : ((IS_WIDE_NEWLINE (wc)
774					 && pstr->newline_anchor)
775					? CONTEXT_NEWLINE : 0));
776	      if (BE (pstr->valid_len, 0))
777		{
778		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
779		    pstr->wcs[wcs_idx] = WEOF;
780		  if (pstr->mbs_allocated)
781		    memset (pstr->mbs, 255, pstr->valid_len);
782		}
783	      pstr->valid_raw_len = pstr->valid_len;
784	    }
785	  else
786#endif /* RE_ENABLE_I18N */
787	    {
788	      int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
789	      pstr->valid_raw_len = 0;
790	      if (pstr->trans)
791		c = pstr->trans[c];
792	      pstr->tip_context = (bitset_contain (pstr->word_char, c)
793				   ? CONTEXT_WORD
794				   : ((IS_NEWLINE (c) && pstr->newline_anchor)
795				      ? CONTEXT_NEWLINE : 0));
796	    }
797	}
798      if (!BE (pstr->mbs_allocated, 0))
799	pstr->mbs += offset;
800    }
801  pstr->raw_mbs_idx = idx;
802  pstr->len -= offset;
803  pstr->stop -= offset;
804
805  /* Then build the buffers.  */
806#ifdef RE_ENABLE_I18N
807  if (pstr->mb_cur_max > 1)
808    {
809      if (pstr->icase)
810	{
811	  reg_errcode_t ret = build_wcs_upper_buffer (pstr);
812	  if (BE (ret != REG_NOERROR, 0))
813	    return ret;
814	}
815      else
816	build_wcs_buffer (pstr);
817    }
818  else
819#endif /* RE_ENABLE_I18N */
820    if (BE (pstr->mbs_allocated, 0))
821      {
822	if (pstr->icase)
823	  build_upper_buffer (pstr);
824	else if (pstr->trans != NULL)
825	  re_string_translate_buffer (pstr);
826      }
827    else
828      pstr->valid_len = pstr->len;
829
830  pstr->cur_idx = 0;
831  return REG_NOERROR;
832}
833
834static unsigned char
835internal_function __attribute ((pure))
836re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
837{
838  int ch;
839  Idx off;
840
841  /* Handle the common (easiest) cases first.  */
842  if (BE (!pstr->mbs_allocated, 1))
843    return re_string_peek_byte (pstr, idx);
844
845#ifdef RE_ENABLE_I18N
846  if (pstr->mb_cur_max > 1
847      && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
848    return re_string_peek_byte (pstr, idx);
849#endif
850
851  off = pstr->cur_idx + idx;
852#ifdef RE_ENABLE_I18N
853  if (pstr->offsets_needed)
854    off = pstr->offsets[off];
855#endif
856
857  ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
858
859#ifdef RE_ENABLE_I18N
860  /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
861     this function returns CAPITAL LETTER I instead of first byte of
862     DOTLESS SMALL LETTER I.  The latter would confuse the parser,
863     since peek_byte_case doesn't advance cur_idx in any way.  */
864  if (pstr->offsets_needed && !isascii (ch))
865    return re_string_peek_byte (pstr, idx);
866#endif
867
868  return ch;
869}
870
871static unsigned char
872internal_function __attribute ((pure))
873re_string_fetch_byte_case (re_string_t *pstr)
874{
875  if (BE (!pstr->mbs_allocated, 1))
876    return re_string_fetch_byte (pstr);
877
878#ifdef RE_ENABLE_I18N
879  if (pstr->offsets_needed)
880    {
881      Idx off;
882      int ch;
883
884      /* For tr_TR.UTF-8 [[:islower:]] there is
885	 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
886	 in that case the whole multi-byte character and return
887	 the original letter.  On the other side, with
888	 [[: DOTLESS SMALL LETTER I return [[:I, as doing
889	 anything else would complicate things too much.  */
890
891      if (!re_string_first_byte (pstr, pstr->cur_idx))
892	return re_string_fetch_byte (pstr);
893
894      off = pstr->offsets[pstr->cur_idx];
895      ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
896
897      if (! isascii (ch))
898	return re_string_fetch_byte (pstr);
899
900      re_string_skip_bytes (pstr,
901			    re_string_char_size_at (pstr, pstr->cur_idx));
902      return ch;
903    }
904#endif
905
906  return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
907}
908
909static void
910internal_function
911re_string_destruct (re_string_t *pstr)
912{
913#ifdef RE_ENABLE_I18N
914  re_free (pstr->wcs);
915  re_free (pstr->offsets);
916#endif /* RE_ENABLE_I18N  */
917  if (pstr->mbs_allocated)
918    re_free (pstr->mbs);
919}
920
921/* Return the context at IDX in INPUT.  */
922
923static unsigned int
924internal_function
925re_string_context_at (const re_string_t *input, Idx idx, int eflags)
926{
927  int c;
928  if (BE (! REG_VALID_INDEX (idx), 0))
929    /* In this case, we use the value stored in input->tip_context,
930       since we can't know the character in input->mbs[-1] here.  */
931    return input->tip_context;
932  if (BE (idx == input->len, 0))
933    return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
934	    : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
935#ifdef RE_ENABLE_I18N
936  if (input->mb_cur_max > 1)
937    {
938      wint_t wc;
939      Idx wc_idx = idx;
940      while(input->wcs[wc_idx] == WEOF)
941	{
942#ifdef DEBUG
943	  /* It must not happen.  */
944	  assert (REG_VALID_INDEX (wc_idx));
945#endif
946	  --wc_idx;
947	  if (! REG_VALID_INDEX (wc_idx))
948	    return input->tip_context;
949	}
950      wc = input->wcs[wc_idx];
951      if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
952	return CONTEXT_WORD;
953      return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
954	      ? CONTEXT_NEWLINE : 0);
955    }
956  else
957#endif
958    {
959      c = re_string_byte_at (input, idx);
960      if (bitset_contain (input->word_char, c))
961	return CONTEXT_WORD;
962      return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
963    }
964}
965
966/* Functions for set operation.  */
967
968static reg_errcode_t
969internal_function
970re_node_set_alloc (re_node_set *set, Idx size)
971{
972  set->alloc = size;
973  set->nelem = 0;
974  set->elems = re_malloc (Idx, size);
975  if (BE (set->elems == NULL, 0))
976    return REG_ESPACE;
977  return REG_NOERROR;
978}
979
980static reg_errcode_t
981internal_function
982re_node_set_init_1 (re_node_set *set, Idx elem)
983{
984  set->alloc = 1;
985  set->nelem = 1;
986  set->elems = re_malloc (Idx, 1);
987  if (BE (set->elems == NULL, 0))
988    {
989      set->alloc = set->nelem = 0;
990      return REG_ESPACE;
991    }
992  set->elems[0] = elem;
993  return REG_NOERROR;
994}
995
996static reg_errcode_t
997internal_function
998re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
999{
1000  set->alloc = 2;
1001  set->elems = re_malloc (Idx, 2);
1002  if (BE (set->elems == NULL, 0))
1003    return REG_ESPACE;
1004  if (elem1 == elem2)
1005    {
1006      set->nelem = 1;
1007      set->elems[0] = elem1;
1008    }
1009  else
1010    {
1011      set->nelem = 2;
1012      if (elem1 < elem2)
1013	{
1014	  set->elems[0] = elem1;
1015	  set->elems[1] = elem2;
1016	}
1017      else
1018	{
1019	  set->elems[0] = elem2;
1020	  set->elems[1] = elem1;
1021	}
1022    }
1023  return REG_NOERROR;
1024}
1025
1026static reg_errcode_t
1027internal_function
1028re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1029{
1030  dest->nelem = src->nelem;
1031  if (src->nelem > 0)
1032    {
1033      dest->alloc = dest->nelem;
1034      dest->elems = re_malloc (Idx, dest->alloc);
1035      if (BE (dest->elems == NULL, 0))
1036	{
1037	  dest->alloc = dest->nelem = 0;
1038	  return REG_ESPACE;
1039	}
1040      memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1041    }
1042  else
1043    re_node_set_init_empty (dest);
1044  return REG_NOERROR;
1045}
1046
1047/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1048   DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1049   Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1050
1051static reg_errcode_t
1052internal_function
1053re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1054			   const re_node_set *src2)
1055{
1056  Idx i1, i2, is, id, delta, sbase;
1057  if (src1->nelem == 0 || src2->nelem == 0)
1058    return REG_NOERROR;
1059
1060  /* We need dest->nelem + 2 * elems_in_intersection; this is a
1061     conservative estimate.  */
1062  if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1063    {
1064      Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1065      Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1066      if (BE (new_elems == NULL, 0))
1067        return REG_ESPACE;
1068      dest->elems = new_elems;
1069      dest->alloc = new_alloc;
1070    }
1071
1072  /* Find the items in the intersection of SRC1 and SRC2, and copy
1073     into the top of DEST those that are not already in DEST itself.  */
1074  sbase = dest->nelem + src1->nelem + src2->nelem;
1075  i1 = src1->nelem - 1;
1076  i2 = src2->nelem - 1;
1077  id = dest->nelem - 1;
1078  for (;;)
1079    {
1080      if (src1->elems[i1] == src2->elems[i2])
1081	{
1082	  /* Try to find the item in DEST.  Maybe we could binary search?  */
1083	  while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
1084	    --id;
1085
1086          if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
1087            dest->elems[--sbase] = src1->elems[i1];
1088
1089	  if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1090	    break;
1091	}
1092
1093      /* Lower the highest of the two items.  */
1094      else if (src1->elems[i1] < src2->elems[i2])
1095	{
1096	  if (! REG_VALID_INDEX (--i2))
1097	    break;
1098	}
1099      else
1100	{
1101	  if (! REG_VALID_INDEX (--i1))
1102	    break;
1103	}
1104    }
1105
1106  id = dest->nelem - 1;
1107  is = dest->nelem + src1->nelem + src2->nelem - 1;
1108  delta = is - sbase + 1;
1109
1110  /* Now copy.  When DELTA becomes zero, the remaining
1111     DEST elements are already in place; this is more or
1112     less the same loop that is in re_node_set_merge.  */
1113  dest->nelem += delta;
1114  if (delta > 0 && REG_VALID_INDEX (id))
1115    for (;;)
1116      {
1117        if (dest->elems[is] > dest->elems[id])
1118          {
1119            /* Copy from the top.  */
1120            dest->elems[id + delta--] = dest->elems[is--];
1121            if (delta == 0)
1122              break;
1123          }
1124        else
1125          {
1126            /* Slide from the bottom.  */
1127            dest->elems[id + delta] = dest->elems[id];
1128            if (! REG_VALID_INDEX (--id))
1129              break;
1130          }
1131      }
1132
1133  /* Copy remaining SRC elements.  */
1134  memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1135
1136  return REG_NOERROR;
1137}
1138
1139/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1140   DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1141
1142static reg_errcode_t
1143internal_function
1144re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1145			const re_node_set *src2)
1146{
1147  Idx i1, i2, id;
1148  if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1149    {
1150      dest->alloc = src1->nelem + src2->nelem;
1151      dest->elems = re_malloc (Idx, dest->alloc);
1152      if (BE (dest->elems == NULL, 0))
1153	return REG_ESPACE;
1154    }
1155  else
1156    {
1157      if (src1 != NULL && src1->nelem > 0)
1158	return re_node_set_init_copy (dest, src1);
1159      else if (src2 != NULL && src2->nelem > 0)
1160	return re_node_set_init_copy (dest, src2);
1161      else
1162	re_node_set_init_empty (dest);
1163      return REG_NOERROR;
1164    }
1165  for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1166    {
1167      if (src1->elems[i1] > src2->elems[i2])
1168	{
1169	  dest->elems[id++] = src2->elems[i2++];
1170	  continue;
1171	}
1172      if (src1->elems[i1] == src2->elems[i2])
1173	++i2;
1174      dest->elems[id++] = src1->elems[i1++];
1175    }
1176  if (i1 < src1->nelem)
1177    {
1178      memcpy (dest->elems + id, src1->elems + i1,
1179	     (src1->nelem - i1) * sizeof (Idx));
1180      id += src1->nelem - i1;
1181    }
1182  else if (i2 < src2->nelem)
1183    {
1184      memcpy (dest->elems + id, src2->elems + i2,
1185	     (src2->nelem - i2) * sizeof (Idx));
1186      id += src2->nelem - i2;
1187    }
1188  dest->nelem = id;
1189  return REG_NOERROR;
1190}
1191
1192/* Calculate the union set of the sets DEST and SRC. And store it to
1193   DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1194
1195static reg_errcode_t
1196internal_function
1197re_node_set_merge (re_node_set *dest, const re_node_set *src)
1198{
1199  Idx is, id, sbase, delta;
1200  if (src == NULL || src->nelem == 0)
1201    return REG_NOERROR;
1202  if (dest->alloc < 2 * src->nelem + dest->nelem)
1203    {
1204      Idx new_alloc = 2 * (src->nelem + dest->alloc);
1205      Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1206      if (BE (new_buffer == NULL, 0))
1207	return REG_ESPACE;
1208      dest->elems = new_buffer;
1209      dest->alloc = new_alloc;
1210    }
1211
1212  if (BE (dest->nelem == 0, 0))
1213    {
1214      dest->nelem = src->nelem;
1215      memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1216      return REG_NOERROR;
1217    }
1218
1219  /* Copy into the top of DEST the items of SRC that are not
1220     found in DEST.  Maybe we could binary search in DEST?  */
1221  for (sbase = dest->nelem + 2 * src->nelem,
1222       is = src->nelem - 1, id = dest->nelem - 1;
1223       REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1224    {
1225      if (dest->elems[id] == src->elems[is])
1226        is--, id--;
1227      else if (dest->elems[id] < src->elems[is])
1228        dest->elems[--sbase] = src->elems[is--];
1229      else /* if (dest->elems[id] > src->elems[is]) */
1230        --id;
1231    }
1232
1233  if (REG_VALID_INDEX (is))
1234    {
1235      /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1236      sbase -= is + 1;
1237      memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1238    }
1239
1240  id = dest->nelem - 1;
1241  is = dest->nelem + 2 * src->nelem - 1;
1242  delta = is - sbase + 1;
1243  if (delta == 0)
1244    return REG_NOERROR;
1245
1246  /* Now copy.  When DELTA becomes zero, the remaining
1247     DEST elements are already in place.  */
1248  dest->nelem += delta;
1249  for (;;)
1250    {
1251      if (dest->elems[is] > dest->elems[id])
1252        {
1253	  /* Copy from the top.  */
1254          dest->elems[id + delta--] = dest->elems[is--];
1255	  if (delta == 0)
1256	    break;
1257	}
1258      else
1259        {
1260          /* Slide from the bottom.  */
1261          dest->elems[id + delta] = dest->elems[id];
1262	  if (! REG_VALID_INDEX (--id))
1263	    {
1264	      /* Copy remaining SRC elements.  */
1265	      memcpy (dest->elems, dest->elems + sbase,
1266	              delta * sizeof (Idx));
1267	      break;
1268	    }
1269	}
1270    }
1271
1272  return REG_NOERROR;
1273}
1274
1275/* Insert the new element ELEM to the re_node_set* SET.
1276   SET should not already have ELEM.
1277   Return true if successful.  */
1278
1279static bool
1280internal_function
1281re_node_set_insert (re_node_set *set, Idx elem)
1282{
1283  Idx idx;
1284  /* In case the set is empty.  */
1285  if (set->alloc == 0)
1286    return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1287
1288  if (BE (set->nelem, 0) == 0)
1289    {
1290      /* We already guaranteed above that set->alloc != 0.  */
1291      set->elems[0] = elem;
1292      ++set->nelem;
1293      return true;
1294    }
1295
1296  /* Realloc if we need.  */
1297  if (set->alloc == set->nelem)
1298    {
1299      Idx *new_elems;
1300      set->alloc = set->alloc * 2;
1301      new_elems = re_realloc (set->elems, Idx, set->alloc);
1302      if (BE (new_elems == NULL, 0))
1303	return false;
1304      set->elems = new_elems;
1305    }
1306
1307  /* Move the elements which follows the new element.  Test the
1308     first element separately to skip a check in the inner loop.  */
1309  if (elem < set->elems[0])
1310    {
1311      idx = 0;
1312      for (idx = set->nelem; idx > 0; idx--)
1313        set->elems[idx] = set->elems[idx - 1];
1314    }
1315  else
1316    {
1317      for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1318        set->elems[idx] = set->elems[idx - 1];
1319    }
1320
1321  /* Insert the new element.  */
1322  set->elems[idx] = elem;
1323  ++set->nelem;
1324  return true;
1325}
1326
1327/* Insert the new element ELEM to the re_node_set* SET.
1328   SET should not already have any element greater than or equal to ELEM.
1329   Return true if successful.  */
1330
1331static bool
1332internal_function
1333re_node_set_insert_last (re_node_set *set, Idx elem)
1334{
1335  /* Realloc if we need.  */
1336  if (set->alloc == set->nelem)
1337    {
1338      Idx *new_elems;
1339      set->alloc = (set->alloc + 1) * 2;
1340      new_elems = re_realloc (set->elems, Idx, set->alloc);
1341      if (BE (new_elems == NULL, 0))
1342	return false;
1343      set->elems = new_elems;
1344    }
1345
1346  /* Insert the new element.  */
1347  set->elems[set->nelem++] = elem;
1348  return true;
1349}
1350
1351/* Compare two node sets SET1 and SET2.
1352   Return true if SET1 and SET2 are equivalent.  */
1353
1354static bool
1355internal_function __attribute ((pure))
1356re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1357{
1358  Idx i;
1359  if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1360    return false;
1361  for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1362    if (set1->elems[i] != set2->elems[i])
1363      return false;
1364  return true;
1365}
1366
1367/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1368
1369static Idx
1370internal_function __attribute ((pure))
1371re_node_set_contains (const re_node_set *set, Idx elem)
1372{
1373  __re_size_t idx, right, mid;
1374  if (! REG_VALID_NONZERO_INDEX (set->nelem))
1375    return 0;
1376
1377  /* Binary search the element.  */
1378  idx = 0;
1379  right = set->nelem - 1;
1380  while (idx < right)
1381    {
1382      mid = (idx + right) / 2;
1383      if (set->elems[mid] < elem)
1384	idx = mid + 1;
1385      else
1386	right = mid;
1387    }
1388  return set->elems[idx] == elem ? idx + 1 : 0;
1389}
1390
1391static void
1392internal_function
1393re_node_set_remove_at (re_node_set *set, Idx idx)
1394{
1395  if (idx < 0 || idx >= set->nelem)
1396    return;
1397  --set->nelem;
1398  for (; idx < set->nelem; idx++)
1399    set->elems[idx] = set->elems[idx + 1];
1400}
1401
1402
1403/* Add the token TOKEN to dfa->nodes, and return the index of the token.
1404   Or return REG_MISSING if an error occurred.  */
1405
1406static Idx
1407internal_function
1408re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1409{
1410  if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1411    {
1412      size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1413      Idx *new_nexts, *new_indices;
1414      re_node_set *new_edests, *new_eclosures;
1415      re_token_t *new_nodes;
1416      size_t max_object_size =
1417	MAX (sizeof (re_token_t),
1418	     MAX (sizeof (re_node_set),
1419		  sizeof (Idx)));
1420
1421      /* Avoid overflows.  */
1422      if (BE (SIZE_MAX / 2 / max_object_size < dfa->nodes_alloc, 0))
1423	return REG_MISSING;
1424
1425      new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1426      if (BE (new_nodes == NULL, 0))
1427	return REG_MISSING;
1428      dfa->nodes = new_nodes;
1429      new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1430      new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1431      new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1432      new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1433      if (BE (new_nexts == NULL || new_indices == NULL
1434	      || new_edests == NULL || new_eclosures == NULL, 0))
1435	return REG_MISSING;
1436      dfa->nexts = new_nexts;
1437      dfa->org_indices = new_indices;
1438      dfa->edests = new_edests;
1439      dfa->eclosures = new_eclosures;
1440      dfa->nodes_alloc = new_nodes_alloc;
1441    }
1442  dfa->nodes[dfa->nodes_len] = token;
1443  dfa->nodes[dfa->nodes_len].constraint = 0;
1444#ifdef RE_ENABLE_I18N
1445  {
1446  int type = token.type;
1447  dfa->nodes[dfa->nodes_len].accept_mb =
1448    (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1449  }
1450#endif
1451  dfa->nexts[dfa->nodes_len] = REG_MISSING;
1452  re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1453  re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1454  return dfa->nodes_len++;
1455}
1456
1457static inline re_hashval_t
1458internal_function
1459calc_state_hash (const re_node_set *nodes, unsigned int context)
1460{
1461  re_hashval_t hash = nodes->nelem + context;
1462  Idx i;
1463  for (i = 0 ; i < nodes->nelem ; i++)
1464    hash += nodes->elems[i];
1465  return hash;
1466}
1467
1468/* Search for the state whose node_set is equivalent to NODES.
1469   Return the pointer to the state, if we found it in the DFA.
1470   Otherwise create the new one and return it.  In case of an error
1471   return NULL and set the error code in ERR.
1472   Note: - We assume NULL as the invalid state, then it is possible that
1473	   return value is NULL and ERR is REG_NOERROR.
1474	 - We never return non-NULL value in case of any errors, it is for
1475	   optimization.  */
1476
1477static re_dfastate_t *
1478internal_function
1479re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1480		  const re_node_set *nodes)
1481{
1482  re_hashval_t hash;
1483  re_dfastate_t *new_state;
1484  struct re_state_table_entry *spot;
1485  Idx i;
1486#ifdef lint
1487  /* Suppress bogus uninitialized-variable warnings.  */
1488  *err = REG_NOERROR;
1489#endif
1490  if (BE (nodes->nelem == 0, 0))
1491    {
1492      *err = REG_NOERROR;
1493      return NULL;
1494    }
1495  hash = calc_state_hash (nodes, 0);
1496  spot = dfa->state_table + (hash & dfa->state_hash_mask);
1497
1498  for (i = 0 ; i < spot->num ; i++)
1499    {
1500      re_dfastate_t *state = spot->array[i];
1501      if (hash != state->hash)
1502	continue;
1503      if (re_node_set_compare (&state->nodes, nodes))
1504	return state;
1505    }
1506
1507  /* There are no appropriate state in the dfa, create the new one.  */
1508  new_state = create_ci_newstate (dfa, nodes, hash);
1509  if (BE (new_state == NULL, 0))
1510    *err = REG_ESPACE;
1511
1512  return new_state;
1513}
1514
1515/* Search for the state whose node_set is equivalent to NODES and
1516   whose context is equivalent to CONTEXT.
1517   Return the pointer to the state, if we found it in the DFA.
1518   Otherwise create the new one and return it.  In case of an error
1519   return NULL and set the error code in ERR.
1520   Note: - We assume NULL as the invalid state, then it is possible that
1521	   return value is NULL and ERR is REG_NOERROR.
1522	 - We never return non-NULL value in case of any errors, it is for
1523	   optimization.  */
1524
1525static re_dfastate_t *
1526internal_function
1527re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1528			  const re_node_set *nodes, unsigned int context)
1529{
1530  re_hashval_t hash;
1531  re_dfastate_t *new_state;
1532  struct re_state_table_entry *spot;
1533  Idx i;
1534#ifdef lint
1535  /* Suppress bogus uninitialized-variable warnings.  */
1536  *err = REG_NOERROR;
1537#endif
1538  if (nodes->nelem == 0)
1539    {
1540      *err = REG_NOERROR;
1541      return NULL;
1542    }
1543  hash = calc_state_hash (nodes, context);
1544  spot = dfa->state_table + (hash & dfa->state_hash_mask);
1545
1546  for (i = 0 ; i < spot->num ; i++)
1547    {
1548      re_dfastate_t *state = spot->array[i];
1549      if (state->hash == hash
1550	  && state->context == context
1551	  && re_node_set_compare (state->entrance_nodes, nodes))
1552	return state;
1553    }
1554  /* There are no appropriate state in `dfa', create the new one.  */
1555  new_state = create_cd_newstate (dfa, nodes, context, hash);
1556  if (BE (new_state == NULL, 0))
1557    *err = REG_ESPACE;
1558
1559  return new_state;
1560}
1561
1562/* Finish initialization of the new state NEWSTATE, and using its hash value
1563   HASH put in the appropriate bucket of DFA's state table.  Return value
1564   indicates the error code if failed.  */
1565
1566static reg_errcode_t
1567register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1568		re_hashval_t hash)
1569{
1570  struct re_state_table_entry *spot;
1571  reg_errcode_t err;
1572  Idx i;
1573
1574  newstate->hash = hash;
1575  err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1576  if (BE (err != REG_NOERROR, 0))
1577    return REG_ESPACE;
1578  for (i = 0; i < newstate->nodes.nelem; i++)
1579    {
1580      Idx elem = newstate->nodes.elems[i];
1581      if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1582	if (BE (! re_node_set_insert_last (&newstate->non_eps_nodes, elem), 0))
1583	  return REG_ESPACE;
1584    }
1585
1586  spot = dfa->state_table + (hash & dfa->state_hash_mask);
1587  if (BE (spot->alloc <= spot->num, 0))
1588    {
1589      Idx new_alloc = 2 * spot->num + 2;
1590      re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1591					      new_alloc);
1592      if (BE (new_array == NULL, 0))
1593	return REG_ESPACE;
1594      spot->array = new_array;
1595      spot->alloc = new_alloc;
1596    }
1597  spot->array[spot->num++] = newstate;
1598  return REG_NOERROR;
1599}
1600
1601static void
1602free_state (re_dfastate_t *state)
1603{
1604  re_node_set_free (&state->non_eps_nodes);
1605  re_node_set_free (&state->inveclosure);
1606  if (state->entrance_nodes != &state->nodes)
1607    {
1608      re_node_set_free (state->entrance_nodes);
1609      re_free (state->entrance_nodes);
1610    }
1611  re_node_set_free (&state->nodes);
1612  re_free (state->word_trtable);
1613  re_free (state->trtable);
1614  re_free (state);
1615}
1616
1617/* Create the new state which is independ of contexts.
1618   Return the new state if succeeded, otherwise return NULL.  */
1619
1620static re_dfastate_t *
1621internal_function
1622create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1623		    re_hashval_t hash)
1624{
1625  Idx i;
1626  reg_errcode_t err;
1627  re_dfastate_t *newstate;
1628
1629  newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1630  if (BE (newstate == NULL, 0))
1631    return NULL;
1632  err = re_node_set_init_copy (&newstate->nodes, nodes);
1633  if (BE (err != REG_NOERROR, 0))
1634    {
1635      re_free (newstate);
1636      return NULL;
1637    }
1638
1639  newstate->entrance_nodes = &newstate->nodes;
1640  for (i = 0 ; i < nodes->nelem ; i++)
1641    {
1642      re_token_t *node = dfa->nodes + nodes->elems[i];
1643      re_token_type_t type = node->type;
1644      if (type == CHARACTER && !node->constraint)
1645	continue;
1646#ifdef RE_ENABLE_I18N
1647      newstate->accept_mb |= node->accept_mb;
1648#endif /* RE_ENABLE_I18N */
1649
1650      /* If the state has the halt node, the state is a halt state.  */
1651      if (type == END_OF_RE)
1652	newstate->halt = 1;
1653      else if (type == OP_BACK_REF)
1654	newstate->has_backref = 1;
1655      else if (type == ANCHOR || node->constraint)
1656	newstate->has_constraint = 1;
1657    }
1658  err = register_state (dfa, newstate, hash);
1659  if (BE (err != REG_NOERROR, 0))
1660    {
1661      free_state (newstate);
1662      newstate = NULL;
1663    }
1664  return newstate;
1665}
1666
1667/* Create the new state which is depend on the context CONTEXT.
1668   Return the new state if succeeded, otherwise return NULL.  */
1669
1670static re_dfastate_t *
1671internal_function
1672create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1673		    unsigned int context, re_hashval_t hash)
1674{
1675  Idx i, nctx_nodes = 0;
1676  reg_errcode_t err;
1677  re_dfastate_t *newstate;
1678
1679  newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1680  if (BE (newstate == NULL, 0))
1681    return NULL;
1682  err = re_node_set_init_copy (&newstate->nodes, nodes);
1683  if (BE (err != REG_NOERROR, 0))
1684    {
1685      re_free (newstate);
1686      return NULL;
1687    }
1688
1689  newstate->context = context;
1690  newstate->entrance_nodes = &newstate->nodes;
1691
1692  for (i = 0 ; i < nodes->nelem ; i++)
1693    {
1694      re_token_t *node = dfa->nodes + nodes->elems[i];
1695      re_token_type_t type = node->type;
1696      unsigned int constraint = node->constraint;
1697
1698      if (type == CHARACTER && !constraint)
1699	continue;
1700#ifdef RE_ENABLE_I18N
1701      newstate->accept_mb |= node->accept_mb;
1702#endif /* RE_ENABLE_I18N */
1703
1704      /* If the state has the halt node, the state is a halt state.  */
1705      if (type == END_OF_RE)
1706	newstate->halt = 1;
1707      else if (type == OP_BACK_REF)
1708	newstate->has_backref = 1;
1709
1710      if (constraint)
1711	{
1712	  if (newstate->entrance_nodes == &newstate->nodes)
1713	    {
1714	      newstate->entrance_nodes = re_malloc (re_node_set, 1);
1715	      if (BE (newstate->entrance_nodes == NULL, 0))
1716		{
1717		  free_state (newstate);
1718		  return NULL;
1719		}
1720	      re_node_set_init_copy (newstate->entrance_nodes, nodes);
1721	      nctx_nodes = 0;
1722	      newstate->has_constraint = 1;
1723	    }
1724
1725	  if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1726	    {
1727	      re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1728	      ++nctx_nodes;
1729	    }
1730	}
1731    }
1732  err = register_state (dfa, newstate, hash);
1733  if (BE (err != REG_NOERROR, 0))
1734    {
1735      free_state (newstate);
1736      newstate = NULL;
1737    }
1738  return  newstate;
1739}
1740