105436638acc7c010349a69c3395f1a57c642dc62Ying Wang/* Iterating through multibyte strings: macros for multi-byte encodings.
205436638acc7c010349a69c3395f1a57c642dc62Ying Wang   Copyright (C) 2001, 2005, 2007, 2009-2012 Free Software Foundation, Inc.
305436638acc7c010349a69c3395f1a57c642dc62Ying Wang
405436638acc7c010349a69c3395f1a57c642dc62Ying Wang   This program is free software: you can redistribute it and/or modify
505436638acc7c010349a69c3395f1a57c642dc62Ying Wang   it under the terms of the GNU General Public License as published by
605436638acc7c010349a69c3395f1a57c642dc62Ying Wang   the Free Software Foundation; either version 3 of the License, or
705436638acc7c010349a69c3395f1a57c642dc62Ying Wang   (at your option) any later version.
805436638acc7c010349a69c3395f1a57c642dc62Ying Wang
905436638acc7c010349a69c3395f1a57c642dc62Ying Wang   This program is distributed in the hope that it will be useful,
1005436638acc7c010349a69c3395f1a57c642dc62Ying Wang   but WITHOUT ANY WARRANTY; without even the implied warranty of
1105436638acc7c010349a69c3395f1a57c642dc62Ying Wang   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
1205436638acc7c010349a69c3395f1a57c642dc62Ying Wang   GNU General Public License for more details.
1305436638acc7c010349a69c3395f1a57c642dc62Ying Wang
1405436638acc7c010349a69c3395f1a57c642dc62Ying Wang   You should have received a copy of the GNU General Public License
1505436638acc7c010349a69c3395f1a57c642dc62Ying Wang   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
1605436638acc7c010349a69c3395f1a57c642dc62Ying Wang
1705436638acc7c010349a69c3395f1a57c642dc62Ying Wang/* Written by Bruno Haible <bruno@clisp.org>.  */
1805436638acc7c010349a69c3395f1a57c642dc62Ying Wang
1905436638acc7c010349a69c3395f1a57c642dc62Ying Wang/* The macros in this file implement forward iteration through a
2005436638acc7c010349a69c3395f1a57c642dc62Ying Wang   multi-byte string, without knowing its length a-priori.
2105436638acc7c010349a69c3395f1a57c642dc62Ying Wang
2205436638acc7c010349a69c3395f1a57c642dc62Ying Wang   With these macros, an iteration loop that looks like
2305436638acc7c010349a69c3395f1a57c642dc62Ying Wang
2405436638acc7c010349a69c3395f1a57c642dc62Ying Wang      char *iter;
2505436638acc7c010349a69c3395f1a57c642dc62Ying Wang      for (iter = buf; *iter != '\0'; iter++)
2605436638acc7c010349a69c3395f1a57c642dc62Ying Wang        {
2705436638acc7c010349a69c3395f1a57c642dc62Ying Wang          do_something (*iter);
2805436638acc7c010349a69c3395f1a57c642dc62Ying Wang        }
2905436638acc7c010349a69c3395f1a57c642dc62Ying Wang
3005436638acc7c010349a69c3395f1a57c642dc62Ying Wang   becomes
3105436638acc7c010349a69c3395f1a57c642dc62Ying Wang
3205436638acc7c010349a69c3395f1a57c642dc62Ying Wang      mbui_iterator_t iter;
3305436638acc7c010349a69c3395f1a57c642dc62Ying Wang      for (mbui_init (iter, buf); mbui_avail (iter); mbui_advance (iter))
3405436638acc7c010349a69c3395f1a57c642dc62Ying Wang        {
3505436638acc7c010349a69c3395f1a57c642dc62Ying Wang          do_something (mbui_cur_ptr (iter), mb_len (mbui_cur (iter)));
3605436638acc7c010349a69c3395f1a57c642dc62Ying Wang        }
3705436638acc7c010349a69c3395f1a57c642dc62Ying Wang
3805436638acc7c010349a69c3395f1a57c642dc62Ying Wang   The benefit of these macros over plain use of mbrtowc is:
3905436638acc7c010349a69c3395f1a57c642dc62Ying Wang   - Handling of invalid multibyte sequences is possible without
4005436638acc7c010349a69c3395f1a57c642dc62Ying Wang     making the code more complicated, while still preserving the
4105436638acc7c010349a69c3395f1a57c642dc62Ying Wang     invalid multibyte sequences.
4205436638acc7c010349a69c3395f1a57c642dc62Ying Wang
4305436638acc7c010349a69c3395f1a57c642dc62Ying Wang   Compared to mbiter.h, the macros here don't need to know the string's
4405436638acc7c010349a69c3395f1a57c642dc62Ying Wang   length a-priori.  The downside is that at each step, the look-ahead
4505436638acc7c010349a69c3395f1a57c642dc62Ying Wang   that guards against overrunning the terminating '\0' is more expensive.
4605436638acc7c010349a69c3395f1a57c642dc62Ying Wang   The mbui_* macros are therefore suitable when there is a high probability
4705436638acc7c010349a69c3395f1a57c642dc62Ying Wang   that only the first few multibyte characters need to be inspected.
4805436638acc7c010349a69c3395f1a57c642dc62Ying Wang   Whereas the mbi_* macros are better if usually the iteration runs
4905436638acc7c010349a69c3395f1a57c642dc62Ying Wang   through the entire string.
5005436638acc7c010349a69c3395f1a57c642dc62Ying Wang
5105436638acc7c010349a69c3395f1a57c642dc62Ying Wang   mbui_iterator_t
5205436638acc7c010349a69c3395f1a57c642dc62Ying Wang     is a type usable for variable declarations.
5305436638acc7c010349a69c3395f1a57c642dc62Ying Wang
5405436638acc7c010349a69c3395f1a57c642dc62Ying Wang   mbui_init (iter, startptr)
5505436638acc7c010349a69c3395f1a57c642dc62Ying Wang     initializes the iterator, starting at startptr.
5605436638acc7c010349a69c3395f1a57c642dc62Ying Wang
5705436638acc7c010349a69c3395f1a57c642dc62Ying Wang   mbui_avail (iter)
5805436638acc7c010349a69c3395f1a57c642dc62Ying Wang     returns true if there are more multibyte characters available before
5905436638acc7c010349a69c3395f1a57c642dc62Ying Wang     the end of string is reached. In this case, mbui_cur (iter) is
6005436638acc7c010349a69c3395f1a57c642dc62Ying Wang     initialized to the next multibyte character.
6105436638acc7c010349a69c3395f1a57c642dc62Ying Wang
6205436638acc7c010349a69c3395f1a57c642dc62Ying Wang   mbui_advance (iter)
6305436638acc7c010349a69c3395f1a57c642dc62Ying Wang     advances the iterator by one multibyte character.
6405436638acc7c010349a69c3395f1a57c642dc62Ying Wang
6505436638acc7c010349a69c3395f1a57c642dc62Ying Wang   mbui_cur (iter)
6605436638acc7c010349a69c3395f1a57c642dc62Ying Wang     returns the current multibyte character, of type mbchar_t.  All the
6705436638acc7c010349a69c3395f1a57c642dc62Ying Wang     macros defined in mbchar.h can be used on it.
6805436638acc7c010349a69c3395f1a57c642dc62Ying Wang
6905436638acc7c010349a69c3395f1a57c642dc62Ying Wang   mbui_cur_ptr (iter)
7005436638acc7c010349a69c3395f1a57c642dc62Ying Wang     return a pointer to the beginning of the current multibyte character.
7105436638acc7c010349a69c3395f1a57c642dc62Ying Wang
7205436638acc7c010349a69c3395f1a57c642dc62Ying Wang   mbui_reloc (iter, ptrdiff)
7305436638acc7c010349a69c3395f1a57c642dc62Ying Wang     relocates iterator when the string is moved by ptrdiff bytes.
7405436638acc7c010349a69c3395f1a57c642dc62Ying Wang
7505436638acc7c010349a69c3395f1a57c642dc62Ying Wang   mbui_copy (&destiter, &srciter)
7605436638acc7c010349a69c3395f1a57c642dc62Ying Wang     copies srciter to destiter.
7705436638acc7c010349a69c3395f1a57c642dc62Ying Wang
7805436638acc7c010349a69c3395f1a57c642dc62Ying Wang   Here are the function prototypes of the macros.
7905436638acc7c010349a69c3395f1a57c642dc62Ying Wang
8005436638acc7c010349a69c3395f1a57c642dc62Ying Wang   extern void          mbui_init (mbui_iterator_t iter, const char *startptr);
8105436638acc7c010349a69c3395f1a57c642dc62Ying Wang   extern bool          mbui_avail (mbui_iterator_t iter);
8205436638acc7c010349a69c3395f1a57c642dc62Ying Wang   extern void          mbui_advance (mbui_iterator_t iter);
8305436638acc7c010349a69c3395f1a57c642dc62Ying Wang   extern mbchar_t      mbui_cur (mbui_iterator_t iter);
8405436638acc7c010349a69c3395f1a57c642dc62Ying Wang   extern const char *  mbui_cur_ptr (mbui_iterator_t iter);
8505436638acc7c010349a69c3395f1a57c642dc62Ying Wang   extern void          mbui_reloc (mbui_iterator_t iter, ptrdiff_t ptrdiff);
8605436638acc7c010349a69c3395f1a57c642dc62Ying Wang   extern void          mbui_copy (mbui_iterator_t *new, const mbui_iterator_t *old);
8705436638acc7c010349a69c3395f1a57c642dc62Ying Wang */
8805436638acc7c010349a69c3395f1a57c642dc62Ying Wang
8905436638acc7c010349a69c3395f1a57c642dc62Ying Wang#ifndef _MBUITER_H
9005436638acc7c010349a69c3395f1a57c642dc62Ying Wang#define _MBUITER_H 1
9105436638acc7c010349a69c3395f1a57c642dc62Ying Wang
9205436638acc7c010349a69c3395f1a57c642dc62Ying Wang#include <assert.h>
9305436638acc7c010349a69c3395f1a57c642dc62Ying Wang#include <stdbool.h>
9405436638acc7c010349a69c3395f1a57c642dc62Ying Wang#include <stddef.h>
9505436638acc7c010349a69c3395f1a57c642dc62Ying Wang#include <stdlib.h>
9605436638acc7c010349a69c3395f1a57c642dc62Ying Wang#include <string.h>
9705436638acc7c010349a69c3395f1a57c642dc62Ying Wang
9805436638acc7c010349a69c3395f1a57c642dc62Ying Wang/* Tru64 with Desktop Toolkit C has a bug: <stdio.h> must be included before
9905436638acc7c010349a69c3395f1a57c642dc62Ying Wang   <wchar.h>.
10005436638acc7c010349a69c3395f1a57c642dc62Ying Wang   BSD/OS 4.1 has a bug: <stdio.h> and <time.h> must be included before
10105436638acc7c010349a69c3395f1a57c642dc62Ying Wang   <wchar.h>.  */
10205436638acc7c010349a69c3395f1a57c642dc62Ying Wang#include <stdio.h>
10305436638acc7c010349a69c3395f1a57c642dc62Ying Wang#include <time.h>
10405436638acc7c010349a69c3395f1a57c642dc62Ying Wang#include <wchar.h>
10505436638acc7c010349a69c3395f1a57c642dc62Ying Wang
10605436638acc7c010349a69c3395f1a57c642dc62Ying Wang#include "mbchar.h"
10705436638acc7c010349a69c3395f1a57c642dc62Ying Wang#include "strnlen1.h"
10805436638acc7c010349a69c3395f1a57c642dc62Ying Wang
10905436638acc7c010349a69c3395f1a57c642dc62Ying Wang_GL_INLINE_HEADER_BEGIN
11005436638acc7c010349a69c3395f1a57c642dc62Ying Wang#ifndef MBUITER_INLINE
11105436638acc7c010349a69c3395f1a57c642dc62Ying Wang# define MBUITER_INLINE _GL_INLINE
11205436638acc7c010349a69c3395f1a57c642dc62Ying Wang#endif
11305436638acc7c010349a69c3395f1a57c642dc62Ying Wang
11405436638acc7c010349a69c3395f1a57c642dc62Ying Wangstruct mbuiter_multi
11505436638acc7c010349a69c3395f1a57c642dc62Ying Wang{
11605436638acc7c010349a69c3395f1a57c642dc62Ying Wang  bool in_shift;        /* true if next byte may not be interpreted as ASCII */
11705436638acc7c010349a69c3395f1a57c642dc62Ying Wang  mbstate_t state;      /* if in_shift: current shift state */
11805436638acc7c010349a69c3395f1a57c642dc62Ying Wang  bool next_done;       /* true if mbui_avail has already filled the following */
11905436638acc7c010349a69c3395f1a57c642dc62Ying Wang  struct mbchar cur;    /* the current character:
12005436638acc7c010349a69c3395f1a57c642dc62Ying Wang        const char *cur.ptr             pointer to current character
12105436638acc7c010349a69c3395f1a57c642dc62Ying Wang        The following are only valid after mbui_avail.
12205436638acc7c010349a69c3395f1a57c642dc62Ying Wang        size_t cur.bytes                number of bytes of current character
12305436638acc7c010349a69c3395f1a57c642dc62Ying Wang        bool cur.wc_valid               true if wc is a valid wide character
12405436638acc7c010349a69c3395f1a57c642dc62Ying Wang        wchar_t cur.wc                  if wc_valid: the current character
12505436638acc7c010349a69c3395f1a57c642dc62Ying Wang        */
12605436638acc7c010349a69c3395f1a57c642dc62Ying Wang};
12705436638acc7c010349a69c3395f1a57c642dc62Ying Wang
12805436638acc7c010349a69c3395f1a57c642dc62Ying WangMBUITER_INLINE void
12905436638acc7c010349a69c3395f1a57c642dc62Ying Wangmbuiter_multi_next (struct mbuiter_multi *iter)
13005436638acc7c010349a69c3395f1a57c642dc62Ying Wang{
13105436638acc7c010349a69c3395f1a57c642dc62Ying Wang  if (iter->next_done)
13205436638acc7c010349a69c3395f1a57c642dc62Ying Wang    return;
13305436638acc7c010349a69c3395f1a57c642dc62Ying Wang  if (iter->in_shift)
13405436638acc7c010349a69c3395f1a57c642dc62Ying Wang    goto with_shift;
13505436638acc7c010349a69c3395f1a57c642dc62Ying Wang  /* Handle most ASCII characters quickly, without calling mbrtowc().  */
13605436638acc7c010349a69c3395f1a57c642dc62Ying Wang  if (is_basic (*iter->cur.ptr))
13705436638acc7c010349a69c3395f1a57c642dc62Ying Wang    {
13805436638acc7c010349a69c3395f1a57c642dc62Ying Wang      /* These characters are part of the basic character set.  ISO C 99
13905436638acc7c010349a69c3395f1a57c642dc62Ying Wang         guarantees that their wide character code is identical to their
14005436638acc7c010349a69c3395f1a57c642dc62Ying Wang         char code.  */
14105436638acc7c010349a69c3395f1a57c642dc62Ying Wang      iter->cur.bytes = 1;
14205436638acc7c010349a69c3395f1a57c642dc62Ying Wang      iter->cur.wc = *iter->cur.ptr;
14305436638acc7c010349a69c3395f1a57c642dc62Ying Wang      iter->cur.wc_valid = true;
14405436638acc7c010349a69c3395f1a57c642dc62Ying Wang    }
14505436638acc7c010349a69c3395f1a57c642dc62Ying Wang  else
14605436638acc7c010349a69c3395f1a57c642dc62Ying Wang    {
14705436638acc7c010349a69c3395f1a57c642dc62Ying Wang      assert (mbsinit (&iter->state));
14805436638acc7c010349a69c3395f1a57c642dc62Ying Wang      iter->in_shift = true;
14905436638acc7c010349a69c3395f1a57c642dc62Ying Wang    with_shift:
15005436638acc7c010349a69c3395f1a57c642dc62Ying Wang      iter->cur.bytes = mbrtowc (&iter->cur.wc, iter->cur.ptr,
15105436638acc7c010349a69c3395f1a57c642dc62Ying Wang                                 strnlen1 (iter->cur.ptr, MB_CUR_MAX),
15205436638acc7c010349a69c3395f1a57c642dc62Ying Wang                                 &iter->state);
15305436638acc7c010349a69c3395f1a57c642dc62Ying Wang      if (iter->cur.bytes == (size_t) -1)
15405436638acc7c010349a69c3395f1a57c642dc62Ying Wang        {
15505436638acc7c010349a69c3395f1a57c642dc62Ying Wang          /* An invalid multibyte sequence was encountered.  */
15605436638acc7c010349a69c3395f1a57c642dc62Ying Wang          iter->cur.bytes = 1;
15705436638acc7c010349a69c3395f1a57c642dc62Ying Wang          iter->cur.wc_valid = false;
15805436638acc7c010349a69c3395f1a57c642dc62Ying Wang          /* Whether to set iter->in_shift = false and reset iter->state
15905436638acc7c010349a69c3395f1a57c642dc62Ying Wang             or not is not very important; the string is bogus anyway.  */
16005436638acc7c010349a69c3395f1a57c642dc62Ying Wang        }
16105436638acc7c010349a69c3395f1a57c642dc62Ying Wang      else if (iter->cur.bytes == (size_t) -2)
16205436638acc7c010349a69c3395f1a57c642dc62Ying Wang        {
16305436638acc7c010349a69c3395f1a57c642dc62Ying Wang          /* An incomplete multibyte character at the end.  */
16405436638acc7c010349a69c3395f1a57c642dc62Ying Wang          iter->cur.bytes = strlen (iter->cur.ptr);
16505436638acc7c010349a69c3395f1a57c642dc62Ying Wang          iter->cur.wc_valid = false;
16605436638acc7c010349a69c3395f1a57c642dc62Ying Wang          /* Whether to set iter->in_shift = false and reset iter->state
16705436638acc7c010349a69c3395f1a57c642dc62Ying Wang             or not is not important; the string end is reached anyway.  */
16805436638acc7c010349a69c3395f1a57c642dc62Ying Wang        }
16905436638acc7c010349a69c3395f1a57c642dc62Ying Wang      else
17005436638acc7c010349a69c3395f1a57c642dc62Ying Wang        {
17105436638acc7c010349a69c3395f1a57c642dc62Ying Wang          if (iter->cur.bytes == 0)
17205436638acc7c010349a69c3395f1a57c642dc62Ying Wang            {
17305436638acc7c010349a69c3395f1a57c642dc62Ying Wang              /* A null wide character was encountered.  */
17405436638acc7c010349a69c3395f1a57c642dc62Ying Wang              iter->cur.bytes = 1;
17505436638acc7c010349a69c3395f1a57c642dc62Ying Wang              assert (*iter->cur.ptr == '\0');
17605436638acc7c010349a69c3395f1a57c642dc62Ying Wang              assert (iter->cur.wc == 0);
17705436638acc7c010349a69c3395f1a57c642dc62Ying Wang            }
17805436638acc7c010349a69c3395f1a57c642dc62Ying Wang          iter->cur.wc_valid = true;
17905436638acc7c010349a69c3395f1a57c642dc62Ying Wang
18005436638acc7c010349a69c3395f1a57c642dc62Ying Wang          /* When in the initial state, we can go back treating ASCII
18105436638acc7c010349a69c3395f1a57c642dc62Ying Wang             characters more quickly.  */
18205436638acc7c010349a69c3395f1a57c642dc62Ying Wang          if (mbsinit (&iter->state))
18305436638acc7c010349a69c3395f1a57c642dc62Ying Wang            iter->in_shift = false;
18405436638acc7c010349a69c3395f1a57c642dc62Ying Wang        }
18505436638acc7c010349a69c3395f1a57c642dc62Ying Wang    }
18605436638acc7c010349a69c3395f1a57c642dc62Ying Wang  iter->next_done = true;
18705436638acc7c010349a69c3395f1a57c642dc62Ying Wang}
18805436638acc7c010349a69c3395f1a57c642dc62Ying Wang
18905436638acc7c010349a69c3395f1a57c642dc62Ying WangMBUITER_INLINE void
19005436638acc7c010349a69c3395f1a57c642dc62Ying Wangmbuiter_multi_reloc (struct mbuiter_multi *iter, ptrdiff_t ptrdiff)
19105436638acc7c010349a69c3395f1a57c642dc62Ying Wang{
19205436638acc7c010349a69c3395f1a57c642dc62Ying Wang  iter->cur.ptr += ptrdiff;
19305436638acc7c010349a69c3395f1a57c642dc62Ying Wang}
19405436638acc7c010349a69c3395f1a57c642dc62Ying Wang
19505436638acc7c010349a69c3395f1a57c642dc62Ying WangMBUITER_INLINE void
19605436638acc7c010349a69c3395f1a57c642dc62Ying Wangmbuiter_multi_copy (struct mbuiter_multi *new_iter, const struct mbuiter_multi *old_iter)
19705436638acc7c010349a69c3395f1a57c642dc62Ying Wang{
19805436638acc7c010349a69c3395f1a57c642dc62Ying Wang  if ((new_iter->in_shift = old_iter->in_shift))
19905436638acc7c010349a69c3395f1a57c642dc62Ying Wang    memcpy (&new_iter->state, &old_iter->state, sizeof (mbstate_t));
20005436638acc7c010349a69c3395f1a57c642dc62Ying Wang  else
20105436638acc7c010349a69c3395f1a57c642dc62Ying Wang    memset (&new_iter->state, 0, sizeof (mbstate_t));
20205436638acc7c010349a69c3395f1a57c642dc62Ying Wang  new_iter->next_done = old_iter->next_done;
20305436638acc7c010349a69c3395f1a57c642dc62Ying Wang  mb_copy (&new_iter->cur, &old_iter->cur);
20405436638acc7c010349a69c3395f1a57c642dc62Ying Wang}
20505436638acc7c010349a69c3395f1a57c642dc62Ying Wang
20605436638acc7c010349a69c3395f1a57c642dc62Ying Wang/* Iteration macros.  */
20705436638acc7c010349a69c3395f1a57c642dc62Ying Wangtypedef struct mbuiter_multi mbui_iterator_t;
20805436638acc7c010349a69c3395f1a57c642dc62Ying Wang#define mbui_init(iter, startptr) \
20905436638acc7c010349a69c3395f1a57c642dc62Ying Wang  ((iter).cur.ptr = (startptr), \
21005436638acc7c010349a69c3395f1a57c642dc62Ying Wang   (iter).in_shift = false, memset (&(iter).state, '\0', sizeof (mbstate_t)), \
21105436638acc7c010349a69c3395f1a57c642dc62Ying Wang   (iter).next_done = false)
21205436638acc7c010349a69c3395f1a57c642dc62Ying Wang#define mbui_avail(iter) \
21305436638acc7c010349a69c3395f1a57c642dc62Ying Wang  (mbuiter_multi_next (&(iter)), !mb_isnul ((iter).cur))
21405436638acc7c010349a69c3395f1a57c642dc62Ying Wang#define mbui_advance(iter) \
21505436638acc7c010349a69c3395f1a57c642dc62Ying Wang  ((iter).cur.ptr += (iter).cur.bytes, (iter).next_done = false)
21605436638acc7c010349a69c3395f1a57c642dc62Ying Wang
21705436638acc7c010349a69c3395f1a57c642dc62Ying Wang/* Access to the current character.  */
21805436638acc7c010349a69c3395f1a57c642dc62Ying Wang#define mbui_cur(iter) (iter).cur
21905436638acc7c010349a69c3395f1a57c642dc62Ying Wang#define mbui_cur_ptr(iter) (iter).cur.ptr
22005436638acc7c010349a69c3395f1a57c642dc62Ying Wang
22105436638acc7c010349a69c3395f1a57c642dc62Ying Wang/* Relocation.  */
22205436638acc7c010349a69c3395f1a57c642dc62Ying Wang#define mbui_reloc(iter, ptrdiff) mbuiter_multi_reloc (&iter, ptrdiff)
22305436638acc7c010349a69c3395f1a57c642dc62Ying Wang
22405436638acc7c010349a69c3395f1a57c642dc62Ying Wang/* Copying an iterator.  */
22505436638acc7c010349a69c3395f1a57c642dc62Ying Wang#define mbui_copy mbuiter_multi_copy
22605436638acc7c010349a69c3395f1a57c642dc62Ying Wang
22705436638acc7c010349a69c3395f1a57c642dc62Ying Wang_GL_INLINE_HEADER_END
22805436638acc7c010349a69c3395f1a57c642dc62Ying Wang
22905436638acc7c010349a69c3395f1a57c642dc62Ying Wang#endif /* _MBUITER_H */
230