105436638acc7c010349a69c3395f1a57c642dc62Ying Wang/* Iterating through multibyte strings: macros for multi-byte encodings. 205436638acc7c010349a69c3395f1a57c642dc62Ying Wang Copyright (C) 2001, 2005, 2007, 2009-2012 Free Software Foundation, Inc. 305436638acc7c010349a69c3395f1a57c642dc62Ying Wang 405436638acc7c010349a69c3395f1a57c642dc62Ying Wang This program is free software: you can redistribute it and/or modify 505436638acc7c010349a69c3395f1a57c642dc62Ying Wang it under the terms of the GNU General Public License as published by 605436638acc7c010349a69c3395f1a57c642dc62Ying Wang the Free Software Foundation; either version 3 of the License, or 705436638acc7c010349a69c3395f1a57c642dc62Ying Wang (at your option) any later version. 805436638acc7c010349a69c3395f1a57c642dc62Ying Wang 905436638acc7c010349a69c3395f1a57c642dc62Ying Wang This program is distributed in the hope that it will be useful, 1005436638acc7c010349a69c3395f1a57c642dc62Ying Wang but WITHOUT ANY WARRANTY; without even the implied warranty of 1105436638acc7c010349a69c3395f1a57c642dc62Ying Wang MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 1205436638acc7c010349a69c3395f1a57c642dc62Ying Wang GNU General Public License for more details. 1305436638acc7c010349a69c3395f1a57c642dc62Ying Wang 1405436638acc7c010349a69c3395f1a57c642dc62Ying Wang You should have received a copy of the GNU General Public License 1505436638acc7c010349a69c3395f1a57c642dc62Ying Wang along with this program. If not, see <http://www.gnu.org/licenses/>. */ 1605436638acc7c010349a69c3395f1a57c642dc62Ying Wang 1705436638acc7c010349a69c3395f1a57c642dc62Ying Wang/* Written by Bruno Haible <bruno@clisp.org>. */ 1805436638acc7c010349a69c3395f1a57c642dc62Ying Wang 1905436638acc7c010349a69c3395f1a57c642dc62Ying Wang/* The macros in this file implement forward iteration through a 2005436638acc7c010349a69c3395f1a57c642dc62Ying Wang multi-byte string, without knowing its length a-priori. 2105436638acc7c010349a69c3395f1a57c642dc62Ying Wang 2205436638acc7c010349a69c3395f1a57c642dc62Ying Wang With these macros, an iteration loop that looks like 2305436638acc7c010349a69c3395f1a57c642dc62Ying Wang 2405436638acc7c010349a69c3395f1a57c642dc62Ying Wang char *iter; 2505436638acc7c010349a69c3395f1a57c642dc62Ying Wang for (iter = buf; *iter != '\0'; iter++) 2605436638acc7c010349a69c3395f1a57c642dc62Ying Wang { 2705436638acc7c010349a69c3395f1a57c642dc62Ying Wang do_something (*iter); 2805436638acc7c010349a69c3395f1a57c642dc62Ying Wang } 2905436638acc7c010349a69c3395f1a57c642dc62Ying Wang 3005436638acc7c010349a69c3395f1a57c642dc62Ying Wang becomes 3105436638acc7c010349a69c3395f1a57c642dc62Ying Wang 3205436638acc7c010349a69c3395f1a57c642dc62Ying Wang mbui_iterator_t iter; 3305436638acc7c010349a69c3395f1a57c642dc62Ying Wang for (mbui_init (iter, buf); mbui_avail (iter); mbui_advance (iter)) 3405436638acc7c010349a69c3395f1a57c642dc62Ying Wang { 3505436638acc7c010349a69c3395f1a57c642dc62Ying Wang do_something (mbui_cur_ptr (iter), mb_len (mbui_cur (iter))); 3605436638acc7c010349a69c3395f1a57c642dc62Ying Wang } 3705436638acc7c010349a69c3395f1a57c642dc62Ying Wang 3805436638acc7c010349a69c3395f1a57c642dc62Ying Wang The benefit of these macros over plain use of mbrtowc is: 3905436638acc7c010349a69c3395f1a57c642dc62Ying Wang - Handling of invalid multibyte sequences is possible without 4005436638acc7c010349a69c3395f1a57c642dc62Ying Wang making the code more complicated, while still preserving the 4105436638acc7c010349a69c3395f1a57c642dc62Ying Wang invalid multibyte sequences. 4205436638acc7c010349a69c3395f1a57c642dc62Ying Wang 4305436638acc7c010349a69c3395f1a57c642dc62Ying Wang Compared to mbiter.h, the macros here don't need to know the string's 4405436638acc7c010349a69c3395f1a57c642dc62Ying Wang length a-priori. The downside is that at each step, the look-ahead 4505436638acc7c010349a69c3395f1a57c642dc62Ying Wang that guards against overrunning the terminating '\0' is more expensive. 4605436638acc7c010349a69c3395f1a57c642dc62Ying Wang The mbui_* macros are therefore suitable when there is a high probability 4705436638acc7c010349a69c3395f1a57c642dc62Ying Wang that only the first few multibyte characters need to be inspected. 4805436638acc7c010349a69c3395f1a57c642dc62Ying Wang Whereas the mbi_* macros are better if usually the iteration runs 4905436638acc7c010349a69c3395f1a57c642dc62Ying Wang through the entire string. 5005436638acc7c010349a69c3395f1a57c642dc62Ying Wang 5105436638acc7c010349a69c3395f1a57c642dc62Ying Wang mbui_iterator_t 5205436638acc7c010349a69c3395f1a57c642dc62Ying Wang is a type usable for variable declarations. 5305436638acc7c010349a69c3395f1a57c642dc62Ying Wang 5405436638acc7c010349a69c3395f1a57c642dc62Ying Wang mbui_init (iter, startptr) 5505436638acc7c010349a69c3395f1a57c642dc62Ying Wang initializes the iterator, starting at startptr. 5605436638acc7c010349a69c3395f1a57c642dc62Ying Wang 5705436638acc7c010349a69c3395f1a57c642dc62Ying Wang mbui_avail (iter) 5805436638acc7c010349a69c3395f1a57c642dc62Ying Wang returns true if there are more multibyte characters available before 5905436638acc7c010349a69c3395f1a57c642dc62Ying Wang the end of string is reached. In this case, mbui_cur (iter) is 6005436638acc7c010349a69c3395f1a57c642dc62Ying Wang initialized to the next multibyte character. 6105436638acc7c010349a69c3395f1a57c642dc62Ying Wang 6205436638acc7c010349a69c3395f1a57c642dc62Ying Wang mbui_advance (iter) 6305436638acc7c010349a69c3395f1a57c642dc62Ying Wang advances the iterator by one multibyte character. 6405436638acc7c010349a69c3395f1a57c642dc62Ying Wang 6505436638acc7c010349a69c3395f1a57c642dc62Ying Wang mbui_cur (iter) 6605436638acc7c010349a69c3395f1a57c642dc62Ying Wang returns the current multibyte character, of type mbchar_t. All the 6705436638acc7c010349a69c3395f1a57c642dc62Ying Wang macros defined in mbchar.h can be used on it. 6805436638acc7c010349a69c3395f1a57c642dc62Ying Wang 6905436638acc7c010349a69c3395f1a57c642dc62Ying Wang mbui_cur_ptr (iter) 7005436638acc7c010349a69c3395f1a57c642dc62Ying Wang return a pointer to the beginning of the current multibyte character. 7105436638acc7c010349a69c3395f1a57c642dc62Ying Wang 7205436638acc7c010349a69c3395f1a57c642dc62Ying Wang mbui_reloc (iter, ptrdiff) 7305436638acc7c010349a69c3395f1a57c642dc62Ying Wang relocates iterator when the string is moved by ptrdiff bytes. 7405436638acc7c010349a69c3395f1a57c642dc62Ying Wang 7505436638acc7c010349a69c3395f1a57c642dc62Ying Wang mbui_copy (&destiter, &srciter) 7605436638acc7c010349a69c3395f1a57c642dc62Ying Wang copies srciter to destiter. 7705436638acc7c010349a69c3395f1a57c642dc62Ying Wang 7805436638acc7c010349a69c3395f1a57c642dc62Ying Wang Here are the function prototypes of the macros. 7905436638acc7c010349a69c3395f1a57c642dc62Ying Wang 8005436638acc7c010349a69c3395f1a57c642dc62Ying Wang extern void mbui_init (mbui_iterator_t iter, const char *startptr); 8105436638acc7c010349a69c3395f1a57c642dc62Ying Wang extern bool mbui_avail (mbui_iterator_t iter); 8205436638acc7c010349a69c3395f1a57c642dc62Ying Wang extern void mbui_advance (mbui_iterator_t iter); 8305436638acc7c010349a69c3395f1a57c642dc62Ying Wang extern mbchar_t mbui_cur (mbui_iterator_t iter); 8405436638acc7c010349a69c3395f1a57c642dc62Ying Wang extern const char * mbui_cur_ptr (mbui_iterator_t iter); 8505436638acc7c010349a69c3395f1a57c642dc62Ying Wang extern void mbui_reloc (mbui_iterator_t iter, ptrdiff_t ptrdiff); 8605436638acc7c010349a69c3395f1a57c642dc62Ying Wang extern void mbui_copy (mbui_iterator_t *new, const mbui_iterator_t *old); 8705436638acc7c010349a69c3395f1a57c642dc62Ying Wang */ 8805436638acc7c010349a69c3395f1a57c642dc62Ying Wang 8905436638acc7c010349a69c3395f1a57c642dc62Ying Wang#ifndef _MBUITER_H 9005436638acc7c010349a69c3395f1a57c642dc62Ying Wang#define _MBUITER_H 1 9105436638acc7c010349a69c3395f1a57c642dc62Ying Wang 9205436638acc7c010349a69c3395f1a57c642dc62Ying Wang#include <assert.h> 9305436638acc7c010349a69c3395f1a57c642dc62Ying Wang#include <stdbool.h> 9405436638acc7c010349a69c3395f1a57c642dc62Ying Wang#include <stddef.h> 9505436638acc7c010349a69c3395f1a57c642dc62Ying Wang#include <stdlib.h> 9605436638acc7c010349a69c3395f1a57c642dc62Ying Wang#include <string.h> 9705436638acc7c010349a69c3395f1a57c642dc62Ying Wang 9805436638acc7c010349a69c3395f1a57c642dc62Ying Wang/* Tru64 with Desktop Toolkit C has a bug: <stdio.h> must be included before 9905436638acc7c010349a69c3395f1a57c642dc62Ying Wang <wchar.h>. 10005436638acc7c010349a69c3395f1a57c642dc62Ying Wang BSD/OS 4.1 has a bug: <stdio.h> and <time.h> must be included before 10105436638acc7c010349a69c3395f1a57c642dc62Ying Wang <wchar.h>. */ 10205436638acc7c010349a69c3395f1a57c642dc62Ying Wang#include <stdio.h> 10305436638acc7c010349a69c3395f1a57c642dc62Ying Wang#include <time.h> 10405436638acc7c010349a69c3395f1a57c642dc62Ying Wang#include <wchar.h> 10505436638acc7c010349a69c3395f1a57c642dc62Ying Wang 10605436638acc7c010349a69c3395f1a57c642dc62Ying Wang#include "mbchar.h" 10705436638acc7c010349a69c3395f1a57c642dc62Ying Wang#include "strnlen1.h" 10805436638acc7c010349a69c3395f1a57c642dc62Ying Wang 10905436638acc7c010349a69c3395f1a57c642dc62Ying Wang_GL_INLINE_HEADER_BEGIN 11005436638acc7c010349a69c3395f1a57c642dc62Ying Wang#ifndef MBUITER_INLINE 11105436638acc7c010349a69c3395f1a57c642dc62Ying Wang# define MBUITER_INLINE _GL_INLINE 11205436638acc7c010349a69c3395f1a57c642dc62Ying Wang#endif 11305436638acc7c010349a69c3395f1a57c642dc62Ying Wang 11405436638acc7c010349a69c3395f1a57c642dc62Ying Wangstruct mbuiter_multi 11505436638acc7c010349a69c3395f1a57c642dc62Ying Wang{ 11605436638acc7c010349a69c3395f1a57c642dc62Ying Wang bool in_shift; /* true if next byte may not be interpreted as ASCII */ 11705436638acc7c010349a69c3395f1a57c642dc62Ying Wang mbstate_t state; /* if in_shift: current shift state */ 11805436638acc7c010349a69c3395f1a57c642dc62Ying Wang bool next_done; /* true if mbui_avail has already filled the following */ 11905436638acc7c010349a69c3395f1a57c642dc62Ying Wang struct mbchar cur; /* the current character: 12005436638acc7c010349a69c3395f1a57c642dc62Ying Wang const char *cur.ptr pointer to current character 12105436638acc7c010349a69c3395f1a57c642dc62Ying Wang The following are only valid after mbui_avail. 12205436638acc7c010349a69c3395f1a57c642dc62Ying Wang size_t cur.bytes number of bytes of current character 12305436638acc7c010349a69c3395f1a57c642dc62Ying Wang bool cur.wc_valid true if wc is a valid wide character 12405436638acc7c010349a69c3395f1a57c642dc62Ying Wang wchar_t cur.wc if wc_valid: the current character 12505436638acc7c010349a69c3395f1a57c642dc62Ying Wang */ 12605436638acc7c010349a69c3395f1a57c642dc62Ying Wang}; 12705436638acc7c010349a69c3395f1a57c642dc62Ying Wang 12805436638acc7c010349a69c3395f1a57c642dc62Ying WangMBUITER_INLINE void 12905436638acc7c010349a69c3395f1a57c642dc62Ying Wangmbuiter_multi_next (struct mbuiter_multi *iter) 13005436638acc7c010349a69c3395f1a57c642dc62Ying Wang{ 13105436638acc7c010349a69c3395f1a57c642dc62Ying Wang if (iter->next_done) 13205436638acc7c010349a69c3395f1a57c642dc62Ying Wang return; 13305436638acc7c010349a69c3395f1a57c642dc62Ying Wang if (iter->in_shift) 13405436638acc7c010349a69c3395f1a57c642dc62Ying Wang goto with_shift; 13505436638acc7c010349a69c3395f1a57c642dc62Ying Wang /* Handle most ASCII characters quickly, without calling mbrtowc(). */ 13605436638acc7c010349a69c3395f1a57c642dc62Ying Wang if (is_basic (*iter->cur.ptr)) 13705436638acc7c010349a69c3395f1a57c642dc62Ying Wang { 13805436638acc7c010349a69c3395f1a57c642dc62Ying Wang /* These characters are part of the basic character set. ISO C 99 13905436638acc7c010349a69c3395f1a57c642dc62Ying Wang guarantees that their wide character code is identical to their 14005436638acc7c010349a69c3395f1a57c642dc62Ying Wang char code. */ 14105436638acc7c010349a69c3395f1a57c642dc62Ying Wang iter->cur.bytes = 1; 14205436638acc7c010349a69c3395f1a57c642dc62Ying Wang iter->cur.wc = *iter->cur.ptr; 14305436638acc7c010349a69c3395f1a57c642dc62Ying Wang iter->cur.wc_valid = true; 14405436638acc7c010349a69c3395f1a57c642dc62Ying Wang } 14505436638acc7c010349a69c3395f1a57c642dc62Ying Wang else 14605436638acc7c010349a69c3395f1a57c642dc62Ying Wang { 14705436638acc7c010349a69c3395f1a57c642dc62Ying Wang assert (mbsinit (&iter->state)); 14805436638acc7c010349a69c3395f1a57c642dc62Ying Wang iter->in_shift = true; 14905436638acc7c010349a69c3395f1a57c642dc62Ying Wang with_shift: 15005436638acc7c010349a69c3395f1a57c642dc62Ying Wang iter->cur.bytes = mbrtowc (&iter->cur.wc, iter->cur.ptr, 15105436638acc7c010349a69c3395f1a57c642dc62Ying Wang strnlen1 (iter->cur.ptr, MB_CUR_MAX), 15205436638acc7c010349a69c3395f1a57c642dc62Ying Wang &iter->state); 15305436638acc7c010349a69c3395f1a57c642dc62Ying Wang if (iter->cur.bytes == (size_t) -1) 15405436638acc7c010349a69c3395f1a57c642dc62Ying Wang { 15505436638acc7c010349a69c3395f1a57c642dc62Ying Wang /* An invalid multibyte sequence was encountered. */ 15605436638acc7c010349a69c3395f1a57c642dc62Ying Wang iter->cur.bytes = 1; 15705436638acc7c010349a69c3395f1a57c642dc62Ying Wang iter->cur.wc_valid = false; 15805436638acc7c010349a69c3395f1a57c642dc62Ying Wang /* Whether to set iter->in_shift = false and reset iter->state 15905436638acc7c010349a69c3395f1a57c642dc62Ying Wang or not is not very important; the string is bogus anyway. */ 16005436638acc7c010349a69c3395f1a57c642dc62Ying Wang } 16105436638acc7c010349a69c3395f1a57c642dc62Ying Wang else if (iter->cur.bytes == (size_t) -2) 16205436638acc7c010349a69c3395f1a57c642dc62Ying Wang { 16305436638acc7c010349a69c3395f1a57c642dc62Ying Wang /* An incomplete multibyte character at the end. */ 16405436638acc7c010349a69c3395f1a57c642dc62Ying Wang iter->cur.bytes = strlen (iter->cur.ptr); 16505436638acc7c010349a69c3395f1a57c642dc62Ying Wang iter->cur.wc_valid = false; 16605436638acc7c010349a69c3395f1a57c642dc62Ying Wang /* Whether to set iter->in_shift = false and reset iter->state 16705436638acc7c010349a69c3395f1a57c642dc62Ying Wang or not is not important; the string end is reached anyway. */ 16805436638acc7c010349a69c3395f1a57c642dc62Ying Wang } 16905436638acc7c010349a69c3395f1a57c642dc62Ying Wang else 17005436638acc7c010349a69c3395f1a57c642dc62Ying Wang { 17105436638acc7c010349a69c3395f1a57c642dc62Ying Wang if (iter->cur.bytes == 0) 17205436638acc7c010349a69c3395f1a57c642dc62Ying Wang { 17305436638acc7c010349a69c3395f1a57c642dc62Ying Wang /* A null wide character was encountered. */ 17405436638acc7c010349a69c3395f1a57c642dc62Ying Wang iter->cur.bytes = 1; 17505436638acc7c010349a69c3395f1a57c642dc62Ying Wang assert (*iter->cur.ptr == '\0'); 17605436638acc7c010349a69c3395f1a57c642dc62Ying Wang assert (iter->cur.wc == 0); 17705436638acc7c010349a69c3395f1a57c642dc62Ying Wang } 17805436638acc7c010349a69c3395f1a57c642dc62Ying Wang iter->cur.wc_valid = true; 17905436638acc7c010349a69c3395f1a57c642dc62Ying Wang 18005436638acc7c010349a69c3395f1a57c642dc62Ying Wang /* When in the initial state, we can go back treating ASCII 18105436638acc7c010349a69c3395f1a57c642dc62Ying Wang characters more quickly. */ 18205436638acc7c010349a69c3395f1a57c642dc62Ying Wang if (mbsinit (&iter->state)) 18305436638acc7c010349a69c3395f1a57c642dc62Ying Wang iter->in_shift = false; 18405436638acc7c010349a69c3395f1a57c642dc62Ying Wang } 18505436638acc7c010349a69c3395f1a57c642dc62Ying Wang } 18605436638acc7c010349a69c3395f1a57c642dc62Ying Wang iter->next_done = true; 18705436638acc7c010349a69c3395f1a57c642dc62Ying Wang} 18805436638acc7c010349a69c3395f1a57c642dc62Ying Wang 18905436638acc7c010349a69c3395f1a57c642dc62Ying WangMBUITER_INLINE void 19005436638acc7c010349a69c3395f1a57c642dc62Ying Wangmbuiter_multi_reloc (struct mbuiter_multi *iter, ptrdiff_t ptrdiff) 19105436638acc7c010349a69c3395f1a57c642dc62Ying Wang{ 19205436638acc7c010349a69c3395f1a57c642dc62Ying Wang iter->cur.ptr += ptrdiff; 19305436638acc7c010349a69c3395f1a57c642dc62Ying Wang} 19405436638acc7c010349a69c3395f1a57c642dc62Ying Wang 19505436638acc7c010349a69c3395f1a57c642dc62Ying WangMBUITER_INLINE void 19605436638acc7c010349a69c3395f1a57c642dc62Ying Wangmbuiter_multi_copy (struct mbuiter_multi *new_iter, const struct mbuiter_multi *old_iter) 19705436638acc7c010349a69c3395f1a57c642dc62Ying Wang{ 19805436638acc7c010349a69c3395f1a57c642dc62Ying Wang if ((new_iter->in_shift = old_iter->in_shift)) 19905436638acc7c010349a69c3395f1a57c642dc62Ying Wang memcpy (&new_iter->state, &old_iter->state, sizeof (mbstate_t)); 20005436638acc7c010349a69c3395f1a57c642dc62Ying Wang else 20105436638acc7c010349a69c3395f1a57c642dc62Ying Wang memset (&new_iter->state, 0, sizeof (mbstate_t)); 20205436638acc7c010349a69c3395f1a57c642dc62Ying Wang new_iter->next_done = old_iter->next_done; 20305436638acc7c010349a69c3395f1a57c642dc62Ying Wang mb_copy (&new_iter->cur, &old_iter->cur); 20405436638acc7c010349a69c3395f1a57c642dc62Ying Wang} 20505436638acc7c010349a69c3395f1a57c642dc62Ying Wang 20605436638acc7c010349a69c3395f1a57c642dc62Ying Wang/* Iteration macros. */ 20705436638acc7c010349a69c3395f1a57c642dc62Ying Wangtypedef struct mbuiter_multi mbui_iterator_t; 20805436638acc7c010349a69c3395f1a57c642dc62Ying Wang#define mbui_init(iter, startptr) \ 20905436638acc7c010349a69c3395f1a57c642dc62Ying Wang ((iter).cur.ptr = (startptr), \ 21005436638acc7c010349a69c3395f1a57c642dc62Ying Wang (iter).in_shift = false, memset (&(iter).state, '\0', sizeof (mbstate_t)), \ 21105436638acc7c010349a69c3395f1a57c642dc62Ying Wang (iter).next_done = false) 21205436638acc7c010349a69c3395f1a57c642dc62Ying Wang#define mbui_avail(iter) \ 21305436638acc7c010349a69c3395f1a57c642dc62Ying Wang (mbuiter_multi_next (&(iter)), !mb_isnul ((iter).cur)) 21405436638acc7c010349a69c3395f1a57c642dc62Ying Wang#define mbui_advance(iter) \ 21505436638acc7c010349a69c3395f1a57c642dc62Ying Wang ((iter).cur.ptr += (iter).cur.bytes, (iter).next_done = false) 21605436638acc7c010349a69c3395f1a57c642dc62Ying Wang 21705436638acc7c010349a69c3395f1a57c642dc62Ying Wang/* Access to the current character. */ 21805436638acc7c010349a69c3395f1a57c642dc62Ying Wang#define mbui_cur(iter) (iter).cur 21905436638acc7c010349a69c3395f1a57c642dc62Ying Wang#define mbui_cur_ptr(iter) (iter).cur.ptr 22005436638acc7c010349a69c3395f1a57c642dc62Ying Wang 22105436638acc7c010349a69c3395f1a57c642dc62Ying Wang/* Relocation. */ 22205436638acc7c010349a69c3395f1a57c642dc62Ying Wang#define mbui_reloc(iter, ptrdiff) mbuiter_multi_reloc (&iter, ptrdiff) 22305436638acc7c010349a69c3395f1a57c642dc62Ying Wang 22405436638acc7c010349a69c3395f1a57c642dc62Ying Wang/* Copying an iterator. */ 22505436638acc7c010349a69c3395f1a57c642dc62Ying Wang#define mbui_copy mbuiter_multi_copy 22605436638acc7c010349a69c3395f1a57c642dc62Ying Wang 22705436638acc7c010349a69c3395f1a57c642dc62Ying Wang_GL_INLINE_HEADER_END 22805436638acc7c010349a69c3395f1a57c642dc62Ying Wang 22905436638acc7c010349a69c3395f1a57c642dc62Ying Wang#endif /* _MBUITER_H */ 230