wchar.cpp revision 086bb382db3de7459bc5fad6bb1c257ca331b0e8
1/*	$OpenBSD: citrus_utf8.c,v 1.6 2012/12/05 23:19:59 deraadt Exp $ */
2
3/*-
4 * Copyright (c) 2002-2004 Tim J. Robbins
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <errno.h>
30#include <sys/param.h>
31#include <string.h>
32#include <wchar.h>
33#include <uchar.h>
34
35#include "private/bionic_mbstate.h"
36
37//
38// This file is basically OpenBSD's citrus_utf8.c but rewritten to not require a
39// 12-byte mbstate_t so we're backwards-compatible with our LP32 ABI where
40// mbstate_t was only 4 bytes.
41//
42// The state is the UTF-8 sequence. We only support <= 4-bytes sequences so LP32
43// mbstate_t already has enough space (out of the 4 available bytes we only
44// need 3 since we should never need to store the entire sequence in the
45// intermediary state).
46//
47// The C standard leaves the conversion state undefined after a bad conversion.
48// To avoid unexpected failures due to the possible use of the internal private
49// state we always reset the conversion state when encountering illegal
50// sequences.
51//
52// We also implement the POSIX interface directly rather than being accessed via
53// function pointers.
54//
55
56int mbsinit(const mbstate_t* ps) {
57  return (ps == NULL || (*(reinterpret_cast<const uint32_t*>(ps->__seq)) == 0));
58}
59
60size_t mbrtowc(wchar_t* pwc, const char* s, size_t n, mbstate_t* ps) {
61  static mbstate_t __private_state;
62  mbstate_t* state = (ps == NULL) ? &__private_state : ps;
63
64  // Our wchar_t is UTF-32
65  return mbrtoc32(reinterpret_cast<char32_t*>(pwc), s, n, state);
66}
67
68size_t mbsnrtowcs(wchar_t* dst, const char** src, size_t nmc, size_t len, mbstate_t* ps) {
69  static mbstate_t __private_state;
70  mbstate_t* state = (ps == NULL) ? &__private_state : ps;
71  size_t i, o, r;
72
73  if (dst == NULL) {
74    /*
75     * The fast path in the loop below is not safe if an ASCII
76     * character appears as anything but the first byte of a
77     * multibyte sequence. Check now to avoid doing it in the loop.
78     */
79    if ((nmc > 0) && (mbstate_bytes_so_far(state) > 0)
80        && (static_cast<uint8_t>((*src)[0]) < 0x80)) {
81      return reset_and_return_illegal(EILSEQ, state);
82    }
83    for (i = o = 0; i < nmc; i += r, o++) {
84      if (static_cast<uint8_t>((*src)[i]) < 0x80) {
85        // Fast path for plain ASCII characters.
86        if ((*src)[i] == '\0') {
87          *src = nullptr;
88          return reset_and_return(o, state);
89        }
90        r = 1;
91      } else {
92        r = mbrtowc(NULL, *src + i, nmc - i, state);
93        if (r == __MB_ERR_ILLEGAL_SEQUENCE) {
94          return reset_and_return_illegal(EILSEQ, state);
95        }
96        if (r == __MB_ERR_INCOMPLETE_SEQUENCE) {
97          return reset_and_return_illegal(EILSEQ, state);
98        }
99        if (r == 0) {
100          *src = nullptr;
101          return reset_and_return(o, state);
102        }
103      }
104    }
105    return reset_and_return(o, state);
106  }
107
108  /*
109   * The fast path in the loop below is not safe if an ASCII
110   * character appears as anything but the first byte of a
111   * multibyte sequence. Check now to avoid doing it in the loop.
112   */
113  if ((nmc > 0) && (mbstate_bytes_so_far(state) > 0)
114      && (static_cast<uint8_t>((*src)[0]) < 0x80)) {
115    return reset_and_return_illegal(EILSEQ, state);
116  }
117  for (i = o = 0; i < nmc && o < len; i += r, o++) {
118    if (static_cast<uint8_t>((*src)[i]) < 0x80) {
119      // Fast path for plain ASCII characters.
120      dst[o] = (*src)[i];
121      r = 1;
122      if ((*src)[i] == '\0') {
123        *src = nullptr;
124        return reset_and_return(o, state);
125      }
126    } else {
127      r = mbrtowc(dst + o, *src + i, nmc - i, state);
128      if (r == __MB_ERR_ILLEGAL_SEQUENCE) {
129        *src += i;
130        return reset_and_return_illegal(EILSEQ, state);
131      }
132      if (r == __MB_ERR_INCOMPLETE_SEQUENCE) {
133        *src += nmc;
134        return reset_and_return(EILSEQ, state);
135      }
136      if (r == 0) {
137        *src = NULL;
138        return reset_and_return(o, state);
139      }
140    }
141  }
142  *src += i;
143  return reset_and_return(o, state);
144}
145
146size_t mbsrtowcs(wchar_t* dst, const char** src, size_t len, mbstate_t* ps) {
147  return mbsnrtowcs(dst, src, SIZE_MAX, len, ps);
148}
149
150size_t wcrtomb(char* s, wchar_t wc, mbstate_t* ps) {
151  static mbstate_t __private_state;
152  mbstate_t* state = (ps == NULL) ? &__private_state : ps;
153
154  // Our wchar_t is UTF-32
155  return c32rtomb(s, static_cast<char32_t>(wc), state);
156}
157
158size_t wcsnrtombs(char* dst, const wchar_t** src, size_t nwc, size_t len, mbstate_t* ps) {
159  static mbstate_t __private_state;
160  mbstate_t* state = (ps == NULL) ? &__private_state : ps;
161
162  if (!mbsinit(state)) {
163    return reset_and_return_illegal(EILSEQ, state);
164  }
165
166  char buf[MB_LEN_MAX];
167  size_t i, o, r;
168  if (dst == NULL) {
169    for (i = o = 0; i < nwc; i++, o += r) {
170      wchar_t wc = (*src)[i];
171      if (static_cast<uint32_t>(wc) < 0x80) {
172        // Fast path for plain ASCII characters.
173        if (wc == 0) {
174          return o;
175        }
176        r = 1;
177      } else {
178        r = wcrtomb(buf, wc, state);
179        if (r == __MB_ERR_ILLEGAL_SEQUENCE) {
180          return r;
181        }
182      }
183    }
184    return o;
185  }
186
187  for (i = o = 0; i < nwc && o < len; i++, o += r) {
188    wchar_t wc = (*src)[i];
189    if (static_cast<uint32_t>(wc) < 0x80) {
190      // Fast path for plain ASCII characters.
191      dst[o] = wc;
192      if (wc == 0) {
193        *src = NULL;
194        return o;
195      }
196      r = 1;
197    } else if (len - o >= sizeof(buf)) {
198      // Enough space to translate in-place.
199      r = wcrtomb(dst + o, wc, state);
200      if (r == __MB_ERR_ILLEGAL_SEQUENCE) {
201        *src += i;
202        return r;
203      }
204    } else {
205      // May not be enough space; use temp buffer.
206      r = wcrtomb(buf, wc, state);
207      if (r == __MB_ERR_ILLEGAL_SEQUENCE) {
208        *src += i;
209        return r;
210      }
211      if (r > len - o) {
212        break;
213      }
214      memcpy(dst + o, buf, r);
215    }
216  }
217  *src += i;
218  return o;
219}
220
221size_t wcsrtombs(char* dst, const wchar_t** src, size_t len, mbstate_t* ps) {
222  return wcsnrtombs(dst, src, SIZE_MAX, len, ps);
223}
224
225int wcscoll_l(const wchar_t *ws1, const wchar_t *ws2, locale_t) {
226  return wcscoll(ws1, ws2);
227}
228
229size_t wcsxfrm_l(wchar_t *dest, const wchar_t *src, size_t n, locale_t) {
230  return wcsxfrm(dest, src, n);
231}
232
233long long wcstoll_l(const wchar_t *nptr, wchar_t **endptr, int base,
234                    locale_t) {
235  return wcstoll(nptr, endptr, base);
236}
237
238unsigned long long wcstoull_l(const wchar_t *nptr, wchar_t **endptr,
239                              int base, locale_t) {
240  return wcstoull(nptr, endptr, base);
241}
242
243long double wcstold_l(const wchar_t *nptr, wchar_t **endptr, locale_t) {
244  return wcstold(nptr, endptr);
245}
246