1/* $OpenBSD: citrus_utf8.c,v 1.6 2012/12/05 23:19:59 deraadt Exp $ */ 2 3/*- 4 * Copyright (c) 2002-2004 Tim J. Robbins 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29#include <errno.h> 30#include <sys/param.h> 31#include <string.h> 32#include <wchar.h> 33#include <uchar.h> 34 35#include "private/bionic_mbstate.h" 36 37// 38// This file is basically OpenBSD's citrus_utf8.c but rewritten to not require a 39// 12-byte mbstate_t so we're backwards-compatible with our LP32 ABI where 40// mbstate_t was only 4 bytes. 41// 42// The state is the UTF-8 sequence. We only support <= 4-bytes sequences so LP32 43// mbstate_t already has enough space (out of the 4 available bytes we only 44// need 3 since we should never need to store the entire sequence in the 45// intermediary state). 46// 47// The C standard leaves the conversion state undefined after a bad conversion. 48// To avoid unexpected failures due to the possible use of the internal private 49// state we always reset the conversion state when encountering illegal 50// sequences. 51// 52// We also implement the POSIX interface directly rather than being accessed via 53// function pointers. 54// 55 56int mbsinit(const mbstate_t* ps) { 57 return (ps == NULL || (*(reinterpret_cast<const uint32_t*>(ps->__seq)) == 0)); 58} 59 60size_t mbrtowc(wchar_t* pwc, const char* s, size_t n, mbstate_t* ps) { 61 static mbstate_t __private_state; 62 mbstate_t* state = (ps == NULL) ? &__private_state : ps; 63 64 // Our wchar_t is UTF-32 65 return mbrtoc32(reinterpret_cast<char32_t*>(pwc), s, n, state); 66} 67 68size_t mbsnrtowcs(wchar_t* dst, const char** src, size_t nmc, size_t len, mbstate_t* ps) { 69 static mbstate_t __private_state; 70 mbstate_t* state = (ps == NULL) ? &__private_state : ps; 71 size_t i, o, r; 72 73 if (dst == NULL) { 74 /* 75 * The fast path in the loop below is not safe if an ASCII 76 * character appears as anything but the first byte of a 77 * multibyte sequence. Check now to avoid doing it in the loop. 78 */ 79 if ((nmc > 0) && (mbstate_bytes_so_far(state) > 0) 80 && (static_cast<uint8_t>((*src)[0]) < 0x80)) { 81 return reset_and_return_illegal(EILSEQ, state); 82 } 83 for (i = o = 0; i < nmc; i += r, o++) { 84 if (static_cast<uint8_t>((*src)[i]) < 0x80) { 85 // Fast path for plain ASCII characters. 86 if ((*src)[i] == '\0') { 87 *src = nullptr; 88 return reset_and_return(o, state); 89 } 90 r = 1; 91 } else { 92 r = mbrtowc(NULL, *src + i, nmc - i, state); 93 if (r == __MB_ERR_ILLEGAL_SEQUENCE) { 94 return reset_and_return_illegal(EILSEQ, state); 95 } 96 if (r == __MB_ERR_INCOMPLETE_SEQUENCE) { 97 return reset_and_return_illegal(EILSEQ, state); 98 } 99 if (r == 0) { 100 *src = nullptr; 101 return reset_and_return(o, state); 102 } 103 } 104 } 105 return reset_and_return(o, state); 106 } 107 108 /* 109 * The fast path in the loop below is not safe if an ASCII 110 * character appears as anything but the first byte of a 111 * multibyte sequence. Check now to avoid doing it in the loop. 112 */ 113 if ((nmc > 0) && (mbstate_bytes_so_far(state) > 0) 114 && (static_cast<uint8_t>((*src)[0]) < 0x80)) { 115 return reset_and_return_illegal(EILSEQ, state); 116 } 117 for (i = o = 0; i < nmc && o < len; i += r, o++) { 118 if (static_cast<uint8_t>((*src)[i]) < 0x80) { 119 // Fast path for plain ASCII characters. 120 dst[o] = (*src)[i]; 121 r = 1; 122 if ((*src)[i] == '\0') { 123 *src = nullptr; 124 return reset_and_return(o, state); 125 } 126 } else { 127 r = mbrtowc(dst + o, *src + i, nmc - i, state); 128 if (r == __MB_ERR_ILLEGAL_SEQUENCE) { 129 *src += i; 130 return reset_and_return_illegal(EILSEQ, state); 131 } 132 if (r == __MB_ERR_INCOMPLETE_SEQUENCE) { 133 *src += nmc; 134 return reset_and_return(EILSEQ, state); 135 } 136 if (r == 0) { 137 *src = NULL; 138 return reset_and_return(o, state); 139 } 140 } 141 } 142 *src += i; 143 return reset_and_return(o, state); 144} 145 146size_t mbsrtowcs(wchar_t* dst, const char** src, size_t len, mbstate_t* ps) { 147 return mbsnrtowcs(dst, src, SIZE_MAX, len, ps); 148} 149 150size_t wcrtomb(char* s, wchar_t wc, mbstate_t* ps) { 151 static mbstate_t __private_state; 152 mbstate_t* state = (ps == NULL) ? &__private_state : ps; 153 154 // Our wchar_t is UTF-32 155 return c32rtomb(s, static_cast<char32_t>(wc), state); 156} 157 158size_t wcsnrtombs(char* dst, const wchar_t** src, size_t nwc, size_t len, mbstate_t* ps) { 159 static mbstate_t __private_state; 160 mbstate_t* state = (ps == NULL) ? &__private_state : ps; 161 162 if (!mbsinit(state)) { 163 return reset_and_return_illegal(EILSEQ, state); 164 } 165 166 char buf[MB_LEN_MAX]; 167 size_t i, o, r; 168 if (dst == NULL) { 169 for (i = o = 0; i < nwc; i++, o += r) { 170 wchar_t wc = (*src)[i]; 171 if (static_cast<uint32_t>(wc) < 0x80) { 172 // Fast path for plain ASCII characters. 173 if (wc == 0) { 174 return o; 175 } 176 r = 1; 177 } else { 178 r = wcrtomb(buf, wc, state); 179 if (r == __MB_ERR_ILLEGAL_SEQUENCE) { 180 return r; 181 } 182 } 183 } 184 return o; 185 } 186 187 for (i = o = 0; i < nwc && o < len; i++, o += r) { 188 wchar_t wc = (*src)[i]; 189 if (static_cast<uint32_t>(wc) < 0x80) { 190 // Fast path for plain ASCII characters. 191 dst[o] = wc; 192 if (wc == 0) { 193 *src = NULL; 194 return o; 195 } 196 r = 1; 197 } else if (len - o >= sizeof(buf)) { 198 // Enough space to translate in-place. 199 r = wcrtomb(dst + o, wc, state); 200 if (r == __MB_ERR_ILLEGAL_SEQUENCE) { 201 *src += i; 202 return r; 203 } 204 } else { 205 // May not be enough space; use temp buffer. 206 r = wcrtomb(buf, wc, state); 207 if (r == __MB_ERR_ILLEGAL_SEQUENCE) { 208 *src += i; 209 return r; 210 } 211 if (r > len - o) { 212 break; 213 } 214 memcpy(dst + o, buf, r); 215 } 216 } 217 *src += i; 218 return o; 219} 220 221size_t wcsrtombs(char* dst, const wchar_t** src, size_t len, mbstate_t* ps) { 222 return wcsnrtombs(dst, src, SIZE_MAX, len, ps); 223} 224 225int wcscoll_l(const wchar_t *ws1, const wchar_t *ws2, locale_t) { 226 return wcscoll(ws1, ws2); 227} 228 229size_t wcsxfrm_l(wchar_t *dest, const wchar_t *src, size_t n, locale_t) { 230 return wcsxfrm(dest, src, n); 231} 232 233long long wcstoll_l(const wchar_t *nptr, wchar_t **endptr, int base, 234 locale_t) { 235 return wcstoll(nptr, endptr, base); 236} 237 238unsigned long long wcstoull_l(const wchar_t *nptr, wchar_t **endptr, 239 int base, locale_t) { 240 return wcstoull(nptr, endptr, base); 241} 242 243long double wcstold_l(const wchar_t *nptr, wchar_t **endptr, locale_t) { 244 return wcstold(nptr, endptr); 245} 246