wchar.cpp revision 7a7f9952c12b216fbf91fc4cdbb97045e8861115
1/* $OpenBSD: citrus_utf8.c,v 1.6 2012/12/05 23:19:59 deraadt Exp $ */ 2 3/*- 4 * Copyright (c) 2002-2004 Tim J. Robbins 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29#include <errno.h> 30#include <sys/param.h> 31#include <string.h> 32#include <wchar.h> 33#include <uchar.h> 34 35#include "private/bionic_mbstate.h" 36 37// 38// This file is basically OpenBSD's citrus_utf8.c but rewritten to not require a 39// 12-byte mbstate_t so we're backwards-compatible with our LP32 ABI where 40// mbstate_t was only 4 bytes. 41// 42// The state is the UTF-8 sequence. We only support <= 4-bytes sequences so LP32 43// mbstate_t already has enough space (out of the 4 available bytes we only 44// need 3 since we should never need to store the entire sequence in the 45// intermediary state). 46// 47// The C standard leaves the conversion state undefined after a bad conversion. 48// To avoid unexpected failures due to the possible use of the internal private 49// state we always reset the conversion state when encountering illegal 50// sequences. 51// 52// We also implement the POSIX interface directly rather than being accessed via 53// function pointers. 54// 55 56int mbsinit(const mbstate_t* ps) { 57 return (ps == NULL || (*(reinterpret_cast<const uint32_t*>(ps->__seq)) == 0)); 58} 59 60size_t mbrtowc(wchar_t* pwc, const char* s, size_t n, mbstate_t* ps) { 61 static mbstate_t __private_state; 62 mbstate_t* state = (ps == NULL) ? &__private_state : ps; 63 64 // Our wchar_t is UTF-32 65 return mbrtoc32(reinterpret_cast<char32_t*>(pwc), s, n, state); 66} 67 68size_t mbsnrtowcs(wchar_t* dst, const char** src, size_t nmc, size_t len, mbstate_t* ps) { 69 static mbstate_t __private_state; 70 mbstate_t* state = (ps == NULL) ? &__private_state : ps; 71 size_t i, o, r; 72 73 if (dst == NULL) { 74 /* 75 * The fast path in the loop below is not safe if an ASCII 76 * character appears as anything but the first byte of a 77 * multibyte sequence. Check now to avoid doing it in the loop. 78 */ 79 if ((nmc > 0) && (mbstate_bytes_so_far(state) > 0) 80 && (static_cast<uint8_t>((*src)[0]) < 0x80)) { 81 return reset_and_return_illegal(EILSEQ, state); 82 } 83 for (i = o = 0; i < nmc; i += r, o++) { 84 if (static_cast<uint8_t>((*src)[i]) < 0x80) { 85 // Fast path for plain ASCII characters. 86 if ((*src)[i] == '\0') { 87 return reset_and_return(o, state); 88 } 89 r = 1; 90 } else { 91 r = mbrtowc(NULL, *src + i, nmc - i, state); 92 if (r == __MB_ERR_ILLEGAL_SEQUENCE) { 93 return reset_and_return_illegal(EILSEQ, state); 94 } 95 if (r == __MB_ERR_INCOMPLETE_SEQUENCE) { 96 return reset_and_return_illegal(EILSEQ, state); 97 } 98 if (r == 0) { 99 return reset_and_return(o, state); 100 } 101 } 102 } 103 return reset_and_return(o, state); 104 } 105 106 /* 107 * The fast path in the loop below is not safe if an ASCII 108 * character appears as anything but the first byte of a 109 * multibyte sequence. Check now to avoid doing it in the loop. 110 */ 111 if ((nmc > 0) && (mbstate_bytes_so_far(state) > 0) 112 && (static_cast<uint8_t>((*src)[0]) < 0x80)) { 113 return reset_and_return_illegal(EILSEQ, state); 114 } 115 for (i = o = 0; i < nmc && o < len; i += r, o++) { 116 if (static_cast<uint8_t>((*src)[i]) < 0x80) { 117 // Fast path for plain ASCII characters. 118 dst[o] = (*src)[i]; 119 if ((*src)[i] == '\0') { 120 *src = NULL; 121 return reset_and_return_illegal(EILSEQ, state); 122 } 123 r = 1; 124 } else { 125 r = mbrtowc(dst + o, *src + i, nmc - i, state); 126 if (r == __MB_ERR_ILLEGAL_SEQUENCE) { 127 *src += i; 128 return reset_and_return_illegal(EILSEQ, state); 129 } 130 if (r == __MB_ERR_INCOMPLETE_SEQUENCE) { 131 *src += nmc; 132 return reset_and_return(EILSEQ, state); 133 } 134 if (r == 0) { 135 *src = NULL; 136 return reset_and_return(o, state); 137 } 138 } 139 } 140 *src += i; 141 return reset_and_return(o, state); 142} 143 144size_t mbsrtowcs(wchar_t* dst, const char** src, size_t len, mbstate_t* ps) { 145 return mbsnrtowcs(dst, src, SIZE_MAX, len, ps); 146} 147 148size_t wcrtomb(char* s, wchar_t wc, mbstate_t* ps) { 149 static mbstate_t __private_state; 150 mbstate_t* state = (ps == NULL) ? &__private_state : ps; 151 152 // Our wchar_t is UTF-32 153 return c32rtomb(s, static_cast<char32_t>(wc), state); 154} 155 156size_t wcsnrtombs(char* dst, const wchar_t** src, size_t nwc, size_t len, mbstate_t* ps) { 157 static mbstate_t __private_state; 158 mbstate_t* state = (ps == NULL) ? &__private_state : ps; 159 160 if (!mbsinit(state)) { 161 return reset_and_return_illegal(EILSEQ, state); 162 } 163 164 char buf[MB_LEN_MAX]; 165 size_t i, o, r; 166 if (dst == NULL) { 167 for (i = o = 0; i < nwc; i++, o += r) { 168 wchar_t wc = (*src)[i]; 169 if (static_cast<uint32_t>(wc) < 0x80) { 170 // Fast path for plain ASCII characters. 171 if (wc == 0) { 172 return o; 173 } 174 r = 1; 175 } else { 176 r = wcrtomb(buf, wc, state); 177 if (r == __MB_ERR_ILLEGAL_SEQUENCE) { 178 return r; 179 } 180 } 181 } 182 return o; 183 } 184 185 for (i = o = 0; i < nwc && o < len; i++, o += r) { 186 wchar_t wc = (*src)[i]; 187 if (static_cast<uint32_t>(wc) < 0x80) { 188 // Fast path for plain ASCII characters. 189 dst[o] = wc; 190 if (wc == 0) { 191 *src = NULL; 192 return o; 193 } 194 r = 1; 195 } else if (len - o >= sizeof(buf)) { 196 // Enough space to translate in-place. 197 r = wcrtomb(dst + o, wc, state); 198 if (r == __MB_ERR_ILLEGAL_SEQUENCE) { 199 *src += i; 200 return r; 201 } 202 } else { 203 // May not be enough space; use temp buffer. 204 r = wcrtomb(buf, wc, state); 205 if (r == __MB_ERR_ILLEGAL_SEQUENCE) { 206 *src += i; 207 return r; 208 } 209 if (r > len - o) { 210 break; 211 } 212 memcpy(dst + o, buf, r); 213 } 214 } 215 *src += i; 216 return o; 217} 218 219size_t wcsrtombs(char* dst, const wchar_t** src, size_t len, mbstate_t* ps) { 220 return wcsnrtombs(dst, src, SIZE_MAX, len, ps); 221} 222