1328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko/*===--- ConvertUTF.h - Universal Character Names conversions ---------------===
2328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko *
3328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko *                     The LLVM Compiler Infrastructure
4328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko *
5328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * This file is distributed under the University of Illinois Open Source
6328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * License. See LICENSE.TXT for details.
7328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko *
8328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko *==------------------------------------------------------------------------==*/
9328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko/*
10328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * Copyright 2001-2004 Unicode, Inc.
11328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko *
12328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * Disclaimer
13328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko *
14328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * This source code is provided as is by Unicode, Inc. No claims are
15328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * made as to fitness for any particular purpose. No warranties of any
16328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * kind are expressed or implied. The recipient agrees to determine
17328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * applicability of information provided. If this file has been
18328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * purchased on magnetic or optical media from Unicode, Inc., the
19328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * sole remedy for any claim will be exchange of defective media
20328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * within 90 days of receipt.
21328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko *
22328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * Limitations on Rights to Redistribute This Code
23328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko *
24328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * Unicode, Inc. hereby grants the right to freely use the information
25328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * supplied in this file in the creation of products supporting the
26328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * Unicode Standard, and to make copies of this file in any form
27328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * for internal or external distribution as long as this notice
28328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * remains attached.
29328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko */
30328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
31328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko/* ---------------------------------------------------------------------
32328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
33328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    Conversions between UTF32, UTF-16, and UTF-8.  Header file.
34328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
35328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    Several funtions are included here, forming a complete set of
36328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    conversions between the three formats.  UTF-7 is not included
37328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    here, but is handled in a separate source file.
38328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
39328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    Each of these routines takes pointers to input buffers and output
40328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    buffers.  The input buffers are const.
41328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
42328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    Each routine converts the text between *sourceStart and sourceEnd,
43328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    putting the result into the buffer between *targetStart and
44328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    targetEnd. Note: the end pointers are *after* the last item: e.g.
45328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    *(sourceEnd - 1) is the last item.
46328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
47328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    The return result indicates whether the conversion was successful,
48328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    and if not, whether the problem was in the source or target buffers.
49328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    (Only the first encountered problem is indicated.)
50328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
51328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    After the conversion, *sourceStart and *targetStart are both
52328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    updated to point to the end of last text successfully converted in
53328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    the respective buffers.
54328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
55328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    Input parameters:
56328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko        sourceStart - pointer to a pointer to the source buffer.
57328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko                The contents of this are modified on return so that
58328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko                it points at the next thing to be converted.
59328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko        targetStart - similarly, pointer to pointer to the target buffer.
60328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko        sourceEnd, targetEnd - respectively pointers to the ends of the
61328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko                two buffers, for overflow checking only.
62328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
63328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    These conversion functions take a ConversionFlags argument. When this
64328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    flag is set to strict, both irregular sequences and isolated surrogates
65328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    will cause an error.  When the flag is set to lenient, both irregular
66328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    sequences and isolated surrogates are converted.
67328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
68328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    Whether the flag is strict or lenient, all illegal sequences will cause
69328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
70328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
71328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    must check for illegal sequences.
72328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
73328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    When the flag is set to lenient, characters over 0x10FFFF are converted
74328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    to the replacement character; otherwise (when the flag is set to strict)
75328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    they constitute an error.
76328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
77328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    Output parameters:
78328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko        The value "sourceIllegal" is returned from some routines if the input
79328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko        sequence is malformed.  When "sourceIllegal" is returned, the source
80328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko        value will point to the illegal value that caused the problem. E.g.,
81328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko        in UTF-8 when a sequence is malformed, it points to the start of the
82328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko        malformed sequence.
83328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
84328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    Author: Mark E. Davis, 1994.
85328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    Rev History: Rick McGowan, fixes & updates May 2001.
86328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko         Fixes & updates, Sept 2001.
87328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
88328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko------------------------------------------------------------------------ */
89328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
9006c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner#ifndef LLVM_SUPPORT_CONVERTUTF_H
9106c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner#define LLVM_SUPPORT_CONVERTUTF_H
92328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
93328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko/* ---------------------------------------------------------------------
94328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    The following 4 definitions are compiler-specific.
95328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    The C standard does not guarantee that wchar_t has at least
96328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    16 bits, so wchar_t is no less portable than unsigned short!
97328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    All should be unsigned values to avoid sign extension during
98328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    bit mask & shift operations.
99328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko------------------------------------------------------------------------ */
100328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
101328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenkotypedef unsigned int    UTF32;  /* at least 32 bits */
102328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenkotypedef unsigned short  UTF16;  /* at least 16 bits */
103328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenkotypedef unsigned char   UTF8;   /* typically 8 bits */
104328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenkotypedef unsigned char   Boolean; /* 0 or 1 */
105328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
106328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko/* Some fundamental constants */
107328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko#define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
108328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko#define UNI_MAX_BMP (UTF32)0x0000FFFF
109328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko#define UNI_MAX_UTF16 (UTF32)0x0010FFFF
110328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko#define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
111328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko#define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
112328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
113328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko#define UNI_MAX_UTF8_BYTES_PER_CODE_POINT 4
114328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
11506c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner#define UNI_UTF16_BYTE_ORDER_MARK_NATIVE  0xFEFF
11606c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner#define UNI_UTF16_BYTE_ORDER_MARK_SWAPPED 0xFFFE
11706c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner
118328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenkotypedef enum {
119328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko  conversionOK,           /* conversion successful */
120328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko  sourceExhausted,        /* partial character in source, but hit end */
121328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko  targetExhausted,        /* insuff. room in target for conversion */
122328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko  sourceIllegal           /* source sequence is illegal/malformed */
123328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko} ConversionResult;
124328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
125328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenkotypedef enum {
126328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko  strictConversion = 0,
127328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko  lenientConversion
128328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko} ConversionFlags;
129328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
130328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko/* This is for C++ and does no harm in C */
131328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko#ifdef __cplusplus
132328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenkoextern "C" {
133328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko#endif
134328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
135328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri GribenkoConversionResult ConvertUTF8toUTF16 (
136328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko  const UTF8** sourceStart, const UTF8* sourceEnd,
137328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko  UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
138328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
139cd81d94322a39503e4a3e87b6ee03d4fcb3465fbStephen Hines/**
140cd81d94322a39503e4a3e87b6ee03d4fcb3465fbStephen Hines * Convert a partial UTF8 sequence to UTF32.  If the sequence ends in an
141cd81d94322a39503e4a3e87b6ee03d4fcb3465fbStephen Hines * incomplete code unit sequence, returns \c sourceExhausted.
142cd81d94322a39503e4a3e87b6ee03d4fcb3465fbStephen Hines */
143cd81d94322a39503e4a3e87b6ee03d4fcb3465fbStephen HinesConversionResult ConvertUTF8toUTF32Partial(
144cd81d94322a39503e4a3e87b6ee03d4fcb3465fbStephen Hines  const UTF8** sourceStart, const UTF8* sourceEnd,
145cd81d94322a39503e4a3e87b6ee03d4fcb3465fbStephen Hines  UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
146cd81d94322a39503e4a3e87b6ee03d4fcb3465fbStephen Hines
147cd81d94322a39503e4a3e87b6ee03d4fcb3465fbStephen Hines/**
148cd81d94322a39503e4a3e87b6ee03d4fcb3465fbStephen Hines * Convert a partial UTF8 sequence to UTF32.  If the sequence ends in an
149cd81d94322a39503e4a3e87b6ee03d4fcb3465fbStephen Hines * incomplete code unit sequence, returns \c sourceIllegal.
150cd81d94322a39503e4a3e87b6ee03d4fcb3465fbStephen Hines */
151cd81d94322a39503e4a3e87b6ee03d4fcb3465fbStephen HinesConversionResult ConvertUTF8toUTF32(
152328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko  const UTF8** sourceStart, const UTF8* sourceEnd,
153328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko  UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
154328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
155328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri GribenkoConversionResult ConvertUTF16toUTF8 (
156328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko  const UTF16** sourceStart, const UTF16* sourceEnd,
157328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko  UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
158328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
159328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri GribenkoConversionResult ConvertUTF32toUTF8 (
160328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko  const UTF32** sourceStart, const UTF32* sourceEnd,
161328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko  UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
162328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
163328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri GribenkoConversionResult ConvertUTF16toUTF32 (
164328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko  const UTF16** sourceStart, const UTF16* sourceEnd,
165328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko  UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
166328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
167328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri GribenkoConversionResult ConvertUTF32toUTF16 (
168328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko  const UTF32** sourceStart, const UTF32* sourceEnd,
169328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko  UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
170328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
171328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri GribenkoBoolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);
172328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
173328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri GribenkoBoolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd);
174328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
175328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenkounsigned getNumBytesForUTF8(UTF8 firstByte);
176328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
177328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko#ifdef __cplusplus
178328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko}
179328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
180328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko/*************************************************************************/
181328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko/* Below are LLVM-specific wrappers of the functions above. */
182328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
18306c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner#include "llvm/ADT/ArrayRef.h"
184328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko#include "llvm/ADT/StringRef.h"
185328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
186328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenkonamespace llvm {
187328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
188328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko/**
189328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * Convert an UTF8 StringRef to UTF8, UTF16, or UTF32 depending on
190328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * WideCharWidth. The converted data is written to ResultPtr, which needs to
191328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * point to at least WideCharWidth * (Source.Size() + 1) bytes. On success,
192328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * ResultPtr will point one after the end of the copied string. On failure,
193328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * ResultPtr will not be changed, and ErrorPtr will be set to the location of
194328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * the first character which could not be converted.
195328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * \return true on success.
196328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko */
197328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenkobool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
198328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko                       char *&ResultPtr, const UTF8 *&ErrorPtr);
199328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
200328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko/**
201328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * Convert an Unicode code point to UTF8 sequence.
202328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko *
203328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * \param Source a Unicode code point.
204328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * \param [in,out] ResultPtr pointer to the output buffer, needs to be at least
205328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * \c UNI_MAX_UTF8_BYTES_PER_CODE_POINT bytes.  On success \c ResultPtr is
206328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * updated one past end of the converted sequence.
207328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko *
208328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * \returns true on success.
209328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko */
210328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenkobool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr);
211328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
212328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko/**
213328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * Convert the first UTF8 sequence in the given source buffer to a UTF32
214328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * code point.
215328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko *
216328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * \param [in,out] source A pointer to the source buffer. If the conversion
217328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * succeeds, this pointer will be updated to point to the byte just past the
218328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * end of the converted sequence.
219328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * \param sourceEnd A pointer just past the end of the source buffer.
220328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * \param [out] target The converted code
221328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * \param flags Whether the conversion is strict or lenient.
222328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko *
223328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * \returns conversionOK on success
224328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko *
225328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko * \sa ConvertUTF8toUTF32
226328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko */
227328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenkostatic inline ConversionResult convertUTF8Sequence(const UTF8 **source,
228328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko                                                   const UTF8 *sourceEnd,
229328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko                                                   UTF32 *target,
230328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko                                                   ConversionFlags flags) {
231328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko  if (*source == sourceEnd)
232328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    return sourceExhausted;
233328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko  unsigned size = getNumBytesForUTF8(**source);
234328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko  if ((ptrdiff_t)size > sourceEnd - *source)
235328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko    return sourceExhausted;
236328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko  return ConvertUTF8toUTF32(source, *source + size, &target, target + 1, flags);
237328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko}
23806c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner
23906c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner/**
24006c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner * Returns true if a blob of text starts with a UTF-16 big or little endian byte
24106c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner * order mark.
24206c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner */
24306c847e83e558f0cc6fea742498b2730eb6837c6Reid Klecknerbool hasUTF16ByteOrderMark(ArrayRef<char> SrcBytes);
24406c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner
24506c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner/**
24606c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner * Converts a stream of raw bytes assumed to be UTF16 into a UTF8 std::string.
24706c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner *
24806c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner * \param [in] SrcBytes A buffer of what is assumed to be UTF-16 encoded text.
24906c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner * \param [out] Out Converted UTF-8 is stored here on success.
25006c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner * \returns true on success
25106c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner */
25206c847e83e558f0cc6fea742498b2730eb6837c6Reid Klecknerbool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out);
25306c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner
254ff91f2ef47cd16adffaa16ee83c45f6c3933a415Alex Rosenberg} /* end namespace llvm */
255328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
256328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko#endif
257328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
258328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko/* --------------------------------------------------------------------- */
259328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko
260328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko#endif
261