1328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko//===-- ConvertUTFWrapper.cpp - Wrap ConvertUTF.h with clang data types -----=== 2328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko// 3328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko// The LLVM Compiler Infrastructure 4328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko// 5328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko// This file is distributed under the University of Illinois Open Source 6328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko// License. See LICENSE.TXT for details. 7328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko// 8328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko//===----------------------------------------------------------------------===// 9328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko 10328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko#include "llvm/Support/ConvertUTF.h" 1106c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner#include "llvm/Support/SwapByteOrder.h" 1206c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner#include <string> 1306c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner#include <vector> 14328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko 15328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenkonamespace llvm { 16328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko 17328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenkobool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source, 18328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko char *&ResultPtr, const UTF8 *&ErrorPtr) { 19328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko assert(WideCharWidth == 1 || WideCharWidth == 2 || WideCharWidth == 4); 20328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko ConversionResult result = conversionOK; 21328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko // Copy the character span over. 22328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko if (WideCharWidth == 1) { 23328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko const UTF8 *Pos = reinterpret_cast<const UTF8*>(Source.begin()); 24328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko if (!isLegalUTF8String(&Pos, reinterpret_cast<const UTF8*>(Source.end()))) { 25328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko result = sourceIllegal; 26328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko ErrorPtr = Pos; 27328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko } else { 28328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko memcpy(ResultPtr, Source.data(), Source.size()); 29328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko ResultPtr += Source.size(); 30328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko } 31328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko } else if (WideCharWidth == 2) { 32328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko const UTF8 *sourceStart = (const UTF8*)Source.data(); 33328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko // FIXME: Make the type of the result buffer correct instead of 34328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko // using reinterpret_cast. 35328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko UTF16 *targetStart = reinterpret_cast<UTF16*>(ResultPtr); 36328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko ConversionFlags flags = strictConversion; 37328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko result = ConvertUTF8toUTF16( 38328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko &sourceStart, sourceStart + Source.size(), 39328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko &targetStart, targetStart + 2*Source.size(), flags); 40328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko if (result == conversionOK) 41328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko ResultPtr = reinterpret_cast<char*>(targetStart); 42328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko else 43328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko ErrorPtr = sourceStart; 44328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko } else if (WideCharWidth == 4) { 45328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko const UTF8 *sourceStart = (const UTF8*)Source.data(); 46328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko // FIXME: Make the type of the result buffer correct instead of 47328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko // using reinterpret_cast. 48328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko UTF32 *targetStart = reinterpret_cast<UTF32*>(ResultPtr); 49328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko ConversionFlags flags = strictConversion; 50328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko result = ConvertUTF8toUTF32( 51328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko &sourceStart, sourceStart + Source.size(), 52328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko &targetStart, targetStart + 4*Source.size(), flags); 53328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko if (result == conversionOK) 54328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko ResultPtr = reinterpret_cast<char*>(targetStart); 55328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko else 56328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko ErrorPtr = sourceStart; 57328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko } 58328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko assert((result != targetExhausted) 59328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko && "ConvertUTF8toUTFXX exhausted target buffer"); 60328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko return result == conversionOK; 61328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko} 62328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko 63328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenkobool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr) { 64328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko const UTF32 *SourceStart = &Source; 65328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko const UTF32 *SourceEnd = SourceStart + 1; 66328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko UTF8 *TargetStart = reinterpret_cast<UTF8 *>(ResultPtr); 67328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko UTF8 *TargetEnd = TargetStart + 4; 68328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko ConversionResult CR = ConvertUTF32toUTF8(&SourceStart, SourceEnd, 69328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko &TargetStart, TargetEnd, 70328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko strictConversion); 71328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko if (CR != conversionOK) 72328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko return false; 73328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko 74328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko ResultPtr = reinterpret_cast<char*>(TargetStart); 75328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko return true; 76328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko} 77328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko 7806c847e83e558f0cc6fea742498b2730eb6837c6Reid Klecknerbool hasUTF16ByteOrderMark(ArrayRef<char> S) { 7906c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner return (S.size() >= 2 && 8006c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner ((S[0] == '\xff' && S[1] == '\xfe') || 8106c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner (S[0] == '\xfe' && S[1] == '\xff'))); 8206c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner} 8306c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner 8406c847e83e558f0cc6fea742498b2730eb6837c6Reid Klecknerbool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) { 8506c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner assert(Out.empty()); 8606c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner 8706c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner // Error out on an uneven byte count. 8806c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner if (SrcBytes.size() % 2) 8906c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner return false; 9006c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner 9106c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner // Avoid OOB by returning early on empty input. 9206c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner if (SrcBytes.empty()) 9306c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner return true; 9406c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner 9506c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner const UTF16 *Src = reinterpret_cast<const UTF16 *>(SrcBytes.begin()); 9606c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner const UTF16 *SrcEnd = reinterpret_cast<const UTF16 *>(SrcBytes.end()); 9706c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner 9806c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner // Byteswap if necessary. 9906c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner std::vector<UTF16> ByteSwapped; 10006c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner if (Src[0] == UNI_UTF16_BYTE_ORDER_MARK_SWAPPED) { 10106c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner ByteSwapped.insert(ByteSwapped.end(), Src, SrcEnd); 10206c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner for (unsigned I = 0, E = ByteSwapped.size(); I != E; ++I) 10306c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner ByteSwapped[I] = llvm::sys::SwapByteOrder_16(ByteSwapped[I]); 10406c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner Src = &ByteSwapped[0]; 10506c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner SrcEnd = &ByteSwapped[ByteSwapped.size() - 1] + 1; 10606c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner } 10706c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner 10806c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner // Skip the BOM for conversion. 10906c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner if (Src[0] == UNI_UTF16_BYTE_ORDER_MARK_NATIVE) 11006c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner Src++; 11106c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner 11206c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner // Just allocate enough space up front. We'll shrink it later. 11306c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner Out.resize(SrcBytes.size() * UNI_MAX_UTF8_BYTES_PER_CODE_POINT); 11406c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner UTF8 *Dst = reinterpret_cast<UTF8 *>(&Out[0]); 11506c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner UTF8 *DstEnd = Dst + Out.size(); 11606c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner 11706c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner ConversionResult CR = 11806c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner ConvertUTF16toUTF8(&Src, SrcEnd, &Dst, DstEnd, strictConversion); 11906c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner assert(CR != targetExhausted); 12006c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner 12106c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner if (CR != conversionOK) { 12206c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner Out.clear(); 12306c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner return false; 12406c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner } 12506c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner 12606c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner Out.resize(reinterpret_cast<char *>(Dst) - &Out[0]); 12706c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner return true; 12806c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner} 12906c847e83e558f0cc6fea742498b2730eb6837c6Reid Kleckner 130328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko} // end namespace llvm 131328027bf269bb0c108bd8533908ccb36ba11e9f0Dmitri Gribenko 132