1//===--- Encoding.h - Format C++ code -------------------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9/// 10/// \file 11/// \brief Contains functions for text encoding manipulation. Supports UTF-8, 12/// 8-bit encodings and escape sequences in C++ string literals. 13/// 14//===----------------------------------------------------------------------===// 15 16#ifndef LLVM_CLANG_FORMAT_ENCODING_H 17#define LLVM_CLANG_FORMAT_ENCODING_H 18 19#include "clang/Basic/LLVM.h" 20#include "llvm/Support/ConvertUTF.h" 21 22namespace clang { 23namespace format { 24namespace encoding { 25 26enum Encoding { 27 Encoding_UTF8, 28 Encoding_Unknown // We treat all other encodings as 8-bit encodings. 29}; 30 31/// \brief Detects encoding of the Text. If the Text can be decoded using UTF-8, 32/// it is considered UTF8, otherwise we treat it as some 8-bit encoding. 33inline Encoding detectEncoding(StringRef Text) { 34 const UTF8 *Ptr = reinterpret_cast<const UTF8 *>(Text.begin()); 35 const UTF8 *BufEnd = reinterpret_cast<const UTF8 *>(Text.end()); 36 if (::isLegalUTF8String(&Ptr, BufEnd)) 37 return Encoding_UTF8; 38 return Encoding_Unknown; 39} 40 41inline unsigned getCodePointCountUTF8(StringRef Text) { 42 unsigned CodePoints = 0; 43 for (size_t i = 0, e = Text.size(); i < e; i += getNumBytesForUTF8(Text[i])) { 44 ++CodePoints; 45 } 46 return CodePoints; 47} 48 49/// \brief Gets the number of code points in the Text using the specified 50/// Encoding. 51inline unsigned getCodePointCount(StringRef Text, Encoding Encoding) { 52 switch (Encoding) { 53 case Encoding_UTF8: 54 return getCodePointCountUTF8(Text); 55 default: 56 return Text.size(); 57 } 58} 59 60/// \brief Gets the number of bytes in a sequence representing a single 61/// codepoint and starting with FirstChar in the specified Encoding. 62inline unsigned getCodePointNumBytes(char FirstChar, Encoding Encoding) { 63 switch (Encoding) { 64 case Encoding_UTF8: 65 return getNumBytesForUTF8(FirstChar); 66 default: 67 return 1; 68 } 69} 70 71inline bool isOctDigit(char c) { return '0' <= c && c <= '7'; } 72 73inline bool isHexDigit(char c) { 74 return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || 75 ('A' <= c && c <= 'F'); 76} 77 78/// \brief Gets the length of an escape sequence inside a C++ string literal. 79/// Text should span from the beginning of the escape sequence (starting with a 80/// backslash) to the end of the string literal. 81inline unsigned getEscapeSequenceLength(StringRef Text) { 82 assert(Text[0] == '\\'); 83 if (Text.size() < 2) 84 return 1; 85 86 switch (Text[1]) { 87 case 'u': 88 return 6; 89 case 'U': 90 return 10; 91 case 'x': { 92 unsigned I = 2; // Point after '\x'. 93 while (I < Text.size() && isHexDigit(Text[I])) 94 ++I; 95 return I; 96 } 97 default: 98 if (isOctDigit(Text[1])) { 99 unsigned I = 1; 100 while (I < Text.size() && I < 4 && isOctDigit(Text[I])) 101 ++I; 102 return I; 103 } 104 return 2; 105 } 106} 107 108} // namespace encoding 109} // namespace format 110} // namespace clang 111 112#endif // LLVM_CLANG_FORMAT_ENCODING_H 113