100895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko//===--- Encoding.h - Format C++ code -------------------------------------===// 200895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko// 300895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko// The LLVM Compiler Infrastructure 400895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko// 500895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko// This file is distributed under the University of Illinois Open Source 600895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko// License. See LICENSE.TXT for details. 700895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko// 800895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko//===----------------------------------------------------------------------===// 900895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// 1000895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// \file 1100895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// \brief Contains functions for text encoding manipulation. Supports UTF-8, 1200895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// 8-bit encodings and escape sequences in C++ string literals. 1300895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// 1400895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko//===----------------------------------------------------------------------===// 1500895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko 1600895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko#ifndef LLVM_CLANG_FORMAT_ENCODING_H 1700895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko#define LLVM_CLANG_FORMAT_ENCODING_H 1800895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko 1900895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko#include "clang/Basic/LLVM.h" 2000895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko#include "llvm/Support/ConvertUTF.h" 210b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko#include "llvm/Support/Unicode.h" 2200895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko 2300895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienkonamespace clang { 2400895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienkonamespace format { 2500895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienkonamespace encoding { 2600895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko 2700895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienkoenum Encoding { 2800895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko Encoding_UTF8, 2900895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko Encoding_Unknown // We treat all other encodings as 8-bit encodings. 3000895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko}; 3100895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko 3200895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// \brief Detects encoding of the Text. If the Text can be decoded using UTF-8, 3300895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// it is considered UTF8, otherwise we treat it as some 8-bit encoding. 3400895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienkoinline Encoding detectEncoding(StringRef Text) { 3500895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko const UTF8 *Ptr = reinterpret_cast<const UTF8 *>(Text.begin()); 3600895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko const UTF8 *BufEnd = reinterpret_cast<const UTF8 *>(Text.end()); 3700895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko if (::isLegalUTF8String(&Ptr, BufEnd)) 3800895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko return Encoding_UTF8; 3900895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko return Encoding_Unknown; 4000895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko} 4100895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko 4200895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienkoinline unsigned getCodePointCountUTF8(StringRef Text) { 4300895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko unsigned CodePoints = 0; 4400895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko for (size_t i = 0, e = Text.size(); i < e; i += getNumBytesForUTF8(Text[i])) { 4500895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko ++CodePoints; 4600895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko } 4700895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko return CodePoints; 4800895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko} 4900895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko 5000895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// \brief Gets the number of code points in the Text using the specified 5100895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// Encoding. 5200895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienkoinline unsigned getCodePointCount(StringRef Text, Encoding Encoding) { 5300895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko switch (Encoding) { 542a409b62126d8f0b8f5749d5ed435ad2b394b526Daniel Jasper case Encoding_UTF8: 552a409b62126d8f0b8f5749d5ed435ad2b394b526Daniel Jasper return getCodePointCountUTF8(Text); 562a409b62126d8f0b8f5749d5ed435ad2b394b526Daniel Jasper default: 572a409b62126d8f0b8f5749d5ed435ad2b394b526Daniel Jasper return Text.size(); 5800895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko } 5900895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko} 6000895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko 610b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko/// \brief Returns the number of columns required to display the \p Text on a 620b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko/// generic Unicode-capable terminal. Text is assumed to use the specified 630b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko/// \p Encoding. 640b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienkoinline unsigned columnWidth(StringRef Text, Encoding Encoding) { 650b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko if (Encoding == Encoding_UTF8) { 660b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko int ContentWidth = llvm::sys::unicode::columnWidthUTF8(Text); 67651f13cea278ec967336033dd032faef0e9fc2ecStephen Hines // FIXME: Figure out the correct way to handle this in the presence of both 68651f13cea278ec967336033dd032faef0e9fc2ecStephen Hines // printable and unprintable multi-byte UTF-8 characters. Falling back to 69651f13cea278ec967336033dd032faef0e9fc2ecStephen Hines // returning the number of bytes may cause problems, as columnWidth suddenly 70651f13cea278ec967336033dd032faef0e9fc2ecStephen Hines // becomes non-additive. 710b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko if (ContentWidth >= 0) 720b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko return ContentWidth; 730b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko } 740b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko return Text.size(); 750b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko} 760b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko 770b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko/// \brief Returns the number of columns required to display the \p Text, 780b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko/// starting from the \p StartColumn on a terminal with the \p TabWidth. The 790b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko/// text is assumed to use the specified \p Encoding. 800b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienkoinline unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn, 810b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko unsigned TabWidth, Encoding Encoding) { 820b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko unsigned TotalWidth = 0; 830b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko StringRef Tail = Text; 840b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko for (;;) { 850b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko StringRef::size_type TabPos = Tail.find('\t'); 860b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko if (TabPos == StringRef::npos) 870b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko return TotalWidth + columnWidth(Tail, Encoding); 88651f13cea278ec967336033dd032faef0e9fc2ecStephen Hines TotalWidth += columnWidth(Tail.substr(0, TabPos), Encoding); 890b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko TotalWidth += TabWidth - (TotalWidth + StartColumn) % TabWidth; 900b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko Tail = Tail.substr(TabPos + 1); 910b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko } 920b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko} 930b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko 9400895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// \brief Gets the number of bytes in a sequence representing a single 9500895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// codepoint and starting with FirstChar in the specified Encoding. 9600895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienkoinline unsigned getCodePointNumBytes(char FirstChar, Encoding Encoding) { 9700895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko switch (Encoding) { 982a409b62126d8f0b8f5749d5ed435ad2b394b526Daniel Jasper case Encoding_UTF8: 992a409b62126d8f0b8f5749d5ed435ad2b394b526Daniel Jasper return getNumBytesForUTF8(FirstChar); 1002a409b62126d8f0b8f5749d5ed435ad2b394b526Daniel Jasper default: 1012a409b62126d8f0b8f5749d5ed435ad2b394b526Daniel Jasper return 1; 10200895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko } 10300895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko} 10400895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko 1052a409b62126d8f0b8f5749d5ed435ad2b394b526Daniel Jasperinline bool isOctDigit(char c) { return '0' <= c && c <= '7'; } 10600895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko 10700895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienkoinline bool isHexDigit(char c) { 10800895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || 10900895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko ('A' <= c && c <= 'F'); 11000895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko} 11100895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko 11200895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// \brief Gets the length of an escape sequence inside a C++ string literal. 11300895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// Text should span from the beginning of the escape sequence (starting with a 11400895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// backslash) to the end of the string literal. 11500895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienkoinline unsigned getEscapeSequenceLength(StringRef Text) { 11600895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko assert(Text[0] == '\\'); 11700895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko if (Text.size() < 2) 11800895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko return 1; 11900895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko 12000895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko switch (Text[1]) { 12100895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko case 'u': 12200895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko return 6; 12300895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko case 'U': 12400895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko return 10; 12500895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko case 'x': { 12600895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko unsigned I = 2; // Point after '\x'. 12700895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko while (I < Text.size() && isHexDigit(Text[I])) 12800895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko ++I; 12900895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko return I; 13000895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko } 13100895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko default: 13200895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko if (isOctDigit(Text[1])) { 13300895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko unsigned I = 1; 13400895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko while (I < Text.size() && I < 4 && isOctDigit(Text[I])) 13500895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko ++I; 13600895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko return I; 13700895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko } 13800895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko return 2; 13900895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko } 14000895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko} 14100895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko 14200895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko} // namespace encoding 14300895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko} // namespace format 14400895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko} // namespace clang 14500895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko 14600895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko#endif // LLVM_CLANG_FORMAT_ENCODING_H 147