100895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko//===--- Encoding.h - Format C++ code -------------------------------------===//
200895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko//
300895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko//                     The LLVM Compiler Infrastructure
400895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko//
500895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko// This file is distributed under the University of Illinois Open Source
600895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko// License. See LICENSE.TXT for details.
700895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko//
800895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko//===----------------------------------------------------------------------===//
900895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko///
1000895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// \file
1100895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// \brief Contains functions for text encoding manipulation. Supports UTF-8,
1200895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// 8-bit encodings and escape sequences in C++ string literals.
1300895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko///
1400895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko//===----------------------------------------------------------------------===//
1500895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko
16176edba5311f6eff0cad2631449885ddf4fbc9eaStephen Hines#ifndef LLVM_CLANG_LIB_FORMAT_ENCODING_H
17176edba5311f6eff0cad2631449885ddf4fbc9eaStephen Hines#define LLVM_CLANG_LIB_FORMAT_ENCODING_H
1800895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko
1900895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko#include "clang/Basic/LLVM.h"
204967a710c84587c654b56c828382219c3937dacbPirama Arumuga Nainar#include "llvm/ADT/StringRef.h"
2100895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko#include "llvm/Support/ConvertUTF.h"
220b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko#include "llvm/Support/Unicode.h"
2300895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko
2400895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienkonamespace clang {
2500895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienkonamespace format {
2600895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienkonamespace encoding {
2700895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko
2800895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienkoenum Encoding {
2900895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  Encoding_UTF8,
3000895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  Encoding_Unknown // We treat all other encodings as 8-bit encodings.
3100895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko};
3200895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko
3300895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// \brief Detects encoding of the Text. If the Text can be decoded using UTF-8,
3400895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// it is considered UTF8, otherwise we treat it as some 8-bit encoding.
3500895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienkoinline Encoding detectEncoding(StringRef Text) {
3600895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  const UTF8 *Ptr = reinterpret_cast<const UTF8 *>(Text.begin());
3700895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  const UTF8 *BufEnd = reinterpret_cast<const UTF8 *>(Text.end());
3800895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  if (::isLegalUTF8String(&Ptr, BufEnd))
3900895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko    return Encoding_UTF8;
4000895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  return Encoding_Unknown;
4100895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko}
4200895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko
4300895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienkoinline unsigned getCodePointCountUTF8(StringRef Text) {
4400895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  unsigned CodePoints = 0;
4500895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  for (size_t i = 0, e = Text.size(); i < e; i += getNumBytesForUTF8(Text[i])) {
4600895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko    ++CodePoints;
4700895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  }
4800895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  return CodePoints;
4900895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko}
5000895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko
5100895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// \brief Gets the number of code points in the Text using the specified
5200895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// Encoding.
5300895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienkoinline unsigned getCodePointCount(StringRef Text, Encoding Encoding) {
5400895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  switch (Encoding) {
552a409b62126d8f0b8f5749d5ed435ad2b394b526Daniel Jasper  case Encoding_UTF8:
562a409b62126d8f0b8f5749d5ed435ad2b394b526Daniel Jasper    return getCodePointCountUTF8(Text);
572a409b62126d8f0b8f5749d5ed435ad2b394b526Daniel Jasper  default:
582a409b62126d8f0b8f5749d5ed435ad2b394b526Daniel Jasper    return Text.size();
5900895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  }
6000895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko}
6100895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko
620b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko/// \brief Returns the number of columns required to display the \p Text on a
630b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko/// generic Unicode-capable terminal. Text is assumed to use the specified
640b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko/// \p Encoding.
650b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienkoinline unsigned columnWidth(StringRef Text, Encoding Encoding) {
660b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko  if (Encoding == Encoding_UTF8) {
670b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko    int ContentWidth = llvm::sys::unicode::columnWidthUTF8(Text);
68651f13cea278ec967336033dd032faef0e9fc2ecStephen Hines    // FIXME: Figure out the correct way to handle this in the presence of both
69651f13cea278ec967336033dd032faef0e9fc2ecStephen Hines    // printable and unprintable multi-byte UTF-8 characters. Falling back to
70651f13cea278ec967336033dd032faef0e9fc2ecStephen Hines    // returning the number of bytes may cause problems, as columnWidth suddenly
71651f13cea278ec967336033dd032faef0e9fc2ecStephen Hines    // becomes non-additive.
720b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko    if (ContentWidth >= 0)
730b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko      return ContentWidth;
740b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko  }
750b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko  return Text.size();
760b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko}
770b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko
780b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko/// \brief Returns the number of columns required to display the \p Text,
790b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko/// starting from the \p StartColumn on a terminal with the \p TabWidth. The
800b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko/// text is assumed to use the specified \p Encoding.
810b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienkoinline unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn,
820b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko                                    unsigned TabWidth, Encoding Encoding) {
830b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko  unsigned TotalWidth = 0;
840b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko  StringRef Tail = Text;
850b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko  for (;;) {
860b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko    StringRef::size_type TabPos = Tail.find('\t');
870b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko    if (TabPos == StringRef::npos)
880b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko      return TotalWidth + columnWidth(Tail, Encoding);
89651f13cea278ec967336033dd032faef0e9fc2ecStephen Hines    TotalWidth += columnWidth(Tail.substr(0, TabPos), Encoding);
900b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko    TotalWidth += TabWidth - (TotalWidth + StartColumn) % TabWidth;
910b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko    Tail = Tail.substr(TabPos + 1);
920b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko  }
930b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko}
940b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko
9500895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// \brief Gets the number of bytes in a sequence representing a single
9600895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// codepoint and starting with FirstChar in the specified Encoding.
9700895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienkoinline unsigned getCodePointNumBytes(char FirstChar, Encoding Encoding) {
9800895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  switch (Encoding) {
992a409b62126d8f0b8f5749d5ed435ad2b394b526Daniel Jasper  case Encoding_UTF8:
1002a409b62126d8f0b8f5749d5ed435ad2b394b526Daniel Jasper    return getNumBytesForUTF8(FirstChar);
1012a409b62126d8f0b8f5749d5ed435ad2b394b526Daniel Jasper  default:
1022a409b62126d8f0b8f5749d5ed435ad2b394b526Daniel Jasper    return 1;
10300895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  }
10400895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko}
10500895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko
1062a409b62126d8f0b8f5749d5ed435ad2b394b526Daniel Jasperinline bool isOctDigit(char c) { return '0' <= c && c <= '7'; }
10700895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko
10800895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienkoinline bool isHexDigit(char c) {
10900895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') ||
11000895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko         ('A' <= c && c <= 'F');
11100895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko}
11200895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko
11300895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// \brief Gets the length of an escape sequence inside a C++ string literal.
11400895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// Text should span from the beginning of the escape sequence (starting with a
11500895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// backslash) to the end of the string literal.
11600895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienkoinline unsigned getEscapeSequenceLength(StringRef Text) {
11700895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  assert(Text[0] == '\\');
11800895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  if (Text.size() < 2)
11900895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko    return 1;
12000895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko
12100895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  switch (Text[1]) {
12200895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  case 'u':
12300895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko    return 6;
12400895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  case 'U':
12500895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko    return 10;
12600895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  case 'x': {
12700895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko    unsigned I = 2; // Point after '\x'.
12800895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko    while (I < Text.size() && isHexDigit(Text[I]))
12900895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko      ++I;
13000895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko    return I;
13100895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  }
13200895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  default:
13300895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko    if (isOctDigit(Text[1])) {
13400895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko      unsigned I = 1;
13500895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko      while (I < Text.size() && I < 4 && isOctDigit(Text[I]))
13600895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko        ++I;
13700895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko      return I;
13800895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko    }
13987d948ecccffea9e9e37d0d053b246e2d6d6c47bPirama Arumuga Nainar    return 1 + getNumBytesForUTF8(Text[1]);
14000895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  }
14100895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko}
14200895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko
14300895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko} // namespace encoding
14400895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko} // namespace format
14500895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko} // namespace clang
14600895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko
147176edba5311f6eff0cad2631449885ddf4fbc9eaStephen Hines#endif
148