100895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko//===--- Encoding.h - Format C++ code -------------------------------------===//
200895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko//
300895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko//                     The LLVM Compiler Infrastructure
400895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko//
500895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko// This file is distributed under the University of Illinois Open Source
600895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko// License. See LICENSE.TXT for details.
700895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko//
800895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko//===----------------------------------------------------------------------===//
900895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko///
1000895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// \file
1100895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// \brief Contains functions for text encoding manipulation. Supports UTF-8,
1200895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// 8-bit encodings and escape sequences in C++ string literals.
1300895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko///
1400895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko//===----------------------------------------------------------------------===//
1500895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko
1600895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko#ifndef LLVM_CLANG_FORMAT_ENCODING_H
1700895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko#define LLVM_CLANG_FORMAT_ENCODING_H
1800895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko
1900895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko#include "clang/Basic/LLVM.h"
2000895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko#include "llvm/Support/ConvertUTF.h"
210b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko#include "llvm/Support/Unicode.h"
2200895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko
2300895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienkonamespace clang {
2400895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienkonamespace format {
2500895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienkonamespace encoding {
2600895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko
2700895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienkoenum Encoding {
2800895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  Encoding_UTF8,
2900895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  Encoding_Unknown // We treat all other encodings as 8-bit encodings.
3000895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko};
3100895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko
3200895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// \brief Detects encoding of the Text. If the Text can be decoded using UTF-8,
3300895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// it is considered UTF8, otherwise we treat it as some 8-bit encoding.
3400895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienkoinline Encoding detectEncoding(StringRef Text) {
3500895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  const UTF8 *Ptr = reinterpret_cast<const UTF8 *>(Text.begin());
3600895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  const UTF8 *BufEnd = reinterpret_cast<const UTF8 *>(Text.end());
3700895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  if (::isLegalUTF8String(&Ptr, BufEnd))
3800895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko    return Encoding_UTF8;
3900895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  return Encoding_Unknown;
4000895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko}
4100895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko
4200895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienkoinline unsigned getCodePointCountUTF8(StringRef Text) {
4300895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  unsigned CodePoints = 0;
4400895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  for (size_t i = 0, e = Text.size(); i < e; i += getNumBytesForUTF8(Text[i])) {
4500895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko    ++CodePoints;
4600895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  }
4700895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  return CodePoints;
4800895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko}
4900895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko
5000895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// \brief Gets the number of code points in the Text using the specified
5100895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// Encoding.
5200895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienkoinline unsigned getCodePointCount(StringRef Text, Encoding Encoding) {
5300895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  switch (Encoding) {
542a409b62126d8f0b8f5749d5ed435ad2b394b526Daniel Jasper  case Encoding_UTF8:
552a409b62126d8f0b8f5749d5ed435ad2b394b526Daniel Jasper    return getCodePointCountUTF8(Text);
562a409b62126d8f0b8f5749d5ed435ad2b394b526Daniel Jasper  default:
572a409b62126d8f0b8f5749d5ed435ad2b394b526Daniel Jasper    return Text.size();
5800895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  }
5900895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko}
6000895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko
610b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko/// \brief Returns the number of columns required to display the \p Text on a
620b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko/// generic Unicode-capable terminal. Text is assumed to use the specified
630b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko/// \p Encoding.
640b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienkoinline unsigned columnWidth(StringRef Text, Encoding Encoding) {
650b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko  if (Encoding == Encoding_UTF8) {
660b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko    int ContentWidth = llvm::sys::unicode::columnWidthUTF8(Text);
67651f13cea278ec967336033dd032faef0e9fc2ecStephen Hines    // FIXME: Figure out the correct way to handle this in the presence of both
68651f13cea278ec967336033dd032faef0e9fc2ecStephen Hines    // printable and unprintable multi-byte UTF-8 characters. Falling back to
69651f13cea278ec967336033dd032faef0e9fc2ecStephen Hines    // returning the number of bytes may cause problems, as columnWidth suddenly
70651f13cea278ec967336033dd032faef0e9fc2ecStephen Hines    // becomes non-additive.
710b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko    if (ContentWidth >= 0)
720b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko      return ContentWidth;
730b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko  }
740b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko  return Text.size();
750b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko}
760b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko
770b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko/// \brief Returns the number of columns required to display the \p Text,
780b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko/// starting from the \p StartColumn on a terminal with the \p TabWidth. The
790b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko/// text is assumed to use the specified \p Encoding.
800b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienkoinline unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn,
810b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko                                    unsigned TabWidth, Encoding Encoding) {
820b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko  unsigned TotalWidth = 0;
830b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko  StringRef Tail = Text;
840b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko  for (;;) {
850b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko    StringRef::size_type TabPos = Tail.find('\t');
860b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko    if (TabPos == StringRef::npos)
870b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko      return TotalWidth + columnWidth(Tail, Encoding);
88651f13cea278ec967336033dd032faef0e9fc2ecStephen Hines    TotalWidth += columnWidth(Tail.substr(0, TabPos), Encoding);
890b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko    TotalWidth += TabWidth - (TotalWidth + StartColumn) % TabWidth;
900b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko    Tail = Tail.substr(TabPos + 1);
910b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko  }
920b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko}
930b62cc30c9aa462184de0435dc083d944a41d67fAlexander Kornienko
9400895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// \brief Gets the number of bytes in a sequence representing a single
9500895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// codepoint and starting with FirstChar in the specified Encoding.
9600895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienkoinline unsigned getCodePointNumBytes(char FirstChar, Encoding Encoding) {
9700895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  switch (Encoding) {
982a409b62126d8f0b8f5749d5ed435ad2b394b526Daniel Jasper  case Encoding_UTF8:
992a409b62126d8f0b8f5749d5ed435ad2b394b526Daniel Jasper    return getNumBytesForUTF8(FirstChar);
1002a409b62126d8f0b8f5749d5ed435ad2b394b526Daniel Jasper  default:
1012a409b62126d8f0b8f5749d5ed435ad2b394b526Daniel Jasper    return 1;
10200895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  }
10300895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko}
10400895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko
1052a409b62126d8f0b8f5749d5ed435ad2b394b526Daniel Jasperinline bool isOctDigit(char c) { return '0' <= c && c <= '7'; }
10600895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko
10700895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienkoinline bool isHexDigit(char c) {
10800895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') ||
10900895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko         ('A' <= c && c <= 'F');
11000895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko}
11100895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko
11200895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// \brief Gets the length of an escape sequence inside a C++ string literal.
11300895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// Text should span from the beginning of the escape sequence (starting with a
11400895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko/// backslash) to the end of the string literal.
11500895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienkoinline unsigned getEscapeSequenceLength(StringRef Text) {
11600895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  assert(Text[0] == '\\');
11700895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  if (Text.size() < 2)
11800895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko    return 1;
11900895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko
12000895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  switch (Text[1]) {
12100895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  case 'u':
12200895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko    return 6;
12300895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  case 'U':
12400895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko    return 10;
12500895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  case 'x': {
12600895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko    unsigned I = 2; // Point after '\x'.
12700895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko    while (I < Text.size() && isHexDigit(Text[I]))
12800895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko      ++I;
12900895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko    return I;
13000895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  }
13100895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  default:
13200895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko    if (isOctDigit(Text[1])) {
13300895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko      unsigned I = 1;
13400895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko      while (I < Text.size() && I < 4 && isOctDigit(Text[I]))
13500895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko        ++I;
13600895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko      return I;
13700895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko    }
13800895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko    return 2;
13900895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko  }
14000895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko}
14100895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko
14200895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko} // namespace encoding
14300895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko} // namespace format
14400895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko} // namespace clang
14500895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko
14600895106f9ed602af67984ec4d225a0cdc8c12afAlexander Kornienko#endif // LLVM_CLANG_FORMAT_ENCODING_H
147