1//===--- Parser.cpp - Matcher expression parser -----*- C++ -*-===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9///
10/// \file
11/// \brief Recursive parser implementation for the matcher expression grammar.
12///
13//===----------------------------------------------------------------------===//
14
15#include "clang/ASTMatchers/Dynamic/Parser.h"
16#include "clang/ASTMatchers/Dynamic/Registry.h"
17#include "clang/Basic/CharInfo.h"
18#include "llvm/ADT/Optional.h"
19#include "llvm/ADT/Twine.h"
20#include "llvm/Support/ManagedStatic.h"
21#include <string>
22#include <vector>
23
24namespace clang {
25namespace ast_matchers {
26namespace dynamic {
27
28/// \brief Simple structure to hold information for one token from the parser.
29struct Parser::TokenInfo {
30  /// \brief Different possible tokens.
31  enum TokenKind {
32    TK_Eof,
33    TK_OpenParen,
34    TK_CloseParen,
35    TK_Comma,
36    TK_Period,
37    TK_Literal,
38    TK_Ident,
39    TK_InvalidChar,
40    TK_Error,
41    TK_CodeCompletion
42  };
43
44  /// \brief Some known identifiers.
45  static const char* const ID_Bind;
46
47  TokenInfo() : Text(), Kind(TK_Eof), Range(), Value() {}
48
49  StringRef Text;
50  TokenKind Kind;
51  SourceRange Range;
52  VariantValue Value;
53};
54
55const char* const Parser::TokenInfo::ID_Bind = "bind";
56
57/// \brief Simple tokenizer for the parser.
58class Parser::CodeTokenizer {
59public:
60  explicit CodeTokenizer(StringRef MatcherCode, Diagnostics *Error)
61      : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error),
62        CodeCompletionLocation(nullptr) {
63    NextToken = getNextToken();
64  }
65
66  CodeTokenizer(StringRef MatcherCode, Diagnostics *Error,
67                unsigned CodeCompletionOffset)
68      : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error),
69        CodeCompletionLocation(MatcherCode.data() + CodeCompletionOffset) {
70    NextToken = getNextToken();
71  }
72
73  /// \brief Returns but doesn't consume the next token.
74  const TokenInfo &peekNextToken() const { return NextToken; }
75
76  /// \brief Consumes and returns the next token.
77  TokenInfo consumeNextToken() {
78    TokenInfo ThisToken = NextToken;
79    NextToken = getNextToken();
80    return ThisToken;
81  }
82
83  TokenInfo::TokenKind nextTokenKind() const { return NextToken.Kind; }
84
85private:
86  TokenInfo getNextToken() {
87    consumeWhitespace();
88    TokenInfo Result;
89    Result.Range.Start = currentLocation();
90
91    if (CodeCompletionLocation && CodeCompletionLocation <= Code.data()) {
92      Result.Kind = TokenInfo::TK_CodeCompletion;
93      Result.Text = StringRef(CodeCompletionLocation, 0);
94      CodeCompletionLocation = nullptr;
95      return Result;
96    }
97
98    if (Code.empty()) {
99      Result.Kind = TokenInfo::TK_Eof;
100      Result.Text = "";
101      return Result;
102    }
103
104    switch (Code[0]) {
105    case ',':
106      Result.Kind = TokenInfo::TK_Comma;
107      Result.Text = Code.substr(0, 1);
108      Code = Code.drop_front();
109      break;
110    case '.':
111      Result.Kind = TokenInfo::TK_Period;
112      Result.Text = Code.substr(0, 1);
113      Code = Code.drop_front();
114      break;
115    case '(':
116      Result.Kind = TokenInfo::TK_OpenParen;
117      Result.Text = Code.substr(0, 1);
118      Code = Code.drop_front();
119      break;
120    case ')':
121      Result.Kind = TokenInfo::TK_CloseParen;
122      Result.Text = Code.substr(0, 1);
123      Code = Code.drop_front();
124      break;
125
126    case '"':
127    case '\'':
128      // Parse a string literal.
129      consumeStringLiteral(&Result);
130      break;
131
132    case '0': case '1': case '2': case '3': case '4':
133    case '5': case '6': case '7': case '8': case '9':
134      // Parse an unsigned literal.
135      consumeUnsignedLiteral(&Result);
136      break;
137
138    default:
139      if (isAlphanumeric(Code[0])) {
140        // Parse an identifier
141        size_t TokenLength = 1;
142        while (1) {
143          // A code completion location in/immediately after an identifier will
144          // cause the portion of the identifier before the code completion
145          // location to become a code completion token.
146          if (CodeCompletionLocation == Code.data() + TokenLength) {
147            CodeCompletionLocation = nullptr;
148            Result.Kind = TokenInfo::TK_CodeCompletion;
149            Result.Text = Code.substr(0, TokenLength);
150            Code = Code.drop_front(TokenLength);
151            return Result;
152          }
153          if (TokenLength == Code.size() || !isAlphanumeric(Code[TokenLength]))
154            break;
155          ++TokenLength;
156        }
157        Result.Kind = TokenInfo::TK_Ident;
158        Result.Text = Code.substr(0, TokenLength);
159        Code = Code.drop_front(TokenLength);
160      } else {
161        Result.Kind = TokenInfo::TK_InvalidChar;
162        Result.Text = Code.substr(0, 1);
163        Code = Code.drop_front(1);
164      }
165      break;
166    }
167
168    Result.Range.End = currentLocation();
169    return Result;
170  }
171
172  /// \brief Consume an unsigned literal.
173  void consumeUnsignedLiteral(TokenInfo *Result) {
174    unsigned Length = 1;
175    if (Code.size() > 1) {
176      // Consume the 'x' or 'b' radix modifier, if present.
177      switch (toLowercase(Code[1])) {
178      case 'x': case 'b': Length = 2;
179      }
180    }
181    while (Length < Code.size() && isHexDigit(Code[Length]))
182      ++Length;
183
184    Result->Text = Code.substr(0, Length);
185    Code = Code.drop_front(Length);
186
187    unsigned Value;
188    if (!Result->Text.getAsInteger(0, Value)) {
189      Result->Kind = TokenInfo::TK_Literal;
190      Result->Value = Value;
191    } else {
192      SourceRange Range;
193      Range.Start = Result->Range.Start;
194      Range.End = currentLocation();
195      Error->addError(Range, Error->ET_ParserUnsignedError) << Result->Text;
196      Result->Kind = TokenInfo::TK_Error;
197    }
198  }
199
200  /// \brief Consume a string literal.
201  ///
202  /// \c Code must be positioned at the start of the literal (the opening
203  /// quote). Consumed until it finds the same closing quote character.
204  void consumeStringLiteral(TokenInfo *Result) {
205    bool InEscape = false;
206    const char Marker = Code[0];
207    for (size_t Length = 1, Size = Code.size(); Length != Size; ++Length) {
208      if (InEscape) {
209        InEscape = false;
210        continue;
211      }
212      if (Code[Length] == '\\') {
213        InEscape = true;
214        continue;
215      }
216      if (Code[Length] == Marker) {
217        Result->Kind = TokenInfo::TK_Literal;
218        Result->Text = Code.substr(0, Length + 1);
219        Result->Value = Code.substr(1, Length - 1).str();
220        Code = Code.drop_front(Length + 1);
221        return;
222      }
223    }
224
225    StringRef ErrorText = Code;
226    Code = Code.drop_front(Code.size());
227    SourceRange Range;
228    Range.Start = Result->Range.Start;
229    Range.End = currentLocation();
230    Error->addError(Range, Error->ET_ParserStringError) << ErrorText;
231    Result->Kind = TokenInfo::TK_Error;
232  }
233
234  /// \brief Consume all leading whitespace from \c Code.
235  void consumeWhitespace() {
236    while (!Code.empty() && isWhitespace(Code[0])) {
237      if (Code[0] == '\n') {
238        ++Line;
239        StartOfLine = Code.drop_front();
240      }
241      Code = Code.drop_front();
242    }
243  }
244
245  SourceLocation currentLocation() {
246    SourceLocation Location;
247    Location.Line = Line;
248    Location.Column = Code.data() - StartOfLine.data() + 1;
249    return Location;
250  }
251
252  StringRef Code;
253  StringRef StartOfLine;
254  unsigned Line;
255  Diagnostics *Error;
256  TokenInfo NextToken;
257  const char *CodeCompletionLocation;
258};
259
260Parser::Sema::~Sema() {}
261
262std::vector<ArgKind> Parser::Sema::getAcceptedCompletionTypes(
263    llvm::ArrayRef<std::pair<MatcherCtor, unsigned>> Context) {
264  return std::vector<ArgKind>();
265}
266
267std::vector<MatcherCompletion>
268Parser::Sema::getMatcherCompletions(llvm::ArrayRef<ArgKind> AcceptedTypes) {
269  return std::vector<MatcherCompletion>();
270}
271
272struct Parser::ScopedContextEntry {
273  Parser *P;
274
275  ScopedContextEntry(Parser *P, MatcherCtor C) : P(P) {
276    P->ContextStack.push_back(std::make_pair(C, 0u));
277  }
278
279  ~ScopedContextEntry() {
280    P->ContextStack.pop_back();
281  }
282
283  void nextArg() {
284    ++P->ContextStack.back().second;
285  }
286};
287
288/// \brief Parse expressions that start with an identifier.
289///
290/// This function can parse named values and matchers.
291/// In case of failure it will try to determine the user's intent to give
292/// an appropriate error message.
293bool Parser::parseIdentifierPrefixImpl(VariantValue *Value) {
294  const TokenInfo NameToken = Tokenizer->consumeNextToken();
295
296  if (Tokenizer->nextTokenKind() != TokenInfo::TK_OpenParen) {
297    // Parse as a named value.
298    if (const VariantValue NamedValue =
299            NamedValues ? NamedValues->lookup(NameToken.Text)
300                        : VariantValue()) {
301      *Value = NamedValue;
302      return true;
303    }
304    // If the syntax is correct and the name is not a matcher either, report
305    // unknown named value.
306    if ((Tokenizer->nextTokenKind() == TokenInfo::TK_Comma ||
307         Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen ||
308         Tokenizer->nextTokenKind() == TokenInfo::TK_Eof) &&
309        !S->lookupMatcherCtor(NameToken.Text)) {
310      Error->addError(NameToken.Range, Error->ET_RegistryValueNotFound)
311          << NameToken.Text;
312      return false;
313    }
314    // Otherwise, fallback to the matcher parser.
315  }
316
317  // Parse as a matcher expression.
318  return parseMatcherExpressionImpl(NameToken, Value);
319}
320
321/// \brief Parse and validate a matcher expression.
322/// \return \c true on success, in which case \c Value has the matcher parsed.
323///   If the input is malformed, or some argument has an error, it
324///   returns \c false.
325bool Parser::parseMatcherExpressionImpl(const TokenInfo &NameToken,
326                                        VariantValue *Value) {
327  assert(NameToken.Kind == TokenInfo::TK_Ident);
328  const TokenInfo OpenToken = Tokenizer->consumeNextToken();
329  if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
330    Error->addError(OpenToken.Range, Error->ET_ParserNoOpenParen)
331        << OpenToken.Text;
332    return false;
333  }
334
335  llvm::Optional<MatcherCtor> Ctor = S->lookupMatcherCtor(NameToken.Text);
336
337  if (!Ctor) {
338    Error->addError(NameToken.Range, Error->ET_RegistryMatcherNotFound)
339        << NameToken.Text;
340    // Do not return here. We need to continue to give completion suggestions.
341  }
342
343  std::vector<ParserValue> Args;
344  TokenInfo EndToken;
345
346  {
347    ScopedContextEntry SCE(this, Ctor ? *Ctor : nullptr);
348
349    while (Tokenizer->nextTokenKind() != TokenInfo::TK_Eof) {
350      if (Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen) {
351        // End of args.
352        EndToken = Tokenizer->consumeNextToken();
353        break;
354      }
355      if (Args.size() > 0) {
356        // We must find a , token to continue.
357        const TokenInfo CommaToken = Tokenizer->consumeNextToken();
358        if (CommaToken.Kind != TokenInfo::TK_Comma) {
359          Error->addError(CommaToken.Range, Error->ET_ParserNoComma)
360              << CommaToken.Text;
361          return false;
362        }
363      }
364
365      Diagnostics::Context Ctx(Diagnostics::Context::MatcherArg, Error,
366                               NameToken.Text, NameToken.Range,
367                               Args.size() + 1);
368      ParserValue ArgValue;
369      ArgValue.Text = Tokenizer->peekNextToken().Text;
370      ArgValue.Range = Tokenizer->peekNextToken().Range;
371      if (!parseExpressionImpl(&ArgValue.Value)) {
372        return false;
373      }
374
375      Args.push_back(ArgValue);
376      SCE.nextArg();
377    }
378  }
379
380  if (EndToken.Kind == TokenInfo::TK_Eof) {
381    Error->addError(OpenToken.Range, Error->ET_ParserNoCloseParen);
382    return false;
383  }
384
385  std::string BindID;
386  if (Tokenizer->peekNextToken().Kind == TokenInfo::TK_Period) {
387    // Parse .bind("foo")
388    Tokenizer->consumeNextToken();  // consume the period.
389    const TokenInfo BindToken = Tokenizer->consumeNextToken();
390    if (BindToken.Kind == TokenInfo::TK_CodeCompletion) {
391      addCompletion(BindToken, MatcherCompletion("bind(\"", "bind", 1));
392      return false;
393    }
394
395    const TokenInfo OpenToken = Tokenizer->consumeNextToken();
396    const TokenInfo IDToken = Tokenizer->consumeNextToken();
397    const TokenInfo CloseToken = Tokenizer->consumeNextToken();
398
399    // TODO: We could use different error codes for each/some to be more
400    //       explicit about the syntax error.
401    if (BindToken.Kind != TokenInfo::TK_Ident ||
402        BindToken.Text != TokenInfo::ID_Bind) {
403      Error->addError(BindToken.Range, Error->ET_ParserMalformedBindExpr);
404      return false;
405    }
406    if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
407      Error->addError(OpenToken.Range, Error->ET_ParserMalformedBindExpr);
408      return false;
409    }
410    if (IDToken.Kind != TokenInfo::TK_Literal || !IDToken.Value.isString()) {
411      Error->addError(IDToken.Range, Error->ET_ParserMalformedBindExpr);
412      return false;
413    }
414    if (CloseToken.Kind != TokenInfo::TK_CloseParen) {
415      Error->addError(CloseToken.Range, Error->ET_ParserMalformedBindExpr);
416      return false;
417    }
418    BindID = IDToken.Value.getString();
419  }
420
421  if (!Ctor)
422    return false;
423
424  // Merge the start and end infos.
425  Diagnostics::Context Ctx(Diagnostics::Context::ConstructMatcher, Error,
426                           NameToken.Text, NameToken.Range);
427  SourceRange MatcherRange = NameToken.Range;
428  MatcherRange.End = EndToken.Range.End;
429  VariantMatcher Result = S->actOnMatcherExpression(
430      *Ctor, MatcherRange, BindID, Args, Error);
431  if (Result.isNull()) return false;
432
433  *Value = Result;
434  return true;
435}
436
437// If the prefix of this completion matches the completion token, add it to
438// Completions minus the prefix.
439void Parser::addCompletion(const TokenInfo &CompToken,
440                           const MatcherCompletion& Completion) {
441  if (StringRef(Completion.TypedText).startswith(CompToken.Text) &&
442      Completion.Specificity > 0) {
443    Completions.emplace_back(Completion.TypedText.substr(CompToken.Text.size()),
444                             Completion.MatcherDecl, Completion.Specificity);
445  }
446}
447
448std::vector<MatcherCompletion> Parser::getNamedValueCompletions(
449    ArrayRef<ArgKind> AcceptedTypes) {
450  if (!NamedValues) return std::vector<MatcherCompletion>();
451  std::vector<MatcherCompletion> Result;
452  for (const auto &Entry : *NamedValues) {
453    unsigned Specificity;
454    if (Entry.getValue().isConvertibleTo(AcceptedTypes, &Specificity)) {
455      std::string Decl =
456          (Entry.getValue().getTypeAsString() + " " + Entry.getKey()).str();
457      Result.emplace_back(Entry.getKey(), Decl, Specificity);
458    }
459  }
460  return Result;
461}
462
463void Parser::addExpressionCompletions() {
464  const TokenInfo CompToken = Tokenizer->consumeNextToken();
465  assert(CompToken.Kind == TokenInfo::TK_CodeCompletion);
466
467  // We cannot complete code if there is an invalid element on the context
468  // stack.
469  for (ContextStackTy::iterator I = ContextStack.begin(),
470                                E = ContextStack.end();
471       I != E; ++I) {
472    if (!I->first)
473      return;
474  }
475
476  auto AcceptedTypes = S->getAcceptedCompletionTypes(ContextStack);
477  for (const auto &Completion : S->getMatcherCompletions(AcceptedTypes)) {
478    addCompletion(CompToken, Completion);
479  }
480
481  for (const auto &Completion : getNamedValueCompletions(AcceptedTypes)) {
482    addCompletion(CompToken, Completion);
483  }
484}
485
486/// \brief Parse an <Expresssion>
487bool Parser::parseExpressionImpl(VariantValue *Value) {
488  switch (Tokenizer->nextTokenKind()) {
489  case TokenInfo::TK_Literal:
490    *Value = Tokenizer->consumeNextToken().Value;
491    return true;
492
493  case TokenInfo::TK_Ident:
494    return parseIdentifierPrefixImpl(Value);
495
496  case TokenInfo::TK_CodeCompletion:
497    addExpressionCompletions();
498    return false;
499
500  case TokenInfo::TK_Eof:
501    Error->addError(Tokenizer->consumeNextToken().Range,
502                    Error->ET_ParserNoCode);
503    return false;
504
505  case TokenInfo::TK_Error:
506    // This error was already reported by the tokenizer.
507    return false;
508
509  case TokenInfo::TK_OpenParen:
510  case TokenInfo::TK_CloseParen:
511  case TokenInfo::TK_Comma:
512  case TokenInfo::TK_Period:
513  case TokenInfo::TK_InvalidChar:
514    const TokenInfo Token = Tokenizer->consumeNextToken();
515    Error->addError(Token.Range, Error->ET_ParserInvalidToken) << Token.Text;
516    return false;
517  }
518
519  llvm_unreachable("Unknown token kind.");
520}
521
522static llvm::ManagedStatic<Parser::RegistrySema> DefaultRegistrySema;
523
524Parser::Parser(CodeTokenizer *Tokenizer, Sema *S,
525               const NamedValueMap *NamedValues, Diagnostics *Error)
526    : Tokenizer(Tokenizer), S(S ? S : &*DefaultRegistrySema),
527      NamedValues(NamedValues), Error(Error) {}
528
529Parser::RegistrySema::~RegistrySema() {}
530
531llvm::Optional<MatcherCtor>
532Parser::RegistrySema::lookupMatcherCtor(StringRef MatcherName) {
533  return Registry::lookupMatcherCtor(MatcherName);
534}
535
536VariantMatcher Parser::RegistrySema::actOnMatcherExpression(
537    MatcherCtor Ctor, const SourceRange &NameRange, StringRef BindID,
538    ArrayRef<ParserValue> Args, Diagnostics *Error) {
539  if (BindID.empty()) {
540    return Registry::constructMatcher(Ctor, NameRange, Args, Error);
541  } else {
542    return Registry::constructBoundMatcher(Ctor, NameRange, BindID, Args,
543                                           Error);
544  }
545}
546
547std::vector<ArgKind> Parser::RegistrySema::getAcceptedCompletionTypes(
548    ArrayRef<std::pair<MatcherCtor, unsigned>> Context) {
549  return Registry::getAcceptedCompletionTypes(Context);
550}
551
552std::vector<MatcherCompletion> Parser::RegistrySema::getMatcherCompletions(
553    ArrayRef<ArgKind> AcceptedTypes) {
554  return Registry::getMatcherCompletions(AcceptedTypes);
555}
556
557bool Parser::parseExpression(StringRef Code, Sema *S,
558                             const NamedValueMap *NamedValues,
559                             VariantValue *Value, Diagnostics *Error) {
560  CodeTokenizer Tokenizer(Code, Error);
561  if (!Parser(&Tokenizer, S, NamedValues, Error).parseExpressionImpl(Value))
562    return false;
563  if (Tokenizer.peekNextToken().Kind != TokenInfo::TK_Eof) {
564    Error->addError(Tokenizer.peekNextToken().Range,
565                    Error->ET_ParserTrailingCode);
566    return false;
567  }
568  return true;
569}
570
571std::vector<MatcherCompletion>
572Parser::completeExpression(StringRef Code, unsigned CompletionOffset, Sema *S,
573                           const NamedValueMap *NamedValues) {
574  Diagnostics Error;
575  CodeTokenizer Tokenizer(Code, &Error, CompletionOffset);
576  Parser P(&Tokenizer, S, NamedValues, &Error);
577  VariantValue Dummy;
578  P.parseExpressionImpl(&Dummy);
579
580  // Sort by specificity, then by name.
581  std::sort(P.Completions.begin(), P.Completions.end(),
582            [](const MatcherCompletion &A, const MatcherCompletion &B) {
583    if (A.Specificity != B.Specificity)
584      return A.Specificity > B.Specificity;
585    return A.TypedText < B.TypedText;
586  });
587
588  return P.Completions;
589}
590
591llvm::Optional<DynTypedMatcher>
592Parser::parseMatcherExpression(StringRef Code, Sema *S,
593                               const NamedValueMap *NamedValues,
594                               Diagnostics *Error) {
595  VariantValue Value;
596  if (!parseExpression(Code, S, NamedValues, &Value, Error))
597    return llvm::Optional<DynTypedMatcher>();
598  if (!Value.isMatcher()) {
599    Error->addError(SourceRange(), Error->ET_ParserNotAMatcher);
600    return llvm::Optional<DynTypedMatcher>();
601  }
602  llvm::Optional<DynTypedMatcher> Result =
603      Value.getMatcher().getSingleMatcher();
604  if (!Result.hasValue()) {
605    Error->addError(SourceRange(), Error->ET_ParserOverloadedType)
606        << Value.getTypeAsString();
607  }
608  return Result;
609}
610
611}  // namespace dynamic
612}  // namespace ast_matchers
613}  // namespace clang
614