1//===--- Parser.cpp - Matcher expression parser -----*- C++ -*-===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9///
10/// \file
11/// \brief Recursive parser implementation for the matcher expression grammar.
12///
13//===----------------------------------------------------------------------===//
14
15#include "clang/ASTMatchers/Dynamic/Parser.h"
16#include "clang/ASTMatchers/Dynamic/Registry.h"
17#include "clang/Basic/CharInfo.h"
18#include "llvm/ADT/Optional.h"
19#include "llvm/ADT/Twine.h"
20#include <string>
21#include <vector>
22
23namespace clang {
24namespace ast_matchers {
25namespace dynamic {
26
27/// \brief Simple structure to hold information for one token from the parser.
28struct Parser::TokenInfo {
29  /// \brief Different possible tokens.
30  enum TokenKind {
31    TK_Eof,
32    TK_OpenParen,
33    TK_CloseParen,
34    TK_Comma,
35    TK_Period,
36    TK_Literal,
37    TK_Ident,
38    TK_InvalidChar,
39    TK_Error,
40    TK_CodeCompletion
41  };
42
43  /// \brief Some known identifiers.
44  static const char* const ID_Bind;
45
46  TokenInfo() : Text(), Kind(TK_Eof), Range(), Value() {}
47
48  StringRef Text;
49  TokenKind Kind;
50  SourceRange Range;
51  VariantValue Value;
52};
53
54const char* const Parser::TokenInfo::ID_Bind = "bind";
55
56/// \brief Simple tokenizer for the parser.
57class Parser::CodeTokenizer {
58public:
59  explicit CodeTokenizer(StringRef MatcherCode, Diagnostics *Error)
60      : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error),
61        CodeCompletionLocation(nullptr) {
62    NextToken = getNextToken();
63  }
64
65  CodeTokenizer(StringRef MatcherCode, Diagnostics *Error,
66                unsigned CodeCompletionOffset)
67      : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error),
68        CodeCompletionLocation(MatcherCode.data() + CodeCompletionOffset) {
69    NextToken = getNextToken();
70  }
71
72  /// \brief Returns but doesn't consume the next token.
73  const TokenInfo &peekNextToken() const { return NextToken; }
74
75  /// \brief Consumes and returns the next token.
76  TokenInfo consumeNextToken() {
77    TokenInfo ThisToken = NextToken;
78    NextToken = getNextToken();
79    return ThisToken;
80  }
81
82  TokenInfo::TokenKind nextTokenKind() const { return NextToken.Kind; }
83
84private:
85  TokenInfo getNextToken() {
86    consumeWhitespace();
87    TokenInfo Result;
88    Result.Range.Start = currentLocation();
89
90    if (CodeCompletionLocation && CodeCompletionLocation <= Code.data()) {
91      Result.Kind = TokenInfo::TK_CodeCompletion;
92      Result.Text = StringRef(CodeCompletionLocation, 0);
93      CodeCompletionLocation = nullptr;
94      return Result;
95    }
96
97    if (Code.empty()) {
98      Result.Kind = TokenInfo::TK_Eof;
99      Result.Text = "";
100      return Result;
101    }
102
103    switch (Code[0]) {
104    case ',':
105      Result.Kind = TokenInfo::TK_Comma;
106      Result.Text = Code.substr(0, 1);
107      Code = Code.drop_front();
108      break;
109    case '.':
110      Result.Kind = TokenInfo::TK_Period;
111      Result.Text = Code.substr(0, 1);
112      Code = Code.drop_front();
113      break;
114    case '(':
115      Result.Kind = TokenInfo::TK_OpenParen;
116      Result.Text = Code.substr(0, 1);
117      Code = Code.drop_front();
118      break;
119    case ')':
120      Result.Kind = TokenInfo::TK_CloseParen;
121      Result.Text = Code.substr(0, 1);
122      Code = Code.drop_front();
123      break;
124
125    case '"':
126    case '\'':
127      // Parse a string literal.
128      consumeStringLiteral(&Result);
129      break;
130
131    case '0': case '1': case '2': case '3': case '4':
132    case '5': case '6': case '7': case '8': case '9':
133      // Parse an unsigned literal.
134      consumeUnsignedLiteral(&Result);
135      break;
136
137    default:
138      if (isAlphanumeric(Code[0])) {
139        // Parse an identifier
140        size_t TokenLength = 1;
141        while (1) {
142          // A code completion location in/immediately after an identifier will
143          // cause the portion of the identifier before the code completion
144          // location to become a code completion token.
145          if (CodeCompletionLocation == Code.data() + TokenLength) {
146            CodeCompletionLocation = nullptr;
147            Result.Kind = TokenInfo::TK_CodeCompletion;
148            Result.Text = Code.substr(0, TokenLength);
149            Code = Code.drop_front(TokenLength);
150            return Result;
151          }
152          if (TokenLength == Code.size() || !isAlphanumeric(Code[TokenLength]))
153            break;
154          ++TokenLength;
155        }
156        Result.Kind = TokenInfo::TK_Ident;
157        Result.Text = Code.substr(0, TokenLength);
158        Code = Code.drop_front(TokenLength);
159      } else {
160        Result.Kind = TokenInfo::TK_InvalidChar;
161        Result.Text = Code.substr(0, 1);
162        Code = Code.drop_front(1);
163      }
164      break;
165    }
166
167    Result.Range.End = currentLocation();
168    return Result;
169  }
170
171  /// \brief Consume an unsigned literal.
172  void consumeUnsignedLiteral(TokenInfo *Result) {
173    unsigned Length = 1;
174    if (Code.size() > 1) {
175      // Consume the 'x' or 'b' radix modifier, if present.
176      switch (toLowercase(Code[1])) {
177      case 'x': case 'b': Length = 2;
178      }
179    }
180    while (Length < Code.size() && isHexDigit(Code[Length]))
181      ++Length;
182
183    Result->Text = Code.substr(0, Length);
184    Code = Code.drop_front(Length);
185
186    unsigned Value;
187    if (!Result->Text.getAsInteger(0, Value)) {
188      Result->Kind = TokenInfo::TK_Literal;
189      Result->Value = Value;
190    } else {
191      SourceRange Range;
192      Range.Start = Result->Range.Start;
193      Range.End = currentLocation();
194      Error->addError(Range, Error->ET_ParserUnsignedError) << Result->Text;
195      Result->Kind = TokenInfo::TK_Error;
196    }
197  }
198
199  /// \brief Consume a string literal.
200  ///
201  /// \c Code must be positioned at the start of the literal (the opening
202  /// quote). Consumed until it finds the same closing quote character.
203  void consumeStringLiteral(TokenInfo *Result) {
204    bool InEscape = false;
205    const char Marker = Code[0];
206    for (size_t Length = 1, Size = Code.size(); Length != Size; ++Length) {
207      if (InEscape) {
208        InEscape = false;
209        continue;
210      }
211      if (Code[Length] == '\\') {
212        InEscape = true;
213        continue;
214      }
215      if (Code[Length] == Marker) {
216        Result->Kind = TokenInfo::TK_Literal;
217        Result->Text = Code.substr(0, Length + 1);
218        Result->Value = Code.substr(1, Length - 1).str();
219        Code = Code.drop_front(Length + 1);
220        return;
221      }
222    }
223
224    StringRef ErrorText = Code;
225    Code = Code.drop_front(Code.size());
226    SourceRange Range;
227    Range.Start = Result->Range.Start;
228    Range.End = currentLocation();
229    Error->addError(Range, Error->ET_ParserStringError) << ErrorText;
230    Result->Kind = TokenInfo::TK_Error;
231  }
232
233  /// \brief Consume all leading whitespace from \c Code.
234  void consumeWhitespace() {
235    while (!Code.empty() && isWhitespace(Code[0])) {
236      if (Code[0] == '\n') {
237        ++Line;
238        StartOfLine = Code.drop_front();
239      }
240      Code = Code.drop_front();
241    }
242  }
243
244  SourceLocation currentLocation() {
245    SourceLocation Location;
246    Location.Line = Line;
247    Location.Column = Code.data() - StartOfLine.data() + 1;
248    return Location;
249  }
250
251  StringRef Code;
252  StringRef StartOfLine;
253  unsigned Line;
254  Diagnostics *Error;
255  TokenInfo NextToken;
256  const char *CodeCompletionLocation;
257};
258
259Parser::Sema::~Sema() {}
260
261VariantValue Parser::Sema::getNamedValue(StringRef Name) {
262  return VariantValue();
263}
264
265struct Parser::ScopedContextEntry {
266  Parser *P;
267
268  ScopedContextEntry(Parser *P, MatcherCtor C) : P(P) {
269    P->ContextStack.push_back(std::make_pair(C, 0u));
270  }
271
272  ~ScopedContextEntry() {
273    P->ContextStack.pop_back();
274  }
275
276  void nextArg() {
277    ++P->ContextStack.back().second;
278  }
279};
280
281/// \brief Parse expressions that start with an identifier.
282///
283/// This function can parse named values and matchers.
284/// In case of failure it will try to determine the user's intent to give
285/// an appropriate error message.
286bool Parser::parseIdentifierPrefixImpl(VariantValue *Value) {
287  const TokenInfo NameToken = Tokenizer->consumeNextToken();
288
289  if (Tokenizer->nextTokenKind() != TokenInfo::TK_OpenParen) {
290    // Parse as a named value.
291    if (const VariantValue NamedValue = S->getNamedValue(NameToken.Text)) {
292      *Value = NamedValue;
293      return true;
294    }
295    // If the syntax is correct and the name is not a matcher either, report
296    // unknown named value.
297    if ((Tokenizer->nextTokenKind() == TokenInfo::TK_Comma ||
298         Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen ||
299         Tokenizer->nextTokenKind() == TokenInfo::TK_Eof) &&
300        !S->lookupMatcherCtor(NameToken.Text)) {
301      Error->addError(NameToken.Range, Error->ET_RegistryValueNotFound)
302          << NameToken.Text;
303      return false;
304    }
305    // Otherwise, fallback to the matcher parser.
306  }
307
308  // Parse as a matcher expression.
309  return parseMatcherExpressionImpl(NameToken, Value);
310}
311
312/// \brief Parse and validate a matcher expression.
313/// \return \c true on success, in which case \c Value has the matcher parsed.
314///   If the input is malformed, or some argument has an error, it
315///   returns \c false.
316bool Parser::parseMatcherExpressionImpl(const TokenInfo &NameToken,
317                                        VariantValue *Value) {
318  assert(NameToken.Kind == TokenInfo::TK_Ident);
319  const TokenInfo OpenToken = Tokenizer->consumeNextToken();
320  if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
321    Error->addError(OpenToken.Range, Error->ET_ParserNoOpenParen)
322        << OpenToken.Text;
323    return false;
324  }
325
326  llvm::Optional<MatcherCtor> Ctor = S->lookupMatcherCtor(NameToken.Text);
327
328  if (!Ctor) {
329    Error->addError(NameToken.Range, Error->ET_RegistryMatcherNotFound)
330        << NameToken.Text;
331    // Do not return here. We need to continue to give completion suggestions.
332  }
333
334  std::vector<ParserValue> Args;
335  TokenInfo EndToken;
336
337  {
338    ScopedContextEntry SCE(this, Ctor ? *Ctor : nullptr);
339
340    while (Tokenizer->nextTokenKind() != TokenInfo::TK_Eof) {
341      if (Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen) {
342        // End of args.
343        EndToken = Tokenizer->consumeNextToken();
344        break;
345      }
346      if (Args.size() > 0) {
347        // We must find a , token to continue.
348        const TokenInfo CommaToken = Tokenizer->consumeNextToken();
349        if (CommaToken.Kind != TokenInfo::TK_Comma) {
350          Error->addError(CommaToken.Range, Error->ET_ParserNoComma)
351              << CommaToken.Text;
352          return false;
353        }
354      }
355
356      Diagnostics::Context Ctx(Diagnostics::Context::MatcherArg, Error,
357                               NameToken.Text, NameToken.Range,
358                               Args.size() + 1);
359      ParserValue ArgValue;
360      ArgValue.Text = Tokenizer->peekNextToken().Text;
361      ArgValue.Range = Tokenizer->peekNextToken().Range;
362      if (!parseExpressionImpl(&ArgValue.Value)) {
363        return false;
364      }
365
366      Args.push_back(ArgValue);
367      SCE.nextArg();
368    }
369  }
370
371  if (EndToken.Kind == TokenInfo::TK_Eof) {
372    Error->addError(OpenToken.Range, Error->ET_ParserNoCloseParen);
373    return false;
374  }
375
376  std::string BindID;
377  if (Tokenizer->peekNextToken().Kind == TokenInfo::TK_Period) {
378    // Parse .bind("foo")
379    Tokenizer->consumeNextToken();  // consume the period.
380    const TokenInfo BindToken = Tokenizer->consumeNextToken();
381    if (BindToken.Kind == TokenInfo::TK_CodeCompletion) {
382      addCompletion(BindToken, "bind(\"", "bind");
383      return false;
384    }
385
386    const TokenInfo OpenToken = Tokenizer->consumeNextToken();
387    const TokenInfo IDToken = Tokenizer->consumeNextToken();
388    const TokenInfo CloseToken = Tokenizer->consumeNextToken();
389
390    // TODO: We could use different error codes for each/some to be more
391    //       explicit about the syntax error.
392    if (BindToken.Kind != TokenInfo::TK_Ident ||
393        BindToken.Text != TokenInfo::ID_Bind) {
394      Error->addError(BindToken.Range, Error->ET_ParserMalformedBindExpr);
395      return false;
396    }
397    if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
398      Error->addError(OpenToken.Range, Error->ET_ParserMalformedBindExpr);
399      return false;
400    }
401    if (IDToken.Kind != TokenInfo::TK_Literal || !IDToken.Value.isString()) {
402      Error->addError(IDToken.Range, Error->ET_ParserMalformedBindExpr);
403      return false;
404    }
405    if (CloseToken.Kind != TokenInfo::TK_CloseParen) {
406      Error->addError(CloseToken.Range, Error->ET_ParserMalformedBindExpr);
407      return false;
408    }
409    BindID = IDToken.Value.getString();
410  }
411
412  if (!Ctor)
413    return false;
414
415  // Merge the start and end infos.
416  Diagnostics::Context Ctx(Diagnostics::Context::ConstructMatcher, Error,
417                           NameToken.Text, NameToken.Range);
418  SourceRange MatcherRange = NameToken.Range;
419  MatcherRange.End = EndToken.Range.End;
420  VariantMatcher Result = S->actOnMatcherExpression(
421      *Ctor, MatcherRange, BindID, Args, Error);
422  if (Result.isNull()) return false;
423
424  *Value = Result;
425  return true;
426}
427
428// If the prefix of this completion matches the completion token, add it to
429// Completions minus the prefix.
430void Parser::addCompletion(const TokenInfo &CompToken, StringRef TypedText,
431                           StringRef Decl) {
432  if (TypedText.size() >= CompToken.Text.size() &&
433      TypedText.substr(0, CompToken.Text.size()) == CompToken.Text) {
434    Completions.push_back(
435        MatcherCompletion(TypedText.substr(CompToken.Text.size()), Decl));
436  }
437}
438
439void Parser::addExpressionCompletions() {
440  const TokenInfo CompToken = Tokenizer->consumeNextToken();
441  assert(CompToken.Kind == TokenInfo::TK_CodeCompletion);
442
443  // We cannot complete code if there is an invalid element on the context
444  // stack.
445  for (ContextStackTy::iterator I = ContextStack.begin(),
446                                E = ContextStack.end();
447       I != E; ++I) {
448    if (!I->first)
449      return;
450  }
451
452  std::vector<MatcherCompletion> RegCompletions =
453      Registry::getCompletions(ContextStack);
454  for (std::vector<MatcherCompletion>::iterator I = RegCompletions.begin(),
455                                                E = RegCompletions.end();
456       I != E; ++I) {
457    addCompletion(CompToken, I->TypedText, I->MatcherDecl);
458  }
459}
460
461/// \brief Parse an <Expresssion>
462bool Parser::parseExpressionImpl(VariantValue *Value) {
463  switch (Tokenizer->nextTokenKind()) {
464  case TokenInfo::TK_Literal:
465    *Value = Tokenizer->consumeNextToken().Value;
466    return true;
467
468  case TokenInfo::TK_Ident:
469    return parseIdentifierPrefixImpl(Value);
470
471  case TokenInfo::TK_CodeCompletion:
472    addExpressionCompletions();
473    return false;
474
475  case TokenInfo::TK_Eof:
476    Error->addError(Tokenizer->consumeNextToken().Range,
477                    Error->ET_ParserNoCode);
478    return false;
479
480  case TokenInfo::TK_Error:
481    // This error was already reported by the tokenizer.
482    return false;
483
484  case TokenInfo::TK_OpenParen:
485  case TokenInfo::TK_CloseParen:
486  case TokenInfo::TK_Comma:
487  case TokenInfo::TK_Period:
488  case TokenInfo::TK_InvalidChar:
489    const TokenInfo Token = Tokenizer->consumeNextToken();
490    Error->addError(Token.Range, Error->ET_ParserInvalidToken) << Token.Text;
491    return false;
492  }
493
494  llvm_unreachable("Unknown token kind.");
495}
496
497Parser::Parser(CodeTokenizer *Tokenizer, Sema *S,
498               Diagnostics *Error)
499    : Tokenizer(Tokenizer), S(S), Error(Error) {}
500
501Parser::RegistrySema::~RegistrySema() {}
502
503llvm::Optional<MatcherCtor>
504Parser::RegistrySema::lookupMatcherCtor(StringRef MatcherName) {
505  return Registry::lookupMatcherCtor(MatcherName);
506}
507
508VariantMatcher Parser::RegistrySema::actOnMatcherExpression(
509    MatcherCtor Ctor, const SourceRange &NameRange, StringRef BindID,
510    ArrayRef<ParserValue> Args, Diagnostics *Error) {
511  if (BindID.empty()) {
512    return Registry::constructMatcher(Ctor, NameRange, Args, Error);
513  } else {
514    return Registry::constructBoundMatcher(Ctor, NameRange, BindID, Args,
515                                           Error);
516  }
517}
518
519bool Parser::parseExpression(StringRef Code, VariantValue *Value,
520                             Diagnostics *Error) {
521  RegistrySema S;
522  return parseExpression(Code, &S, Value, Error);
523}
524
525bool Parser::parseExpression(StringRef Code, Sema *S,
526                             VariantValue *Value, Diagnostics *Error) {
527  CodeTokenizer Tokenizer(Code, Error);
528  if (!Parser(&Tokenizer, S, Error).parseExpressionImpl(Value)) return false;
529  if (Tokenizer.peekNextToken().Kind != TokenInfo::TK_Eof) {
530    Error->addError(Tokenizer.peekNextToken().Range,
531                    Error->ET_ParserTrailingCode);
532    return false;
533  }
534  return true;
535}
536
537std::vector<MatcherCompletion>
538Parser::completeExpression(StringRef Code, unsigned CompletionOffset) {
539  Diagnostics Error;
540  CodeTokenizer Tokenizer(Code, &Error, CompletionOffset);
541  RegistrySema S;
542  Parser P(&Tokenizer, &S, &Error);
543  VariantValue Dummy;
544  P.parseExpressionImpl(&Dummy);
545
546  return P.Completions;
547}
548
549llvm::Optional<DynTypedMatcher>
550Parser::parseMatcherExpression(StringRef Code, Diagnostics *Error) {
551  RegistrySema S;
552  return parseMatcherExpression(Code, &S, Error);
553}
554
555llvm::Optional<DynTypedMatcher>
556Parser::parseMatcherExpression(StringRef Code, Parser::Sema *S,
557                               Diagnostics *Error) {
558  VariantValue Value;
559  if (!parseExpression(Code, S, &Value, Error))
560    return llvm::Optional<DynTypedMatcher>();
561  if (!Value.isMatcher()) {
562    Error->addError(SourceRange(), Error->ET_ParserNotAMatcher);
563    return llvm::Optional<DynTypedMatcher>();
564  }
565  llvm::Optional<DynTypedMatcher> Result =
566      Value.getMatcher().getSingleMatcher();
567  if (!Result.hasValue()) {
568    Error->addError(SourceRange(), Error->ET_ParserOverloadedType)
569        << Value.getTypeAsString();
570  }
571  return Result;
572}
573
574}  // namespace dynamic
575}  // namespace ast_matchers
576}  // namespace clang
577