| /** \file |
| * Defines the basic structure to support recognizing by either a lexer, |
| * parser, or tree parser. |
| * \addtogroup BaseRecognizer |
| * @{ |
| */ |
| #ifndef _ANTLR3_BASERECOGNIZER_HPP |
| #define _ANTLR3_BASERECOGNIZER_HPP |
| |
| // [The "BSD licence"] |
| // Copyright (c) 2005-2009 Gokulakannan Somasundaram, ElectronDB |
| |
| // |
| // All rights reserved. |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions |
| // are met: |
| // 1. Redistributions of source code must retain the above copyright |
| // notice, this list of conditions and the following disclaimer. |
| // 2. Redistributions in binary form must reproduce the above copyright |
| // notice, this list of conditions and the following disclaimer in the |
| // documentation and/or other materials provided with the distribution. |
| // 3. The name of the author may not be used to endorse or promote products |
| // derived from this software without specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR |
| // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
| // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. |
| // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, |
| // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT |
| // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF |
| // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| #include "antlr3defs.hpp" |
| #include "antlr3collections.hpp" |
| |
| ANTLR_BEGIN_NAMESPACE() |
| |
| /** \brief Base tracking context structure for all types of |
| * recognizers. |
| */ |
| template< class ImplTraits, class StreamType > |
| class BaseRecognizer : public ImplTraits::AllocPolicyType |
| { |
| public: |
| typedef typename ImplTraits::AllocPolicyType AllocPolicyType; |
| typedef typename StreamType::IntStreamType IntStreamType; |
| typedef typename ComponentTypeFinder<ImplTraits, StreamType>::ComponentType SuperType; |
| typedef typename StreamType::UnitType UnitType; |
| typedef typename ImplTraits::template ExceptionBaseType<StreamType> ExceptionBaseType; |
| typedef typename ImplTraits::BitsetType BitsetType; |
| typedef typename ImplTraits::BitsetListType BitsetListType; |
| typedef typename ImplTraits::StringType StringType; |
| typedef typename ImplTraits::template RecognizerSharedStateType<StreamType> RecognizerSharedStateType; |
| typedef typename ImplTraits::DebugEventListenerType DebugEventListenerType; |
| typedef typename ImplTraits::LexerType LexerType; |
| typedef typename ImplTraits::ParserType ParserType; |
| typedef typename ImplTraits::TreeParserType TreeParserType; |
| |
| typedef typename AllocPolicyType::template StackType<StringType> StringStackType; |
| typedef typename AllocPolicyType::template ListType<StringType> StringListType; |
| |
| private: |
| /// A pointer to the shared recognizer state, such that multiple |
| /// recognizers can use the same inputs streams and so on (in |
| /// the case of grammar inheritance for instance. |
| /// |
| RecognizerSharedStateType* m_state; |
| |
| /// If set to something other than NULL, then this structure is |
| /// points to an instance of the debugger interface. In general, the |
| /// debugger is only referenced internally in recovery/error operations |
| /// so that it does not cause overhead by having to check this pointer |
| /// in every function/method |
| /// |
| DebugEventListenerType* m_debugger; |
| |
| |
| public: |
| BaseRecognizer(ANTLR_UINT32 sizeHint, RecognizerSharedStateType* state); |
| |
| SuperType* get_super(); |
| RecognizerSharedStateType* get_state() const; |
| DebugEventListenerType* get_debugger() const; |
| void set_state( RecognizerSharedStateType* state ); |
| void set_debugger( DebugEventListenerType* debugger ); |
| |
| /// Match current input symbol against ttype. Upon error, do one token |
| /// insertion or deletion if possible. |
| /// To turn off single token insertion or deletion error |
| /// recovery, override mismatchRecover() and have it call |
| /// plain mismatch(), which does not recover. Then any error |
| /// in a rule will cause an exception and immediate exit from |
| /// rule. Rule would recover by resynchronizing to the set of |
| /// symbols that can follow rule ref. |
| /// |
| const UnitType* match(ANTLR_UINT32 ttype, BitsetListType* follow); |
| |
| /// Consumes the next token, whatever it is, and resets the recognizer state |
| /// so that it is not in error. |
| /// |
| /// \param recognizer |
| /// Recognizer context pointer |
| /// |
| void matchAny(); |
| |
| /// function that decides if the token ahead of the current one is the |
| /// one we were loking for, in which case the curernt one is very likely extraneous |
| /// and can be reported that way. |
| /// |
| bool mismatchIsUnwantedToken(IntStreamType* input, ANTLR_UINT32 ttype); |
| |
| /// function that decides if the current token is one that can logically |
| /// follow the one we were looking for, in which case the one we were looking for is |
| /// probably missing from the input. |
| /// |
| bool mismatchIsMissingToken(IntStreamType* input, BitsetListType* follow); |
| |
| /// Factor out what to do upon token mismatch so tree parsers can behave |
| /// differently. Override and call mismatchRecover(input, ttype, follow) |
| /// to get single token insertion and deletion. Use this to turn off |
| /// single token insertion and deletion. Override mismatchRecover |
| /// to call this instead. |
| /// |
| /// \remark mismatch only works for parsers and must be overridden for anything else. |
| /// |
| void mismatch(ANTLR_UINT32 ttype, BitsetListType* follow); |
| |
| /// Report a recognition problem. |
| /// |
| /// This method sets errorRecovery to indicate the parser is recovering |
| /// not parsing. Once in recovery mode, no errors are generated. |
| /// To get out of recovery mode, the parser must successfully match |
| /// a token (after a resync). So it will go: |
| /// |
| /// 1. error occurs |
| /// 2. enter recovery mode, report error |
| /// 3. consume until token found in resynch set |
| /// 4. try to resume parsing |
| /// 5. next match() will reset errorRecovery mode |
| /// |
| /// If you override, make sure to update errorCount if you care about that. |
| /// |
| void reportError(); |
| void reportError( ClassForwarder<LexerType> ); |
| template<typename CompType> |
| void reportError( ClassForwarder<CompType> ); |
| |
| /** Function that is called to display a recognition error message. You may |
| * override this function independently of (*reportError)() above as that function calls |
| * this one to do the actual exception printing. |
| */ |
| void displayRecognitionError(ANTLR_UINT8** tokenNames); |
| |
| /// Get number of recognition errors (lexer, parser, tree parser). Each |
| /// recognizer tracks its own number. So parser and lexer each have |
| /// separate count. Does not count the spurious errors found between |
| /// an error and next valid token match |
| /// |
| /// \see reportError() |
| /// |
| ANTLR_UINT32 getNumberOfSyntaxErrors(); |
| |
| /** Function that recovers from an error found in the input stream. |
| * Generally, this will be a #ANTLR3_EXCEPTION_NOVIABLE_ALT but it could also |
| * be from a mismatched token that the (*match)() could not recover from. |
| */ |
| void recover(); |
| |
| /** function that is a hook to listen to token consumption during error recovery. |
| * This is mainly used by the debug parser to send events to the listener. |
| */ |
| void beginResync(); |
| |
| /** function that is a hook to listen to token consumption during error recovery. |
| * This is mainly used by the debug parser to send events to the listener. |
| */ |
| void endResync(); |
| |
| /** function that is a hook to listen to token consumption during error recovery. |
| * This is mainly used by the debug parser to send events to the listener. |
| */ |
| void beginBacktrack(ANTLR_UINT32 level); |
| |
| /** function that is a hook to listen to token consumption during error recovery. |
| * This is mainly used by the debug parser to send events to the listener. |
| */ |
| void endBacktrack(ANTLR_UINT32 level, bool successful); |
| |
| /// Compute the error recovery set for the current rule. |
| /// Documentation below is from the Java implementation. |
| /// |
| /// During rule invocation, the parser pushes the set of tokens that can |
| /// follow that rule reference on the stack; this amounts to |
| /// computing FIRST of what follows the rule reference in the |
| /// enclosing rule. This local follow set only includes tokens |
| /// from within the rule; i.e., the FIRST computation done by |
| /// ANTLR stops at the end of a rule. |
| // |
| /// EXAMPLE |
| // |
| /// When you find a "no viable alt exception", the input is not |
| /// consistent with any of the alternatives for rule r. The best |
| /// thing to do is to consume tokens until you see something that |
| /// can legally follow a call to r *or* any rule that called r. |
| /// You don't want the exact set of viable next tokens because the |
| /// input might just be missing a token--you might consume the |
| /// rest of the input looking for one of the missing tokens. |
| /// |
| /// Consider grammar: |
| /// |
| /// a : '[' b ']' |
| /// | '(' b ')' |
| /// ; |
| /// b : c '^' INT ; |
| /// c : ID |
| /// | INT |
| /// ; |
| /// |
| /// At each rule invocation, the set of tokens that could follow |
| /// that rule is pushed on a stack. Here are the various "local" |
| /// follow sets: |
| /// |
| /// FOLLOW(b1_in_a) = FIRST(']') = ']' |
| /// FOLLOW(b2_in_a) = FIRST(')') = ')' |
| /// FOLLOW(c_in_b) = FIRST('^') = '^' |
| /// |
| /// Upon erroneous input "[]", the call chain is |
| /// |
| /// a -> b -> c |
| /// |
| /// and, hence, the follow context stack is: |
| /// |
| /// depth local follow set after call to rule |
| /// 0 <EOF> a (from main()) |
| /// 1 ']' b |
| /// 3 '^' c |
| /// |
| /// Notice that ')' is not included, because b would have to have |
| /// been called from a different context in rule a for ')' to be |
| /// included. |
| /// |
| /// For error recovery, we cannot consider FOLLOW(c) |
| /// (context-sensitive or otherwise). We need the combined set of |
| /// all context-sensitive FOLLOW sets--the set of all tokens that |
| /// could follow any reference in the call chain. We need to |
| /// resync to one of those tokens. Note that FOLLOW(c)='^' and if |
| /// we resync'd to that token, we'd consume until EOF. We need to |
| /// sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}. |
| /// In this case, for input "[]", LA(1) is in this set so we would |
| /// not consume anything and after printing an error rule c would |
| /// return normally. It would not find the required '^' though. |
| /// At this point, it gets a mismatched token error and throws an |
| /// exception (since LA(1) is not in the viable following token |
| /// set). The rule exception handler tries to recover, but finds |
| /// the same recovery set and doesn't consume anything. Rule b |
| /// exits normally returning to rule a. Now it finds the ']' (and |
| /// with the successful match exits errorRecovery mode). |
| /// |
| /// So, you can see that the parser walks up call chain looking |
| /// for the token that was a member of the recovery set. |
| /// |
| /// Errors are not generated in errorRecovery mode. |
| /// |
| /// ANTLR's error recovery mechanism is based upon original ideas: |
| /// |
| /// "Algorithms + Data Structures = Programs" by Niklaus Wirth |
| /// |
| /// and |
| /// |
| /// "A note on error recovery in recursive descent parsers": |
| /// http://portal.acm.org/citation.cfm?id=947902.947905 |
| /// |
| /// Later, Josef Grosch had some good ideas: |
| /// |
| /// "Efficient and Comfortable Error Recovery in Recursive Descent |
| /// Parsers": |
| /// ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip |
| /// |
| /// Like Grosch I implemented local FOLLOW sets that are combined |
| /// at run-time upon error to avoid overhead during parsing. |
| /// |
| BitsetType* computeErrorRecoverySet(); |
| |
| /// Compute the context-sensitive FOLLOW set for current rule. |
| /// Documentation below is from the Java runtime. |
| /// |
| /// This is the set of token types that can follow a specific rule |
| /// reference given a specific call chain. You get the set of |
| /// viable tokens that can possibly come next (look ahead depth 1) |
| /// given the current call chain. Contrast this with the |
| /// definition of plain FOLLOW for rule r: |
| /// |
| /// FOLLOW(r)={x | S=>*alpha r beta in G and x in FIRST(beta)} |
| /// |
| /// where x in T* and alpha, beta in V*; T is set of terminals and |
| /// V is the set of terminals and non terminals. In other words, |
| /// FOLLOW(r) is the set of all tokens that can possibly follow |
| /// references to r in///any* sentential form (context). At |
| /// runtime, however, we know precisely which context applies as |
| /// we have the call chain. We may compute the exact (rather |
| /// than covering superset) set of following tokens. |
| /// |
| /// For example, consider grammar: |
| /// |
| /// stat : ID '=' expr ';' // FOLLOW(stat)=={EOF} |
| /// | "return" expr '.' |
| /// ; |
| /// expr : atom ('+' atom)* ; // FOLLOW(expr)=={';','.',')'} |
| /// atom : INT // FOLLOW(atom)=={'+',')',';','.'} |
| /// | '(' expr ')' |
| /// ; |
| /// |
| /// The FOLLOW sets are all inclusive whereas context-sensitive |
| /// FOLLOW sets are precisely what could follow a rule reference. |
| /// For input input "i=(3);", here is the derivation: |
| /// |
| /// stat => ID '=' expr ';' |
| /// => ID '=' atom ('+' atom)* ';' |
| /// => ID '=' '(' expr ')' ('+' atom)* ';' |
| /// => ID '=' '(' atom ')' ('+' atom)* ';' |
| /// => ID '=' '(' INT ')' ('+' atom)* ';' |
| /// => ID '=' '(' INT ')' ';' |
| /// |
| /// At the "3" token, you'd have a call chain of |
| /// |
| /// stat -> expr -> atom -> expr -> atom |
| /// |
| /// What can follow that specific nested ref to atom? Exactly ')' |
| /// as you can see by looking at the derivation of this specific |
| /// input. Contrast this with the FOLLOW(atom)={'+',')',';','.'}. |
| /// |
| /// You want the exact viable token set when recovering from a |
| /// token mismatch. Upon token mismatch, if LA(1) is member of |
| /// the viable next token set, then you know there is most likely |
| /// a missing token in the input stream. "Insert" one by just not |
| /// throwing an exception. |
| /// |
| BitsetType* computeCSRuleFollow(); |
| |
| /// Compute the current followset for the input stream. |
| /// |
| BitsetType* combineFollows(bool exact); |
| |
| /// Attempt to recover from a single missing or extra token. |
| /// |
| /// EXTRA TOKEN |
| /// |
| /// LA(1) is not what we are looking for. If LA(2) has the right token, |
| /// however, then assume LA(1) is some extra spurious token. Delete it |
| /// and LA(2) as if we were doing a normal match(), which advances the |
| /// input. |
| /// |
| /// MISSING TOKEN |
| /// |
| /// If current token is consistent with what could come after |
| /// ttype then it is ok to "insert" the missing token, else throw |
| /// exception For example, Input "i=(3;" is clearly missing the |
| /// ')'. When the parser returns from the nested call to expr, it |
| /// will have call chain: |
| /// |
| /// stat -> expr -> atom |
| /// |
| /// and it will be trying to match the ')' at this point in the |
| /// derivation: |
| /// |
| /// => ID '=' '(' INT ')' ('+' atom)* ';' |
| /// ^ |
| /// match() will see that ';' doesn't match ')' and report a |
| /// mismatched token error. To recover, it sees that LA(1)==';' |
| /// is in the set of tokens that can follow the ')' token |
| /// reference in rule atom. It can assume that you forgot the ')'. |
| /// |
| /// The exception that was passed in, in the java implementation is |
| /// sorted in the recognizer exception stack in the C version. To 'throw' it we set the |
| /// error flag and rules cascade back when this is set. |
| /// |
| const UnitType* recoverFromMismatchedToken( ANTLR_UINT32 ttype, BitsetListType* follow); |
| |
| /** Function that recovers from a mismatched set in the token stream, in a similar manner |
| * to (*recoverFromMismatchedToken) |
| */ |
| const UnitType* recoverFromMismatchedSet(BitsetListType* follow); |
| |
| /** common routine to handle single token insertion for recovery functions. |
| */ |
| /// This code is factored out from mismatched token and mismatched set |
| /// recovery. It handles "single token insertion" error recovery for |
| /// both. No tokens are consumed to recover from insertions. Return |
| /// true if recovery was possible else return false. |
| /// |
| bool recoverFromMismatchedElement(BitsetListType* follow); |
| |
| /** function that consumes input until the next token matches |
| * the given token. |
| */ |
| void consumeUntil(ANTLR_UINT32 tokenType); |
| |
| /** function that consumes input until the next token matches |
| * one in the given set. |
| */ |
| void consumeUntilSet(BitsetType* set); |
| |
| /** function that returns an ANTLR3_LIST of the strings that identify |
| * the rules in the parser that got you to this point. Can be overridden by installing your |
| * own function set. |
| * |
| * \todo Document how to override invocation stack functions. |
| */ |
| StringStackType getRuleInvocationStack(); |
| StringStackType getRuleInvocationStackNamed(ANTLR_UINT8* name); |
| |
| /** function that converts an ANLR3_LIST of tokens to an ANTLR3_LIST of |
| * string token names. As this is mostly used in string template processing it may not be useful |
| * in the C runtime. |
| */ |
| StringListType toStrings( const StringListType& ); |
| |
| /** function to return whether the rule has parsed input starting at the supplied |
| * start index before. If the rule has not parsed input starting from the supplied start index, |
| * then it will return ANTLR3_MEMO_RULE_UNKNOWN. If it has parsed from the suppled start point |
| * then it will return the point where it last stopped parsing after that start point. |
| */ |
| ANTLR_MARKER getRuleMemoization( ANTLR_INTKEY ruleIndex, |
| ANTLR_MARKER ruleParseStart); |
| |
| /** function that determines whether the rule has parsed input at the current index |
| * in the input stream |
| */ |
| bool alreadyParsedRule(ANTLR_MARKER ruleIndex); |
| |
| /** Function that records whether the rule has parsed the input at a |
| * current position successfully or not. |
| */ |
| void memoize(ANTLR_MARKER ruleIndex, |
| ANTLR_MARKER ruleParseStart); |
| |
| /// Function that returns the current input symbol. |
| /// The is placed into any label for the associated token ref; e.g., x=ID. Token |
| /// and tree parsers need to return different objects. Rather than test |
| /// for input stream type or change the IntStream interface, I use |
| /// a simple method to ask the recognizer to tell me what the current |
| /// input symbol is. |
| /// |
| /// This is ignored for lexers and the lexer implementation of this |
| /// function should return NULL. |
| /// |
| const UnitType* getCurrentInputSymbol(IntStreamType* istream); |
| const UnitType* getCurrentInputSymbol(IntStreamType* istream, ClassForwarder<LexerType>); |
| const UnitType* getCurrentInputSymbol(IntStreamType* istream, ClassForwarder<ParserType>); |
| const UnitType* getCurrentInputSymbol(IntStreamType* istream, ClassForwarder<TreeParserType>); |
| |
| /// Conjure up a missing token during error recovery. |
| /// |
| /// The recognizer attempts to recover from single missing |
| /// symbols. But, actions might refer to that missing symbol. |
| /// For example, x=ID {f($x);}. The action clearly assumes |
| /// that there has been an identifier matched previously and that |
| /// $x points at that token. If that token is missing, but |
| /// the next token in the stream is what we want we assume that |
| /// this token is missing and we keep going. Because we |
| /// have to return some token to replace the missing token, |
| /// we have to conjure one up. This method gives the user control |
| /// over the tokens returned for missing tokens. Mostly, |
| /// you will want to create something special for identifier |
| /// tokens. For literals such as '{' and ',', the default |
| /// action in the parser or tree parser works. It simply creates |
| /// a CommonToken of the appropriate type. The text will be the token. |
| /// If you change what tokens must be created by the lexer, |
| /// override this method to create the appropriate tokens. |
| /// |
| UnitType* getMissingSymbol( IntStreamType* istream, ExceptionBaseType* e, |
| ANTLR_UINT32 expectedTokenType, |
| BitsetListType* follow); |
| |
| /** Function that returns whether the supplied grammar function |
| * will parse the current input stream or not. This is the way that syntactic |
| * predicates are evaluated. Unlike java, C is perfectly happy to invoke code |
| * via a pointer to a function (hence that's what all the ANTLR3 C interfaces |
| * do. |
| */ |
| template<typename Predicate> |
| bool synpred( ClassForwarder<Predicate> ); |
| |
| //In place of exConstruct, just directly instantiate the Exception Object |
| |
| /** Reset the recognizer |
| */ |
| void reset(); |
| void reset( ClassForwarder<LexerType> ); |
| template<typename CompType> |
| void reset( ClassForwarder<CompType> ); |
| |
| void exConstruct(); |
| |
| ~BaseRecognizer(); |
| |
| }; |
| |
| ANTLR_END_NAMESPACE() |
| |
| #include "antlr3baserecognizer.inl" |
| |
| /// @} |
| /// |
| |
| #endif /* _ANTLR3_BASERECOGNIZER_H */ |
| |