src/llvm-project/clang-tools-extra/pseudo/include/clang-pseudo/GLR.h - toolchain/rustc - Git at Google

 //===--- GLR.h - Implement a GLR parsing algorithm ---------------*- C++-*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This implements a standard Generalized LR (GLR) parsing algorithm.
 //
 // The GLR parser behaves as a normal LR parser until it encounters a conflict.
 // To handle a conflict (where there are multiple actions could perform), the
 // parser will simulate nondeterminism by doing a breadth-first search
 // over all the possibilities.
 //
 // Basic mechanisims of the GLR parser:
 //  - A number of processes are operated in parallel.
 //  - Each process has its own parsing stack and behaves as a standard
 //    determinism LR parser.
 //  - When a process encounters a conflict, it will be fork (one for each
 //    avaiable action).
 //  - When a process encounters an error, it is abandoned.
 //  - All process are synchronized by the lookahead token: they perfrom shift
 //    action at the same time, which means some processes need wait until other
 //    processes have performed all reduce actions.
 //
 //===----------------------------------------------------------------------===//

 #ifndef CLANG_PSEUDO_GLR_H
 #define CLANG_PSEUDO_GLR_H

 #include "clang-pseudo/Forest.h"
 #include "clang-pseudo/Language.h"
 #include "clang-pseudo/grammar/Grammar.h"
 #include "clang-pseudo/grammar/LRTable.h"
 #include "llvm/Support/Allocator.h"
 #include <vector>

 namespace clang {
 namespace pseudo {

 // A Graph-Structured Stack efficiently represents all parse stacks of a GLR
 // parser.
 //
 // Each node stores a parse state, the last parsed ForestNode, and the parent
 // node. There may be several heads (top of stack), and the parser operates by:
 // - shift: pushing terminal symbols on top of the stack
 // - reduce: replace N symbols on top of the stack with one nonterminal
 //
 // The structure is a DAG rather than a linear stack:
 // - GLR allows multiple actions (conflicts) on the same head, producing forks
 //   where several nodes have the same parent
 // - The parser merges nodes with the same (state, ForestNode), producing joins
 //   where one node has multiple parents
 //
 // The parser is responsible for creating nodes and keeping track of the set of
 // heads. The GSS class is mostly an arena for them.
 struct GSS {
   // A node represents a partial parse of the input up to some point.
   //
   // It is the equivalent of a frame in an LR parse stack.
   // Like such a frame, it has an LR parse state and a syntax-tree node
   // representing the last parsed symbol (a ForestNode in our case).
   // Unlike a regular LR stack frame, it may have multiple parents.
   //
   // Nodes are not exactly pushed and popped on the stack: pushing is just
   // allocating a new head node with a parent pointer to the old head. Popping
   // is just forgetting about a node and remembering its parent instead.
   struct alignas(struct Node *) Node {
     // LR state describing how parsing should continue from this head.
     LRTable::StateID State;
     // Used internally to track reachability during garbage collection.
     bool GCParity;
     // Have we already used this node for error recovery? (prevents loops)
     mutable bool Recovered = false;
     // Number of the parents of this node.
     // The parents hold previous parsed symbols, and may resume control after
     // this node is reduced.
     unsigned ParentCount;
     // The parse node for the last parsed symbol.
     // This symbol appears on the left of the dot in the parse state's items.
     // (In the literature, the node is attached to the *edge* to the parent).
     const ForestNode *Payload = nullptr;

     llvm::ArrayRef<const Node *> parents() const {
       return llvm::ArrayRef(reinterpret_cast<const Node *const *>(this + 1),
                             ParentCount);
     };
     // Parents are stored as a trailing array of Node*.
   };

   // Allocates a new node in the graph.
   const Node *addNode(LRTable::StateID State, const ForestNode *Symbol,
                       llvm::ArrayRef<const Node *> Parents);
   // Frees all nodes not reachable as ancestors of Roots, and returns the count.
   // Calling this periodically prevents steady memory growth of the GSS.
   unsigned gc(std::vector<const Node *> &&Roots);

   size_t bytes() const { return Arena.getTotalMemory() + sizeof(*this); }
   size_t nodesCreated() const { return NodesCreated; }

 private:
   // Nodes are recycled using freelists.
   // They are variable size, so use one free-list per distinct #parents.
   std::vector<std::vector<Node *>> FreeList;
   Node *allocate(unsigned Parents);
   void destroy(Node *N);
   // The list of nodes created and not destroyed - our candidates for gc().
   std::vector<Node *> Alive;
   bool GCParity = false; // All nodes should match this, except during GC.

   llvm::BumpPtrAllocator Arena;
   unsigned NodesCreated = 0;
 };
 llvm::raw_ostream &operator<<(llvm::raw_ostream &, const GSS::Node &);

 // Parameters for the GLR parsing.
 struct ParseParams {
   // The token stream to parse.
   const TokenStream &Code;

   // Arena for data structure used by the GLR algorithm.
   ForestArena &Forest;  // Storage for the output forest.
   GSS &GSStack;         // Storage for parsing stacks.
 };

 // Parses the given token stream as the start symbol with the GLR algorithm,
 // and returns a forest node of the start symbol.
 //
 // A rule `_ := StartSymbol` must exit for the chosen start symbol.
 //
 // If the parsing fails, we model it as an opaque node in the forest.
 ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
                      const Language &Lang);

 // Shift a token onto all OldHeads, placing the results into NewHeads.
 //
 // Exposed for testing only.
 void glrShift(llvm::ArrayRef<const GSS::Node *> OldHeads,
               const ForestNode &NextTok, const ParseParams &Params,
               const Language &Lang, std::vector<const GSS::Node *> &NewHeads);
 // Applies available reductions on Heads, appending resulting heads to the list.
 //
 // Exposed for testing only.
 void glrReduce(std::vector<const GSS::Node *> &Heads, SymbolID Lookahead,
                const ParseParams &Params, const Language &Lang);

 // Heuristically recover from a state where no further parsing is possible.
 //
 // OldHeads is the parse state at TokenIndex.
 // This function consumes zero or more tokens by advancing TokenIndex,
 // and places any recovery states created in NewHeads.
 //
 // On failure, NewHeads is empty and TokenIndex is unchanged.
 //
 // WARNING: glrRecover acts as a "fallback shift". If it consumes no tokens,
 // there is a risk of the parser falling into an infinite loop, creating an
 // endless sequence of recovery nodes.
 // Generally it is safe for recovery to match 0 tokens against sequence symbols
 // like `statement-seq`, as the grammar won't permit another statement-seq
 // immediately afterwards. However recovery strategies for `statement` should
 // consume at least one token, as statements may be adjacent in the input.
 void glrRecover(llvm::ArrayRef<const GSS::Node *> OldHeads,
                 unsigned &TokenIndex, const ParseParams &Params,
                 const Language &Lang, std::vector<const GSS::Node *> &NewHeads);

 } // namespace pseudo
 } // namespace clang

 #endif // CLANG_PSEUDO_GLR_H
	//===--- GLR.h - Implement a GLR parsing algorithm ---------------- C++--===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This implements a standard Generalized LR (GLR) parsing algorithm.
	//
	// The GLR parser behaves as a normal LR parser until it encounters a conflict.
	// To handle a conflict (where there are multiple actions could perform), the
	// parser will simulate nondeterminism by doing a breadth-first search
	// over all the possibilities.
	//
	// Basic mechanisims of the GLR parser:
	// - A number of processes are operated in parallel.
	// - Each process has its own parsing stack and behaves as a standard
	// determinism LR parser.
	// - When a process encounters a conflict, it will be fork (one for each
	// avaiable action).
	// - When a process encounters an error, it is abandoned.
	// - All process are synchronized by the lookahead token: they perfrom shift
	// action at the same time, which means some processes need wait until other
	// processes have performed all reduce actions.
	//
	//===----------------------------------------------------------------------===//

	#ifndef CLANG_PSEUDO_GLR_H
	#define CLANG_PSEUDO_GLR_H

	#include "clang-pseudo/Forest.h"
	#include "clang-pseudo/Language.h"
	#include "clang-pseudo/grammar/Grammar.h"
	#include "clang-pseudo/grammar/LRTable.h"
	#include "llvm/Support/Allocator.h"
	#include <vector>

	namespace clang {
	namespace pseudo {

	// A Graph-Structured Stack efficiently represents all parse stacks of a GLR
	// parser.
	//
	// Each node stores a parse state, the last parsed ForestNode, and the parent
	// node. There may be several heads (top of stack), and the parser operates by:
	// - shift: pushing terminal symbols on top of the stack
	// - reduce: replace N symbols on top of the stack with one nonterminal
	//
	// The structure is a DAG rather than a linear stack:
	// - GLR allows multiple actions (conflicts) on the same head, producing forks
	// where several nodes have the same parent
	// - The parser merges nodes with the same (state, ForestNode), producing joins
	// where one node has multiple parents
	//
	// The parser is responsible for creating nodes and keeping track of the set of
	// heads. The GSS class is mostly an arena for them.
	struct GSS {
	// A node represents a partial parse of the input up to some point.
	//
	// It is the equivalent of a frame in an LR parse stack.
	// Like such a frame, it has an LR parse state and a syntax-tree node
	// representing the last parsed symbol (a ForestNode in our case).
	// Unlike a regular LR stack frame, it may have multiple parents.
	//
	// Nodes are not exactly pushed and popped on the stack: pushing is just
	// allocating a new head node with a parent pointer to the old head. Popping
	// is just forgetting about a node and remembering its parent instead.
	struct alignas(struct Node *) Node {
	// LR state describing how parsing should continue from this head.
	LRTable::StateID State;
	// Used internally to track reachability during garbage collection.
	bool GCParity;
	// Have we already used this node for error recovery? (prevents loops)
	mutable bool Recovered = false;
	// Number of the parents of this node.
	// The parents hold previous parsed symbols, and may resume control after
	// this node is reduced.
	unsigned ParentCount;
	// The parse node for the last parsed symbol.
	// This symbol appears on the left of the dot in the parse state's items.
	// (In the literature, the node is attached to the edge to the parent).
	const ForestNode *Payload = nullptr;

	llvm::ArrayRef<const Node *> parents() const {
	return llvm::ArrayRef(reinterpret_cast<const Node const >(this + 1),
	ParentCount);
	};
	// Parents are stored as a trailing array of Node*.
	};

	// Allocates a new node in the graph.
	const Node addNode(LRTable::StateID State, const ForestNode Symbol,
	llvm::ArrayRef<const Node *> Parents);
	// Frees all nodes not reachable as ancestors of Roots, and returns the count.
	// Calling this periodically prevents steady memory growth of the GSS.
	unsigned gc(std::vector<const Node *> &&Roots);

	size_t bytes() const { return Arena.getTotalMemory() + sizeof(*this); }
	size_t nodesCreated() const { return NodesCreated; }

	private:
	// Nodes are recycled using freelists.
	// They are variable size, so use one free-list per distinct #parents.
	std::vector<std::vector<Node *>> FreeList;
	Node *allocate(unsigned Parents);
	void destroy(Node *N);
	// The list of nodes created and not destroyed - our candidates for gc().
	std::vector<Node *> Alive;
	bool GCParity = false; // All nodes should match this, except during GC.

	llvm::BumpPtrAllocator Arena;
	unsigned NodesCreated = 0;
	};
	llvm::raw_ostream &operator<<(llvm::raw_ostream &, const GSS::Node &);

	// Parameters for the GLR parsing.
	struct ParseParams {
	// The token stream to parse.
	const TokenStream &Code;

	// Arena for data structure used by the GLR algorithm.
	ForestArena &Forest; // Storage for the output forest.
	GSS &GSStack; // Storage for parsing stacks.
	};

	// Parses the given token stream as the start symbol with the GLR algorithm,
	// and returns a forest node of the start symbol.
	//
	// A rule `_ := StartSymbol` must exit for the chosen start symbol.
	//
	// If the parsing fails, we model it as an opaque node in the forest.
	ForestNode &glrParse(const ParseParams &Params, SymbolID StartSymbol,
	const Language &Lang);

	// Shift a token onto all OldHeads, placing the results into NewHeads.
	//
	// Exposed for testing only.
	void glrShift(llvm::ArrayRef<const GSS::Node *> OldHeads,
	const ForestNode &NextTok, const ParseParams &Params,
	const Language &Lang, std::vector<const GSS::Node *> &NewHeads);
	// Applies available reductions on Heads, appending resulting heads to the list.
	//
	// Exposed for testing only.
	void glrReduce(std::vector<const GSS::Node *> &Heads, SymbolID Lookahead,
	const ParseParams &Params, const Language &Lang);

	// Heuristically recover from a state where no further parsing is possible.
	//
	// OldHeads is the parse state at TokenIndex.
	// This function consumes zero or more tokens by advancing TokenIndex,
	// and places any recovery states created in NewHeads.
	//
	// On failure, NewHeads is empty and TokenIndex is unchanged.
	//
	// WARNING: glrRecover acts as a "fallback shift". If it consumes no tokens,
	// there is a risk of the parser falling into an infinite loop, creating an
	// endless sequence of recovery nodes.
	// Generally it is safe for recovery to match 0 tokens against sequence symbols
	// like `statement-seq`, as the grammar won't permit another statement-seq
	// immediately afterwards. However recovery strategies for `statement` should
	// consume at least one token, as statements may be adjacent in the input.
	void glrRecover(llvm::ArrayRef<const GSS::Node *> OldHeads,
	unsigned &TokenIndex, const ParseParams &Params,
	const Language &Lang, std::vector<const GSS::Node *> &NewHeads);

	} // namespace pseudo
	} // namespace clang

	#endif // CLANG_PSEUDO_GLR_H