mirror of
https://github.com/rust-lang/rust.git
synced 2024-11-24 07:44:10 +00:00
Document how the parsing works
This commit is contained in:
parent
9e2c056478
commit
59087840f5
@ -33,19 +33,22 @@ The centerpiece of this whole endeavor is the syntax tree, in the
|
||||
|
||||
The syntax tree is produced using a three-staged process.
|
||||
|
||||
First, a raw text is split into tokens with a lexer. Lexer has a
|
||||
peculiar signature: it is an `Fn(&str) -> Token`, where token is a
|
||||
pair of `SyntaxKind` (you should have read the `tree` module and RFC
|
||||
First, a raw text is split into tokens with a lexer (the `lexer` module).
|
||||
Lexer has a peculiar signature: it is an `Fn(&str) -> Token`, where token
|
||||
is a pair of `SyntaxKind` (you should have read the `tree` module and RFC
|
||||
by this time! :)) and a len. That is, lexer chomps only the first
|
||||
token of the input. This forces the lexer to be stateless, and makes
|
||||
it possible to implement incremental relexing easily.
|
||||
|
||||
Then, the bulk of work, the parser turns a stream of tokens into
|
||||
stream of events. Not that parser **does not** construct a tree right
|
||||
away. This is done for several reasons:
|
||||
stream of events (the `parser` module; of particular interest are
|
||||
the `parser/event` and `parser/parser` modules, which contain parsing
|
||||
API, and the `parser/grammar` module, which contains actual parsing code
|
||||
for various Rust syntactic constructs). Not that parser **does not**
|
||||
construct a tree right away. This is done for several reasons:
|
||||
|
||||
* to decouple the actual tree data structure from the parser: you can
|
||||
build any datastructre you want from the stream of events
|
||||
build any data structure you want from the stream of events
|
||||
|
||||
* to make parsing fast: you can produce a list of events without
|
||||
allocations
|
||||
@ -77,12 +80,6 @@ And at last, the TreeBuilder converts a flat stream of events into a
|
||||
tree structure. It also *should* be responsible for attaching comments
|
||||
and rebalancing the tree, but it does not do this yet :)
|
||||
|
||||
|
||||
## Error reporing
|
||||
|
||||
TODO: describe how stuff like `skip_to_first` works
|
||||
|
||||
|
||||
## Validator
|
||||
|
||||
Parser and lexer accept a lot of *invalid* code intentionally. The
|
||||
|
@ -42,7 +42,7 @@ pub(crate) enum Event {
|
||||
/// |
|
||||
/// 'foo'
|
||||
///
|
||||
/// See also `CompleteMarker::precede`.
|
||||
/// See also `CompletedMarker::precede`.
|
||||
Start {
|
||||
kind: SyntaxKind,
|
||||
forward_parent: Option<u32>,
|
||||
|
@ -1,4 +1,27 @@
|
||||
use parser::parser::{Parser};
|
||||
//! This is the actual "grammar" of the Rust language.
|
||||
//!
|
||||
//! Each function in this module and its children corresponds
|
||||
//! to a production of the format grammar. Submodules roughly
|
||||
//! correspond to different *areas* of the grammar. By convention,
|
||||
//! each submodule starts with `use super::*` import and exports
|
||||
//! "public" productions via `pub(super)`.
|
||||
//!
|
||||
//! See docs for `Parser` to learn about API, available to the grammar,
|
||||
//! and see docs for `Event` to learn how this actually manages to
|
||||
//! produce parse trees.
|
||||
//!
|
||||
//! Code in this module also contains inline tests, which start with
|
||||
//! `// test name-of-the-test` comment and look like this:
|
||||
//!
|
||||
//! ```
|
||||
//! // test fn_item_with_zero_parameters
|
||||
//! // fn foo() {}
|
||||
//! ```
|
||||
//!
|
||||
//! After adding a new inline-test, run `cargo collect-tests` to extract
|
||||
//! it as a standalone text-fixture into `tests/data/parser/inline`, and
|
||||
//! run `cargo test` once to create the "gold" value.
|
||||
use parser::parser::Parser;
|
||||
use parser::token_set::TokenSet;
|
||||
use SyntaxKind;
|
||||
use syntax_kinds::*;
|
||||
|
@ -4,6 +4,9 @@ use parser::event::Event;
|
||||
use SyntaxKind;
|
||||
use syntax_kinds::{TOMBSTONE, EOF};
|
||||
|
||||
/// Implementation details of `Parser`, extracted
|
||||
/// to a separate struct in order not to pollute
|
||||
/// the public API of the `Parser`.
|
||||
pub(crate) struct ParserImpl<'t> {
|
||||
inp: &'t ParserInput<'t>,
|
||||
|
||||
|
@ -4,51 +4,72 @@ use syntax_kinds::ERROR;
|
||||
pub(super) mod imp;
|
||||
use self::imp::ParserImpl;
|
||||
|
||||
/// `Parser` struct provides the low-level API for
|
||||
/// navigating through the stream of tokens and
|
||||
/// constructing the parse tree. The actual parsing
|
||||
/// happens in the `grammar` module.
|
||||
///
|
||||
/// However, the result of this `Parser` is not a real
|
||||
/// tree, but rather a flat stream of events of the form
|
||||
/// "start expression, consume number literal,
|
||||
/// finish expression". See `Event` docs for more.
|
||||
pub(crate) struct Parser<'t>(pub(super) ParserImpl<'t>);
|
||||
|
||||
|
||||
impl<'t> Parser<'t> {
|
||||
/// Returns the kind of the current token.
|
||||
/// If parser has already reached the end of input,
|
||||
/// the special `EOF` kind is returned.
|
||||
pub(crate) fn current(&self) -> SyntaxKind {
|
||||
self.nth(0)
|
||||
}
|
||||
|
||||
/// Lookahead operation: returns the kind of the next nth
|
||||
/// token.
|
||||
pub(crate) fn nth(&self, n: u32) -> SyntaxKind {
|
||||
self.0.nth(n)
|
||||
}
|
||||
|
||||
/// Checks if the current token is `kind`.
|
||||
pub(crate) fn at(&self, kind: SyntaxKind) -> bool {
|
||||
self.current() == kind
|
||||
}
|
||||
|
||||
pub(crate) fn at_kw(&self, t: &str) -> bool {
|
||||
/// Checks if the current token is contextual keyword with text `t`.
|
||||
pub(crate) fn at_contextual_kw(&self, t: &str) -> bool {
|
||||
self.0.at_kw(t)
|
||||
}
|
||||
|
||||
/// Starts a new node in the syntax tree. All nodes and tokens
|
||||
/// consumed between the `start` and the corresponding `Marker::complete`
|
||||
/// belong to the same node.
|
||||
pub(crate) fn start(&mut self) -> Marker {
|
||||
Marker(self.0.start())
|
||||
}
|
||||
|
||||
/// Advances the parser by one token.
|
||||
pub(crate) fn bump(&mut self) {
|
||||
self.0.bump();
|
||||
}
|
||||
|
||||
/// Advances the parser by one token, remapping its kind.
|
||||
/// This is useful to create contextual keywords from
|
||||
/// identifiers. For example, the lexer creates an `union`
|
||||
/// *identifier* token, but the parser remaps it to the
|
||||
/// `union` keyword, and keyword is what ends up in the
|
||||
/// final tree.
|
||||
pub(crate) fn bump_remap(&mut self, kind: SyntaxKind) {
|
||||
self.0.bump_remap(kind);
|
||||
}
|
||||
|
||||
/// Emit error with the `message`
|
||||
/// TODO: this should be much more fancy and support
|
||||
/// structured errors with spans and notes, like rustc
|
||||
/// does.
|
||||
pub(crate) fn error<T: Into<String>>(&mut self, message: T) {
|
||||
self.0.error(message.into())
|
||||
}
|
||||
|
||||
pub(crate) fn expect(&mut self, kind: SyntaxKind) -> bool {
|
||||
if self.at(kind) {
|
||||
self.bump();
|
||||
return true;
|
||||
}
|
||||
self.error(format!("expected {:?}", kind));
|
||||
false
|
||||
}
|
||||
|
||||
/// Consume the next token if it is `kind`.
|
||||
pub(crate) fn eat(&mut self, kind: SyntaxKind) -> bool {
|
||||
if !self.at(kind) {
|
||||
return false;
|
||||
@ -57,6 +78,17 @@ impl<'t> Parser<'t> {
|
||||
true
|
||||
}
|
||||
|
||||
/// Consume the next token if it is `kind` or emit an error
|
||||
/// otherwise.
|
||||
pub(crate) fn expect(&mut self, kind: SyntaxKind) -> bool {
|
||||
if self.eat(kind) {
|
||||
return true;
|
||||
}
|
||||
self.error(format!("expected {:?}", kind));
|
||||
false
|
||||
}
|
||||
|
||||
/// Create an error node and consume the next token.
|
||||
pub(crate) fn err_and_bump(&mut self, message: &str) {
|
||||
let m = self.start();
|
||||
self.error(message);
|
||||
@ -65,9 +97,11 @@ impl<'t> Parser<'t> {
|
||||
}
|
||||
}
|
||||
|
||||
/// See `Parser::start`.
|
||||
pub(crate) struct Marker(u32);
|
||||
|
||||
impl Marker {
|
||||
/// Finishes the syntax tree node and assigns `kind` to it.
|
||||
pub(crate) fn complete(self, p: &mut Parser, kind: SyntaxKind) -> CompletedMarker {
|
||||
let pos = self.0;
|
||||
::std::mem::forget(self);
|
||||
@ -75,6 +109,8 @@ impl Marker {
|
||||
CompletedMarker(pos)
|
||||
}
|
||||
|
||||
/// Abandons the syntax tree node. All its children
|
||||
/// are attached to its parent instead.
|
||||
pub(crate) fn abandon(self, p: &mut Parser) {
|
||||
let pos = self.0;
|
||||
::std::mem::forget(self);
|
||||
@ -94,6 +130,13 @@ impl Drop for Marker {
|
||||
pub(crate) struct CompletedMarker(u32);
|
||||
|
||||
impl CompletedMarker {
|
||||
/// This one is tricky :-)
|
||||
/// This method allows to create a new node which starts
|
||||
/// *before* the current one. That is, parser could start
|
||||
/// node `A`, then complete it, and then after parsing the
|
||||
/// whole `A`, decide that it should have started some node
|
||||
/// `B` before starting `A`. `precede` allows to do exactly
|
||||
/// that. See also docs about `forward_parent` in `Event::Start`.
|
||||
pub(crate) fn precede(self, p: &mut Parser) -> Marker {
|
||||
Marker(p.0.precede(self.0))
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user