From 21699c41af325786cbda1dba2379710f98613c68 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Wed, 30 Mar 2022 11:21:32 +1100 Subject: [PATCH 1/5] Simplify exit of `Delimited` submatchers. Currently, we detect an exit from a `Delimited` submatcher when `idx` exceeds the bounds of the current submatcher *and* there is a `stack` entry. This commit changes it to something simpler: just look for a `CloseDelim` token. --- compiler/rustc_expand/src/mbe/macro_parser.rs | 30 ++++++++----------- 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/compiler/rustc_expand/src/mbe/macro_parser.rs b/compiler/rustc_expand/src/mbe/macro_parser.rs index 18302ffdb8e..1f86223634f 100644 --- a/compiler/rustc_expand/src/mbe/macro_parser.rs +++ b/compiler/rustc_expand/src/mbe/macro_parser.rs @@ -75,7 +75,7 @@ crate use ParseResult::*; use crate::mbe::{self, SequenceRepetition, TokenTree}; -use rustc_ast::token::{self, DocComment, Nonterminal, Token}; +use rustc_ast::token::{self, DocComment, Nonterminal, Token, TokenKind}; use rustc_parse::parser::{NtOrTt, Parser}; use rustc_session::parse::ParseSess; use rustc_span::symbol::MacroRulesNormalizedIdent; @@ -454,18 +454,6 @@ impl<'tt> TtParser<'tt> { let mut eof_mps = EofMatcherPositions::None; while let Some(mut mp) = self.cur_mps.pop() { - // Backtrack out of delimited submatcher when necessary. When backtracking out again, - // we need to advance the "dot" past the delimiters in the parent matcher(s). - while mp.idx >= mp.tts.len() { - match mp.stack.pop() { - Some(MatcherPosFrame { tts, idx }) => { - mp.tts = tts; - mp.idx = idx + 1; - } - None => break, - } - } - // Get the current position of the "dot" (`idx`) in `mp` and the number of token // trees in the matcher (`len`). let idx = mp.idx; @@ -516,11 +504,10 @@ impl<'tt> TtParser<'tt> { TokenTree::Delimited(_, delimited) => { // To descend into a delimited submatcher, we push the current matcher onto - // a stack and push a new mp containing the submatcher onto `cur_mps`. - // - // At the beginning of the loop, if we reach the end of the delimited - // submatcher, we pop the stack to backtrack out of the descent. Note that - // we use `all_tts` to include the open and close delimiter tokens. + // a stack and push a new mp containing the submatcher onto `cur_mps`. When + // we reach the closing delimiter, we will pop the stack to backtrack out + // of the descent. Note that we use `all_tts` to include the open and close + // delimiter tokens. let tts = mem::replace(&mut mp.tts, &delimited.all_tts); let idx = mp.idx; mp.stack.push(MatcherPosFrame { tts, idx }); @@ -542,6 +529,13 @@ impl<'tt> TtParser<'tt> { mp.idx += 1; self.cur_mps.push(mp); } else if token_name_eq(&t, token) { + if let TokenKind::CloseDelim(_) = token.kind { + // Ascend out of the delimited submatcher. + debug_assert_eq!(idx, len - 1); + let frame = mp.stack.pop().unwrap(); + mp.tts = frame.tts; + mp.idx = frame.idx; + } mp.idx += 1; self.next_mps.push(mp); } From 2e423c7fd06223a49e3f85636c620a7a70fbc221 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Wed, 30 Mar 2022 16:30:35 +1100 Subject: [PATCH 2/5] Remove `MatcherPos::match_lo`. It's redundant w.r.t. other fields. --- compiler/rustc_expand/src/mbe/macro_parser.rs | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/compiler/rustc_expand/src/mbe/macro_parser.rs b/compiler/rustc_expand/src/mbe/macro_parser.rs index 1f86223634f..fad13d5350e 100644 --- a/compiler/rustc_expand/src/mbe/macro_parser.rs +++ b/compiler/rustc_expand/src/mbe/macro_parser.rs @@ -134,10 +134,6 @@ struct MatcherPos<'tt> { /// The number of sequences this mp is within. seq_depth: usize, - /// The position in `matches` of the first metavar in this (sub)matcher. Zero if there are - /// no metavars. - match_lo: usize, - /// The position in `matches` of the next metavar to be matched against the source token /// stream. Should not be used if there are no metavars. match_cur: usize, @@ -152,7 +148,7 @@ struct MatcherPos<'tt> { // This type is used a lot. Make sure it doesn't unintentionally get bigger. #[cfg(all(target_arch = "x86_64", target_pointer_width = "64"))] -rustc_data_structures::static_assert_size!(MatcherPos<'_>, 104); +rustc_data_structures::static_assert_size!(MatcherPos<'_>, 96); impl<'tt> MatcherPos<'tt> { fn top_level(matcher: &'tt [TokenTree], empty_matches: Lrc) -> Self { @@ -161,7 +157,6 @@ impl<'tt> MatcherPos<'tt> { idx: 0, matches: empty_matches, seq_depth: 0, - match_lo: 0, match_cur: 0, stack: smallvec![], sequence: None, @@ -178,14 +173,13 @@ impl<'tt> MatcherPos<'tt> { idx: 0, matches: parent.matches.clone(), seq_depth: parent.seq_depth, - match_lo: parent.match_cur, match_cur: parent.match_cur, sequence: Some(MatcherPosSequence { parent, seq }), stack: smallvec![], }; // Start with an empty vec for each metavar within the sequence. Note that `mp.seq_depth` // must have the parent's depth at this point for these `push_match` calls to work. - for idx in mp.match_lo..mp.match_lo + seq.num_captures { + for idx in mp.match_cur..mp.match_cur + seq.num_captures { mp.push_match(idx, MatchedSeq(empty_matches.clone())); } mp.seq_depth += 1; @@ -554,7 +548,7 @@ impl<'tt> TtParser<'tt> { // is finished. let mut new_mp = sequence.parent.clone(); new_mp.matches = mp.matches.clone(); - new_mp.match_cur = mp.match_lo + sequence.seq.num_captures; + new_mp.match_cur = mp.match_cur; new_mp.idx += 1; self.cur_mps.push(new_mp); } @@ -575,7 +569,7 @@ impl<'tt> TtParser<'tt> { // We don't need a separator. Move the "dot" back to the beginning of the // matcher and try to match again UNLESS we are only allowed to have _one_ // repetition. - mp.match_cur = mp.match_lo; + mp.match_cur -= sequence.seq.num_captures; mp.idx = 0; self.cur_mps.push(mp); } From 048bd67d513d23341431da208aa7ec075d6252b2 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Thu, 31 Mar 2022 07:00:07 +1100 Subject: [PATCH 3/5] Clarify `idx` handling in sequences. By adding comments, and improving an assertion. I finally fully understand this part! --- compiler/rustc_expand/src/mbe/macro_parser.rs | 27 ++++++++++++------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/compiler/rustc_expand/src/mbe/macro_parser.rs b/compiler/rustc_expand/src/mbe/macro_parser.rs index fad13d5350e..3fd81e2d87d 100644 --- a/compiler/rustc_expand/src/mbe/macro_parser.rs +++ b/compiler/rustc_expand/src/mbe/macro_parser.rs @@ -122,7 +122,9 @@ struct MatcherPos<'tt> { /// submatcher, this is just the contents of that submatcher. tts: &'tt [TokenTree], - /// The "dot" position within the current submatcher, i.e. the index into `tts`. + /// The "dot" position within the current submatcher, i.e. the index into `tts`. Can go one or + /// two positions past the final elements in `tts` when dealing with sequences, see + /// `parse_tt_inner` for details. idx: usize, /// This vector ends up with one element per metavar in the *top-level* matcher, even when this @@ -540,12 +542,17 @@ impl<'tt> TtParser<'tt> { } } else if let Some(sequence) = &mp.sequence { // We are past the end of a sequence. - debug_assert!(idx <= len + 1); + // - If it has no separator, we must be only one past the end. + // - If it has a separator, we may be one past the end, in which case we must + // look for a separator. Or we may be two past the end, in which case we have + // already dealt with the separator. + debug_assert!(idx == len || idx == len + 1 && sequence.seq.separator.is_some()); if idx == len { - // Add all matches from the sequence to `parent`, and move the "dot" past the - // sequence in `parent`. This allows for the case where the sequence matching - // is finished. + // Sequence matching may have finished: move the "dot" past the sequence in + // `parent`. This applies whether a separator is used or not. If sequence + // matching hasn't finished, this `new_mp` will fail quietly when it is + // processed next time around the loop. let mut new_mp = sequence.parent.clone(); new_mp.matches = mp.matches.clone(); new_mp.match_cur = mp.match_cur; @@ -553,7 +560,8 @@ impl<'tt> TtParser<'tt> { self.cur_mps.push(new_mp); } - if idx == len && sequence.seq.separator.is_some() { + if sequence.seq.separator.is_some() && idx == len { + // Look for the separator. if sequence .seq .separator @@ -566,9 +574,10 @@ impl<'tt> TtParser<'tt> { self.next_mps.push(mp); } } else if sequence.seq.kleene.op != mbe::KleeneOp::ZeroOrOne { - // We don't need a separator. Move the "dot" back to the beginning of the - // matcher and try to match again UNLESS we are only allowed to have _one_ - // repetition. + // We don't need to look for a separator: either this sequence doesn't have + // one, or it does and we've already handled it. Also, we are allowed to have + // more than one repetition. Move the "dot" back to the beginning of the + // matcher and try to match again. mp.match_cur -= sequence.seq.num_captures; mp.idx = 0; self.cur_mps.push(mp); From f68a0449ed4e49705e2ee70be6e5678aa426b206 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Thu, 31 Mar 2022 08:21:36 +1100 Subject: [PATCH 4/5] Remove `MatcherPos::stack`. `parse_tt` needs a way to get from within submatchers make to the enclosing submatchers. Currently it has two distinct mechanisms for this: - `Delimited` submatchers use `MatcherPos::stack` to record stuff about the parent (and further back ancestors). - `Sequence` submatchers use `MatcherPosSequence::parent` to point to the parent matcher position. Having two mechanisms is really confusing, and it took me a long time to understand all this. This commit eliminates `MatcherPos::stack`, and changes `Delimited` submatchers to use the same mechanism as sequence submatchers. That mechanism is also changed a bit: instead of storing the entire parent `MatcherPos`, we now only store the necessary parts from the parent `MatcherPos`. Overall this is a small performance win, with the positives outweighing the negatives, but it's mostly for clarity. --- compiler/rustc_expand/src/lib.rs | 1 + compiler/rustc_expand/src/mbe/macro_parser.rs | 135 ++++++++++-------- 2 files changed, 74 insertions(+), 62 deletions(-) diff --git a/compiler/rustc_expand/src/lib.rs b/compiler/rustc_expand/src/lib.rs index 791ba560607..f5a93905b82 100644 --- a/compiler/rustc_expand/src/lib.rs +++ b/compiler/rustc_expand/src/lib.rs @@ -1,5 +1,6 @@ #![feature(associated_type_bounds)] #![feature(associated_type_defaults)] +#![feature(box_patterns)] #![feature(box_syntax)] #![feature(crate_visibility_modifier)] #![feature(decl_macro)] diff --git a/compiler/rustc_expand/src/mbe/macro_parser.rs b/compiler/rustc_expand/src/mbe/macro_parser.rs index 3fd81e2d87d..0e05440493a 100644 --- a/compiler/rustc_expand/src/mbe/macro_parser.rs +++ b/compiler/rustc_expand/src/mbe/macro_parser.rs @@ -87,17 +87,6 @@ use rustc_data_structures::sync::Lrc; use rustc_span::symbol::Ident; use std::borrow::Cow; use std::collections::hash_map::Entry::{Occupied, Vacant}; -use std::mem; - -/// This is used by `parse_tt_inner` to keep track of delimited submatchers that we have -/// descended into. -#[derive(Clone)] -struct MatcherPosFrame<'tt> { - /// The "parent" matcher that we have descended from. - tts: &'tt [TokenTree], - /// The position of the "dot" in `tt` at the time we descended. - idx: usize, -} // One element is enough to cover 95-99% of vectors for most benchmarks. Also, // vectors longer than one frequently have many elements, not just two or @@ -108,6 +97,33 @@ type NamedMatchVec = SmallVec<[NamedMatch; 1]>; #[cfg(all(target_arch = "x86_64", target_pointer_width = "64"))] rustc_data_structures::static_assert_size!(NamedMatchVec, 48); +#[derive(Clone)] +enum MatcherKind<'tt> { + TopLevel, + Delimited(Box>), + Sequence(Box>), +} + +#[derive(Clone)] +struct DelimitedSubmatcher<'tt> { + parent: Parent<'tt>, +} + +#[derive(Clone)] +struct SequenceSubmatcher<'tt> { + parent: Parent<'tt>, + seq: &'tt SequenceRepetition, +} + +/// Data used to ascend from a submatcher back to its parent matcher. A subset of the fields from +/// `MathcherPos`. +#[derive(Clone)] +struct Parent<'tt> { + tts: &'tt [TokenTree], + idx: usize, + kind: MatcherKind<'tt>, +} + /// A single matcher position, which could be within the top-level matcher, a submatcher, a /// subsubmatcher, etc. For example: /// ```text @@ -140,17 +156,14 @@ struct MatcherPos<'tt> { /// stream. Should not be used if there are no metavars. match_cur: usize, - /// This field is only used if we are matching a sequence. - sequence: Option>, - - /// When we are within a `Delimited` submatcher (or subsubmatcher), this tracks the parent - /// matcher(s). The bottom of the stack is the top-level matcher. - stack: SmallVec<[MatcherPosFrame<'tt>; 1]>, + /// What kind of matcher we are in. For submatchers, this contains enough information to + /// reconstitute a `MatcherPos` within the parent once we ascend out of the submatcher. + kind: MatcherKind<'tt>, } // This type is used a lot. Make sure it doesn't unintentionally get bigger. #[cfg(all(target_arch = "x86_64", target_pointer_width = "64"))] -rustc_data_structures::static_assert_size!(MatcherPos<'_>, 96); +rustc_data_structures::static_assert_size!(MatcherPos<'_>, 64); impl<'tt> MatcherPos<'tt> { fn top_level(matcher: &'tt [TokenTree], empty_matches: Lrc) -> Self { @@ -160,24 +173,26 @@ impl<'tt> MatcherPos<'tt> { matches: empty_matches, seq_depth: 0, match_cur: 0, - stack: smallvec![], - sequence: None, + kind: MatcherKind::TopLevel, } } fn sequence( - parent: Box>, + parent_mp: Box>, seq: &'tt SequenceRepetition, empty_matches: Lrc, ) -> Self { + let seq_kind = box SequenceSubmatcher { + parent: Parent { tts: parent_mp.tts, idx: parent_mp.idx, kind: parent_mp.kind }, + seq, + }; let mut mp = MatcherPos { tts: &seq.tts, idx: 0, - matches: parent.matches.clone(), - seq_depth: parent.seq_depth, - match_cur: parent.match_cur, - sequence: Some(MatcherPosSequence { parent, seq }), - stack: smallvec![], + matches: parent_mp.matches, + seq_depth: parent_mp.seq_depth, + match_cur: parent_mp.match_cur, + kind: MatcherKind::Sequence(seq_kind), }; // Start with an empty vec for each metavar within the sequence. Note that `mp.seq_depth` // must have the parent's depth at this point for these `push_match` calls to work. @@ -222,16 +237,6 @@ impl<'tt> MatcherPos<'tt> { } } -#[derive(Clone)] -struct MatcherPosSequence<'tt> { - /// The parent matcher position. Effectively gives a linked list of matches all the way to the - /// top-level matcher. - parent: Box>, - - /// The sequence itself. - seq: &'tt SequenceRepetition, -} - enum EofMatcherPositions<'tt> { None, One(Box>), @@ -499,15 +504,17 @@ impl<'tt> TtParser<'tt> { } TokenTree::Delimited(_, delimited) => { - // To descend into a delimited submatcher, we push the current matcher onto - // a stack and push a new mp containing the submatcher onto `cur_mps`. When - // we reach the closing delimiter, we will pop the stack to backtrack out - // of the descent. Note that we use `all_tts` to include the open and close - // delimiter tokens. - let tts = mem::replace(&mut mp.tts, &delimited.all_tts); - let idx = mp.idx; - mp.stack.push(MatcherPosFrame { tts, idx }); + // To descend into a delimited submatcher, we update `mp` appropriately, + // including enough information to re-ascend afterwards, and push it onto + // `cur_mps`. Later, when we reach the closing delimiter, we will recover + // the parent matcher position to ascend. Note that we use `all_tts` to + // include the open and close delimiter tokens. + let kind = MatcherKind::Delimited(box DelimitedSubmatcher { + parent: Parent { tts: mp.tts, idx: mp.idx, kind: mp.kind }, + }); + mp.tts = &delimited.all_tts; mp.idx = 0; + mp.kind = kind; self.cur_mps.push(mp); } @@ -528,9 +535,14 @@ impl<'tt> TtParser<'tt> { if let TokenKind::CloseDelim(_) = token.kind { // Ascend out of the delimited submatcher. debug_assert_eq!(idx, len - 1); - let frame = mp.stack.pop().unwrap(); - mp.tts = frame.tts; - mp.idx = frame.idx; + match mp.kind { + MatcherKind::Delimited(submatcher) => { + mp.tts = submatcher.parent.tts; + mp.idx = submatcher.parent.idx; + mp.kind = submatcher.parent.kind; + } + _ => unreachable!(), + } } mp.idx += 1; self.next_mps.push(mp); @@ -540,45 +552,44 @@ impl<'tt> TtParser<'tt> { // These cannot appear in a matcher. TokenTree::MetaVar(..) | TokenTree::MetaVarExpr(..) => unreachable!(), } - } else if let Some(sequence) = &mp.sequence { + } else if let MatcherKind::Sequence(box SequenceSubmatcher { parent, seq }) = &mp.kind { // We are past the end of a sequence. // - If it has no separator, we must be only one past the end. // - If it has a separator, we may be one past the end, in which case we must // look for a separator. Or we may be two past the end, in which case we have // already dealt with the separator. - debug_assert!(idx == len || idx == len + 1 && sequence.seq.separator.is_some()); + debug_assert!(idx == len || idx == len + 1 && seq.separator.is_some()); if idx == len { // Sequence matching may have finished: move the "dot" past the sequence in // `parent`. This applies whether a separator is used or not. If sequence // matching hasn't finished, this `new_mp` will fail quietly when it is // processed next time around the loop. - let mut new_mp = sequence.parent.clone(); - new_mp.matches = mp.matches.clone(); - new_mp.match_cur = mp.match_cur; - new_mp.idx += 1; + let new_mp = box MatcherPos { + tts: parent.tts, + idx: parent.idx + 1, + matches: mp.matches.clone(), // a cheap clone + seq_depth: mp.seq_depth - 1, + match_cur: mp.match_cur, + kind: parent.kind.clone(), // an expensive clone + }; self.cur_mps.push(new_mp); } - if sequence.seq.separator.is_some() && idx == len { + if seq.separator.is_some() && idx == len { // Look for the separator. - if sequence - .seq - .separator - .as_ref() - .map_or(false, |sep| token_name_eq(token, sep)) - { + if seq.separator.as_ref().map_or(false, |sep| token_name_eq(token, sep)) { // The matcher has a separator, and it matches the current token. We can // advance past the separator token. mp.idx += 1; self.next_mps.push(mp); } - } else if sequence.seq.kleene.op != mbe::KleeneOp::ZeroOrOne { + } else if seq.kleene.op != mbe::KleeneOp::ZeroOrOne { // We don't need to look for a separator: either this sequence doesn't have // one, or it does and we've already handled it. Also, we are allowed to have // more than one repetition. Move the "dot" back to the beginning of the // matcher and try to match again. - mp.match_cur -= sequence.seq.num_captures; + mp.match_cur -= seq.num_captures; mp.idx = 0; self.cur_mps.push(mp); } From c6fedd4f1006069786a28e2054442c188408e194 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Thu, 31 Mar 2022 13:50:30 +1100 Subject: [PATCH 5/5] Make `MatcherPos` not derive `Clone`. It's only used in one place, and there we clone and then make a bunch of modifications. It's clearer if we duplicate more explicitly, and there's a symmetry now between `sequence()` and `empty_sequence()`. --- compiler/rustc_expand/src/mbe/macro_parser.rs | 32 ++++++++++++++----- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/compiler/rustc_expand/src/mbe/macro_parser.rs b/compiler/rustc_expand/src/mbe/macro_parser.rs index 0e05440493a..618a213055e 100644 --- a/compiler/rustc_expand/src/mbe/macro_parser.rs +++ b/compiler/rustc_expand/src/mbe/macro_parser.rs @@ -132,7 +132,6 @@ struct Parent<'tt> { /// <--------------> first submatcher; three tts, zero metavars /// <--------------------------> top-level matcher; two tts, one metavar /// ``` -#[derive(Clone)] struct MatcherPos<'tt> { /// The tokens that make up the current matcher. When we are within a `Sequence` or `Delimited` /// submatcher, this is just the contents of that submatcher. @@ -177,6 +176,25 @@ impl<'tt> MatcherPos<'tt> { } } + fn empty_sequence( + parent_mp: &MatcherPos<'tt>, + seq: &'tt SequenceRepetition, + empty_matches: Lrc, + ) -> Self { + let mut mp = MatcherPos { + tts: parent_mp.tts, + idx: parent_mp.idx + 1, + matches: parent_mp.matches.clone(), // a cheap clone + seq_depth: parent_mp.seq_depth, + match_cur: parent_mp.match_cur + seq.num_captures, + kind: parent_mp.kind.clone(), // an expensive clone + }; + for idx in parent_mp.match_cur..parent_mp.match_cur + seq.num_captures { + mp.push_match(idx, MatchedSeq(empty_matches.clone())); + } + mp + } + fn sequence( parent_mp: Box>, seq: &'tt SequenceRepetition, @@ -468,13 +486,11 @@ impl<'tt> TtParser<'tt> { let op = seq.kleene.op; if op == mbe::KleeneOp::ZeroOrMore || op == mbe::KleeneOp::ZeroOrOne { // Allow for the possibility of zero matches of this sequence. - let mut new_mp = mp.clone(); - new_mp.match_cur += seq.num_captures; - new_mp.idx += 1; - for idx in mp.match_cur..mp.match_cur + seq.num_captures { - new_mp.push_match(idx, MatchedSeq(self.empty_matches.clone())); - } - self.cur_mps.push(new_mp); + self.cur_mps.push(box MatcherPos::empty_sequence( + &*mp, + &seq, + self.empty_matches.clone(), + )); } // Allow for the possibility of one or more matches of this sequence.