From de145c5b06b424368249d8068d618bcadbe16c12 Mon Sep 17 00:00:00 2001 From: Robin Malfait Date: Mon, 10 Mar 2025 19:08:39 +0100 Subject: [PATCH] Refactor: use compile time type state pattern (#17083) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR implements the state machines using the type state pattern at compile time (via generic types) instead of a runtime state variable. There is no runtime check to see what state we are in, instead we transition to the new state when it's necessary. This has some nice performance improvements for some of the state machines, e.g.: ```diff - ArbitraryVariableMachine: Throughput: 744.92 MB/s + ArbitraryVariableMachine: Throughput: 1.21 GB/s ``` We also don't have to store the current state because each machine runs to completion. It's during execution that we can move to a new state if necessary. Unfortunately the diff is a tiny bit annoying to read, but essentially this is what happened: ### The `enum` is split up in it's individual states as structs: ```rs enum State { A, B, C, } ``` Becomes: ```rs struct A; struct B; struct C; ``` ### Generics The current machine will receive a generic `State` that we can default to the `IdleState`. Then we use `PhantomData` to "use" the type because the generic type is otherwise not used as a concrete value, it's just a marker. ```rs struct MyMachine {} ``` Becomes: ```rs struct MyMachine { _state: std::marker::PhantomData } ``` ### Split Next, the `next` function used to match on the current state, but now each match arm is moved to a dedicated implementation instead: ```rs impl Machine for MyMachine { fn next(&mut self) -> MachineState { match self.state { State::A => { /* … */ }, State::B => { /* … */ }, State::C => { /* … */ }, } } } ``` Becomes: ```rs impl Machine for MyMachine { fn next(&mut self) -> MachineState { /* … */ } } impl Machine for MyMachine { fn next(&mut self) -> MachineState { /* … */ } } impl Machine for MyMachine { fn next(&mut self) -> MachineState { /* … */ } } ``` It's a bit more verbose, but now each state is implemented in its own block. This also removes 2 levels of nesting which is a nice benefit. --- .../extractor/arbitrary_property_machine.rs | 333 ++++++----- .../extractor/arbitrary_variable_machine.rs | 432 +++++++------- .../src/extractor/named_utility_machine.rs | 540 +++++++++--------- .../src/extractor/named_variant_machine.rs | 448 ++++++++------- 4 files changed, 911 insertions(+), 842 deletions(-) diff --git a/crates/oxide/src/extractor/arbitrary_property_machine.rs b/crates/oxide/src/extractor/arbitrary_property_machine.rs index 975c3df98..c1460092f 100644 --- a/crates/oxide/src/extractor/arbitrary_property_machine.rs +++ b/crates/oxide/src/extractor/arbitrary_property_machine.rs @@ -4,6 +4,31 @@ use crate::extractor::machine::{Machine, MachineState}; use crate::extractor::string_machine::StringMachine; use crate::extractor::CssVariableMachine; use classification_macros::ClassifyBytes; +use std::marker::PhantomData; + +#[derive(Debug, Default)] +pub struct IdleState; + +/// Parsing the property, e.g.: +/// +/// ```text +/// [color:red] +/// ^^^^^ +/// +/// [--my-color:red] +/// ^^^^^^^^^^ +/// ``` +#[derive(Debug, Default)] +pub struct ParsingPropertyState; + +/// Parsing the value, e.g.: +/// +/// ```text +/// [color:red] +/// ^^^ +/// ``` +#[derive(Debug, Default)] +pub struct ParsingValueState; /// Extracts arbitrary properties from the input, including the brackets. /// @@ -17,182 +42,98 @@ use classification_macros::ClassifyBytes; /// ^^^^^^^^^^^^^^^^ /// ``` #[derive(Debug, Default)] -pub struct ArbitraryPropertyMachine { +pub struct ArbitraryPropertyMachine { /// Start position of the arbitrary value start_pos: usize, /// Track brackets to ensure they are balanced bracket_stack: BracketStack, - /// Current state of the machine - state: State, - css_variable_machine: CssVariableMachine, string_machine: StringMachine, + + _state: PhantomData, } -#[derive(Debug, Default)] -enum State { - #[default] - Idle, - - /// Parsing the property, e.g.: - /// - /// ```text - /// [color:red] - /// ^^^^^ - /// - /// [--my-color:red] - /// ^^^^^^^^^^ - /// ``` - ParsingProperty, - - /// Parsing the value, e.g.: - /// - /// ```text - /// [color:red] - /// ^^^ - /// ``` - ParsingValue, -} - -impl Machine for ArbitraryPropertyMachine { +impl ArbitraryPropertyMachine { #[inline(always)] - fn reset(&mut self) { - self.start_pos = 0; - self.state = State::Idle; - self.bracket_stack.reset(); + fn transition(&self) -> ArbitraryPropertyMachine { + ArbitraryPropertyMachine { + start_pos: self.start_pos, + bracket_stack: Default::default(), + css_variable_machine: Default::default(), + string_machine: Default::default(), + _state: PhantomData, + } } +} + +impl Machine for ArbitraryPropertyMachine { + #[inline(always)] + fn reset(&mut self) {} + + #[inline] + fn next(&mut self, cursor: &mut cursor::Cursor<'_>) -> MachineState { + match cursor.curr.into() { + // Start of an arbitrary property + Class::OpenBracket => { + self.start_pos = cursor.pos; + cursor.advance(); + self.transition::().next(cursor) + } + + // Anything else is not a valid start of an arbitrary value + _ => MachineState::Idle, + } + } +} + +impl Machine for ArbitraryPropertyMachine { + #[inline(always)] + fn reset(&mut self) {} #[inline] fn next(&mut self, cursor: &mut cursor::Cursor<'_>) -> MachineState { let len = cursor.input.len(); - match self.state { - State::Idle => match cursor.curr.into() { - // Start of an arbitrary property - Class::OpenBracket => { - self.start_pos = cursor.pos; - self.state = State::ParsingProperty; + while cursor.pos < len { + match cursor.curr.into() { + Class::Dash => match cursor.next.into() { + // Start of a CSS variable + // + // E.g.: `[--my-color:red]` + // ^^ + Class::Dash => return self.parse_property_variable(cursor), + + // Dashes are allowed in the property name + // + // E.g.: `[background-color:red]` + // ^ + _ => cursor.advance(), + }, + + // Alpha characters are allowed in the property name + // + // E.g.: `[color:red]` + // ^^^^^ + Class::AlphaLower => cursor.advance(), + + // End of the property name, but there must be at least a single character + Class::Colon if cursor.pos > self.start_pos + 1 => { cursor.advance(); - self.next(cursor) + return self.transition::().next(cursor); } - // Anything else is not a valid start of an arbitrary value - _ => MachineState::Idle, - }, - - State::ParsingProperty => { - while cursor.pos < len { - match cursor.curr.into() { - Class::Dash => match cursor.next.into() { - // Start of a CSS variable - // - // E.g.: `[--my-color:red]` - // ^^ - Class::Dash => return self.parse_property_variable(cursor), - - // Dashes are allowed in the property name - // - // E.g.: `[background-color:red]` - // ^ - _ => cursor.advance(), - }, - - // Alpha characters are allowed in the property name - // - // E.g.: `[color:red]` - // ^^^^^ - Class::AlphaLower => cursor.advance(), - - // End of the property name, but there must be at least a single character - Class::Colon if cursor.pos > self.start_pos + 1 => { - self.state = State::ParsingValue; - cursor.advance(); - return self.next(cursor); - } - - // Anything else is not a valid property character - _ => return self.restart(), - } - } - - self.restart() - } - - State::ParsingValue => { - while cursor.pos < len { - match cursor.curr.into() { - Class::Escape => match cursor.next.into() { - // An escaped whitespace character is not allowed - // - // E.g.: `[color:var(--my-\ color)]` - // ^ - Class::Whitespace => return self.restart(), - - // An escaped character, skip the next character, resume after - // - // E.g.: `[color:var(--my-\#color)]` - // ^ - _ => cursor.advance_twice(), - }, - - Class::OpenParen | Class::OpenBracket | Class::OpenCurly => { - if !self.bracket_stack.push(cursor.curr) { - return self.restart(); - } - cursor.advance(); - } - - Class::CloseParen | Class::CloseBracket | Class::CloseCurly - if !self.bracket_stack.is_empty() => - { - if !self.bracket_stack.pop(cursor.curr) { - return self.restart(); - } - cursor.advance(); - } - - // End of an arbitrary value - // - // 1. All brackets must be balanced - // 2. There must be at least a single character inside the brackets - Class::CloseBracket - if self.start_pos + 1 != cursor.pos - && self.bracket_stack.is_empty() => - { - return self.done(self.start_pos, cursor) - } - - // Start of a string - Class::Quote => return self.parse_string(cursor), - - // Another `:` inside of an arbitrary property is only valid inside of a string or - // inside of brackets. Everywhere else, it's invalid. - // - // E.g.: `[color:red:blue]` - // ^ Not valid - // E.g.: `[background:url(https://example.com)]` - // ^ Valid - // E.g.: `[content:'a:b:c:']` - // ^ ^ ^ Valid - Class::Colon if self.bracket_stack.is_empty() => return self.restart(), - - // Any kind of whitespace is not allowed - Class::Whitespace => return self.restart(), - - // Everything else is valid - _ => cursor.advance(), - }; - } - - self.restart() + // Anything else is not a valid property character + _ => return self.restart(), } } + + self.restart() } } -impl ArbitraryPropertyMachine { +impl ArbitraryPropertyMachine { fn parse_property_variable(&mut self, cursor: &mut cursor::Cursor<'_>) -> MachineState { match self.css_variable_machine.next(cursor) { MachineState::Idle => self.restart(), @@ -202,9 +143,8 @@ impl ArbitraryPropertyMachine { // E.g.: `[--my-color:red]` // ^ Class::Colon => { - self.state = State::ParsingValue; cursor.advance_twice(); - self.next(cursor) + self.transition::().next(cursor) } // Invalid arbitrary property @@ -212,7 +152,86 @@ impl ArbitraryPropertyMachine { }, } } +} +impl Machine for ArbitraryPropertyMachine { + #[inline(always)] + fn reset(&mut self) { + self.bracket_stack.reset(); + } + + #[inline] + fn next(&mut self, cursor: &mut cursor::Cursor<'_>) -> MachineState { + let len = cursor.input.len(); + while cursor.pos < len { + match cursor.curr.into() { + Class::Escape => match cursor.next.into() { + // An escaped whitespace character is not allowed + // + // E.g.: `[color:var(--my-\ color)]` + // ^ + Class::Whitespace => return self.restart(), + + // An escaped character, skip the next character, resume after + // + // E.g.: `[color:var(--my-\#color)]` + // ^ + _ => cursor.advance_twice(), + }, + + Class::OpenParen | Class::OpenBracket | Class::OpenCurly => { + if !self.bracket_stack.push(cursor.curr) { + return self.restart(); + } + cursor.advance(); + } + + Class::CloseParen | Class::CloseBracket | Class::CloseCurly + if !self.bracket_stack.is_empty() => + { + if !self.bracket_stack.pop(cursor.curr) { + return self.restart(); + } + cursor.advance(); + } + + // End of an arbitrary value + // + // 1. All brackets must be balanced + // 2. There must be at least a single character inside the brackets + Class::CloseBracket + if self.start_pos + 1 != cursor.pos && self.bracket_stack.is_empty() => + { + return self.done(self.start_pos, cursor) + } + + // Start of a string + Class::Quote => return self.parse_string(cursor), + + // Another `:` inside of an arbitrary property is only valid inside of a string or + // inside of brackets. Everywhere else, it's invalid. + // + // E.g.: `[color:red:blue]` + // ^ Not valid + // E.g.: `[background:url(https://example.com)]` + // ^ Valid + // E.g.: `[content:'a:b:c:']` + // ^ ^ ^ Valid + Class::Colon if self.bracket_stack.is_empty() => return self.restart(), + + // Any kind of whitespace is not allowed + Class::Whitespace => return self.restart(), + + // Everything else is valid + _ => cursor.advance(), + }; + } + + self.restart() + } +} + +impl ArbitraryPropertyMachine { fn parse_string(&mut self, cursor: &mut cursor::Cursor<'_>) -> MachineState { match self.string_machine.next(cursor) { MachineState::Idle => self.restart(), @@ -271,7 +290,7 @@ enum Class { #[cfg(test)] mod tests { - use super::ArbitraryPropertyMachine; + use super::{ArbitraryPropertyMachine, IdleState}; use crate::extractor::machine::Machine; #[test] @@ -279,8 +298,8 @@ mod tests { fn test_arbitrary_property_machine_performance() { let input = r#"