From 1dcd6e24f4d870db971214a86995e45e382f8804 Mon Sep 17 00:00:00 2001 From: claude Date: Mon, 8 Jun 2026 07:56:20 +0000 Subject: [PATCH] lezer-typst: convert LineCommentContent and MathContent to external tokenizers Both tokens are "read until delimiter" catchalls that match almost every non-newline character, causing buildTokenGroups conflicts with every other literal token in LALR-merged states. Moving them to ExternalTokenizer (the same pattern already used for HeadingTitle, RawBlockBody, etc.) makes them context-isolated: the LR state machine only calls them when those tokens are actually valid, so they never participate in the static token-group overlap check. Also exclude '<' from StrongText/EmphText so Label ('<' LabelName '>') is recognised inside strong/emphasis spans rather than being consumed as plain text. Co-Authored-By: Claude Sonnet 4.6 --- .../source-editor/lezer-typst/tokens.mjs | 36 +++++++++++++++++++ .../source-editor/lezer-typst/typst.grammar | 33 +++++++++-------- 2 files changed, 55 insertions(+), 14 deletions(-) diff --git a/services/web/frontend/js/features/source-editor/lezer-typst/tokens.mjs b/services/web/frontend/js/features/source-editor/lezer-typst/tokens.mjs index 0ded687c9a..5161ad47e8 100644 --- a/services/web/frontend/js/features/source-editor/lezer-typst/tokens.mjs +++ b/services/web/frontend/js/features/source-editor/lezer-typst/tokens.mjs @@ -10,6 +10,8 @@ import { RawInlineContent, CodeBlockBody, BlockCommentBody, + LineCommentContent, + MathContent, } from './typst.terms.mjs' const BACKTICK = 96 // ` @@ -19,6 +21,7 @@ const NEWLINE = 10 // \n const EQUALS = 61 // = const SPACE = 32 // const TAB = 9 // \t +const DOLLAR = 36 // $ const OPEN_BRACE = 123 // { const CLOSE_BRACE = 125 // } @@ -191,3 +194,36 @@ export const blockCommentTokenizer = new ExternalTokenizer( }, { contextual: false } ) + +// ── lineCommentContentTokenizer ───────────────────────────────────────── +// Emits LineCommentContent — everything from the current position to EOL. +// External rather than a @tokens rule because ![\n]+ conflicts with every +// non-newline literal token in LALR-merged states. External tokenizers are +// context-isolated: only called when the LR state expects this token. +export const lineCommentContentTokenizer = new ExternalTokenizer( + (input, _stack) => { + let hasContent = false + while (input.next !== -1 && input.next !== NEWLINE) { + input.advance() + hasContent = true + } + if (hasContent) input.acceptToken(LineCommentContent) + }, + { contextual: false } +) + +// ── mathContentTokenizer ──────────────────────────────────────────────── +// Emits MathContent — everything between the $...$ delimiters (no newlines). +// External rather than a @tokens rule for the same reason as LineCommentContent: +// ![$\n]+ overlaps with spaces, '<', '@', and other literals in merged states. +export const mathContentTokenizer = new ExternalTokenizer( + (input, _stack) => { + let hasContent = false + while (input.next !== -1 && input.next !== DOLLAR && input.next !== NEWLINE) { + input.advance() + hasContent = true + } + if (hasContent) input.acceptToken(MathContent) + }, + { contextual: false } +) diff --git a/services/web/frontend/js/features/source-editor/lezer-typst/typst.grammar b/services/web/frontend/js/features/source-editor/lezer-typst/typst.grammar index ccfec7335b..7404ac9b00 100644 --- a/services/web/frontend/js/features/source-editor/lezer-typst/typst.grammar +++ b/services/web/frontend/js/features/source-editor/lezer-typst/typst.grammar @@ -148,6 +148,14 @@ Escape { "\\" EscapeChar } BlockCommentBody } +@external tokens lineCommentContentTokenizer from "./tokens.mjs" { + LineCommentContent +} + +@external tokens mathContentTokenizer from "./tokens.mjs" { + MathContent +} + // ── Regular tokens ──────────────────────────────────────────────────────── @tokens { // Horizontal whitespace only. Newlines are kept as explicit Newline items @@ -181,16 +189,14 @@ Escape { "\\" EscapeChar } ("pt" | "mm" | "cm" | "in" | "em" | "rem" | "fr" | "deg" | "rad" | "%")? } - // Comment content — everything to end of line. - LineCommentContent { ![\n]+ } - - // Math content — everything between the $ delimiters (no crossing newlines). - MathContent { ![$\n]+ } - // Text tokens for markup contexts; each excludes its own delimiters. - // HeadingText is gone: HeadingTitle is now an external token (see above). - StrongText { ![\n*$#`@\\]+ } - EmphText { ![\n_$#`@\\]+ } + // HeadingText, LineCommentContent, and MathContent are external tokens + // (see above) — broad "read-to-delimiter" tokens that would otherwise + // conflict with every other literal token in LALR-merged states. + // '<' is excluded from StrongText/EmphText so that Label ('<' LabelName '>') + // is recognised inside strong/emphasis rather than consumed as plain text. + StrongText { ![\n*$#`<@\\]+ } + EmphText { ![\n_$#`<@\\]+ } // Regular markup: excludes all special-character starters plus whitespace // (whitespace is handled by @skip). The '/' is excluded so that '//' and @@ -210,11 +216,10 @@ Escape { "\\" EscapeChar } // Resolve ambiguities: more-specific tokens win over broader catch-alls. // EscapeChar > spaces: after '\', EscapeChar must win over the skip token // (both match \t; without this, '\t' would be mis-tokenized). - // "(" > "." > text tokens: after '#' CodeIdent, callSuffix delimiters must win - // over MarkupContent/StrongText/EmphText in the LALR-merged state. - // "]" > LineCommentContent: inside #[...], ']' closes the ContentBlock even - // if it appears after '//' (comment does not "protect" the bracket). - @precedence { CodeKeyword CodeBool CodeIdent EscapeChar "(" "." "]" spaces MarkupContent StrongText EmphText LineCommentContent } + // "(" > "." > "]" > text tokens: after '#' CodeIdent, callSuffix delimiters + // must win over MarkupContent/StrongText/EmphText in merged states. + // LineCommentContent and MathContent are external tokens — not listed here. + @precedence { CodeKeyword CodeBool CodeIdent EscapeChar "(" "." "]" spaces MarkupContent StrongText EmphText } } @skip { spaces }