lezer-typst: convert LineCommentContent and MathContent to external tokenizers
Build and Deploy Verso / deploy (push) Successful in 10m10s

Both tokens are "read until delimiter" catchalls that match almost every
non-newline character, causing buildTokenGroups conflicts with every other
literal token in LALR-merged states.  Moving them to ExternalTokenizer (the
same pattern already used for HeadingTitle, RawBlockBody, etc.) makes them
context-isolated: the LR state machine only calls them when those tokens are
actually valid, so they never participate in the static token-group overlap
check.

Also exclude '<' from StrongText/EmphText so Label ('<' LabelName '>') is
recognised inside strong/emphasis spans rather than being consumed as plain
text.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
claude
2026-06-08 07:56:20 +00:00
parent e21f7cc0d5
commit 1dcd6e24f4
2 changed files with 55 additions and 14 deletions
@@ -10,6 +10,8 @@ import {
RawInlineContent,
CodeBlockBody,
BlockCommentBody,
LineCommentContent,
MathContent,
} from './typst.terms.mjs'
const BACKTICK = 96 // `
@@ -19,6 +21,7 @@ const NEWLINE = 10 // \n
const EQUALS = 61 // =
const SPACE = 32 //
const TAB = 9 // \t
const DOLLAR = 36 // $
const OPEN_BRACE = 123 // {
const CLOSE_BRACE = 125 // }
@@ -191,3 +194,36 @@ export const blockCommentTokenizer = new ExternalTokenizer(
},
{ contextual: false }
)
// ── lineCommentContentTokenizer ─────────────────────────────────────────
// Emits LineCommentContent — everything from the current position to EOL.
// External rather than a @tokens rule because ![\n]+ conflicts with every
// non-newline literal token in LALR-merged states. External tokenizers are
// context-isolated: only called when the LR state expects this token.
export const lineCommentContentTokenizer = new ExternalTokenizer(
(input, _stack) => {
let hasContent = false
while (input.next !== -1 && input.next !== NEWLINE) {
input.advance()
hasContent = true
}
if (hasContent) input.acceptToken(LineCommentContent)
},
{ contextual: false }
)
// ── mathContentTokenizer ────────────────────────────────────────────────
// Emits MathContent — everything between the $...$ delimiters (no newlines).
// External rather than a @tokens rule for the same reason as LineCommentContent:
// ![$\n]+ overlaps with spaces, '<', '@', and other literals in merged states.
export const mathContentTokenizer = new ExternalTokenizer(
(input, _stack) => {
let hasContent = false
while (input.next !== -1 && input.next !== DOLLAR && input.next !== NEWLINE) {
input.advance()
hasContent = true
}
if (hasContent) input.acceptToken(MathContent)
},
{ contextual: false }
)
@@ -148,6 +148,14 @@ Escape { "\\" EscapeChar }
BlockCommentBody
}
@external tokens lineCommentContentTokenizer from "./tokens.mjs" {
LineCommentContent
}
@external tokens mathContentTokenizer from "./tokens.mjs" {
MathContent
}
// ── Regular tokens ────────────────────────────────────────────────────────
@tokens {
// Horizontal whitespace only. Newlines are kept as explicit Newline items
@@ -181,16 +189,14 @@ Escape { "\\" EscapeChar }
("pt" | "mm" | "cm" | "in" | "em" | "rem" | "fr" | "deg" | "rad" | "%")?
}
// Comment content — everything to end of line.
LineCommentContent { ![\n]+ }
// Math content — everything between the $ delimiters (no crossing newlines).
MathContent { ![$\n]+ }
// Text tokens for markup contexts; each excludes its own delimiters.
// HeadingText is gone: HeadingTitle is now an external token (see above).
StrongText { ![\n*$#`@\\]+ }
EmphText { ![\n_$#`@\\]+ }
// HeadingText, LineCommentContent, and MathContent are external tokens
// (see above) — broad "read-to-delimiter" tokens that would otherwise
// conflict with every other literal token in LALR-merged states.
// '<' is excluded from StrongText/EmphText so that Label ('<' LabelName '>')
// is recognised inside strong/emphasis rather than consumed as plain text.
StrongText { ![\n*$#`<@\\]+ }
EmphText { ![\n_$#`<@\\]+ }
// Regular markup: excludes all special-character starters plus whitespace
// (whitespace is handled by @skip). The '/' is excluded so that '//' and
@@ -210,11 +216,10 @@ Escape { "\\" EscapeChar }
// Resolve ambiguities: more-specific tokens win over broader catch-alls.
// EscapeChar > spaces: after '\', EscapeChar must win over the skip token
// (both match \t; without this, '\t' would be mis-tokenized).
// "(" > "." > text tokens: after '#' CodeIdent, callSuffix delimiters must win
// over MarkupContent/StrongText/EmphText in the LALR-merged state.
// "]" > LineCommentContent: inside #[...], ']' closes the ContentBlock even
// if it appears after '//' (comment does not "protect" the bracket).
@precedence { CodeKeyword CodeBool CodeIdent EscapeChar "(" "." "]" spaces MarkupContent StrongText EmphText LineCommentContent }
// "(" > "." > "]" > text tokens: after '#' CodeIdent, callSuffix delimiters
// must win over MarkupContent/StrongText/EmphText in merged states.
// LineCommentContent and MathContent are external tokens — not listed here.
@precedence { CodeKeyword CodeBool CodeIdent EscapeChar "(" "." "]" spaces MarkupContent StrongText EmphText }
}
@skip { spaces }