lezer-typst: convert LineCommentContent and MathContent to external tokenizers
Build and Deploy Verso / deploy (push) Successful in 10m10s
Build and Deploy Verso / deploy (push) Successful in 10m10s
Both tokens are "read until delimiter" catchalls that match almost every
non-newline character, causing buildTokenGroups conflicts with every other
literal token in LALR-merged states. Moving them to ExternalTokenizer (the
same pattern already used for HeadingTitle, RawBlockBody, etc.) makes them
context-isolated: the LR state machine only calls them when those tokens are
actually valid, so they never participate in the static token-group overlap
check.
Also exclude '<' from StrongText/EmphText so Label ('<' LabelName '>') is
recognised inside strong/emphasis spans rather than being consumed as plain
text.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -10,6 +10,8 @@ import {
|
||||
RawInlineContent,
|
||||
CodeBlockBody,
|
||||
BlockCommentBody,
|
||||
LineCommentContent,
|
||||
MathContent,
|
||||
} from './typst.terms.mjs'
|
||||
|
||||
const BACKTICK = 96 // `
|
||||
@@ -19,6 +21,7 @@ const NEWLINE = 10 // \n
|
||||
const EQUALS = 61 // =
|
||||
const SPACE = 32 //
|
||||
const TAB = 9 // \t
|
||||
const DOLLAR = 36 // $
|
||||
const OPEN_BRACE = 123 // {
|
||||
const CLOSE_BRACE = 125 // }
|
||||
|
||||
@@ -191,3 +194,36 @@ export const blockCommentTokenizer = new ExternalTokenizer(
|
||||
},
|
||||
{ contextual: false }
|
||||
)
|
||||
|
||||
// ── lineCommentContentTokenizer ─────────────────────────────────────────
|
||||
// Emits LineCommentContent — everything from the current position to EOL.
|
||||
// External rather than a @tokens rule because ![\n]+ conflicts with every
|
||||
// non-newline literal token in LALR-merged states. External tokenizers are
|
||||
// context-isolated: only called when the LR state expects this token.
|
||||
export const lineCommentContentTokenizer = new ExternalTokenizer(
|
||||
(input, _stack) => {
|
||||
let hasContent = false
|
||||
while (input.next !== -1 && input.next !== NEWLINE) {
|
||||
input.advance()
|
||||
hasContent = true
|
||||
}
|
||||
if (hasContent) input.acceptToken(LineCommentContent)
|
||||
},
|
||||
{ contextual: false }
|
||||
)
|
||||
|
||||
// ── mathContentTokenizer ────────────────────────────────────────────────
|
||||
// Emits MathContent — everything between the $...$ delimiters (no newlines).
|
||||
// External rather than a @tokens rule for the same reason as LineCommentContent:
|
||||
// ![$\n]+ overlaps with spaces, '<', '@', and other literals in merged states.
|
||||
export const mathContentTokenizer = new ExternalTokenizer(
|
||||
(input, _stack) => {
|
||||
let hasContent = false
|
||||
while (input.next !== -1 && input.next !== DOLLAR && input.next !== NEWLINE) {
|
||||
input.advance()
|
||||
hasContent = true
|
||||
}
|
||||
if (hasContent) input.acceptToken(MathContent)
|
||||
},
|
||||
{ contextual: false }
|
||||
)
|
||||
|
||||
@@ -148,6 +148,14 @@ Escape { "\\" EscapeChar }
|
||||
BlockCommentBody
|
||||
}
|
||||
|
||||
@external tokens lineCommentContentTokenizer from "./tokens.mjs" {
|
||||
LineCommentContent
|
||||
}
|
||||
|
||||
@external tokens mathContentTokenizer from "./tokens.mjs" {
|
||||
MathContent
|
||||
}
|
||||
|
||||
// ── Regular tokens ────────────────────────────────────────────────────────
|
||||
@tokens {
|
||||
// Horizontal whitespace only. Newlines are kept as explicit Newline items
|
||||
@@ -181,16 +189,14 @@ Escape { "\\" EscapeChar }
|
||||
("pt" | "mm" | "cm" | "in" | "em" | "rem" | "fr" | "deg" | "rad" | "%")?
|
||||
}
|
||||
|
||||
// Comment content — everything to end of line.
|
||||
LineCommentContent { ![\n]+ }
|
||||
|
||||
// Math content — everything between the $ delimiters (no crossing newlines).
|
||||
MathContent { ![$\n]+ }
|
||||
|
||||
// Text tokens for markup contexts; each excludes its own delimiters.
|
||||
// HeadingText is gone: HeadingTitle is now an external token (see above).
|
||||
StrongText { ![\n*$#`@\\]+ }
|
||||
EmphText { ![\n_$#`@\\]+ }
|
||||
// HeadingText, LineCommentContent, and MathContent are external tokens
|
||||
// (see above) — broad "read-to-delimiter" tokens that would otherwise
|
||||
// conflict with every other literal token in LALR-merged states.
|
||||
// '<' is excluded from StrongText/EmphText so that Label ('<' LabelName '>')
|
||||
// is recognised inside strong/emphasis rather than consumed as plain text.
|
||||
StrongText { ![\n*$#`<@\\]+ }
|
||||
EmphText { ![\n_$#`<@\\]+ }
|
||||
|
||||
// Regular markup: excludes all special-character starters plus whitespace
|
||||
// (whitespace is handled by @skip). The '/' is excluded so that '//' and
|
||||
@@ -210,11 +216,10 @@ Escape { "\\" EscapeChar }
|
||||
// Resolve ambiguities: more-specific tokens win over broader catch-alls.
|
||||
// EscapeChar > spaces: after '\', EscapeChar must win over the skip token
|
||||
// (both match \t; without this, '\t' would be mis-tokenized).
|
||||
// "(" > "." > text tokens: after '#' CodeIdent, callSuffix delimiters must win
|
||||
// over MarkupContent/StrongText/EmphText in the LALR-merged state.
|
||||
// "]" > LineCommentContent: inside #[...], ']' closes the ContentBlock even
|
||||
// if it appears after '//' (comment does not "protect" the bracket).
|
||||
@precedence { CodeKeyword CodeBool CodeIdent EscapeChar "(" "." "]" spaces MarkupContent StrongText EmphText LineCommentContent }
|
||||
// "(" > "." > "]" > text tokens: after '#' CodeIdent, callSuffix delimiters
|
||||
// must win over MarkupContent/StrongText/EmphText in merged states.
|
||||
// LineCommentContent and MathContent are external tokens — not listed here.
|
||||
@precedence { CodeKeyword CodeBool CodeIdent EscapeChar "(" "." "]" spaces MarkupContent StrongText EmphText }
|
||||
}
|
||||
|
||||
@skip { spaces }
|
||||
|
||||
Reference in New Issue
Block a user