fix: make CodeIdent external and replace strongItem*/emphItem* with flat body tokens
Build and Deploy Verso / deploy (push) Successful in 13m36s

Two LALR state-merging bugs prevented Strong/Emphasis nodes from ever being
produced (confirmed: tok-strong/tok-emphasis count = 0 in browser diagnostic).

Bug 1 — _italic_ consumed as CodeIdent:
  CodeIdent was a @tokens rule with identHead = [A-Za-z_], so '_italic_' (the
  entire string including both underscores) matched as one CodeIdent token.
  LALR merging caused CodeIdent to be in item*'s valid set, and CodeIdent >
  "_" in @precedence, so the parser never opened Emphasis.

  Fix: move CodeIdent to an external tokenizer (codeIdentTokenizer) with a
  character-level guard — only fires when the preceding non-whitespace char
  is one of '#', '.', '(', ',' (genuine code-context positions).  In body
  text where peek-back finds a newline, space, or markup delimiter, the
  tokenizer returns without emitting, letting '"_"' open Emphasis correctly.

Bug 2 — StrongText never produced inside Strong:
  The strongItem* / emphItem* loops merged with item* states via Lezer's
  aggressive LALR merging.  In the merged state MarkupContent was in the
  valid set (from the item* side) and MarkupContent > StrongText in
  @precedence, so MarkupContent was always produced — not a valid strongItem,
  leading to error recovery with no StrongText in the tree.

  Fix: replace the recursive strongItem* / emphItem* loops with flat external
  tokens StrongBody / EmphBody (contextual: true).  These fire only inside
  Strong → "*" . StrongBody? "*" and Emphasis → "_" . EmphBody? "_", states
  specific enough that canShift is reliable.  They read everything up to the
  closing delimiter or newline in one token, bypassing the LALR merging
  entirely.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
claude
2026-06-08 22:15:22 +00:00
parent f9d46aabeb
commit 4aca4aaac6
3 changed files with 154 additions and 59 deletions
@@ -73,8 +73,8 @@ export const TypstLanguage = LRLanguage.define({
MathContent: t.string, MathContent: t.string,
// Markup emphasis // Markup emphasis
'Strong/"*" Strong/StrongText': t.strong, 'Strong/"*" Strong/StrongBody': t.strong,
'Emphasis/"_" Emphasis/EmphText': t.emphasis, 'Emphasis/"_" Emphasis/EmphBody': t.emphasis,
// Labels (<name>) and references (@name) // Labels (<name>) and references (@name)
'Label/"<" Label/">" Label/LabelName': t.labelName, 'Label/"<" Label/">" Label/LabelName': t.labelName,
@@ -13,19 +13,26 @@ import {
LineCommentContent, LineCommentContent,
MathContent, MathContent,
CodeKeyword, CodeKeyword,
CodeIdent,
StrongBody,
EmphBody,
} from './typst.terms.mjs' } from './typst.terms.mjs'
const BACKTICK = 96 // ` const BACKTICK = 96 // `
const SLASH = 47 // / const SLASH = 47 // /
const STAR = 42 // * const STAR = 42 // *
const NEWLINE = 10 // \n const NEWLINE = 10 // \n
const EQUALS = 61 // = const EQUALS = 61 // =
const SPACE = 32 // const SPACE = 32 //
const TAB = 9 // \t const TAB = 9 // \t
const DOLLAR = 36 // $ const DOLLAR = 36 // $
const OPEN_BRACE = 123 // { const OPEN_BRACE = 123 // {
const CLOSE_BRACE = 125 // } const CLOSE_BRACE = 125 // }
const HASH = 35 // # const HASH = 35 // #
const UNDERSCORE = 95 // _
const DOT = 46 // .
const OPEN_PAREN = 40 // (
const COMMA = 44 // ,
const KEYWORDS = new Set([ const KEYWORDS = new Set([
'let', 'set', 'show', 'import', 'include', 'let', 'set', 'show', 'import', 'include',
@@ -34,6 +41,13 @@ const KEYWORDS = new Set([
'and', 'or', 'not', 'context', 'and', 'or', 'not', 'context',
]) ])
const BOOLS = new Set(['true', 'false', 'none', 'auto'])
const isAlpha = ch => (ch >= 65 && ch <= 90) || (ch >= 97 && ch <= 122)
const isDigit = ch => ch >= 48 && ch <= 57
const isIdentHead = ch => isAlpha(ch) || ch === UNDERSCORE
const isIdentTail = ch => isAlpha(ch) || isDigit(ch) || ch === UNDERSCORE || ch === 45
// ── headingTokenizer ──────────────────────────────────────────────────── // ── headingTokenizer ────────────────────────────────────────────────────
// Emits HeadingMark — the "=+" prefix plus the trailing whitespace. // Emits HeadingMark — the "=+" prefix plus the trailing whitespace.
// Only fires at the start of a line (pos 0, or character after '\n'). // Only fires at the start of a line (pos 0, or character after '\n').
@@ -272,15 +286,7 @@ export const codeKeywordTokenizer = new ExternalTokenizer(
let len = 0 let len = 0
while (true) { while (true) {
const ch = input.peek(len) const ch = input.peek(len)
if ((ch >= 65 && ch <= 90) || // AZ if (isIdentHead(ch) || (len > 0 && isIdentTail(ch))) { len++ } else { break }
(ch >= 97 && ch <= 122) || // az
(ch >= 48 && ch <= 57) || // 09
ch === 95 || // _
ch === 45) { // -
len++
} else {
break
}
} }
if (len === 0) return if (len === 0) return
@@ -296,3 +302,90 @@ export const codeKeywordTokenizer = new ExternalTokenizer(
}, },
{ contextual: true } { contextual: true }
) )
// ── codeIdentTokenizer ───────────────────────────────────────────────────
// Emits CodeIdent — identifier tokens inside code expressions (#ident,
// #func(args), #obj.method, etc.).
//
// Moving CodeIdent from @tokens to an external tokenizer allows a
// character-level guard: we only emit when the preceding non-whitespace
// character is one of '#', '.', '(', ',' — genuine code-context positions.
// This stops the token from firing in markup body text where LALR-merged
// states would otherwise cause '_italic_' to be consumed as one big
// CodeIdent (since '_' is a valid identHead) instead of opening Emphasis.
//
// Keywords and bools are excluded so codeKeywordTokenizer / CodeBool can
// handle them without conflict.
export const codeIdentTokenizer = new ExternalTokenizer(
(input, stack) => {
if (!stack.canShift(CodeIdent)) return
// Guard: only fire in code context.
// Walk back past any horizontal whitespace (@skip) to the nearest
// non-space character and check that it is a code-mode delimiter.
let back = -1
while (input.peek(back) === SPACE || input.peek(back) === TAB) back--
const prev = input.peek(back)
if (prev !== HASH && prev !== DOT && prev !== OPEN_PAREN && prev !== COMMA) return
// Must start with an identifier head character.
if (!isIdentHead(input.next)) return
// Peek ahead to read the full identifier.
let len = 0
while (true) {
const ch = input.peek(len)
if (len === 0 ? isIdentHead(ch) : isIdentTail(ch)) { len++ } else { break }
}
if (len === 0) return
const chars = []
for (let i = 0; i < len; i++) chars.push(input.peek(i))
const word = String.fromCharCode(...chars)
// Let codeKeywordTokenizer handle keywords; let CodeBool handle bools.
if (KEYWORDS.has(word) || BOOLS.has(word)) return
for (let i = 0; i < len; i++) input.advance()
input.acceptToken(CodeIdent)
},
{ contextual: true }
)
// ── strongBodyTokenizer ──────────────────────────────────────────────────
// Emits StrongBody — the content between the '*' delimiters of a Strong node.
//
// contextual: true — only fires when StrongBody is in the valid set, i.e.
// inside Strong → "*" . StrongBody? "*". This state is very specific and
// is not merged with item* by Lezer's aggressive LALR merging, so canShift
// is a reliable guard here.
//
// Reads everything up to the first '*' or newline (Typst bold does not span
// lines). A trailing '*' that is the closing delimiter is left for the
// grammar rule to consume.
export const strongBodyTokenizer = new ExternalTokenizer(
(input, _stack) => {
let hasContent = false
while (input.next !== -1 && input.next !== STAR && input.next !== NEWLINE) {
input.advance()
hasContent = true
}
if (hasContent) input.acceptToken(StrongBody)
},
{ contextual: true }
)
// ── emphBodyTokenizer ────────────────────────────────────────────────────
// Emits EmphBody — the content between the '_' delimiters of an Emphasis node.
// Same design as strongBodyTokenizer; stops at '_' or newline.
export const emphBodyTokenizer = new ExternalTokenizer(
(input, _stack) => {
let hasContent = false
while (input.next !== -1 && input.next !== UNDERSCORE && input.next !== NEWLINE) {
input.advance()
hasContent = true
}
if (hasContent) input.acceptToken(EmphBody)
},
{ contextual: true }
)
@@ -7,6 +7,9 @@
// rawInlineTokenizer — single-backtick raw inline content // rawInlineTokenizer — single-backtick raw inline content
// codeBlockTokenizer — brace-depth tracking inside #{ ... } // codeBlockTokenizer — brace-depth tracking inside #{ ... }
// blockCommentTokenizer — depth-tracked nested /* ... */ comments // blockCommentTokenizer — depth-tracked nested /* ... */ comments
// codeIdentTokenizer — CodeIdent: identifier, only fires in code context
// strongBodyTokenizer — StrongBody: content inside *...*
// emphBodyTokenizer — EmphBody: content inside _..._
@top Document { item* } @top Document { item* }
@@ -105,16 +108,15 @@ ContentBlock { "[" item* "]" }
InlineMath { "$" MathContent? "$" } InlineMath { "$" MathContent? "$" }
// ── Markup formatting ───────────────────────────────────────────────────── // ── Markup formatting ─────────────────────────────────────────────────────
// Cross-nesting of Strong/Emphasis is intentionally excluded to avoid a // Strong and Emphasis use flat external body tokens (StrongBody / EmphBody)
// mutual-recursion cycle (Strong→Emphasis→Strong) that causes state explosion // rather than recursive strongItem* / emphItem* loops. The loop approach
// in the Lezer LR automaton builder. StrongText includes '_' and EmphText // triggered LALR state merging that caused item*-level tokens (MarkupContent,
// includes '*', so the nested delimiters are treated as plain text inside the // CodeIdent) to win over StrongText/EmphText inside the construct, so the
// opposite construct rather than producing error nodes. // body nodes were never produced. The flat external tokens are contextual
Strong { "*" strongItem* "*" } // (canShift only fires inside Strong/Emphasis) and reliably avoid those
strongItem { CodeExpr | InlineMath | RawInline | Label | Ref | StrongText } // merged states.
Strong { "*" StrongBody? "*" }
Emphasis { "_" emphItem* "_" } Emphasis { "_" EmphBody? "_" }
emphItem { CodeExpr | InlineMath | RawInline | Label | Ref | EmphText }
// ── Labels and references ───────────────────────────────────────────────── // ── Labels and references ─────────────────────────────────────────────────
Label { "<" LabelName ">" } Label { "<" LabelName ">" }
@@ -162,6 +164,24 @@ Escape { "\\" EscapeChar }
CodeKeyword CodeKeyword
} }
// CodeIdent is external so codeIdentTokenizer can apply a character-level
// guard: it only emits when the preceding non-whitespace character is one of
// '#', '.', '(', ',' — i.e. genuinely inside a code expression. This stops
// the token from firing in markup body text, where LALR state merging would
// otherwise cause the entire token (including any leading '_') to be consumed
// as a code identifier instead of letting '_' open an Emphasis.
@external tokens codeIdentTokenizer from "./tokens.mjs" {
CodeIdent
}
@external tokens strongBodyTokenizer from "./tokens.mjs" {
StrongBody
}
@external tokens emphBodyTokenizer from "./tokens.mjs" {
EmphBody
}
// ── Regular tokens ──────────────────────────────────────────────────────── // ── Regular tokens ────────────────────────────────────────────────────────
@tokens { @tokens {
// Horizontal whitespace only. Newlines are kept as explicit Newline items // Horizontal whitespace only. Newlines are kept as explicit Newline items
@@ -172,11 +192,6 @@ Escape { "\\" EscapeChar }
// Boolean / null literals — distinct from keywords for highlighting. // Boolean / null literals — distinct from keywords for highlighting.
CodeBool { "true" | "false" | "none" | "auto" } CodeBool { "true" | "false" | "none" | "auto" }
// General identifier: [A-Za-z_][A-Za-z0-9_-]*
CodeIdent { identHead identTail* }
identHead { @asciiLetter | "_" }
identTail { @asciiLetter | @digit | "_" | "-" }
// Double-quoted string with backslash escapes (no single-quoted strings in Typst). // Double-quoted string with backslash escapes (no single-quoted strings in Typst).
CodeString { '"' (!["\\\n] | "\\" _)* '"' } CodeString { '"' (!["\\\n] | "\\" _)* '"' }
@@ -186,23 +201,14 @@ Escape { "\\" EscapeChar }
("pt" | "mm" | "cm" | "in" | "em" | "rem" | "fr" | "deg" | "rad" | "%")? ("pt" | "mm" | "cm" | "in" | "em" | "rem" | "fr" | "deg" | "rad" | "%")?
} }
// Text tokens for markup contexts; each excludes its own delimiters.
// HeadingText, LineCommentContent, and MathContent are external tokens
// (see above) — broad "read-to-delimiter" tokens that would otherwise
// conflict with every other literal token in LALR-merged states.
// '<' is excluded from StrongText/EmphText so that Label ('<' LabelName '>')
// is recognised inside strong/emphasis rather than consumed as plain text.
StrongText { ![\n*$#`<@\\]+ }
EmphText { ![\n_$#`<@\\]+ }
// Regular markup: excludes all special-character starters plus whitespace // Regular markup: excludes all special-character starters plus whitespace
// (whitespace is handled by @skip). The '/' is excluded so that '//' and // (whitespace is handled by @skip). The '/' is excluded so that '//' and
// '/*' are not accidentally consumed as plain text. // '/*' are not accidentally consumed as plain text.
MarkupContent { ![\n \t=*_$#/<@`\\]+ } MarkupContent { ![\n \t=*_$#/<@`\\]+ }
// Label names: identifiers with optional dots/colons (e.g. <sec:intro>). // Label names: identifiers with optional dots/colons (e.g. <sec:intro>).
LabelName { (identHead | @digit) (identTail | "." | ":")* } LabelName { (@asciiLetter | "_" | @digit) (@asciiLetter | @digit | "_" | "-" | "." | ":")* }
RefName { identHead identTail* } RefName { (@asciiLetter | "_") (@asciiLetter | @digit | "_" | "-")* }
// Escape: any single character after backslash. // Escape: any single character after backslash.
EscapeChar { _ } EscapeChar { _ }
@@ -210,19 +216,15 @@ Escape { "\\" EscapeChar }
// Newline item — kept out of @skip so heading detection works. // Newline item — kept out of @skip so heading detection works.
Newline { "\n" } Newline { "\n" }
// Resolve ambiguities: more-specific tokens win over broader catch-alls. // Resolve ambiguities in merged states:
// EscapeChar > spaces: after '\', EscapeChar must win over the skip token // EscapeChar > spaces: after '\', EscapeChar must win over the skip token.
// (both match \t; without this, '\t' would be mis-tokenized). // "(" > "." > "]": callSuffix delimiters must win over MarkupContent after
// "(" > "." > "]" > text tokens: after '#' CodeIdent, callSuffix delimiters // a code identifier (merged states expose these to the markup tokenizer).
// must win over MarkupContent/StrongText/EmphText in merged states. // "_" > MarkupContent: '_' must open Emphasis rather than being swallowed
// LineCommentContent and MathContent are external tokens — not listed here. // by MarkupContent (redundant since '_' is in MarkupContent's exclusion
// "_" added after CodeIdent: KeywordExpr { CodeKeyword CallExpr? } merges // set, but kept for clarity).
// the post-keyword state with markup states where "_" starts Emphasis. // CodeIdent and StrongText/EmphText are now external tokens — not listed.
// CodeIdent wins so '#set _name(...)' is tokenised correctly; in pure markup @precedence { CodeBool EscapeChar "(" "." "]" "_" spaces MarkupContent }
// states CodeIdent is not in the valid set so "_" still opens Emphasis.
// CodeKeyword is now an external token (codeKeywordTokenizer) and therefore
// not listed here — it uses a peek(-1)==='#' guard to stay out of markup.
@precedence { CodeBool CodeIdent EscapeChar "(" "." "]" "_" spaces MarkupContent StrongText EmphText }
} }
@skip { spaces } @skip { spaces }