fix: make CodeIdent external and replace strongItem*/emphItem* with flat body tokens
Build and Deploy Verso / deploy (push) Successful in 13m36s
Build and Deploy Verso / deploy (push) Successful in 13m36s
Two LALR state-merging bugs prevented Strong/Emphasis nodes from ever being
produced (confirmed: tok-strong/tok-emphasis count = 0 in browser diagnostic).
Bug 1 — _italic_ consumed as CodeIdent:
CodeIdent was a @tokens rule with identHead = [A-Za-z_], so '_italic_' (the
entire string including both underscores) matched as one CodeIdent token.
LALR merging caused CodeIdent to be in item*'s valid set, and CodeIdent >
"_" in @precedence, so the parser never opened Emphasis.
Fix: move CodeIdent to an external tokenizer (codeIdentTokenizer) with a
character-level guard — only fires when the preceding non-whitespace char
is one of '#', '.', '(', ',' (genuine code-context positions). In body
text where peek-back finds a newline, space, or markup delimiter, the
tokenizer returns without emitting, letting '"_"' open Emphasis correctly.
Bug 2 — StrongText never produced inside Strong:
The strongItem* / emphItem* loops merged with item* states via Lezer's
aggressive LALR merging. In the merged state MarkupContent was in the
valid set (from the item* side) and MarkupContent > StrongText in
@precedence, so MarkupContent was always produced — not a valid strongItem,
leading to error recovery with no StrongText in the tree.
Fix: replace the recursive strongItem* / emphItem* loops with flat external
tokens StrongBody / EmphBody (contextual: true). These fire only inside
Strong → "*" . StrongBody? "*" and Emphasis → "_" . EmphBody? "_", states
specific enough that canShift is reliable. They read everything up to the
closing delimiter or newline in one token, bypassing the LALR merging
entirely.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -73,8 +73,8 @@ export const TypstLanguage = LRLanguage.define({
|
||||
MathContent: t.string,
|
||||
|
||||
// Markup emphasis
|
||||
'Strong/"*" Strong/StrongText': t.strong,
|
||||
'Emphasis/"_" Emphasis/EmphText': t.emphasis,
|
||||
'Strong/"*" Strong/StrongBody': t.strong,
|
||||
'Emphasis/"_" Emphasis/EmphBody': t.emphasis,
|
||||
|
||||
// Labels (<name>) and references (@name)
|
||||
'Label/"<" Label/">" Label/LabelName': t.labelName,
|
||||
|
||||
@@ -13,19 +13,26 @@ import {
|
||||
LineCommentContent,
|
||||
MathContent,
|
||||
CodeKeyword,
|
||||
CodeIdent,
|
||||
StrongBody,
|
||||
EmphBody,
|
||||
} from './typst.terms.mjs'
|
||||
|
||||
const BACKTICK = 96 // `
|
||||
const SLASH = 47 // /
|
||||
const STAR = 42 // *
|
||||
const NEWLINE = 10 // \n
|
||||
const EQUALS = 61 // =
|
||||
const SPACE = 32 //
|
||||
const TAB = 9 // \t
|
||||
const DOLLAR = 36 // $
|
||||
const BACKTICK = 96 // `
|
||||
const SLASH = 47 // /
|
||||
const STAR = 42 // *
|
||||
const NEWLINE = 10 // \n
|
||||
const EQUALS = 61 // =
|
||||
const SPACE = 32 //
|
||||
const TAB = 9 // \t
|
||||
const DOLLAR = 36 // $
|
||||
const OPEN_BRACE = 123 // {
|
||||
const CLOSE_BRACE = 125 // }
|
||||
const HASH = 35 // #
|
||||
const HASH = 35 // #
|
||||
const UNDERSCORE = 95 // _
|
||||
const DOT = 46 // .
|
||||
const OPEN_PAREN = 40 // (
|
||||
const COMMA = 44 // ,
|
||||
|
||||
const KEYWORDS = new Set([
|
||||
'let', 'set', 'show', 'import', 'include',
|
||||
@@ -34,6 +41,13 @@ const KEYWORDS = new Set([
|
||||
'and', 'or', 'not', 'context',
|
||||
])
|
||||
|
||||
const BOOLS = new Set(['true', 'false', 'none', 'auto'])
|
||||
|
||||
const isAlpha = ch => (ch >= 65 && ch <= 90) || (ch >= 97 && ch <= 122)
|
||||
const isDigit = ch => ch >= 48 && ch <= 57
|
||||
const isIdentHead = ch => isAlpha(ch) || ch === UNDERSCORE
|
||||
const isIdentTail = ch => isAlpha(ch) || isDigit(ch) || ch === UNDERSCORE || ch === 45
|
||||
|
||||
// ── headingTokenizer ────────────────────────────────────────────────────
|
||||
// Emits HeadingMark — the "=+" prefix plus the trailing whitespace.
|
||||
// Only fires at the start of a line (pos 0, or character after '\n').
|
||||
@@ -272,15 +286,7 @@ export const codeKeywordTokenizer = new ExternalTokenizer(
|
||||
let len = 0
|
||||
while (true) {
|
||||
const ch = input.peek(len)
|
||||
if ((ch >= 65 && ch <= 90) || // A–Z
|
||||
(ch >= 97 && ch <= 122) || // a–z
|
||||
(ch >= 48 && ch <= 57) || // 0–9
|
||||
ch === 95 || // _
|
||||
ch === 45) { // -
|
||||
len++
|
||||
} else {
|
||||
break
|
||||
}
|
||||
if (isIdentHead(ch) || (len > 0 && isIdentTail(ch))) { len++ } else { break }
|
||||
}
|
||||
|
||||
if (len === 0) return
|
||||
@@ -296,3 +302,90 @@ export const codeKeywordTokenizer = new ExternalTokenizer(
|
||||
},
|
||||
{ contextual: true }
|
||||
)
|
||||
|
||||
// ── codeIdentTokenizer ───────────────────────────────────────────────────
|
||||
// Emits CodeIdent — identifier tokens inside code expressions (#ident,
|
||||
// #func(args), #obj.method, etc.).
|
||||
//
|
||||
// Moving CodeIdent from @tokens to an external tokenizer allows a
|
||||
// character-level guard: we only emit when the preceding non-whitespace
|
||||
// character is one of '#', '.', '(', ',' — genuine code-context positions.
|
||||
// This stops the token from firing in markup body text where LALR-merged
|
||||
// states would otherwise cause '_italic_' to be consumed as one big
|
||||
// CodeIdent (since '_' is a valid identHead) instead of opening Emphasis.
|
||||
//
|
||||
// Keywords and bools are excluded so codeKeywordTokenizer / CodeBool can
|
||||
// handle them without conflict.
|
||||
export const codeIdentTokenizer = new ExternalTokenizer(
|
||||
(input, stack) => {
|
||||
if (!stack.canShift(CodeIdent)) return
|
||||
|
||||
// Guard: only fire in code context.
|
||||
// Walk back past any horizontal whitespace (@skip) to the nearest
|
||||
// non-space character and check that it is a code-mode delimiter.
|
||||
let back = -1
|
||||
while (input.peek(back) === SPACE || input.peek(back) === TAB) back--
|
||||
const prev = input.peek(back)
|
||||
if (prev !== HASH && prev !== DOT && prev !== OPEN_PAREN && prev !== COMMA) return
|
||||
|
||||
// Must start with an identifier head character.
|
||||
if (!isIdentHead(input.next)) return
|
||||
|
||||
// Peek ahead to read the full identifier.
|
||||
let len = 0
|
||||
while (true) {
|
||||
const ch = input.peek(len)
|
||||
if (len === 0 ? isIdentHead(ch) : isIdentTail(ch)) { len++ } else { break }
|
||||
}
|
||||
if (len === 0) return
|
||||
|
||||
const chars = []
|
||||
for (let i = 0; i < len; i++) chars.push(input.peek(i))
|
||||
const word = String.fromCharCode(...chars)
|
||||
|
||||
// Let codeKeywordTokenizer handle keywords; let CodeBool handle bools.
|
||||
if (KEYWORDS.has(word) || BOOLS.has(word)) return
|
||||
|
||||
for (let i = 0; i < len; i++) input.advance()
|
||||
input.acceptToken(CodeIdent)
|
||||
},
|
||||
{ contextual: true }
|
||||
)
|
||||
|
||||
// ── strongBodyTokenizer ──────────────────────────────────────────────────
|
||||
// Emits StrongBody — the content between the '*' delimiters of a Strong node.
|
||||
//
|
||||
// contextual: true — only fires when StrongBody is in the valid set, i.e.
|
||||
// inside Strong → "*" . StrongBody? "*". This state is very specific and
|
||||
// is not merged with item* by Lezer's aggressive LALR merging, so canShift
|
||||
// is a reliable guard here.
|
||||
//
|
||||
// Reads everything up to the first '*' or newline (Typst bold does not span
|
||||
// lines). A trailing '*' that is the closing delimiter is left for the
|
||||
// grammar rule to consume.
|
||||
export const strongBodyTokenizer = new ExternalTokenizer(
|
||||
(input, _stack) => {
|
||||
let hasContent = false
|
||||
while (input.next !== -1 && input.next !== STAR && input.next !== NEWLINE) {
|
||||
input.advance()
|
||||
hasContent = true
|
||||
}
|
||||
if (hasContent) input.acceptToken(StrongBody)
|
||||
},
|
||||
{ contextual: true }
|
||||
)
|
||||
|
||||
// ── emphBodyTokenizer ────────────────────────────────────────────────────
|
||||
// Emits EmphBody — the content between the '_' delimiters of an Emphasis node.
|
||||
// Same design as strongBodyTokenizer; stops at '_' or newline.
|
||||
export const emphBodyTokenizer = new ExternalTokenizer(
|
||||
(input, _stack) => {
|
||||
let hasContent = false
|
||||
while (input.next !== -1 && input.next !== UNDERSCORE && input.next !== NEWLINE) {
|
||||
input.advance()
|
||||
hasContent = true
|
||||
}
|
||||
if (hasContent) input.acceptToken(EmphBody)
|
||||
},
|
||||
{ contextual: true }
|
||||
)
|
||||
|
||||
@@ -7,6 +7,9 @@
|
||||
// rawInlineTokenizer — single-backtick raw inline content
|
||||
// codeBlockTokenizer — brace-depth tracking inside #{ ... }
|
||||
// blockCommentTokenizer — depth-tracked nested /* ... */ comments
|
||||
// codeIdentTokenizer — CodeIdent: identifier, only fires in code context
|
||||
// strongBodyTokenizer — StrongBody: content inside *...*
|
||||
// emphBodyTokenizer — EmphBody: content inside _..._
|
||||
|
||||
@top Document { item* }
|
||||
|
||||
@@ -105,16 +108,15 @@ ContentBlock { "[" item* "]" }
|
||||
InlineMath { "$" MathContent? "$" }
|
||||
|
||||
// ── Markup formatting ─────────────────────────────────────────────────────
|
||||
// Cross-nesting of Strong/Emphasis is intentionally excluded to avoid a
|
||||
// mutual-recursion cycle (Strong→Emphasis→Strong) that causes state explosion
|
||||
// in the Lezer LR automaton builder. StrongText includes '_' and EmphText
|
||||
// includes '*', so the nested delimiters are treated as plain text inside the
|
||||
// opposite construct rather than producing error nodes.
|
||||
Strong { "*" strongItem* "*" }
|
||||
strongItem { CodeExpr | InlineMath | RawInline | Label | Ref | StrongText }
|
||||
|
||||
Emphasis { "_" emphItem* "_" }
|
||||
emphItem { CodeExpr | InlineMath | RawInline | Label | Ref | EmphText }
|
||||
// Strong and Emphasis use flat external body tokens (StrongBody / EmphBody)
|
||||
// rather than recursive strongItem* / emphItem* loops. The loop approach
|
||||
// triggered LALR state merging that caused item*-level tokens (MarkupContent,
|
||||
// CodeIdent) to win over StrongText/EmphText inside the construct, so the
|
||||
// body nodes were never produced. The flat external tokens are contextual
|
||||
// (canShift only fires inside Strong/Emphasis) and reliably avoid those
|
||||
// merged states.
|
||||
Strong { "*" StrongBody? "*" }
|
||||
Emphasis { "_" EmphBody? "_" }
|
||||
|
||||
// ── Labels and references ─────────────────────────────────────────────────
|
||||
Label { "<" LabelName ">" }
|
||||
@@ -162,6 +164,24 @@ Escape { "\\" EscapeChar }
|
||||
CodeKeyword
|
||||
}
|
||||
|
||||
// CodeIdent is external so codeIdentTokenizer can apply a character-level
|
||||
// guard: it only emits when the preceding non-whitespace character is one of
|
||||
// '#', '.', '(', ',' — i.e. genuinely inside a code expression. This stops
|
||||
// the token from firing in markup body text, where LALR state merging would
|
||||
// otherwise cause the entire token (including any leading '_') to be consumed
|
||||
// as a code identifier instead of letting '_' open an Emphasis.
|
||||
@external tokens codeIdentTokenizer from "./tokens.mjs" {
|
||||
CodeIdent
|
||||
}
|
||||
|
||||
@external tokens strongBodyTokenizer from "./tokens.mjs" {
|
||||
StrongBody
|
||||
}
|
||||
|
||||
@external tokens emphBodyTokenizer from "./tokens.mjs" {
|
||||
EmphBody
|
||||
}
|
||||
|
||||
// ── Regular tokens ────────────────────────────────────────────────────────
|
||||
@tokens {
|
||||
// Horizontal whitespace only. Newlines are kept as explicit Newline items
|
||||
@@ -172,11 +192,6 @@ Escape { "\\" EscapeChar }
|
||||
// Boolean / null literals — distinct from keywords for highlighting.
|
||||
CodeBool { "true" | "false" | "none" | "auto" }
|
||||
|
||||
// General identifier: [A-Za-z_][A-Za-z0-9_-]*
|
||||
CodeIdent { identHead identTail* }
|
||||
identHead { @asciiLetter | "_" }
|
||||
identTail { @asciiLetter | @digit | "_" | "-" }
|
||||
|
||||
// Double-quoted string with backslash escapes (no single-quoted strings in Typst).
|
||||
CodeString { '"' (!["\\\n] | "\\" _)* '"' }
|
||||
|
||||
@@ -186,23 +201,14 @@ Escape { "\\" EscapeChar }
|
||||
("pt" | "mm" | "cm" | "in" | "em" | "rem" | "fr" | "deg" | "rad" | "%")?
|
||||
}
|
||||
|
||||
// Text tokens for markup contexts; each excludes its own delimiters.
|
||||
// HeadingText, LineCommentContent, and MathContent are external tokens
|
||||
// (see above) — broad "read-to-delimiter" tokens that would otherwise
|
||||
// conflict with every other literal token in LALR-merged states.
|
||||
// '<' is excluded from StrongText/EmphText so that Label ('<' LabelName '>')
|
||||
// is recognised inside strong/emphasis rather than consumed as plain text.
|
||||
StrongText { ![\n*$#`<@\\]+ }
|
||||
EmphText { ![\n_$#`<@\\]+ }
|
||||
|
||||
// Regular markup: excludes all special-character starters plus whitespace
|
||||
// (whitespace is handled by @skip). The '/' is excluded so that '//' and
|
||||
// '/*' are not accidentally consumed as plain text.
|
||||
MarkupContent { ![\n \t=*_$#/<@`\\]+ }
|
||||
|
||||
// Label names: identifiers with optional dots/colons (e.g. <sec:intro>).
|
||||
LabelName { (identHead | @digit) (identTail | "." | ":")* }
|
||||
RefName { identHead identTail* }
|
||||
LabelName { (@asciiLetter | "_" | @digit) (@asciiLetter | @digit | "_" | "-" | "." | ":")* }
|
||||
RefName { (@asciiLetter | "_") (@asciiLetter | @digit | "_" | "-")* }
|
||||
|
||||
// Escape: any single character after backslash.
|
||||
EscapeChar { _ }
|
||||
@@ -210,19 +216,15 @@ Escape { "\\" EscapeChar }
|
||||
// Newline item — kept out of @skip so heading detection works.
|
||||
Newline { "\n" }
|
||||
|
||||
// Resolve ambiguities: more-specific tokens win over broader catch-alls.
|
||||
// EscapeChar > spaces: after '\', EscapeChar must win over the skip token
|
||||
// (both match \t; without this, '\t' would be mis-tokenized).
|
||||
// "(" > "." > "]" > text tokens: after '#' CodeIdent, callSuffix delimiters
|
||||
// must win over MarkupContent/StrongText/EmphText in merged states.
|
||||
// LineCommentContent and MathContent are external tokens — not listed here.
|
||||
// "_" added after CodeIdent: KeywordExpr { CodeKeyword CallExpr? } merges
|
||||
// the post-keyword state with markup states where "_" starts Emphasis.
|
||||
// CodeIdent wins so '#set _name(...)' is tokenised correctly; in pure markup
|
||||
// states CodeIdent is not in the valid set so "_" still opens Emphasis.
|
||||
// CodeKeyword is now an external token (codeKeywordTokenizer) and therefore
|
||||
// not listed here — it uses a peek(-1)==='#' guard to stay out of markup.
|
||||
@precedence { CodeBool CodeIdent EscapeChar "(" "." "]" "_" spaces MarkupContent StrongText EmphText }
|
||||
// Resolve ambiguities in merged states:
|
||||
// EscapeChar > spaces: after '\', EscapeChar must win over the skip token.
|
||||
// "(" > "." > "]": callSuffix delimiters must win over MarkupContent after
|
||||
// a code identifier (merged states expose these to the markup tokenizer).
|
||||
// "_" > MarkupContent: '_' must open Emphasis rather than being swallowed
|
||||
// by MarkupContent (redundant since '_' is in MarkupContent's exclusion
|
||||
// set, but kept for clarity).
|
||||
// CodeIdent and StrongText/EmphText are now external tokens — not listed.
|
||||
@precedence { CodeBool EscapeChar "(" "." "]" "_" spaces MarkupContent }
|
||||
}
|
||||
|
||||
@skip { spaces }
|
||||
|
||||
Reference in New Issue
Block a user