fix: make CodeIdent external and replace strongItem*/emphItem* with flat body tokens
Build and Deploy Verso / deploy (push) Successful in 13m36s
Build and Deploy Verso / deploy (push) Successful in 13m36s
Two LALR state-merging bugs prevented Strong/Emphasis nodes from ever being
produced (confirmed: tok-strong/tok-emphasis count = 0 in browser diagnostic).
Bug 1 — _italic_ consumed as CodeIdent:
CodeIdent was a @tokens rule with identHead = [A-Za-z_], so '_italic_' (the
entire string including both underscores) matched as one CodeIdent token.
LALR merging caused CodeIdent to be in item*'s valid set, and CodeIdent >
"_" in @precedence, so the parser never opened Emphasis.
Fix: move CodeIdent to an external tokenizer (codeIdentTokenizer) with a
character-level guard — only fires when the preceding non-whitespace char
is one of '#', '.', '(', ',' (genuine code-context positions). In body
text where peek-back finds a newline, space, or markup delimiter, the
tokenizer returns without emitting, letting '"_"' open Emphasis correctly.
Bug 2 — StrongText never produced inside Strong:
The strongItem* / emphItem* loops merged with item* states via Lezer's
aggressive LALR merging. In the merged state MarkupContent was in the
valid set (from the item* side) and MarkupContent > StrongText in
@precedence, so MarkupContent was always produced — not a valid strongItem,
leading to error recovery with no StrongText in the tree.
Fix: replace the recursive strongItem* / emphItem* loops with flat external
tokens StrongBody / EmphBody (contextual: true). These fire only inside
Strong → "*" . StrongBody? "*" and Emphasis → "_" . EmphBody? "_", states
specific enough that canShift is reliable. They read everything up to the
closing delimiter or newline in one token, bypassing the LALR merging
entirely.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -73,8 +73,8 @@ export const TypstLanguage = LRLanguage.define({
|
|||||||
MathContent: t.string,
|
MathContent: t.string,
|
||||||
|
|
||||||
// Markup emphasis
|
// Markup emphasis
|
||||||
'Strong/"*" Strong/StrongText': t.strong,
|
'Strong/"*" Strong/StrongBody': t.strong,
|
||||||
'Emphasis/"_" Emphasis/EmphText': t.emphasis,
|
'Emphasis/"_" Emphasis/EmphBody': t.emphasis,
|
||||||
|
|
||||||
// Labels (<name>) and references (@name)
|
// Labels (<name>) and references (@name)
|
||||||
'Label/"<" Label/">" Label/LabelName': t.labelName,
|
'Label/"<" Label/">" Label/LabelName': t.labelName,
|
||||||
|
|||||||
@@ -13,19 +13,26 @@ import {
|
|||||||
LineCommentContent,
|
LineCommentContent,
|
||||||
MathContent,
|
MathContent,
|
||||||
CodeKeyword,
|
CodeKeyword,
|
||||||
|
CodeIdent,
|
||||||
|
StrongBody,
|
||||||
|
EmphBody,
|
||||||
} from './typst.terms.mjs'
|
} from './typst.terms.mjs'
|
||||||
|
|
||||||
const BACKTICK = 96 // `
|
const BACKTICK = 96 // `
|
||||||
const SLASH = 47 // /
|
const SLASH = 47 // /
|
||||||
const STAR = 42 // *
|
const STAR = 42 // *
|
||||||
const NEWLINE = 10 // \n
|
const NEWLINE = 10 // \n
|
||||||
const EQUALS = 61 // =
|
const EQUALS = 61 // =
|
||||||
const SPACE = 32 //
|
const SPACE = 32 //
|
||||||
const TAB = 9 // \t
|
const TAB = 9 // \t
|
||||||
const DOLLAR = 36 // $
|
const DOLLAR = 36 // $
|
||||||
const OPEN_BRACE = 123 // {
|
const OPEN_BRACE = 123 // {
|
||||||
const CLOSE_BRACE = 125 // }
|
const CLOSE_BRACE = 125 // }
|
||||||
const HASH = 35 // #
|
const HASH = 35 // #
|
||||||
|
const UNDERSCORE = 95 // _
|
||||||
|
const DOT = 46 // .
|
||||||
|
const OPEN_PAREN = 40 // (
|
||||||
|
const COMMA = 44 // ,
|
||||||
|
|
||||||
const KEYWORDS = new Set([
|
const KEYWORDS = new Set([
|
||||||
'let', 'set', 'show', 'import', 'include',
|
'let', 'set', 'show', 'import', 'include',
|
||||||
@@ -34,6 +41,13 @@ const KEYWORDS = new Set([
|
|||||||
'and', 'or', 'not', 'context',
|
'and', 'or', 'not', 'context',
|
||||||
])
|
])
|
||||||
|
|
||||||
|
const BOOLS = new Set(['true', 'false', 'none', 'auto'])
|
||||||
|
|
||||||
|
const isAlpha = ch => (ch >= 65 && ch <= 90) || (ch >= 97 && ch <= 122)
|
||||||
|
const isDigit = ch => ch >= 48 && ch <= 57
|
||||||
|
const isIdentHead = ch => isAlpha(ch) || ch === UNDERSCORE
|
||||||
|
const isIdentTail = ch => isAlpha(ch) || isDigit(ch) || ch === UNDERSCORE || ch === 45
|
||||||
|
|
||||||
// ── headingTokenizer ────────────────────────────────────────────────────
|
// ── headingTokenizer ────────────────────────────────────────────────────
|
||||||
// Emits HeadingMark — the "=+" prefix plus the trailing whitespace.
|
// Emits HeadingMark — the "=+" prefix plus the trailing whitespace.
|
||||||
// Only fires at the start of a line (pos 0, or character after '\n').
|
// Only fires at the start of a line (pos 0, or character after '\n').
|
||||||
@@ -272,15 +286,7 @@ export const codeKeywordTokenizer = new ExternalTokenizer(
|
|||||||
let len = 0
|
let len = 0
|
||||||
while (true) {
|
while (true) {
|
||||||
const ch = input.peek(len)
|
const ch = input.peek(len)
|
||||||
if ((ch >= 65 && ch <= 90) || // A–Z
|
if (isIdentHead(ch) || (len > 0 && isIdentTail(ch))) { len++ } else { break }
|
||||||
(ch >= 97 && ch <= 122) || // a–z
|
|
||||||
(ch >= 48 && ch <= 57) || // 0–9
|
|
||||||
ch === 95 || // _
|
|
||||||
ch === 45) { // -
|
|
||||||
len++
|
|
||||||
} else {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (len === 0) return
|
if (len === 0) return
|
||||||
@@ -296,3 +302,90 @@ export const codeKeywordTokenizer = new ExternalTokenizer(
|
|||||||
},
|
},
|
||||||
{ contextual: true }
|
{ contextual: true }
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// ── codeIdentTokenizer ───────────────────────────────────────────────────
|
||||||
|
// Emits CodeIdent — identifier tokens inside code expressions (#ident,
|
||||||
|
// #func(args), #obj.method, etc.).
|
||||||
|
//
|
||||||
|
// Moving CodeIdent from @tokens to an external tokenizer allows a
|
||||||
|
// character-level guard: we only emit when the preceding non-whitespace
|
||||||
|
// character is one of '#', '.', '(', ',' — genuine code-context positions.
|
||||||
|
// This stops the token from firing in markup body text where LALR-merged
|
||||||
|
// states would otherwise cause '_italic_' to be consumed as one big
|
||||||
|
// CodeIdent (since '_' is a valid identHead) instead of opening Emphasis.
|
||||||
|
//
|
||||||
|
// Keywords and bools are excluded so codeKeywordTokenizer / CodeBool can
|
||||||
|
// handle them without conflict.
|
||||||
|
export const codeIdentTokenizer = new ExternalTokenizer(
|
||||||
|
(input, stack) => {
|
||||||
|
if (!stack.canShift(CodeIdent)) return
|
||||||
|
|
||||||
|
// Guard: only fire in code context.
|
||||||
|
// Walk back past any horizontal whitespace (@skip) to the nearest
|
||||||
|
// non-space character and check that it is a code-mode delimiter.
|
||||||
|
let back = -1
|
||||||
|
while (input.peek(back) === SPACE || input.peek(back) === TAB) back--
|
||||||
|
const prev = input.peek(back)
|
||||||
|
if (prev !== HASH && prev !== DOT && prev !== OPEN_PAREN && prev !== COMMA) return
|
||||||
|
|
||||||
|
// Must start with an identifier head character.
|
||||||
|
if (!isIdentHead(input.next)) return
|
||||||
|
|
||||||
|
// Peek ahead to read the full identifier.
|
||||||
|
let len = 0
|
||||||
|
while (true) {
|
||||||
|
const ch = input.peek(len)
|
||||||
|
if (len === 0 ? isIdentHead(ch) : isIdentTail(ch)) { len++ } else { break }
|
||||||
|
}
|
||||||
|
if (len === 0) return
|
||||||
|
|
||||||
|
const chars = []
|
||||||
|
for (let i = 0; i < len; i++) chars.push(input.peek(i))
|
||||||
|
const word = String.fromCharCode(...chars)
|
||||||
|
|
||||||
|
// Let codeKeywordTokenizer handle keywords; let CodeBool handle bools.
|
||||||
|
if (KEYWORDS.has(word) || BOOLS.has(word)) return
|
||||||
|
|
||||||
|
for (let i = 0; i < len; i++) input.advance()
|
||||||
|
input.acceptToken(CodeIdent)
|
||||||
|
},
|
||||||
|
{ contextual: true }
|
||||||
|
)
|
||||||
|
|
||||||
|
// ── strongBodyTokenizer ──────────────────────────────────────────────────
|
||||||
|
// Emits StrongBody — the content between the '*' delimiters of a Strong node.
|
||||||
|
//
|
||||||
|
// contextual: true — only fires when StrongBody is in the valid set, i.e.
|
||||||
|
// inside Strong → "*" . StrongBody? "*". This state is very specific and
|
||||||
|
// is not merged with item* by Lezer's aggressive LALR merging, so canShift
|
||||||
|
// is a reliable guard here.
|
||||||
|
//
|
||||||
|
// Reads everything up to the first '*' or newline (Typst bold does not span
|
||||||
|
// lines). A trailing '*' that is the closing delimiter is left for the
|
||||||
|
// grammar rule to consume.
|
||||||
|
export const strongBodyTokenizer = new ExternalTokenizer(
|
||||||
|
(input, _stack) => {
|
||||||
|
let hasContent = false
|
||||||
|
while (input.next !== -1 && input.next !== STAR && input.next !== NEWLINE) {
|
||||||
|
input.advance()
|
||||||
|
hasContent = true
|
||||||
|
}
|
||||||
|
if (hasContent) input.acceptToken(StrongBody)
|
||||||
|
},
|
||||||
|
{ contextual: true }
|
||||||
|
)
|
||||||
|
|
||||||
|
// ── emphBodyTokenizer ────────────────────────────────────────────────────
|
||||||
|
// Emits EmphBody — the content between the '_' delimiters of an Emphasis node.
|
||||||
|
// Same design as strongBodyTokenizer; stops at '_' or newline.
|
||||||
|
export const emphBodyTokenizer = new ExternalTokenizer(
|
||||||
|
(input, _stack) => {
|
||||||
|
let hasContent = false
|
||||||
|
while (input.next !== -1 && input.next !== UNDERSCORE && input.next !== NEWLINE) {
|
||||||
|
input.advance()
|
||||||
|
hasContent = true
|
||||||
|
}
|
||||||
|
if (hasContent) input.acceptToken(EmphBody)
|
||||||
|
},
|
||||||
|
{ contextual: true }
|
||||||
|
)
|
||||||
|
|||||||
@@ -7,6 +7,9 @@
|
|||||||
// rawInlineTokenizer — single-backtick raw inline content
|
// rawInlineTokenizer — single-backtick raw inline content
|
||||||
// codeBlockTokenizer — brace-depth tracking inside #{ ... }
|
// codeBlockTokenizer — brace-depth tracking inside #{ ... }
|
||||||
// blockCommentTokenizer — depth-tracked nested /* ... */ comments
|
// blockCommentTokenizer — depth-tracked nested /* ... */ comments
|
||||||
|
// codeIdentTokenizer — CodeIdent: identifier, only fires in code context
|
||||||
|
// strongBodyTokenizer — StrongBody: content inside *...*
|
||||||
|
// emphBodyTokenizer — EmphBody: content inside _..._
|
||||||
|
|
||||||
@top Document { item* }
|
@top Document { item* }
|
||||||
|
|
||||||
@@ -105,16 +108,15 @@ ContentBlock { "[" item* "]" }
|
|||||||
InlineMath { "$" MathContent? "$" }
|
InlineMath { "$" MathContent? "$" }
|
||||||
|
|
||||||
// ── Markup formatting ─────────────────────────────────────────────────────
|
// ── Markup formatting ─────────────────────────────────────────────────────
|
||||||
// Cross-nesting of Strong/Emphasis is intentionally excluded to avoid a
|
// Strong and Emphasis use flat external body tokens (StrongBody / EmphBody)
|
||||||
// mutual-recursion cycle (Strong→Emphasis→Strong) that causes state explosion
|
// rather than recursive strongItem* / emphItem* loops. The loop approach
|
||||||
// in the Lezer LR automaton builder. StrongText includes '_' and EmphText
|
// triggered LALR state merging that caused item*-level tokens (MarkupContent,
|
||||||
// includes '*', so the nested delimiters are treated as plain text inside the
|
// CodeIdent) to win over StrongText/EmphText inside the construct, so the
|
||||||
// opposite construct rather than producing error nodes.
|
// body nodes were never produced. The flat external tokens are contextual
|
||||||
Strong { "*" strongItem* "*" }
|
// (canShift only fires inside Strong/Emphasis) and reliably avoid those
|
||||||
strongItem { CodeExpr | InlineMath | RawInline | Label | Ref | StrongText }
|
// merged states.
|
||||||
|
Strong { "*" StrongBody? "*" }
|
||||||
Emphasis { "_" emphItem* "_" }
|
Emphasis { "_" EmphBody? "_" }
|
||||||
emphItem { CodeExpr | InlineMath | RawInline | Label | Ref | EmphText }
|
|
||||||
|
|
||||||
// ── Labels and references ─────────────────────────────────────────────────
|
// ── Labels and references ─────────────────────────────────────────────────
|
||||||
Label { "<" LabelName ">" }
|
Label { "<" LabelName ">" }
|
||||||
@@ -162,6 +164,24 @@ Escape { "\\" EscapeChar }
|
|||||||
CodeKeyword
|
CodeKeyword
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// CodeIdent is external so codeIdentTokenizer can apply a character-level
|
||||||
|
// guard: it only emits when the preceding non-whitespace character is one of
|
||||||
|
// '#', '.', '(', ',' — i.e. genuinely inside a code expression. This stops
|
||||||
|
// the token from firing in markup body text, where LALR state merging would
|
||||||
|
// otherwise cause the entire token (including any leading '_') to be consumed
|
||||||
|
// as a code identifier instead of letting '_' open an Emphasis.
|
||||||
|
@external tokens codeIdentTokenizer from "./tokens.mjs" {
|
||||||
|
CodeIdent
|
||||||
|
}
|
||||||
|
|
||||||
|
@external tokens strongBodyTokenizer from "./tokens.mjs" {
|
||||||
|
StrongBody
|
||||||
|
}
|
||||||
|
|
||||||
|
@external tokens emphBodyTokenizer from "./tokens.mjs" {
|
||||||
|
EmphBody
|
||||||
|
}
|
||||||
|
|
||||||
// ── Regular tokens ────────────────────────────────────────────────────────
|
// ── Regular tokens ────────────────────────────────────────────────────────
|
||||||
@tokens {
|
@tokens {
|
||||||
// Horizontal whitespace only. Newlines are kept as explicit Newline items
|
// Horizontal whitespace only. Newlines are kept as explicit Newline items
|
||||||
@@ -172,11 +192,6 @@ Escape { "\\" EscapeChar }
|
|||||||
// Boolean / null literals — distinct from keywords for highlighting.
|
// Boolean / null literals — distinct from keywords for highlighting.
|
||||||
CodeBool { "true" | "false" | "none" | "auto" }
|
CodeBool { "true" | "false" | "none" | "auto" }
|
||||||
|
|
||||||
// General identifier: [A-Za-z_][A-Za-z0-9_-]*
|
|
||||||
CodeIdent { identHead identTail* }
|
|
||||||
identHead { @asciiLetter | "_" }
|
|
||||||
identTail { @asciiLetter | @digit | "_" | "-" }
|
|
||||||
|
|
||||||
// Double-quoted string with backslash escapes (no single-quoted strings in Typst).
|
// Double-quoted string with backslash escapes (no single-quoted strings in Typst).
|
||||||
CodeString { '"' (!["\\\n] | "\\" _)* '"' }
|
CodeString { '"' (!["\\\n] | "\\" _)* '"' }
|
||||||
|
|
||||||
@@ -186,23 +201,14 @@ Escape { "\\" EscapeChar }
|
|||||||
("pt" | "mm" | "cm" | "in" | "em" | "rem" | "fr" | "deg" | "rad" | "%")?
|
("pt" | "mm" | "cm" | "in" | "em" | "rem" | "fr" | "deg" | "rad" | "%")?
|
||||||
}
|
}
|
||||||
|
|
||||||
// Text tokens for markup contexts; each excludes its own delimiters.
|
|
||||||
// HeadingText, LineCommentContent, and MathContent are external tokens
|
|
||||||
// (see above) — broad "read-to-delimiter" tokens that would otherwise
|
|
||||||
// conflict with every other literal token in LALR-merged states.
|
|
||||||
// '<' is excluded from StrongText/EmphText so that Label ('<' LabelName '>')
|
|
||||||
// is recognised inside strong/emphasis rather than consumed as plain text.
|
|
||||||
StrongText { ![\n*$#`<@\\]+ }
|
|
||||||
EmphText { ![\n_$#`<@\\]+ }
|
|
||||||
|
|
||||||
// Regular markup: excludes all special-character starters plus whitespace
|
// Regular markup: excludes all special-character starters plus whitespace
|
||||||
// (whitespace is handled by @skip). The '/' is excluded so that '//' and
|
// (whitespace is handled by @skip). The '/' is excluded so that '//' and
|
||||||
// '/*' are not accidentally consumed as plain text.
|
// '/*' are not accidentally consumed as plain text.
|
||||||
MarkupContent { ![\n \t=*_$#/<@`\\]+ }
|
MarkupContent { ![\n \t=*_$#/<@`\\]+ }
|
||||||
|
|
||||||
// Label names: identifiers with optional dots/colons (e.g. <sec:intro>).
|
// Label names: identifiers with optional dots/colons (e.g. <sec:intro>).
|
||||||
LabelName { (identHead | @digit) (identTail | "." | ":")* }
|
LabelName { (@asciiLetter | "_" | @digit) (@asciiLetter | @digit | "_" | "-" | "." | ":")* }
|
||||||
RefName { identHead identTail* }
|
RefName { (@asciiLetter | "_") (@asciiLetter | @digit | "_" | "-")* }
|
||||||
|
|
||||||
// Escape: any single character after backslash.
|
// Escape: any single character after backslash.
|
||||||
EscapeChar { _ }
|
EscapeChar { _ }
|
||||||
@@ -210,19 +216,15 @@ Escape { "\\" EscapeChar }
|
|||||||
// Newline item — kept out of @skip so heading detection works.
|
// Newline item — kept out of @skip so heading detection works.
|
||||||
Newline { "\n" }
|
Newline { "\n" }
|
||||||
|
|
||||||
// Resolve ambiguities: more-specific tokens win over broader catch-alls.
|
// Resolve ambiguities in merged states:
|
||||||
// EscapeChar > spaces: after '\', EscapeChar must win over the skip token
|
// EscapeChar > spaces: after '\', EscapeChar must win over the skip token.
|
||||||
// (both match \t; without this, '\t' would be mis-tokenized).
|
// "(" > "." > "]": callSuffix delimiters must win over MarkupContent after
|
||||||
// "(" > "." > "]" > text tokens: after '#' CodeIdent, callSuffix delimiters
|
// a code identifier (merged states expose these to the markup tokenizer).
|
||||||
// must win over MarkupContent/StrongText/EmphText in merged states.
|
// "_" > MarkupContent: '_' must open Emphasis rather than being swallowed
|
||||||
// LineCommentContent and MathContent are external tokens — not listed here.
|
// by MarkupContent (redundant since '_' is in MarkupContent's exclusion
|
||||||
// "_" added after CodeIdent: KeywordExpr { CodeKeyword CallExpr? } merges
|
// set, but kept for clarity).
|
||||||
// the post-keyword state with markup states where "_" starts Emphasis.
|
// CodeIdent and StrongText/EmphText are now external tokens — not listed.
|
||||||
// CodeIdent wins so '#set _name(...)' is tokenised correctly; in pure markup
|
@precedence { CodeBool EscapeChar "(" "." "]" "_" spaces MarkupContent }
|
||||||
// states CodeIdent is not in the valid set so "_" still opens Emphasis.
|
|
||||||
// CodeKeyword is now an external token (codeKeywordTokenizer) and therefore
|
|
||||||
// not listed here — it uses a peek(-1)==='#' guard to stay out of markup.
|
|
||||||
@precedence { CodeBool CodeIdent EscapeChar "(" "." "]" "_" spaces MarkupContent StrongText EmphText }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@skip { spaces }
|
@skip { spaces }
|
||||||
|
|||||||
Reference in New Issue
Block a user