fix(typst): restore HeadingMark+HeadingTitle with character-level bleed guard
Build and Deploy Verso / deploy (push) Successful in 10m0s

The single-HeadingLine token approach caused everything after the first
heading to be unparsed. Reverting to the two-token structure but adding a
backward character scan in headingTitleTokenizer: after canShift(), walk
backward past whitespace and require '=' immediately before the current
position. Body-text positions in LALR-merged states will have a letter or
closing bracket there instead, so the tokenizer returns without accepting.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
claude
2026-06-08 18:17:17 +00:00
parent 34025dc084
commit 974a9c4fb3
3 changed files with 47 additions and 37 deletions
@@ -30,8 +30,7 @@ export const TypstLanguage = LRLanguage.define({
CodeArgs: foldInside,
}),
styleTags({
// HeadingLine is the entire heading line (prefix + title) as one token.
HeadingLine: t.heading,
'HeadingMark HeadingTitle': t.heading,
// Comments
'LineComment LineCommentContent': t.comment,
@@ -2,7 +2,8 @@
import { ExternalTokenizer } from '@lezer/lr'
import {
HeadingLine,
HeadingMark,
HeadingTitle,
RawBlockOpen,
RawBlockBody,
RawBlockClose,
@@ -24,42 +25,49 @@ const DOLLAR = 36 // $
const OPEN_BRACE = 123 // {
const CLOSE_BRACE = 125 // }
// ── headingLineTokenizer ────────────────────────────────────────────────
// Emits HeadingLine — the entire heading line: "=+" markers, trailing
// space, and title text, all as one token.
//
// Using a single token per heading line eliminates the two-step
// (HeadingMark + HeadingTitle) pattern that caused LALR state-merging
// problems: having a separate HeadingTitle token created a parser state
// that "waited" after HeadingMark, and LALR merged that state into
// body-text item* states. In those merged states headingTitleTokenizer
// fired for every paragraph line, swallowing inline markup tokens.
export const headingLineTokenizer = new ExternalTokenizer(
// ── headingTokenizer ────────────────────────────────────────────────────
// Emits HeadingMark — the "=+" prefix plus the trailing whitespace.
// Only fires at the start of a line (pos 0, or character after '\n').
export const headingTokenizer = new ExternalTokenizer(
(input, _stack) => {
// Only fire at the start of a line.
if (input.pos > 0 && input.peek(-1) !== NEWLINE) return
if (input.next !== EQUALS) return
// Require one or more '=' heading markers.
while (input.next === EQUALS) input.advance()
// Must be followed by whitespace.
if (input.next !== SPACE && input.next !== TAB) return
// Consume the whitespace.
while (input.next === SPACE || input.next === TAB) input.advance()
input.acceptToken(HeadingMark)
},
{ contextual: false }
)
// Consume the title text to end of line (stop before line comment).
// ── headingTitleTokenizer ────────────────────────────────────────────────
// Emits HeadingTitle — the title text from after HeadingMark to end of line.
//
// LALR state merging means canShift(HeadingTitle) can return true in merged
// body-text states, not only in the genuine post-HeadingMark state. A
// character-level backward scan guards against those false positives: we walk
// back from input.pos past whitespace and require '=' immediately before it.
// Any body-text position will have a non-'=' character there instead.
export const headingTitleTokenizer = new ExternalTokenizer(
(input, stack) => {
if (!stack.canShift(HeadingTitle)) return
// Walk backward past the trailing whitespace of HeadingMark.
// We must find '=' immediately before that whitespace.
let back = -1
while (input.peek(back) === SPACE || input.peek(back) === TAB) back--
if (input.peek(back) !== EQUALS) return
let hasContent = false
while (input.next !== -1 && input.next !== NEWLINE) {
if (input.next === SLASH &&
(input.peek(1) === SLASH || input.peek(1) === STAR)) break
input.advance()
hasContent = true
}
input.acceptToken(HeadingLine)
if (hasContent) input.acceptToken(HeadingTitle)
},
{ contextual: false }
{ contextual: true }
)
// ── rawTokenizer ────────────────────────────────────────────────────────
@@ -1,14 +1,12 @@
// typst.grammar — Lezer LR grammar for the Typst typesetting language.
// Covers markup mode (top-level), code mode (#expr) and math mode ($...$).
// External tokenizers handle constructs requiring context-sensitive lexing:
// headingLineTokenizer — entire heading line (=+ prefix + title) as one token
// headingTokenizer — HeadingMark: the leading "=+" plus whitespace
// headingTitleTokenizer — HeadingTitle: the title text to end of line
// rawTokenizer — triple-backtick raw block open/body/close
// rawInlineTokenizer — single-backtick raw inline content
// codeBlockTokenizer — brace-depth tracking inside #{ ... }
// blockCommentTokenizer — depth-tracked nested /* ... */ comments
// Using one token for the whole line avoids LALR state-merge issues that arose
// when HeadingTitle (a separate post-HeadingMark token) was accepted by
// merged body-text parser states, causing body text to be swallowed.
@top Document { item* }
@@ -31,11 +29,12 @@ item {
}
// ── Headings ──────────────────────────────────────────────────────────────
// HeadingLine covers the entire heading line: the "=+" prefix, trailing
// space, and title text. One token per line means there is no LALR state
// that "waits" for a second heading token, so no merged body-text state
// can accidentally accept HeadingLine.
Heading { HeadingLine }
// HeadingMark: the "=+" prefix + trailing whitespace (external token).
// HeadingTitle: the rest of the line. headingTitleTokenizer uses both
// stack.canShift() AND a backward character scan to refuse to fire unless
// the raw text really shows "=+" behind the current position — preventing
// LALR-merged body-text states from accidentally consuming markup as title.
Heading { HeadingMark HeadingTitle? }
// ── Comments ──────────────────────────────────────────────────────────────
LineComment { "//" LineCommentContent? }
@@ -125,8 +124,12 @@ Ref { "@" RefName }
Escape { "\\" EscapeChar }
// ── External tokenizer declarations ──────────────────────────────────────
@external tokens headingLineTokenizer from "./tokens.mjs" {
HeadingLine
@external tokens headingTokenizer from "./tokens.mjs" {
HeadingMark
}
@external tokens headingTitleTokenizer from "./tokens.mjs" {
HeadingTitle
}
@external tokens rawTokenizer from "./tokens.mjs" {