fix(typst): restore HeadingMark+HeadingTitle with character-level bleed guard
Build and Deploy Verso / deploy (push) Successful in 10m0s
Build and Deploy Verso / deploy (push) Successful in 10m0s
The single-HeadingLine token approach caused everything after the first heading to be unparsed. Reverting to the two-token structure but adding a backward character scan in headingTitleTokenizer: after canShift(), walk backward past whitespace and require '=' immediately before the current position. Body-text positions in LALR-merged states will have a letter or closing bracket there instead, so the tokenizer returns without accepting. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -30,8 +30,7 @@ export const TypstLanguage = LRLanguage.define({
|
||||
CodeArgs: foldInside,
|
||||
}),
|
||||
styleTags({
|
||||
// HeadingLine is the entire heading line (prefix + title) as one token.
|
||||
HeadingLine: t.heading,
|
||||
'HeadingMark HeadingTitle': t.heading,
|
||||
|
||||
// Comments
|
||||
'LineComment LineCommentContent': t.comment,
|
||||
|
||||
@@ -2,7 +2,8 @@
|
||||
|
||||
import { ExternalTokenizer } from '@lezer/lr'
|
||||
import {
|
||||
HeadingLine,
|
||||
HeadingMark,
|
||||
HeadingTitle,
|
||||
RawBlockOpen,
|
||||
RawBlockBody,
|
||||
RawBlockClose,
|
||||
@@ -24,42 +25,49 @@ const DOLLAR = 36 // $
|
||||
const OPEN_BRACE = 123 // {
|
||||
const CLOSE_BRACE = 125 // }
|
||||
|
||||
// ── headingLineTokenizer ────────────────────────────────────────────────
|
||||
// Emits HeadingLine — the entire heading line: "=+" markers, trailing
|
||||
// space, and title text, all as one token.
|
||||
//
|
||||
// Using a single token per heading line eliminates the two-step
|
||||
// (HeadingMark + HeadingTitle) pattern that caused LALR state-merging
|
||||
// problems: having a separate HeadingTitle token created a parser state
|
||||
// that "waited" after HeadingMark, and LALR merged that state into
|
||||
// body-text item* states. In those merged states headingTitleTokenizer
|
||||
// fired for every paragraph line, swallowing inline markup tokens.
|
||||
export const headingLineTokenizer = new ExternalTokenizer(
|
||||
// ── headingTokenizer ────────────────────────────────────────────────────
|
||||
// Emits HeadingMark — the "=+" prefix plus the trailing whitespace.
|
||||
// Only fires at the start of a line (pos 0, or character after '\n').
|
||||
export const headingTokenizer = new ExternalTokenizer(
|
||||
(input, _stack) => {
|
||||
// Only fire at the start of a line.
|
||||
if (input.pos > 0 && input.peek(-1) !== NEWLINE) return
|
||||
|
||||
if (input.next !== EQUALS) return
|
||||
|
||||
// Require one or more '=' heading markers.
|
||||
while (input.next === EQUALS) input.advance()
|
||||
|
||||
// Must be followed by whitespace.
|
||||
if (input.next !== SPACE && input.next !== TAB) return
|
||||
|
||||
// Consume the whitespace.
|
||||
while (input.next === SPACE || input.next === TAB) input.advance()
|
||||
input.acceptToken(HeadingMark)
|
||||
},
|
||||
{ contextual: false }
|
||||
)
|
||||
|
||||
// Consume the title text to end of line (stop before line comment).
|
||||
// ── headingTitleTokenizer ────────────────────────────────────────────────
|
||||
// Emits HeadingTitle — the title text from after HeadingMark to end of line.
|
||||
//
|
||||
// LALR state merging means canShift(HeadingTitle) can return true in merged
|
||||
// body-text states, not only in the genuine post-HeadingMark state. A
|
||||
// character-level backward scan guards against those false positives: we walk
|
||||
// back from input.pos past whitespace and require '=' immediately before it.
|
||||
// Any body-text position will have a non-'=' character there instead.
|
||||
export const headingTitleTokenizer = new ExternalTokenizer(
|
||||
(input, stack) => {
|
||||
if (!stack.canShift(HeadingTitle)) return
|
||||
|
||||
// Walk backward past the trailing whitespace of HeadingMark.
|
||||
// We must find '=' immediately before that whitespace.
|
||||
let back = -1
|
||||
while (input.peek(back) === SPACE || input.peek(back) === TAB) back--
|
||||
if (input.peek(back) !== EQUALS) return
|
||||
|
||||
let hasContent = false
|
||||
while (input.next !== -1 && input.next !== NEWLINE) {
|
||||
if (input.next === SLASH &&
|
||||
(input.peek(1) === SLASH || input.peek(1) === STAR)) break
|
||||
input.advance()
|
||||
hasContent = true
|
||||
}
|
||||
|
||||
input.acceptToken(HeadingLine)
|
||||
if (hasContent) input.acceptToken(HeadingTitle)
|
||||
},
|
||||
{ contextual: false }
|
||||
{ contextual: true }
|
||||
)
|
||||
|
||||
// ── rawTokenizer ────────────────────────────────────────────────────────
|
||||
|
||||
@@ -1,14 +1,12 @@
|
||||
// typst.grammar — Lezer LR grammar for the Typst typesetting language.
|
||||
// Covers markup mode (top-level), code mode (#expr) and math mode ($...$).
|
||||
// External tokenizers handle constructs requiring context-sensitive lexing:
|
||||
// headingLineTokenizer — entire heading line (=+ prefix + title) as one token
|
||||
// headingTokenizer — HeadingMark: the leading "=+" plus whitespace
|
||||
// headingTitleTokenizer — HeadingTitle: the title text to end of line
|
||||
// rawTokenizer — triple-backtick raw block open/body/close
|
||||
// rawInlineTokenizer — single-backtick raw inline content
|
||||
// codeBlockTokenizer — brace-depth tracking inside #{ ... }
|
||||
// blockCommentTokenizer — depth-tracked nested /* ... */ comments
|
||||
// Using one token for the whole line avoids LALR state-merge issues that arose
|
||||
// when HeadingTitle (a separate post-HeadingMark token) was accepted by
|
||||
// merged body-text parser states, causing body text to be swallowed.
|
||||
|
||||
@top Document { item* }
|
||||
|
||||
@@ -31,11 +29,12 @@ item {
|
||||
}
|
||||
|
||||
// ── Headings ──────────────────────────────────────────────────────────────
|
||||
// HeadingLine covers the entire heading line: the "=+" prefix, trailing
|
||||
// space, and title text. One token per line means there is no LALR state
|
||||
// that "waits" for a second heading token, so no merged body-text state
|
||||
// can accidentally accept HeadingLine.
|
||||
Heading { HeadingLine }
|
||||
// HeadingMark: the "=+" prefix + trailing whitespace (external token).
|
||||
// HeadingTitle: the rest of the line. headingTitleTokenizer uses both
|
||||
// stack.canShift() AND a backward character scan to refuse to fire unless
|
||||
// the raw text really shows "=+" behind the current position — preventing
|
||||
// LALR-merged body-text states from accidentally consuming markup as title.
|
||||
Heading { HeadingMark HeadingTitle? }
|
||||
|
||||
// ── Comments ──────────────────────────────────────────────────────────────
|
||||
LineComment { "//" LineCommentContent? }
|
||||
@@ -125,8 +124,12 @@ Ref { "@" RefName }
|
||||
Escape { "\\" EscapeChar }
|
||||
|
||||
// ── External tokenizer declarations ──────────────────────────────────────
|
||||
@external tokens headingLineTokenizer from "./tokens.mjs" {
|
||||
HeadingLine
|
||||
@external tokens headingTokenizer from "./tokens.mjs" {
|
||||
HeadingMark
|
||||
}
|
||||
|
||||
@external tokens headingTitleTokenizer from "./tokens.mjs" {
|
||||
HeadingTitle
|
||||
}
|
||||
|
||||
@external tokens rawTokenizer from "./tokens.mjs" {
|
||||
|
||||
Reference in New Issue
Block a user