From 974a9c4fb3a5a3106df37117eba414a4610b99be Mon Sep 17 00:00:00 2001 From: claude Date: Mon, 8 Jun 2026 18:17:17 +0000 Subject: [PATCH] fix(typst): restore HeadingMark+HeadingTitle with character-level bleed guard The single-HeadingLine token approach caused everything after the first heading to be unparsed. Reverting to the two-token structure but adding a backward character scan in headingTitleTokenizer: after canShift(), walk backward past whitespace and require '=' immediately before the current position. Body-text positions in LALR-merged states will have a letter or closing bracket there instead, so the tokenizer returns without accepting. Co-Authored-By: Claude Sonnet 4.6 --- .../source-editor/languages/typst/index.ts | 3 +- .../source-editor/lezer-typst/tokens.mjs | 56 +++++++++++-------- .../source-editor/lezer-typst/typst.grammar | 25 +++++---- 3 files changed, 47 insertions(+), 37 deletions(-) diff --git a/services/web/frontend/js/features/source-editor/languages/typst/index.ts b/services/web/frontend/js/features/source-editor/languages/typst/index.ts index a9da188e62..a3d651731f 100644 --- a/services/web/frontend/js/features/source-editor/languages/typst/index.ts +++ b/services/web/frontend/js/features/source-editor/languages/typst/index.ts @@ -30,8 +30,7 @@ export const TypstLanguage = LRLanguage.define({ CodeArgs: foldInside, }), styleTags({ - // HeadingLine is the entire heading line (prefix + title) as one token. - HeadingLine: t.heading, + 'HeadingMark HeadingTitle': t.heading, // Comments 'LineComment LineCommentContent': t.comment, diff --git a/services/web/frontend/js/features/source-editor/lezer-typst/tokens.mjs b/services/web/frontend/js/features/source-editor/lezer-typst/tokens.mjs index 652dff574a..557614d5ec 100644 --- a/services/web/frontend/js/features/source-editor/lezer-typst/tokens.mjs +++ b/services/web/frontend/js/features/source-editor/lezer-typst/tokens.mjs @@ -2,7 +2,8 @@ import { ExternalTokenizer } from '@lezer/lr' import { - HeadingLine, + HeadingMark, + HeadingTitle, RawBlockOpen, RawBlockBody, RawBlockClose, @@ -24,42 +25,49 @@ const DOLLAR = 36 // $ const OPEN_BRACE = 123 // { const CLOSE_BRACE = 125 // } -// ── headingLineTokenizer ──────────────────────────────────────────────── -// Emits HeadingLine — the entire heading line: "=+" markers, trailing -// space, and title text, all as one token. -// -// Using a single token per heading line eliminates the two-step -// (HeadingMark + HeadingTitle) pattern that caused LALR state-merging -// problems: having a separate HeadingTitle token created a parser state -// that "waited" after HeadingMark, and LALR merged that state into -// body-text item* states. In those merged states headingTitleTokenizer -// fired for every paragraph line, swallowing inline markup tokens. -export const headingLineTokenizer = new ExternalTokenizer( +// ── headingTokenizer ──────────────────────────────────────────────────── +// Emits HeadingMark — the "=+" prefix plus the trailing whitespace. +// Only fires at the start of a line (pos 0, or character after '\n'). +export const headingTokenizer = new ExternalTokenizer( (input, _stack) => { - // Only fire at the start of a line. if (input.pos > 0 && input.peek(-1) !== NEWLINE) return - if (input.next !== EQUALS) return - - // Require one or more '=' heading markers. while (input.next === EQUALS) input.advance() - - // Must be followed by whitespace. if (input.next !== SPACE && input.next !== TAB) return - - // Consume the whitespace. while (input.next === SPACE || input.next === TAB) input.advance() + input.acceptToken(HeadingMark) + }, + { contextual: false } +) - // Consume the title text to end of line (stop before line comment). +// ── headingTitleTokenizer ──────────────────────────────────────────────── +// Emits HeadingTitle — the title text from after HeadingMark to end of line. +// +// LALR state merging means canShift(HeadingTitle) can return true in merged +// body-text states, not only in the genuine post-HeadingMark state. A +// character-level backward scan guards against those false positives: we walk +// back from input.pos past whitespace and require '=' immediately before it. +// Any body-text position will have a non-'=' character there instead. +export const headingTitleTokenizer = new ExternalTokenizer( + (input, stack) => { + if (!stack.canShift(HeadingTitle)) return + + // Walk backward past the trailing whitespace of HeadingMark. + // We must find '=' immediately before that whitespace. + let back = -1 + while (input.peek(back) === SPACE || input.peek(back) === TAB) back-- + if (input.peek(back) !== EQUALS) return + + let hasContent = false while (input.next !== -1 && input.next !== NEWLINE) { if (input.next === SLASH && (input.peek(1) === SLASH || input.peek(1) === STAR)) break input.advance() + hasContent = true } - - input.acceptToken(HeadingLine) + if (hasContent) input.acceptToken(HeadingTitle) }, - { contextual: false } + { contextual: true } ) // ── rawTokenizer ──────────────────────────────────────────────────────── diff --git a/services/web/frontend/js/features/source-editor/lezer-typst/typst.grammar b/services/web/frontend/js/features/source-editor/lezer-typst/typst.grammar index f28c78dd26..69d2e8434f 100644 --- a/services/web/frontend/js/features/source-editor/lezer-typst/typst.grammar +++ b/services/web/frontend/js/features/source-editor/lezer-typst/typst.grammar @@ -1,14 +1,12 @@ // typst.grammar — Lezer LR grammar for the Typst typesetting language. // Covers markup mode (top-level), code mode (#expr) and math mode ($...$). // External tokenizers handle constructs requiring context-sensitive lexing: -// headingLineTokenizer — entire heading line (=+ prefix + title) as one token +// headingTokenizer — HeadingMark: the leading "=+" plus whitespace +// headingTitleTokenizer — HeadingTitle: the title text to end of line // rawTokenizer — triple-backtick raw block open/body/close // rawInlineTokenizer — single-backtick raw inline content // codeBlockTokenizer — brace-depth tracking inside #{ ... } // blockCommentTokenizer — depth-tracked nested /* ... */ comments -// Using one token for the whole line avoids LALR state-merge issues that arose -// when HeadingTitle (a separate post-HeadingMark token) was accepted by -// merged body-text parser states, causing body text to be swallowed. @top Document { item* } @@ -31,11 +29,12 @@ item { } // ── Headings ────────────────────────────────────────────────────────────── -// HeadingLine covers the entire heading line: the "=+" prefix, trailing -// space, and title text. One token per line means there is no LALR state -// that "waits" for a second heading token, so no merged body-text state -// can accidentally accept HeadingLine. -Heading { HeadingLine } +// HeadingMark: the "=+" prefix + trailing whitespace (external token). +// HeadingTitle: the rest of the line. headingTitleTokenizer uses both +// stack.canShift() AND a backward character scan to refuse to fire unless +// the raw text really shows "=+" behind the current position — preventing +// LALR-merged body-text states from accidentally consuming markup as title. +Heading { HeadingMark HeadingTitle? } // ── Comments ────────────────────────────────────────────────────────────── LineComment { "//" LineCommentContent? } @@ -125,8 +124,12 @@ Ref { "@" RefName } Escape { "\\" EscapeChar } // ── External tokenizer declarations ────────────────────────────────────── -@external tokens headingLineTokenizer from "./tokens.mjs" { - HeadingLine +@external tokens headingTokenizer from "./tokens.mjs" { + HeadingMark +} + +@external tokens headingTitleTokenizer from "./tokens.mjs" { + HeadingTitle } @external tokens rawTokenizer from "./tokens.mjs" {