From 974a9c4fb3a5a3106df37117eba414a4610b99be Mon Sep 17 00:00:00 2001
From: claude <claude@verso>
Date: Mon, 8 Jun 2026 18:17:17 +0000
Subject: [PATCH] fix(typst): restore HeadingMark+HeadingTitle with
 character-level bleed guard

The single-HeadingLine token approach caused everything after the first
heading to be unparsed. Reverting to the two-token structure but adding a
backward character scan in headingTitleTokenizer: after canShift(), walk
backward past whitespace and require '=' immediately before the current
position. Body-text positions in LALR-merged states will have a letter or
closing bracket there instead, so the tokenizer returns without accepting.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../source-editor/languages/typst/index.ts    |  3 +-
 .../source-editor/lezer-typst/tokens.mjs      | 56 +++++++++++--------
 .../source-editor/lezer-typst/typst.grammar   | 25 +++++----
 3 files changed, 47 insertions(+), 37 deletions(-)

diff --git a/services/web/frontend/js/features/source-editor/languages/typst/index.ts b/services/web/frontend/js/features/source-editor/languages/typst/index.ts
index a9da188e62..a3d651731f 100644
--- a/services/web/frontend/js/features/source-editor/languages/typst/index.ts
+++ b/services/web/frontend/js/features/source-editor/languages/typst/index.ts
@@ -30,8 +30,7 @@ export const TypstLanguage = LRLanguage.define({
         CodeArgs: foldInside,
       }),
       styleTags({
-        // HeadingLine is the entire heading line (prefix + title) as one token.
-        HeadingLine: t.heading,
+        'HeadingMark HeadingTitle': t.heading,
 
         // Comments
         'LineComment LineCommentContent': t.comment,
diff --git a/services/web/frontend/js/features/source-editor/lezer-typst/tokens.mjs b/services/web/frontend/js/features/source-editor/lezer-typst/tokens.mjs
index 652dff574a..557614d5ec 100644
--- a/services/web/frontend/js/features/source-editor/lezer-typst/tokens.mjs
+++ b/services/web/frontend/js/features/source-editor/lezer-typst/tokens.mjs
@@ -2,7 +2,8 @@
 
 import { ExternalTokenizer } from '@lezer/lr'
 import {
-  HeadingLine,
+  HeadingMark,
+  HeadingTitle,
   RawBlockOpen,
   RawBlockBody,
   RawBlockClose,
@@ -24,42 +25,49 @@ const DOLLAR    = 36  // $
 const OPEN_BRACE  = 123 // {
 const CLOSE_BRACE = 125 // }
 
-// ── headingLineTokenizer ────────────────────────────────────────────────
-// Emits HeadingLine — the entire heading line: "=+" markers, trailing
-// space, and title text, all as one token.
-//
-// Using a single token per heading line eliminates the two-step
-// (HeadingMark + HeadingTitle) pattern that caused LALR state-merging
-// problems: having a separate HeadingTitle token created a parser state
-// that "waited" after HeadingMark, and LALR merged that state into
-// body-text item* states.  In those merged states headingTitleTokenizer
-// fired for every paragraph line, swallowing inline markup tokens.
-export const headingLineTokenizer = new ExternalTokenizer(
+// ── headingTokenizer ────────────────────────────────────────────────────
+// Emits HeadingMark — the "=+" prefix plus the trailing whitespace.
+// Only fires at the start of a line (pos 0, or character after '\n').
+export const headingTokenizer = new ExternalTokenizer(
   (input, _stack) => {
-    // Only fire at the start of a line.
     if (input.pos > 0 && input.peek(-1) !== NEWLINE) return
-
     if (input.next !== EQUALS) return
-
-    // Require one or more '=' heading markers.
     while (input.next === EQUALS) input.advance()
-
-    // Must be followed by whitespace.
     if (input.next !== SPACE && input.next !== TAB) return
-
-    // Consume the whitespace.
     while (input.next === SPACE || input.next === TAB) input.advance()
+    input.acceptToken(HeadingMark)
+  },
+  { contextual: false }
+)
 
-    // Consume the title text to end of line (stop before line comment).
+// ── headingTitleTokenizer ────────────────────────────────────────────────
+// Emits HeadingTitle — the title text from after HeadingMark to end of line.
+//
+// LALR state merging means canShift(HeadingTitle) can return true in merged
+// body-text states, not only in the genuine post-HeadingMark state.  A
+// character-level backward scan guards against those false positives: we walk
+// back from input.pos past whitespace and require '=' immediately before it.
+// Any body-text position will have a non-'=' character there instead.
+export const headingTitleTokenizer = new ExternalTokenizer(
+  (input, stack) => {
+    if (!stack.canShift(HeadingTitle)) return
+
+    // Walk backward past the trailing whitespace of HeadingMark.
+    // We must find '=' immediately before that whitespace.
+    let back = -1
+    while (input.peek(back) === SPACE || input.peek(back) === TAB) back--
+    if (input.peek(back) !== EQUALS) return
+
+    let hasContent = false
     while (input.next !== -1 && input.next !== NEWLINE) {
       if (input.next === SLASH &&
           (input.peek(1) === SLASH || input.peek(1) === STAR)) break
       input.advance()
+      hasContent = true
     }
-
-    input.acceptToken(HeadingLine)
+    if (hasContent) input.acceptToken(HeadingTitle)
   },
-  { contextual: false }
+  { contextual: true }
 )
 
 // ── rawTokenizer ────────────────────────────────────────────────────────
diff --git a/services/web/frontend/js/features/source-editor/lezer-typst/typst.grammar b/services/web/frontend/js/features/source-editor/lezer-typst/typst.grammar
index f28c78dd26..69d2e8434f 100644
--- a/services/web/frontend/js/features/source-editor/lezer-typst/typst.grammar
+++ b/services/web/frontend/js/features/source-editor/lezer-typst/typst.grammar
@@ -1,14 +1,12 @@
 // typst.grammar — Lezer LR grammar for the Typst typesetting language.
 // Covers markup mode (top-level), code mode (#expr) and math mode ($...$).
 // External tokenizers handle constructs requiring context-sensitive lexing:
-//   headingLineTokenizer  — entire heading line (=+ prefix + title) as one token
+//   headingTokenizer      — HeadingMark: the leading "=+" plus whitespace
+//   headingTitleTokenizer — HeadingTitle: the title text to end of line
 //   rawTokenizer          — triple-backtick raw block open/body/close
 //   rawInlineTokenizer    — single-backtick raw inline content
 //   codeBlockTokenizer    — brace-depth tracking inside #{ ... }
 //   blockCommentTokenizer — depth-tracked nested /* ... */ comments
-// Using one token for the whole line avoids LALR state-merge issues that arose
-// when HeadingTitle (a separate post-HeadingMark token) was accepted by
-// merged body-text parser states, causing body text to be swallowed.
 
 @top Document { item* }
 
@@ -31,11 +29,12 @@ item {
 }
 
 // ── Headings ──────────────────────────────────────────────────────────────
-// HeadingLine covers the entire heading line: the "=+" prefix, trailing
-// space, and title text.  One token per line means there is no LALR state
-// that "waits" for a second heading token, so no merged body-text state
-// can accidentally accept HeadingLine.
-Heading { HeadingLine }
+// HeadingMark: the "=+" prefix + trailing whitespace (external token).
+// HeadingTitle: the rest of the line.  headingTitleTokenizer uses both
+// stack.canShift() AND a backward character scan to refuse to fire unless
+// the raw text really shows "=+" behind the current position — preventing
+// LALR-merged body-text states from accidentally consuming markup as title.
+Heading { HeadingMark HeadingTitle? }
 
 // ── Comments ──────────────────────────────────────────────────────────────
 LineComment { "//" LineCommentContent? }
@@ -125,8 +124,12 @@ Ref   { "@" RefName  }
 Escape { "\\" EscapeChar }
 
 // ── External tokenizer declarations ──────────────────────────────────────
-@external tokens headingLineTokenizer from "./tokens.mjs" {
-  HeadingLine
+@external tokens headingTokenizer from "./tokens.mjs" {
+  HeadingMark
+}
+
+@external tokens headingTitleTokenizer from "./tokens.mjs" {
+  HeadingTitle
 }
 
 @external tokens rawTokenizer from "./tokens.mjs" {