fix(typst): restore HeadingMark+HeadingTitle with character-level bleed guard

The single-HeadingLine token approach caused everything after the first heading to be unparsed. Reverting to the two-token structure but adding a backward character scan in headingTitleTokenizer: after canShift(), walk backward past whitespace and require '=' immediately before the current position. Body-text positions in LALR-merged states will have a letter or closing bracket there instead, so the tokenizer returns without accepting. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-08 18:17:17 +00:00
parent 34025dc084
commit 974a9c4fb3
3 changed files with 47 additions and 37 deletions
@@ -30,8 +30,7 @@ export const TypstLanguage = LRLanguage.define({
        CodeArgs: foldInside,
      }),
      styleTags({
-        // HeadingLine is the entire heading line (prefix + title) as one token.
-        HeadingLine: t.heading,
+        'HeadingMark HeadingTitle': t.heading,

        // Comments
        'LineComment LineCommentContent': t.comment,
@@ -2,7 +2,8 @@

 import { ExternalTokenizer } from '@lezer/lr'
 import {
-  HeadingLine,
+  HeadingMark,
+  HeadingTitle,
  RawBlockOpen,
  RawBlockBody,
  RawBlockClose,
@@ -24,42 +25,49 @@ const DOLLAR    = 36  // $
 const OPEN_BRACE  = 123 // {
 const CLOSE_BRACE = 125 // }

-// ── headingLineTokenizer ────────────────────────────────────────────────
-// Emits HeadingLine — the entire heading line: "=+" markers, trailing
-// space, and title text, all as one token.
-//
-// Using a single token per heading line eliminates the two-step
-// (HeadingMark + HeadingTitle) pattern that caused LALR state-merging
-// problems: having a separate HeadingTitle token created a parser state
-// that "waited" after HeadingMark, and LALR merged that state into
-// body-text item* states.  In those merged states headingTitleTokenizer
-// fired for every paragraph line, swallowing inline markup tokens.
-export const headingLineTokenizer = new ExternalTokenizer(
+// ── headingTokenizer ────────────────────────────────────────────────────
+// Emits HeadingMark — the "=+" prefix plus the trailing whitespace.
+// Only fires at the start of a line (pos 0, or character after '\n').
+export const headingTokenizer = new ExternalTokenizer(
  (input, _stack) => {
-    // Only fire at the start of a line.
    if (input.pos > 0 && input.peek(-1) !== NEWLINE) return
-
    if (input.next !== EQUALS) return
-
-    // Require one or more '=' heading markers.
    while (input.next === EQUALS) input.advance()
-
-    // Must be followed by whitespace.
    if (input.next !== SPACE && input.next !== TAB) return
-
-    // Consume the whitespace.
    while (input.next === SPACE || input.next === TAB) input.advance()
+    input.acceptToken(HeadingMark)
+  },
+  { contextual: false }
+)

-    // Consume the title text to end of line (stop before line comment).
+// ── headingTitleTokenizer ────────────────────────────────────────────────
+// Emits HeadingTitle — the title text from after HeadingMark to end of line.
+//
+// LALR state merging means canShift(HeadingTitle) can return true in merged
+// body-text states, not only in the genuine post-HeadingMark state.  A
+// character-level backward scan guards against those false positives: we walk
+// back from input.pos past whitespace and require '=' immediately before it.
+// Any body-text position will have a non-'=' character there instead.
+export const headingTitleTokenizer = new ExternalTokenizer(
+  (input, stack) => {
+    if (!stack.canShift(HeadingTitle)) return
+
+    // Walk backward past the trailing whitespace of HeadingMark.
+    // We must find '=' immediately before that whitespace.
+    let back = -1
+    while (input.peek(back) === SPACE || input.peek(back) === TAB) back--
+    if (input.peek(back) !== EQUALS) return
+
+    let hasContent = false
    while (input.next !== -1 && input.next !== NEWLINE) {
      if (input.next === SLASH &&
          (input.peek(1) === SLASH || input.peek(1) === STAR)) break
      input.advance()
+      hasContent = true
    }
-
-    input.acceptToken(HeadingLine)
+    if (hasContent) input.acceptToken(HeadingTitle)
  },
-  { contextual: false }
+  { contextual: true }
 )

 // ── rawTokenizer ────────────────────────────────────────────────────────
@@ -1,14 +1,12 @@
 // typst.grammar — Lezer LR grammar for the Typst typesetting language.
 // Covers markup mode (top-level), code mode (#expr) and math mode ($...$).
 // External tokenizers handle constructs requiring context-sensitive lexing:
-//   headingLineTokenizer  — entire heading line (=+ prefix + title) as one token
+//   headingTokenizer      — HeadingMark: the leading "=+" plus whitespace
+//   headingTitleTokenizer — HeadingTitle: the title text to end of line
 //   rawTokenizer          — triple-backtick raw block open/body/close
 //   rawInlineTokenizer    — single-backtick raw inline content
 //   codeBlockTokenizer    — brace-depth tracking inside #{ ... }
 //   blockCommentTokenizer — depth-tracked nested /* ... */ comments
-// Using one token for the whole line avoids LALR state-merge issues that arose
-// when HeadingTitle (a separate post-HeadingMark token) was accepted by
-// merged body-text parser states, causing body text to be swallowed.

@top Document { item* }

@@ -31,11 +29,12 @@ item {
 }

 // ── Headings ──────────────────────────────────────────────────────────────
-// HeadingLine covers the entire heading line: the "=+" prefix, trailing
-// space, and title text.  One token per line means there is no LALR state
-// that "waits" for a second heading token, so no merged body-text state
-// can accidentally accept HeadingLine.
-Heading { HeadingLine }
+// HeadingMark: the "=+" prefix + trailing whitespace (external token).
+// HeadingTitle: the rest of the line.  headingTitleTokenizer uses both
+// stack.canShift() AND a backward character scan to refuse to fire unless
+// the raw text really shows "=+" behind the current position — preventing
+// LALR-merged body-text states from accidentally consuming markup as title.
+Heading { HeadingMark HeadingTitle? }

 // ── Comments ──────────────────────────────────────────────────────────────
 LineComment { "//" LineCommentContent? }
@@ -125,8 +124,12 @@ Ref   { "@" RefName  }
 Escape { "\\" EscapeChar }

 // ── External tokenizer declarations ──────────────────────────────────────
-@external tokens headingLineTokenizer from "./tokens.mjs" {
-  HeadingLine
+@external tokens headingTokenizer from "./tokens.mjs" {
+  HeadingMark
+}
+
+@external tokens headingTitleTokenizer from "./tokens.mjs" {
+  HeadingTitle
 }

@external tokens rawTokenizer from "./tokens.mjs" {