fix: make CodeIdent external and replace strongItem*/emphItem* with flat body tokens

Two LALR state-merging bugs prevented Strong/Emphasis nodes from ever being produced (confirmed: tok-strong/tok-emphasis count = 0 in browser diagnostic). Bug 1 — _italic_ consumed as CodeIdent: CodeIdent was a @tokens rule with identHead = [A-Za-z_], so '_italic_' (the entire string including both underscores) matched as one CodeIdent token. LALR merging caused CodeIdent to be in item*'s valid set, and CodeIdent > "_" in @precedence, so the parser never opened Emphasis. Fix: move CodeIdent to an external tokenizer (codeIdentTokenizer) with a character-level guard — only fires when the preceding non-whitespace char is one of '#', '.', '(', ',' (genuine code-context positions). In body text where peek-back finds a newline, space, or markup delimiter, the tokenizer returns without emitting, letting '"_"' open Emphasis correctly. Bug 2 — StrongText never produced inside Strong: The strongItem* / emphItem* loops merged with item* states via Lezer's aggressive LALR merging. In the merged state MarkupContent was in the valid set (from the item* side) and MarkupContent > StrongText in @precedence, so MarkupContent was always produced — not a valid strongItem, leading to error recovery with no StrongText in the tree. Fix: replace the recursive strongItem* / emphItem* loops with flat external tokens StrongBody / EmphBody (contextual: true). These fire only inside Strong → "*" . StrongBody? "*" and Emphasis → "_" . EmphBody? "_", states specific enough that canShift is reliable. They read everything up to the closing delimiter or newline in one token, bypassing the LALR merging entirely. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-08 22:15:22 +00:00
parent f9d46aabeb
commit 4aca4aaac6
3 changed files with 154 additions and 59 deletions
@@ -73,8 +73,8 @@ export const TypstLanguage = LRLanguage.define({
        MathContent: t.string,

        // Markup emphasis
-        'Strong/"*" Strong/StrongText': t.strong,
-        'Emphasis/"_" Emphasis/EmphText': t.emphasis,
+        'Strong/"*" Strong/StrongBody': t.strong,
+        'Emphasis/"_" Emphasis/EmphBody': t.emphasis,

        // Labels (<name>) and references (@name)
        'Label/"<" Label/">" Label/LabelName': t.labelName,
@@ -13,19 +13,26 @@ import {
  LineCommentContent,
  MathContent,
  CodeKeyword,
+  CodeIdent,
+  StrongBody,
+  EmphBody,
 } from './typst.terms.mjs'

-const BACKTICK  = 96  // `
-const SLASH     = 47  // /
-const STAR      = 42  // *
-const NEWLINE   = 10  // \n
-const EQUALS    = 61  // =
-const SPACE     = 32  //
-const TAB       =  9  // \t
-const DOLLAR    = 36  // $
+const BACKTICK    = 96  // `
+const SLASH       = 47  // /
+const STAR        = 42  // *
+const NEWLINE     = 10  // \n
+const EQUALS      = 61  // =
+const SPACE       = 32  //
+const TAB         =  9  // \t
+const DOLLAR      = 36  // $
 const OPEN_BRACE  = 123 // {
 const CLOSE_BRACE = 125 // }
-const HASH      = 35  // #
+const HASH        = 35  // #
+const UNDERSCORE  = 95  // _
+const DOT         = 46  // .
+const OPEN_PAREN  = 40  // (
+const COMMA       = 44  // ,

 const KEYWORDS = new Set([
  'let', 'set', 'show', 'import', 'include',
@@ -34,6 +41,13 @@ const KEYWORDS = new Set([
  'and', 'or', 'not', 'context',
 ])

+const BOOLS = new Set(['true', 'false', 'none', 'auto'])
+
+const isAlpha = ch => (ch >= 65 && ch <= 90) || (ch >= 97 && ch <= 122)
+const isDigit = ch => ch >= 48 && ch <= 57
+const isIdentHead = ch => isAlpha(ch) || ch === UNDERSCORE
+const isIdentTail = ch => isAlpha(ch) || isDigit(ch) || ch === UNDERSCORE || ch === 45
+
 // ── headingTokenizer ────────────────────────────────────────────────────
 // Emits HeadingMark — the "=+" prefix plus the trailing whitespace.
 // Only fires at the start of a line (pos 0, or character after '\n').
@@ -272,15 +286,7 @@ export const codeKeywordTokenizer = new ExternalTokenizer(
    let len = 0
    while (true) {
      const ch = input.peek(len)
-      if ((ch >= 65 && ch <= 90) ||  // A–Z
-          (ch >= 97 && ch <= 122) ||  // a–z
-          (ch >= 48 && ch <= 57) ||   // 0–9
-          ch === 95 ||                 // _
-          ch === 45) {                 // -
-        len++
-      } else {
-        break
-      }
+      if (isIdentHead(ch) || (len > 0 && isIdentTail(ch))) { len++ } else { break }
    }

    if (len === 0) return
@@ -296,3 +302,90 @@ export const codeKeywordTokenizer = new ExternalTokenizer(
  },
  { contextual: true }
 )
+
+// ── codeIdentTokenizer ───────────────────────────────────────────────────
+// Emits CodeIdent — identifier tokens inside code expressions (#ident,
+// #func(args), #obj.method, etc.).
+//
+// Moving CodeIdent from @tokens to an external tokenizer allows a
+// character-level guard: we only emit when the preceding non-whitespace
+// character is one of '#', '.', '(', ',' — genuine code-context positions.
+// This stops the token from firing in markup body text where LALR-merged
+// states would otherwise cause '_italic_' to be consumed as one big
+// CodeIdent (since '_' is a valid identHead) instead of opening Emphasis.
+//
+// Keywords and bools are excluded so codeKeywordTokenizer / CodeBool can
+// handle them without conflict.
+export const codeIdentTokenizer = new ExternalTokenizer(
+  (input, stack) => {
+    if (!stack.canShift(CodeIdent)) return
+
+    // Guard: only fire in code context.
+    // Walk back past any horizontal whitespace (@skip) to the nearest
+    // non-space character and check that it is a code-mode delimiter.
+    let back = -1
+    while (input.peek(back) === SPACE || input.peek(back) === TAB) back--
+    const prev = input.peek(back)
+    if (prev !== HASH && prev !== DOT && prev !== OPEN_PAREN && prev !== COMMA) return
+
+    // Must start with an identifier head character.
+    if (!isIdentHead(input.next)) return
+
+    // Peek ahead to read the full identifier.
+    let len = 0
+    while (true) {
+      const ch = input.peek(len)
+      if (len === 0 ? isIdentHead(ch) : isIdentTail(ch)) { len++ } else { break }
+    }
+    if (len === 0) return
+
+    const chars = []
+    for (let i = 0; i < len; i++) chars.push(input.peek(i))
+    const word = String.fromCharCode(...chars)
+
+    // Let codeKeywordTokenizer handle keywords; let CodeBool handle bools.
+    if (KEYWORDS.has(word) || BOOLS.has(word)) return
+
+    for (let i = 0; i < len; i++) input.advance()
+    input.acceptToken(CodeIdent)
+  },
+  { contextual: true }
+)
+
+// ── strongBodyTokenizer ──────────────────────────────────────────────────
+// Emits StrongBody — the content between the '*' delimiters of a Strong node.
+//
+// contextual: true — only fires when StrongBody is in the valid set, i.e.
+// inside Strong → "*" . StrongBody? "*".  This state is very specific and
+// is not merged with item* by Lezer's aggressive LALR merging, so canShift
+// is a reliable guard here.
+//
+// Reads everything up to the first '*' or newline (Typst bold does not span
+// lines).  A trailing '*' that is the closing delimiter is left for the
+// grammar rule to consume.
+export const strongBodyTokenizer = new ExternalTokenizer(
+  (input, _stack) => {
+    let hasContent = false
+    while (input.next !== -1 && input.next !== STAR && input.next !== NEWLINE) {
+      input.advance()
+      hasContent = true
+    }
+    if (hasContent) input.acceptToken(StrongBody)
+  },
+  { contextual: true }
+)
+
+// ── emphBodyTokenizer ────────────────────────────────────────────────────
+// Emits EmphBody — the content between the '_' delimiters of an Emphasis node.
+// Same design as strongBodyTokenizer; stops at '_' or newline.
+export const emphBodyTokenizer = new ExternalTokenizer(
+  (input, _stack) => {
+    let hasContent = false
+    while (input.next !== -1 && input.next !== UNDERSCORE && input.next !== NEWLINE) {
+      input.advance()
+      hasContent = true
+    }
+    if (hasContent) input.acceptToken(EmphBody)
+  },
+  { contextual: true }
+)
@@ -7,6 +7,9 @@
 //   rawInlineTokenizer    — single-backtick raw inline content
 //   codeBlockTokenizer    — brace-depth tracking inside #{ ... }
 //   blockCommentTokenizer — depth-tracked nested /* ... */ comments
+//   codeIdentTokenizer    — CodeIdent: identifier, only fires in code context
+//   strongBodyTokenizer   — StrongBody: content inside *...*
+//   emphBodyTokenizer     — EmphBody: content inside _..._

@top Document { item* }

@@ -105,16 +108,15 @@ ContentBlock { "[" item* "]" }
 InlineMath { "$" MathContent? "$" }

 // ── Markup formatting ─────────────────────────────────────────────────────
-// Cross-nesting of Strong/Emphasis is intentionally excluded to avoid a
-// mutual-recursion cycle (Strong→Emphasis→Strong) that causes state explosion
-// in the Lezer LR automaton builder.  StrongText includes '_' and EmphText
-// includes '*', so the nested delimiters are treated as plain text inside the
-// opposite construct rather than producing error nodes.
-Strong { "*" strongItem* "*" }
-strongItem { CodeExpr | InlineMath | RawInline | Label | Ref | StrongText }
-
-Emphasis { "_" emphItem* "_" }
-emphItem { CodeExpr | InlineMath | RawInline | Label | Ref | EmphText }
+// Strong and Emphasis use flat external body tokens (StrongBody / EmphBody)
+// rather than recursive strongItem* / emphItem* loops.  The loop approach
+// triggered LALR state merging that caused item*-level tokens (MarkupContent,
+// CodeIdent) to win over StrongText/EmphText inside the construct, so the
+// body nodes were never produced.  The flat external tokens are contextual
+// (canShift only fires inside Strong/Emphasis) and reliably avoid those
+// merged states.
+Strong   { "*" StrongBody? "*" }
+Emphasis { "_" EmphBody?   "_" }

 // ── Labels and references ─────────────────────────────────────────────────
 Label { "<" LabelName ">" }
@@ -162,6 +164,24 @@ Escape { "\\" EscapeChar }
  CodeKeyword
 }

+// CodeIdent is external so codeIdentTokenizer can apply a character-level
+// guard: it only emits when the preceding non-whitespace character is one of
+// '#', '.', '(', ',' — i.e. genuinely inside a code expression.  This stops
+// the token from firing in markup body text, where LALR state merging would
+// otherwise cause the entire token (including any leading '_') to be consumed
+// as a code identifier instead of letting '_' open an Emphasis.
+@external tokens codeIdentTokenizer from "./tokens.mjs" {
+  CodeIdent
+}
+
+@external tokens strongBodyTokenizer from "./tokens.mjs" {
+  StrongBody
+}
+
+@external tokens emphBodyTokenizer from "./tokens.mjs" {
+  EmphBody
+}
+
 // ── Regular tokens ────────────────────────────────────────────────────────
@tokens {
  // Horizontal whitespace only.  Newlines are kept as explicit Newline items
@@ -172,11 +192,6 @@ Escape { "\\" EscapeChar }
  // Boolean / null literals — distinct from keywords for highlighting.
  CodeBool { "true" | "false" | "none" | "auto" }

-  // General identifier: [A-Za-z_][A-Za-z0-9_-]*
-  CodeIdent { identHead identTail* }
-  identHead  { @asciiLetter | "_" }
-  identTail  { @asciiLetter | @digit | "_" | "-" }
-
  // Double-quoted string with backslash escapes (no single-quoted strings in Typst).
  CodeString { '"' (!["\\\n] | "\\" _)* '"' }

@@ -186,23 +201,14 @@ Escape { "\\" EscapeChar }
    ("pt" | "mm" | "cm" | "in" | "em" | "rem" | "fr" | "deg" | "rad" | "%")?
  }

-  // Text tokens for markup contexts; each excludes its own delimiters.
-  // HeadingText, LineCommentContent, and MathContent are external tokens
-  // (see above) — broad "read-to-delimiter" tokens that would otherwise
-  // conflict with every other literal token in LALR-merged states.
-  // '<' is excluded from StrongText/EmphText so that Label ('<' LabelName '>')
-  // is recognised inside strong/emphasis rather than consumed as plain text.
-  StrongText   { ![\n*$#`<@\\]+   }
-  EmphText     { ![\n_$#`<@\\]+   }
-
  // Regular markup: excludes all special-character starters plus whitespace
  // (whitespace is handled by @skip).  The '/' is excluded so that '//' and
  // '/*' are not accidentally consumed as plain text.
  MarkupContent { ![\n \t=*_$#/<@`\\]+ }

  // Label names: identifiers with optional dots/colons (e.g. <sec:intro>).
-  LabelName { (identHead | @digit) (identTail | "." | ":")* }
-  RefName   { identHead identTail*                           }
+  LabelName { (@asciiLetter | "_" | @digit) (@asciiLetter | @digit | "_" | "-" | "." | ":")* }
+  RefName   { (@asciiLetter | "_") (@asciiLetter | @digit | "_" | "-")*                      }

  // Escape: any single character after backslash.
  EscapeChar { _ }
@@ -210,19 +216,15 @@ Escape { "\\" EscapeChar }
  // Newline item — kept out of @skip so heading detection works.
  Newline { "\n" }

-  // Resolve ambiguities: more-specific tokens win over broader catch-alls.
-  // EscapeChar > spaces: after '\', EscapeChar must win over the skip token
-  //   (both match \t; without this, '\t' would be mis-tokenized).
-  // "(" > "." > "]" > text tokens: after '#' CodeIdent, callSuffix delimiters
-  //   must win over MarkupContent/StrongText/EmphText in merged states.
-  // LineCommentContent and MathContent are external tokens — not listed here.
-  // "_" added after CodeIdent: KeywordExpr { CodeKeyword CallExpr? } merges
-  // the post-keyword state with markup states where "_" starts Emphasis.
-  // CodeIdent wins so '#set _name(...)' is tokenised correctly; in pure markup
-  // states CodeIdent is not in the valid set so "_" still opens Emphasis.
-  // CodeKeyword is now an external token (codeKeywordTokenizer) and therefore
-  // not listed here — it uses a peek(-1)==='#' guard to stay out of markup.
-  @precedence { CodeBool CodeIdent EscapeChar "(" "." "]" "_" spaces MarkupContent StrongText EmphText }
+  // Resolve ambiguities in merged states:
+  // EscapeChar > spaces: after '\', EscapeChar must win over the skip token.
+  // "(" > "." > "]": callSuffix delimiters must win over MarkupContent after
+  //   a code identifier (merged states expose these to the markup tokenizer).
+  // "_" > MarkupContent: '_' must open Emphasis rather than being swallowed
+  //   by MarkupContent (redundant since '_' is in MarkupContent's exclusion
+  //   set, but kept for clarity).
+  // CodeIdent and StrongText/EmphText are now external tokens — not listed.
+  @precedence { CodeBool EscapeChar "(" "." "]" "_" spaces MarkupContent }
 }

@skip { spaces }