fix: make CodeIdent external and replace strongItem*/emphItem* with flat body tokens

Two LALR state-merging bugs prevented Strong/Emphasis nodes from ever being produced (confirmed: tok-strong/tok-emphasis count = 0 in browser diagnostic). Bug 1 — _italic_ consumed as CodeIdent: CodeIdent was a @tokens rule with identHead = [A-Za-z_], so '_italic_' (the entire string including both underscores) matched as one CodeIdent token. LALR merging caused CodeIdent to be in item*'s valid set, and CodeIdent > "_" in @precedence, so the parser never opened Emphasis. Fix: move CodeIdent to an external tokenizer (codeIdentTokenizer) with a character-level guard — only fires when the preceding non-whitespace char is one of '#', '.', '(', ',' (genuine code-context positions). In body text where peek-back finds a newline, space, or markup delimiter, the tokenizer returns without emitting, letting '"_"' open Emphasis correctly. Bug 2 — StrongText never produced inside Strong: The strongItem* / emphItem* loops merged with item* states via Lezer's aggressive LALR merging. In the merged state MarkupContent was in the valid set (from the item* side) and MarkupContent > StrongText in @precedence, so MarkupContent was always produced — not a valid strongItem, leading to error recovery with no StrongText in the tree. Fix: replace the recursive strongItem* / emphItem* loops with flat external tokens StrongBody / EmphBody (contextual: true). These fire only inside Strong → "*" . StrongBody? "*" and Emphasis → "_" . EmphBody? "_", states specific enough that canShift is reliable. They read everything up to the closing delimiter or newline in one token, bypassing the LALR merging entirely. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-08 22:15:22 +00:00
parent f9d46aabeb
commit 4aca4aaac6
3 changed files with 154 additions and 59 deletions
@@ -73,8 +73,8 @@ export const TypstLanguage = LRLanguage.define({
        MathContent: t.string,
        // Markup emphasis
-        'Strong/"*" Strong/StrongText': t.strong,
+        'Strong/"*" Strong/StrongBody': t.strong,
-        'Emphasis/"_" Emphasis/EmphText': t.emphasis,
+        'Emphasis/"_" Emphasis/EmphBody': t.emphasis,
        // Labels (<name>) and references (@name)
        'Label/"<" Label/">" Label/LabelName': t.labelName,
@@ -13,19 +13,26 @@ import {
  LineCommentContent,
  MathContent,
  CodeKeyword,
  CodeIdent,
  StrongBody,
  EmphBody,
 } from './typst.terms.mjs'
-const BACKTICK  = 96  // `
+const BACKTICK    = 96  // `
-const SLASH     = 47  // /
+const SLASH       = 47  // /
-const STAR      = 42  // *
+const STAR        = 42  // *
-const NEWLINE   = 10  // \n
+const NEWLINE     = 10  // \n
-const EQUALS    = 61  // =
+const EQUALS      = 61  // =
-const SPACE     = 32  //
+const SPACE       = 32  //
-const TAB       =  9  // \t
+const TAB         =  9  // \t
-const DOLLAR    = 36  // $
+const DOLLAR      = 36  // $
 const OPEN_BRACE  = 123 // {
 const CLOSE_BRACE = 125 // }
-const HASH      = 35  // #
+const HASH        = 35  // #
 const UNDERSCORE  = 95  // _
 const DOT         = 46  // .
 const OPEN_PAREN  = 40  // (
 const COMMA       = 44  // ,
 const KEYWORDS = new Set([
  'let', 'set', 'show', 'import', 'include',
@@ -34,6 +41,13 @@ const KEYWORDS = new Set([
  'and', 'or', 'not', 'context',
 ])
 const BOOLS = new Set(['true', 'false', 'none', 'auto'])
 const isAlpha = ch => (ch >= 65 && ch <= 90) || (ch >= 97 && ch <= 122)
 const isDigit = ch => ch >= 48 && ch <= 57
 const isIdentHead = ch => isAlpha(ch) || ch === UNDERSCORE
 const isIdentTail = ch => isAlpha(ch) || isDigit(ch) || ch === UNDERSCORE || ch === 45
 // ── headingTokenizer ────────────────────────────────────────────────────
 // Emits HeadingMark — the "=+" prefix plus the trailing whitespace.
 // Only fires at the start of a line (pos 0, or character after '\n').
@@ -272,15 +286,7 @@ export const codeKeywordTokenizer = new ExternalTokenizer(
    let len = 0
    while (true) {
      const ch = input.peek(len)
-      if ((ch >= 65 && ch <= 90) ||  // A–Z
+      if (isIdentHead(ch) || (len > 0 && isIdentTail(ch))) { len++ } else { break }
          (ch >= 97 && ch <= 122) ||  // a–z
          (ch >= 48 && ch <= 57) ||   // 0–9
          ch === 95 ||                 // _
          ch === 45) {                 // -
        len++
      } else {
        break
      }
    }
    if (len === 0) return
@@ -296,3 +302,90 @@ export const codeKeywordTokenizer = new ExternalTokenizer(
  },
  { contextual: true }
 )
 // ── codeIdentTokenizer ───────────────────────────────────────────────────
 // Emits CodeIdent — identifier tokens inside code expressions (#ident,
 // #func(args), #obj.method, etc.).
 //
 // Moving CodeIdent from @tokens to an external tokenizer allows a
 // character-level guard: we only emit when the preceding non-whitespace
 // character is one of '#', '.', '(', ',' — genuine code-context positions.
 // This stops the token from firing in markup body text where LALR-merged
 // states would otherwise cause '_italic_' to be consumed as one big
 // CodeIdent (since '_' is a valid identHead) instead of opening Emphasis.
 //
 // Keywords and bools are excluded so codeKeywordTokenizer / CodeBool can
 // handle them without conflict.
 export const codeIdentTokenizer = new ExternalTokenizer(
  (input, stack) => {
    if (!stack.canShift(CodeIdent)) return
    // Guard: only fire in code context.
    // Walk back past any horizontal whitespace (@skip) to the nearest
    // non-space character and check that it is a code-mode delimiter.
    let back = -1
    while (input.peek(back) === SPACE || input.peek(back) === TAB) back--
    const prev = input.peek(back)
    if (prev !== HASH && prev !== DOT && prev !== OPEN_PAREN && prev !== COMMA) return
    // Must start with an identifier head character.
    if (!isIdentHead(input.next)) return
    // Peek ahead to read the full identifier.
    let len = 0
    while (true) {
      const ch = input.peek(len)
      if (len === 0 ? isIdentHead(ch) : isIdentTail(ch)) { len++ } else { break }
    }
    if (len === 0) return
    const chars = []
    for (let i = 0; i < len; i++) chars.push(input.peek(i))
    const word = String.fromCharCode(...chars)
    // Let codeKeywordTokenizer handle keywords; let CodeBool handle bools.
    if (KEYWORDS.has(word) || BOOLS.has(word)) return
    for (let i = 0; i < len; i++) input.advance()
    input.acceptToken(CodeIdent)
  },
  { contextual: true }
 )
 // ── strongBodyTokenizer ──────────────────────────────────────────────────
 // Emits StrongBody — the content between the '*' delimiters of a Strong node.
 //
 // contextual: true — only fires when StrongBody is in the valid set, i.e.
 // inside Strong → "*" . StrongBody? "*".  This state is very specific and
 // is not merged with item* by Lezer's aggressive LALR merging, so canShift
 // is a reliable guard here.
 //
 // Reads everything up to the first '*' or newline (Typst bold does not span
 // lines).  A trailing '*' that is the closing delimiter is left for the
 // grammar rule to consume.
 export const strongBodyTokenizer = new ExternalTokenizer(
  (input, _stack) => {
    let hasContent = false
    while (input.next !== -1 && input.next !== STAR && input.next !== NEWLINE) {
      input.advance()
      hasContent = true
    }
    if (hasContent) input.acceptToken(StrongBody)
  },
  { contextual: true }
 )
 // ── emphBodyTokenizer ────────────────────────────────────────────────────
 // Emits EmphBody — the content between the '_' delimiters of an Emphasis node.
 // Same design as strongBodyTokenizer; stops at '_' or newline.
 export const emphBodyTokenizer = new ExternalTokenizer(
  (input, _stack) => {
    let hasContent = false
    while (input.next !== -1 && input.next !== UNDERSCORE && input.next !== NEWLINE) {
      input.advance()
      hasContent = true
    }
    if (hasContent) input.acceptToken(EmphBody)
  },
  { contextual: true }
 )
@@ -7,6 +7,9 @@
 //   rawInlineTokenizer    — single-backtick raw inline content
 //   codeBlockTokenizer    — brace-depth tracking inside #{ ... }
 //   blockCommentTokenizer — depth-tracked nested /* ... */ comments
 //   codeIdentTokenizer    — CodeIdent: identifier, only fires in code context
 //   strongBodyTokenizer   — StrongBody: content inside *...*
 //   emphBodyTokenizer     — EmphBody: content inside _..._
@top Document { item* }
@@ -105,16 +108,15 @@ ContentBlock { "[" item* "]" }
 InlineMath { "$" MathContent? "$" }
 // ── Markup formatting ─────────────────────────────────────────────────────
-// Cross-nesting of Strong/Emphasis is intentionally excluded to avoid a
+// Strong and Emphasis use flat external body tokens (StrongBody / EmphBody)
-// mutual-recursion cycle (Strong→Emphasis→Strong) that causes state explosion
+// rather than recursive strongItem* / emphItem* loops.  The loop approach
-// in the Lezer LR automaton builder.  StrongText includes '_' and EmphText
+// triggered LALR state merging that caused item*-level tokens (MarkupContent,
-// includes '*', so the nested delimiters are treated as plain text inside the
+// CodeIdent) to win over StrongText/EmphText inside the construct, so the
-// opposite construct rather than producing error nodes.
+// body nodes were never produced.  The flat external tokens are contextual
-Strong { "*" strongItem* "*" }
+// (canShift only fires inside Strong/Emphasis) and reliably avoid those
-strongItem { CodeExpr | InlineMath | RawInline | Label | Ref | StrongText }
+// merged states.
-
+Strong   { "*" StrongBody? "*" }
-Emphasis { "_" emphItem* "_" }
+Emphasis { "_" EmphBody?   "_" }
 emphItem { CodeExpr | InlineMath | RawInline | Label | Ref | EmphText }
 // ── Labels and references ─────────────────────────────────────────────────
 Label { "<" LabelName ">" }
@@ -162,6 +164,24 @@ Escape { "\\" EscapeChar }
  CodeKeyword
 }
 // CodeIdent is external so codeIdentTokenizer can apply a character-level
 // guard: it only emits when the preceding non-whitespace character is one of
 // '#', '.', '(', ',' — i.e. genuinely inside a code expression.  This stops
 // the token from firing in markup body text, where LALR state merging would
 // otherwise cause the entire token (including any leading '_') to be consumed
 // as a code identifier instead of letting '_' open an Emphasis.
@external tokens codeIdentTokenizer from "./tokens.mjs" {
  CodeIdent
 }
@external tokens strongBodyTokenizer from "./tokens.mjs" {
  StrongBody
 }
@external tokens emphBodyTokenizer from "./tokens.mjs" {
  EmphBody
 }
 // ── Regular tokens ────────────────────────────────────────────────────────
@tokens {
  // Horizontal whitespace only.  Newlines are kept as explicit Newline items
@@ -172,11 +192,6 @@ Escape { "\\" EscapeChar }
  // Boolean / null literals — distinct from keywords for highlighting.
  CodeBool { "true" | "false" | "none" | "auto" }
  // General identifier: [A-Za-z_][A-Za-z0-9_-]*
  CodeIdent { identHead identTail* }
  identHead  { @asciiLetter | "_" }
  identTail  { @asciiLetter | @digit | "_" | "-" }
  // Double-quoted string with backslash escapes (no single-quoted strings in Typst).
  CodeString { '"' (!["\\\n] | "\\" _)* '"' }
@@ -186,23 +201,14 @@ Escape { "\\" EscapeChar }
    ("pt" | "mm" | "cm" | "in" | "em" | "rem" | "fr" | "deg" | "rad" | "%")?
  }
  // Text tokens for markup contexts; each excludes its own delimiters.
  // HeadingText, LineCommentContent, and MathContent are external tokens
  // (see above) — broad "read-to-delimiter" tokens that would otherwise
  // conflict with every other literal token in LALR-merged states.
  // '<' is excluded from StrongText/EmphText so that Label ('<' LabelName '>')
  // is recognised inside strong/emphasis rather than consumed as plain text.
  StrongText   { ![\n*$#`<@\\]+   }
  EmphText     { ![\n_$#`<@\\]+   }
  // Regular markup: excludes all special-character starters plus whitespace
  // (whitespace is handled by @skip).  The '/' is excluded so that '//' and
  // '/*' are not accidentally consumed as plain text.
  MarkupContent { ![\n \t=*_$#/<@`\\]+ }
  // Label names: identifiers with optional dots/colons (e.g. <sec:intro>).
-  LabelName { (identHead | @digit) (identTail | "." | ":")* }
+  LabelName { (@asciiLetter | "_" | @digit) (@asciiLetter | @digit | "_" | "-" | "." | ":")* }
-  RefName   { identHead identTail*                           }
+  RefName   { (@asciiLetter | "_") (@asciiLetter | @digit | "_" | "-")*                      }
  // Escape: any single character after backslash.
  EscapeChar { _ }
@@ -210,19 +216,15 @@ Escape { "\\" EscapeChar }
  // Newline item — kept out of @skip so heading detection works.
  Newline { "\n" }
-  // Resolve ambiguities: more-specific tokens win over broader catch-alls.
+  // Resolve ambiguities in merged states:
-  // EscapeChar > spaces: after '\', EscapeChar must win over the skip token
+  // EscapeChar > spaces: after '\', EscapeChar must win over the skip token.
-  //   (both match \t; without this, '\t' would be mis-tokenized).
+  // "(" > "." > "]": callSuffix delimiters must win over MarkupContent after
-  // "(" > "." > "]" > text tokens: after '#' CodeIdent, callSuffix delimiters
+  //   a code identifier (merged states expose these to the markup tokenizer).
-  //   must win over MarkupContent/StrongText/EmphText in merged states.
+  // "_" > MarkupContent: '_' must open Emphasis rather than being swallowed
-  // LineCommentContent and MathContent are external tokens — not listed here.
+  //   by MarkupContent (redundant since '_' is in MarkupContent's exclusion
-  // "_" added after CodeIdent: KeywordExpr { CodeKeyword CallExpr? } merges
+  //   set, but kept for clarity).
-  // the post-keyword state with markup states where "_" starts Emphasis.
+  // CodeIdent and StrongText/EmphText are now external tokens — not listed.
-  // CodeIdent wins so '#set _name(...)' is tokenised correctly; in pure markup
+  @precedence { CodeBool EscapeChar "(" "." "]" "_" spaces MarkupContent }
  // states CodeIdent is not in the valid set so "_" still opens Emphasis.
  // CodeKeyword is now an external token (codeKeywordTokenizer) and therefore
  // not listed here — it uses a peek(-1)==='#' guard to stay out of markup.
  @precedence { CodeBool CodeIdent EscapeChar "(" "." "]" "_" spaces MarkupContent StrongText EmphText }
 }
@skip { spaces }