typst: fix named-arg key highlighting and multi-line math

Named arg keys (columns:, align:, caption:) were appearing in black because LALR state merging broke the CodeArgs/CodeIdent path for multi-line expressions. Fix: emit a dedicated CodeArgKey token from codeIdentTokenizer (forward-peek for ':' to pre-disambiguate), declare it in the grammar's codeArgItem rule, and map it to t.attributeName in styleTags — bypassing LALR lookahead entirely. Multi-line display math ($ ...\n... $) was consuming the rest of the document as orange text when contextual:true caused a backward scan to find a previous closing '$' and falsely set isDisplay=true. Fix: revert mathContentTokenizer to contextual:false with '\n' stop (each MathContent token covers one line), and change InlineMath to MathContent* so @skip consumes the newlines between lines. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-09 11:25:31 +00:00
parent 2fdb155547
commit 4c6032bce0
3 changed files with 38 additions and 32 deletions
@@ -14,9 +14,8 @@ import { typstDocumentOutline } from './document-outline'
 // Note on tree structure: rules starting with a lowercase letter in the grammar
 // are inline (no tree node), so their children are promoted to the parent.
 // E.g. codeArgItem, codeValue, callSuffix, codeArgList are all inline.
-// Therefore:
-//   - The named-argument key "CodeIdent" is a *direct* child of CodeArgs.
-//   - Positional arguments that are identifiers are wrapped in CallExpr.
+// Named arg keys emit CodeArgKey (not CodeIdent) via codeIdentTokenizer,
+// so CodeArgKey appears at the same level as other codeArgItem children.

 export const TypstLanguage = LRLanguage.define({
  name: 'typst',
@@ -51,10 +50,10 @@ export const TypstLanguage = LRLanguage.define({

        // Identifiers:
        //   - direct child of CallExpr → function/method name
-        //   - direct child of CodeArgs → named argument key (key: value syntax)
-        //   - everywhere else          → plain variable
+        //   - CodeArgKey (named arg key, emitted by tokenizer before ':') → attributeName
+        //   - everywhere else → plain variable
        'CallExpr/CodeIdent': t.function(t.variableName),
-        'CodeArgs/CodeIdent': t.attributeName,
+        CodeArgKey: t.attributeName,
        CodeIdent: t.variableName,

        // Literals in code mode
@@ -14,6 +14,7 @@ import {
  MathContent,
  CodeKeyword,
  CodeIdent,
+  CodeArgKey,
  StrongBody,
  EmphBody,
 } from './typst.terms.mjs'
@@ -33,6 +34,7 @@ const UNDERSCORE  = 95  // _
 const DOT         = 46  // .
 const OPEN_PAREN  = 40  // (
 const COMMA       = 44  // ,
+const COLON       = 58  // :

 const KEYWORDS = new Set([
  'let', 'set', 'show', 'import', 'include',
@@ -252,36 +254,23 @@ export const lineCommentContentTokenizer = new ExternalTokenizer(
 )

 // ── mathContentTokenizer ────────────────────────────────────────────────
-// Emits MathContent — everything between the $...$ delimiters.
+// Emits MathContent — one line of content between the $...$ delimiters.
+// Stops at '$' or '\n' so each token is bounded to a single line.
 //
-// Typst distinguishes inline math ($x^2$) from display math ($ x^2 $):
-// display math has whitespace between the opening '$' and the content.
-// We detect this by scanning back to '$': if there is any whitespace
-// between '$' and the current position (i.e. @skip consumed it), the
-// tokenizer allows newlines so multi-line display math works.  Inline math
-// keeps the newline stop, preventing a lone '$' from consuming the rest of
-// the document.
-//
-// contextual: true — only fires inside InlineMath after '$', never in
-// body text.  The '$' token appears nowhere else in the grammar so the
-// post-'$' state does not merge with item* states.
+// The grammar uses MathContent* (not MathContent?) so multi-line display
+// math ($ ... \n ... $) is handled by multiple MathContent tokens, one per
+// line, with @skip consuming the newlines in between.  This keeps each
+// token short and prevents a stray '$' from consuming the whole document.
 export const mathContentTokenizer = new ExternalTokenizer(
  (input, _stack) => {
-    // Scan back to the opening '$', detecting display vs inline math.
-    let back = -1
-    while (input.peek(back) === SPACE || input.peek(back) === TAB || input.peek(back) === NEWLINE) back--
-    if (input.peek(back) !== DOLLAR) return
-    const isDisplay = back < -1  // whitespace between '$' and current position
-
    let hasContent = false
-    while (input.next !== -1 && input.next !== DOLLAR) {
-      if (!isDisplay && input.next === NEWLINE) break
+    while (input.next !== -1 && input.next !== DOLLAR && input.next !== NEWLINE) {
      input.advance()
      hasContent = true
    }
    if (hasContent) input.acceptToken(MathContent)
  },
-  { contextual: true }
+  { contextual: false }
 )

 // ── codeKeywordTokenizer ─────────────────────────────────────────────────
@@ -335,7 +324,9 @@ export const codeKeywordTokenizer = new ExternalTokenizer(
 // handle them without conflict.
 export const codeIdentTokenizer = new ExternalTokenizer(
  (input, stack) => {
-    if (!stack.canShift(CodeIdent)) return
+    const couldBeKey   = stack.canShift(CodeArgKey)
+    const couldBeIdent = stack.canShift(CodeIdent)
+    if (!couldBeKey && !couldBeIdent) return

    // Guard: only fire in code context.
    // Walk back past whitespace to the nearest non-space character.
@@ -368,8 +359,18 @@ export const codeIdentTokenizer = new ExternalTokenizer(
    // Let codeKeywordTokenizer handle keywords; let CodeBool handle bools.
    if (KEYWORDS.has(word) || BOOLS.has(word)) return

+    // Emit CodeArgKey when this identifier is a named arg key (followed by ':').
+    // Pre-disambiguating here avoids relying on LALR lookahead to choose between
+    // codeArgItem alternatives, which is fragile under Lezer's state merging.
+    let isArgKey = false
+    if (couldBeKey) {
+      let afterLen = len
+      while (input.peek(afterLen) === SPACE || input.peek(afterLen) === TAB) afterLen++
+      isArgKey = (input.peek(afterLen) === COLON)
+    }
+
    for (let i = 0; i < len; i++) input.advance()
-    input.acceptToken(CodeIdent)
+    input.acceptToken(isArgKey ? CodeArgKey : CodeIdent)
  },
  { contextual: true }
 )
@@ -83,7 +83,7 @@ callSuffix {
 CodeArgs    { "(" codeArgList? ")" }
 codeArgList { codeArgItem ("," codeArgItem)* ","? }
 codeArgItem {
-  CodeIdent ":" codeValue |
+  CodeArgKey ":" codeValue |
  codeValue
 }

@@ -104,7 +104,9 @@ ContentBlock { "[" item* "]" }

 // ── Math ──────────────────────────────────────────────────────────────────
 // Both inline ($x^2$) and display ($ x^2 $) math use the same node type.
-InlineMath { "$" MathContent? "$" }
+// MathContent* (not ?) allows multi-line display math: each line becomes one
+// MathContent token (stopping at '\n'), and @skip consumes the newlines between.
+InlineMath { "$" MathContent* "$" }

 // ── Markup formatting ─────────────────────────────────────────────────────
 // Strong and Emphasis use flat external body tokens (StrongBody / EmphBody)
@@ -169,8 +171,12 @@ Escape { "\\" EscapeChar }
 // the token from firing in markup body text, where LALR state merging would
 // otherwise cause the entire token (including any leading '_') to be consumed
 // as a code identifier instead of letting '_' open an Emphasis.
+// CodeArgKey is emitted by the same tokenizer when an identifier is immediately
+// followed by ':' — the tokenizer pre-disambiguates named arg keys so the LALR
+// parser does not need to choose between codeArgItem alternatives on lookahead.
@external tokens codeIdentTokenizer from "./tokens.mjs" {
-  CodeIdent
+  CodeIdent,
+  CodeArgKey
 }

@external tokens strongBodyTokenizer from "./tokens.mjs" {