fix(typst): highlight keywords/idents inside #{} code blocks

Replace the opaque CodeBlockBody external tokenizer with grammar-parsed codeStatement* so that keywords (show, let, set, …) and identifiers inside #{ } code blocks receive proper Lezer nodes and are highlighted. Key grammar changes: - CodeBlock { "{" codeStatement* "}" } — structured, not opaque - codeStatement uses two explicit alternatives for keyword lines: CodeKeyword !kw callOrValueAndBody (grabs the subject eagerly) CodeKeyword keywordBody? (bare keyword or body-only form) The !kw cut-point gives shift prec kw > 0 over the unannotated reduce, resolving the LALR merge ambiguity without @left/@right on kw. - callOrValue { FuncExpr | CodeIdent | CodeString } — replaces CallExpr { CodeIdent !call callSuffix* }. The * quantifier annotated both shift and reduce with !call, making them a same-prec tie that @right could not reliably resolve in merged states. Using FuncExpr (required callSuffixes) + bare CodeIdent makes the tie strict (call > 0 for FuncExpr shift vs 0 for bare-ident reduce), then @right handles only the extension-of-callSuffixes case (shift = call<<2, FuncExpr reduce = call<<2 - 1 via @right encoding). - KeywordExpr gets the same two-alternative structure as codeStatement so nested show/set/let inside a code block (e.g. show sel: set text) also parse without LALR state-merge conflicts. - CallExpr removed; its role is split between FuncExpr (has args/chain) and bare CodeIdent (no args). Styling updated: CodeExpr/CodeIdent replaces CallExpr/CodeIdent for bare #ident function-style highlights. - codeKeywordTokenizer and codeIdentTokenizer already accept keywords / identifiers after { and ; (added in previous commit) — consistent with the new grammar. Parse results: #{ show strong: link.with(url); body } → CodeKeyword "show", CodeIdent "strong", FuncExpr "link.with(url)", CodeIdent "body" — all properly highlighted, no ⚠ errors. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-09 21:47:21 +00:00
parent 056d9a7f47
commit 0656ddfe52
5 changed files with 132 additions and 113 deletions
@@ -49,11 +49,11 @@ export const TypstLanguage = LRLanguage.define({
        CodeBool: t.atom,

        // Identifiers:
-        //   CallExpr/CodeIdent  — top-level #func or after keywords (#set text) → function style
-        //   FuncExpr/CodeIdent  — func call inside a value expr (has args/method) → function style
+        //   CodeExpr/CodeIdent  — bare #func (no args) → function style
+        //   FuncExpr/CodeIdent  — func call with args/method (#func(...), link.with(url)) → function style
        //   CodeArgKey          — named arg key (tokenizer pre-disambiguates on ':') → attributeName
        //   CodeIdent           — plain variable/constant reference (e.g. 'left', 'center') → variable
-        'CallExpr/CodeIdent': t.function(t.variableName),
+        'CodeExpr/CodeIdent': t.function(t.variableName),
        'FuncExpr/CodeIdent': t.function(t.variableName),
        CodeArgKey: t.attributeName,
        CodeIdent: t.variableName,
@@ -8,7 +8,6 @@ import {
  RawBlockBody,
  RawBlockClose,
  RawInlineContent,
-  CodeBlockBody,
  BlockCommentBody,
  LineCommentContent,
  MathContent,
@@ -35,6 +34,7 @@ const DOT         = 46  // .
 const OPEN_PAREN  = 40  // (
 const COMMA       = 44  // ,
 const COLON       = 58  // :
+const SEMICOLON   = 59  // ;
 const OPEN_ANGLE  = 60  // <
 const CLOSE_ANGLE = 62  // >

@@ -188,36 +188,6 @@ export const rawInlineTokenizer = new ExternalTokenizer(
  { contextual: false }
 )

-// ── codeBlockTokenizer ──────────────────────────────────────────────────
-// Emits CodeBlockBody — the interior of a #{ ... } code block.
-// Tracks brace nesting depth so that inner braces (e.g. #{ f({ x }) })
-// are included in the body rather than closing the outer block.
-export const codeBlockTokenizer = new ExternalTokenizer(
-  (input, _stack) => {
-    // The opening '{' has already been consumed by the grammar rule.
-    let depth = 1
-    let hasContent = false
-    while (input.next !== -1) {
-      const ch = input.next
-      if (ch === OPEN_BRACE) {
-        depth++
-        input.advance()
-        hasContent = true
-      } else if (ch === CLOSE_BRACE) {
-        if (depth === 1) break  // leave this '}' for the grammar rule
-        depth--
-        input.advance()
-        hasContent = true
-      } else {
-        input.advance()
-        hasContent = true
-      }
-    }
-    if (hasContent) input.acceptToken(CodeBlockBody)
-  },
-  { contextual: false }
-)
-
 // ── blockCommentTokenizer ───────────────────────────────────────────────
 // Emits BlockCommentBody — the interior of a /* ... */ comment.
 // Typst supports nested block comments (/* /* inner */ outer */), so this
@@ -298,12 +268,12 @@ export const mathContentTokenizer = new ExternalTokenizer(
 export const codeKeywordTokenizer = new ExternalTokenizer(
  (input, stack) => {
    if (!stack.canShift(CodeKeyword)) return
-    // Valid positions: immediately after '#' (normal #set, #show) or after ':'
-    // (show-body: '#show sel: set text(...)').  Walk back past optional whitespace.
+    // Valid positions: after '#', ':', '{' (code block start), or ';'.
+    // Walk back past optional whitespace.
    let back = -1
    while (input.peek(back) === SPACE || input.peek(back) === TAB || input.peek(back) === NEWLINE) back--
    const kwPrev = input.peek(back)
-    if (kwPrev !== HASH && kwPrev !== COLON) return
+    if (kwPrev !== HASH && kwPrev !== COLON && kwPrev !== OPEN_BRACE && kwPrev !== SEMICOLON) return

    // Peek ahead to read the full identifier without advancing.
    let len = 0
@@ -355,16 +325,24 @@ export const codeIdentTokenizer = new ExternalTokenizer(
    const prev = input.peek(back)

    if (prev !== HASH && prev !== DOT && prev !== OPEN_PAREN && prev !== COMMA && prev !== EQUALS && prev !== COLON) {
-      // May be after a keyword chain like '#set text' or (in show body) 'set body':
-      // scan back through the preceding identifier word, skip whitespace, and
-      // verify '#' or ':' precedes it.  Accepting ':' lets multi-word chains
-      // like '#show sel: set text' find ':' before 'set'.
-      if (!isIdentTail(prev)) return
-      let b = back
-      while (isIdentTail(input.peek(b))) b--
-      while (input.peek(b) === SPACE || input.peek(b) === TAB || input.peek(b) === NEWLINE) b--
-      const chainEnd = input.peek(b)
-      if (chainEnd !== HASH && chainEnd !== COLON) return
+      if (!isIdentTail(prev)) {
+        // prev is a structural delimiter (e.g. ')' after a function call, '{' at
+        // block start, '}' after a nested block).  These are valid statement-start
+        // positions inside a CodeBlock's codeStatement* list.  Trust canShift —
+        // it's reliable in the grammar-parsed code-block states.
+        if (!couldBeIdent) return
+      } else {
+        // prev looks like the tail of a preceding word — scan back to find '#' or ':'.
+        // Accepting ':' lets multi-word chains like 'show sel: set text' work.
+        let b = back
+        while (isIdentTail(input.peek(b))) b--
+        while (input.peek(b) === SPACE || input.peek(b) === TAB || input.peek(b) === NEWLINE) b--
+        const chainEnd = input.peek(b)
+        if (chainEnd !== HASH && chainEnd !== COLON) {
+          // Could be second+ statement in a code block (e.g. after 'let x = 1').
+          if (!couldBeIdent) return
+        }
+      }
    }

    // In arg-delimiter positions ('(' or ',') we may emit CodeArgKey regardless
@@ -5,7 +5,6 @@
 //   headingTitleTokenizer — HeadingTitle: the title text to end of line
 //   rawTokenizer          — triple-backtick raw block open/body/close
 //   rawInlineTokenizer    — single-backtick raw inline content
-//   codeBlockTokenizer    — brace-depth tracking inside #{ ... }
 //   blockCommentTokenizer — depth-tracked nested /* ... */ comments
 //   codeIdentTokenizer    — CodeIdent: identifier, only fires in code context
 //   strongBodyTokenizer   — StrongBody: content inside *...*
@@ -62,10 +61,16 @@ RawInline { "`" RawInlineContent? "`" }
 //   #[ ... ]          — content block (re-parses as markup items)
 CodeExpr { "#" codeExprBody }

+// codeExprBody: forms valid after '#' in markup, or after ':' / '=' in a
+// keyword-body.  FuncExpr handles ident+callSuffix(s); bare CodeIdent handles
+// a plain variable reference (#x).  No CallExpr with callSuffix* here — that
+// *-quantifier makes both shift and reduce carry !call precedence (a tie that
+// @right cannot resolve reliably once codeStatement* state-merging is in play).
 codeExprBody {
  KeywordExpr |
  AtomExpr |
-  CallExpr |
+  FuncExpr   |
+  CodeIdent  |
  CodeBlock |
  ContentBlock
 }
@@ -73,18 +78,59 @@ codeExprBody {
 // callOrValue covers the subject of a keyword expression (#set text, #show link,
 // #import "pkg", #let name).  keywordBody is exclusive: ':' for show-rule bodies
 // and '=' for let-binding values (a keyword expression never has both).
-KeywordExpr { CodeKeyword callOrValue? keywordBody? }
-callOrValue { CallExpr | CodeString }
+// Two precedences:
+//   call @right — prefer extending callSuffixes (FuncExpr) over completing the
+//           FuncExpr and letting '(' start a new statement.  The `!call` marker
+//           encodes the shift as (call << 2) and the FuncExpr reduce as
+//           (call << 2) - 1 (due to @right); shift > reduce, so callSuffix
+//           chains are greedily extended.  Without @right both actions have
+//           the same numeric precedence and the conflict is unresolved.
+//   kw   — prefer CodeKeyword !kw callOrValueAndBody over CodeKeyword keywordBody?
+//           when an identifier follows the keyword.  shift = kw << 2, reduce
+//           (second alternative) = 0; kw > 0, no @right needed.
+@precedence { call @right, kw }
+
+// KeywordExpr: used in markup-level code (#show, #let, #set …) AND nested
+// inside codeExprBody (e.g. the RHS after ':' in a show-rule).
+// Same two-alternative structure as codeStatement: the !kw on the first
+// alternative gives the shift prec kw > 0 over the unannotated reduce of the
+// second alternative (prec 0).  This avoids the call-vs-call tie that arises
+// from the old `callOrValue?` optional pattern.
+KeywordExpr {
+  CodeKeyword !kw callOrValueAndBody |
+  CodeKeyword keywordBody?
+}
+
+// callOrValue: FuncExpr for "ident(args)" / "ident.method", bare CodeIdent for
+// a plain name, CodeString for string subjects like #import "pkg".
+// FuncExpr requires at least one callSuffix, so at [CodeIdent ·] seeing '(':
+//   SHIFT (start callSuffixes, prec call) vs REDUCE bare CodeIdent (prec 0).
+//   call > 0 → shift wins cleanly.
+callOrValue { FuncExpr | CodeIdent | CodeString }
 keywordBody { ":" codeExprBody | "=" codeValue }
 AtomExpr    { CodeBool    }

-// CallExpr allows zero suffixes — used at top level (#x) and after keywords
-// (#set text(...)) where even a bare identifier is valid as a named reference.
-CallExpr { CodeIdent callSuffix* }
-// FuncExpr requires at least one suffix — used inside codeValue so that
-// 'table(...)' gets tok-function while plain identifiers like 'left'/'center'
-// get tok-variableName instead of being false-positively styled as functions.
-FuncExpr { CodeIdent callSuffix+ }
+// codeStatement is the unit inside a CodeBlock's brace body.
+// Two explicit alternatives for the keyword case avoid the LALR ambiguity
+// that arises from codeStatement* merging when callOrValue? is optional.
+// The !kw annotation on the first alternative (shift callOrValueAndBody) has
+// higher precedence than the bare reduce of the second alternative (prec 0),
+// so 'show strong: …' grabs 'strong' as callOrValue rather than completing
+// KeywordExpr early with empty callOrValue.
+codeStatement {
+  CodeKeyword !kw callOrValueAndBody |
+  CodeKeyword keywordBody? |
+  codeValue |
+  ";"
+}
+callOrValueAndBody { callOrValue keywordBody? }
+
+// FuncExpr: identifier followed by one-or-more call suffixes.
+// callSuffixes uses explicit left-recursion (not +) so the !call annotation
+// on the recursive extension point gives the shift prec call vs the unannotated
+// reduce of codeValue → FuncExpr (prec 0) — shift wins, no @right tie.
+callSuffixes { callSuffix | callSuffixes !call callSuffix }
+FuncExpr { CodeIdent !call callSuffixes }
 callSuffix {
  CodeArgs |
  "." CodeIdent |
@@ -114,8 +160,9 @@ codeValue {
 // Reuses codeArgList so named-key entries like (auto, 1fr) work too.
 CodeArray { "(" codeArgList? ")" }

-// CodeBlockBody depth-tracks braces so #{ let x = { 1 } } parses correctly.
-CodeBlock    { "{" CodeBlockBody? "}" }
+// CodeBlock parses its content as a codeStatement* list so that keywords
+// (show, let, set…) and identifiers inside braces receive proper highlighting.
+CodeBlock    { "{" codeStatement* "}" }
 // ContentBlock re-enters markup mode, allowing #[*bold* text].
 ContentBlock { "[" item* "]" }

@@ -162,10 +209,6 @@ Escape { "\\" EscapeChar }
  RawInlineContent
 }

-@external tokens codeBlockTokenizer from "./tokens.mjs" {
-  CodeBlockBody
-}
-
@external tokens blockCommentTokenizer from "./tokens.mjs" {
  BlockCommentBody
 }
@@ -6,42 +6,40 @@ export const
  RawBlockBody = 4,
  RawBlockClose = 5,
  RawInlineContent = 6,
-  CodeBlockBody = 7,
-  BlockCommentBody = 8,
-  LineCommentContent = 9,
-  MathContent = 10,
-  CodeKeyword = 11,
-  CodeIdent = 12,
-  CodeArgKey = 13,
-  StrongBody = 14,
-  EmphBody = 15,
-  Document = 16,
-  Heading = 17,
-  LineComment = 18,
-  BlockComment = 19,
-  RawBlock = 20,
-  RawInline = 21,
-  CodeExpr = 22,
-  KeywordExpr = 23,
-  CallExpr = 24,
-  CodeArgs = 25,
-  CodeString = 26,
-  CodeNumber = 27,
-  CodeBool = 28,
-  FuncExpr = 29,
-  ContentBlock = 30,
-  CodeBlock = 31,
-  InlineMath = 32,
-  CodeArray = 33,
-  AtomExpr = 34,
-  Strong = 35,
-  Emphasis = 36,
-  Label = 37,
-  LabelName = 38,
-  Ref = 39,
-  RefName = 40,
-  Escape = 41,
-  EscapeChar = 42,
-  URL = 43,
-  MarkupContent = 44,
-  ClosingSquare = 45
+  BlockCommentBody = 7,
+  LineCommentContent = 8,
+  MathContent = 9,
+  CodeKeyword = 10,
+  CodeIdent = 11,
+  CodeArgKey = 12,
+  StrongBody = 13,
+  EmphBody = 14,
+  Document = 15,
+  Heading = 16,
+  LineComment = 17,
+  BlockComment = 18,
+  RawBlock = 19,
+  RawInline = 20,
+  CodeExpr = 21,
+  KeywordExpr = 22,
+  FuncExpr = 23,
+  CodeArgs = 24,
+  CodeString = 25,
+  CodeNumber = 26,
+  CodeBool = 27,
+  ContentBlock = 28,
+  CodeBlock = 29,
+  InlineMath = 30,
+  CodeArray = 31,
+  AtomExpr = 32,
+  Strong = 33,
+  Emphasis = 34,
+  Label = 35,
+  LabelName = 36,
+  Ref = 37,
+  RefName = 38,
+  Escape = 39,
+  EscapeChar = 40,
+  URL = 41,
+  MarkupContent = 42,
+  ClosingSquare = 43