fix(typst): highlight keywords/idents inside #{} code blocks

Replace the opaque CodeBlockBody external tokenizer with grammar-parsed codeStatement* so that keywords (show, let, set, …) and identifiers inside #{ } code blocks receive proper Lezer nodes and are highlighted. Key grammar changes: - CodeBlock { "{" codeStatement* "}" } — structured, not opaque - codeStatement uses two explicit alternatives for keyword lines: CodeKeyword !kw callOrValueAndBody (grabs the subject eagerly) CodeKeyword keywordBody? (bare keyword or body-only form) The !kw cut-point gives shift prec kw > 0 over the unannotated reduce, resolving the LALR merge ambiguity without @left/@right on kw. - callOrValue { FuncExpr | CodeIdent | CodeString } — replaces CallExpr { CodeIdent !call callSuffix* }. The * quantifier annotated both shift and reduce with !call, making them a same-prec tie that @right could not reliably resolve in merged states. Using FuncExpr (required callSuffixes) + bare CodeIdent makes the tie strict (call > 0 for FuncExpr shift vs 0 for bare-ident reduce), then @right handles only the extension-of-callSuffixes case (shift = call<<2, FuncExpr reduce = call<<2 - 1 via @right encoding). - KeywordExpr gets the same two-alternative structure as codeStatement so nested show/set/let inside a code block (e.g. show sel: set text) also parse without LALR state-merge conflicts. - CallExpr removed; its role is split between FuncExpr (has args/chain) and bare CodeIdent (no args). Styling updated: CodeExpr/CodeIdent replaces CallExpr/CodeIdent for bare #ident function-style highlights. - codeKeywordTokenizer and codeIdentTokenizer already accept keywords / identifiers after { and ; (added in previous commit) — consistent with the new grammar. Parse results: #{ show strong: link.with(url); body } → CodeKeyword "show", CodeIdent "strong", FuncExpr "link.with(url)", CodeIdent "body" — all properly highlighted, no ⚠ errors. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-09 21:47:21 +00:00
parent 056d9a7f47
commit 0656ddfe52
5 changed files with 132 additions and 113 deletions
@@ -49,11 +49,11 @@ export const TypstLanguage = LRLanguage.define({
        CodeBool: t.atom,
        // Identifiers:
-        //   CallExpr/CodeIdent  — top-level #func or after keywords (#set text) → function style
+        //   CodeExpr/CodeIdent  — bare #func (no args) → function style
-        //   FuncExpr/CodeIdent  — func call inside a value expr (has args/method) → function style
+        //   FuncExpr/CodeIdent  — func call with args/method (#func(...), link.with(url)) → function style
        //   CodeArgKey          — named arg key (tokenizer pre-disambiguates on ':') → attributeName
        //   CodeIdent           — plain variable/constant reference (e.g. 'left', 'center') → variable
-        'CallExpr/CodeIdent': t.function(t.variableName),
+        'CodeExpr/CodeIdent': t.function(t.variableName),
        'FuncExpr/CodeIdent': t.function(t.variableName),
        CodeArgKey: t.attributeName,
        CodeIdent: t.variableName,
@@ -8,7 +8,6 @@ import {
  RawBlockBody,
  RawBlockClose,
  RawInlineContent,
  CodeBlockBody,
  BlockCommentBody,
  LineCommentContent,
  MathContent,
@@ -35,6 +34,7 @@ const DOT         = 46  // .
 const OPEN_PAREN  = 40  // (
 const COMMA       = 44  // ,
 const COLON       = 58  // :
 const SEMICOLON   = 59  // ;
 const OPEN_ANGLE  = 60  // <
 const CLOSE_ANGLE = 62  // >
@@ -188,36 +188,6 @@ export const rawInlineTokenizer = new ExternalTokenizer(
  { contextual: false }
 )
 // ── codeBlockTokenizer ──────────────────────────────────────────────────
 // Emits CodeBlockBody — the interior of a #{ ... } code block.
 // Tracks brace nesting depth so that inner braces (e.g. #{ f({ x }) })
 // are included in the body rather than closing the outer block.
 export const codeBlockTokenizer = new ExternalTokenizer(
  (input, _stack) => {
    // The opening '{' has already been consumed by the grammar rule.
    let depth = 1
    let hasContent = false
    while (input.next !== -1) {
      const ch = input.next
      if (ch === OPEN_BRACE) {
        depth++
        input.advance()
        hasContent = true
      } else if (ch === CLOSE_BRACE) {
        if (depth === 1) break  // leave this '}' for the grammar rule
        depth--
        input.advance()
        hasContent = true
      } else {
        input.advance()
        hasContent = true
      }
    }
    if (hasContent) input.acceptToken(CodeBlockBody)
  },
  { contextual: false }
 )
 // ── blockCommentTokenizer ───────────────────────────────────────────────
 // Emits BlockCommentBody — the interior of a /* ... */ comment.
 // Typst supports nested block comments (/* /* inner */ outer */), so this
@@ -298,12 +268,12 @@ export const mathContentTokenizer = new ExternalTokenizer(
 export const codeKeywordTokenizer = new ExternalTokenizer(
  (input, stack) => {
    if (!stack.canShift(CodeKeyword)) return
-    // Valid positions: immediately after '#' (normal #set, #show) or after ':'
+    // Valid positions: after '#', ':', '{' (code block start), or ';'.
-    // (show-body: '#show sel: set text(...)').  Walk back past optional whitespace.
+    // Walk back past optional whitespace.
    let back = -1
    while (input.peek(back) === SPACE || input.peek(back) === TAB || input.peek(back) === NEWLINE) back--
    const kwPrev = input.peek(back)
-    if (kwPrev !== HASH && kwPrev !== COLON) return
+    if (kwPrev !== HASH && kwPrev !== COLON && kwPrev !== OPEN_BRACE && kwPrev !== SEMICOLON) return
    // Peek ahead to read the full identifier without advancing.
    let len = 0
@@ -355,16 +325,24 @@ export const codeIdentTokenizer = new ExternalTokenizer(
    const prev = input.peek(back)
    if (prev !== HASH && prev !== DOT && prev !== OPEN_PAREN && prev !== COMMA && prev !== EQUALS && prev !== COLON) {
-      // May be after a keyword chain like '#set text' or (in show body) 'set body':
+      if (!isIdentTail(prev)) {
-      // scan back through the preceding identifier word, skip whitespace, and
+        // prev is a structural delimiter (e.g. ')' after a function call, '{' at
-      // verify '#' or ':' precedes it.  Accepting ':' lets multi-word chains
+        // block start, '}' after a nested block).  These are valid statement-start
-      // like '#show sel: set text' find ':' before 'set'.
+        // positions inside a CodeBlock's codeStatement* list.  Trust canShift —
-      if (!isIdentTail(prev)) return
+        // it's reliable in the grammar-parsed code-block states.
-      let b = back
+        if (!couldBeIdent) return
-      while (isIdentTail(input.peek(b))) b--
+      } else {
-      while (input.peek(b) === SPACE || input.peek(b) === TAB || input.peek(b) === NEWLINE) b--
+        // prev looks like the tail of a preceding word — scan back to find '#' or ':'.
-      const chainEnd = input.peek(b)
+        // Accepting ':' lets multi-word chains like 'show sel: set text' work.
-      if (chainEnd !== HASH && chainEnd !== COLON) return
+        let b = back
        while (isIdentTail(input.peek(b))) b--
        while (input.peek(b) === SPACE || input.peek(b) === TAB || input.peek(b) === NEWLINE) b--
        const chainEnd = input.peek(b)
        if (chainEnd !== HASH && chainEnd !== COLON) {
          // Could be second+ statement in a code block (e.g. after 'let x = 1').
          if (!couldBeIdent) return
        }
      }
    }
    // In arg-delimiter positions ('(' or ',') we may emit CodeArgKey regardless
@@ -5,7 +5,6 @@
 //   headingTitleTokenizer — HeadingTitle: the title text to end of line
 //   rawTokenizer          — triple-backtick raw block open/body/close
 //   rawInlineTokenizer    — single-backtick raw inline content
 //   codeBlockTokenizer    — brace-depth tracking inside #{ ... }
 //   blockCommentTokenizer — depth-tracked nested /* ... */ comments
 //   codeIdentTokenizer    — CodeIdent: identifier, only fires in code context
 //   strongBodyTokenizer   — StrongBody: content inside *...*
@@ -62,10 +61,16 @@ RawInline { "`" RawInlineContent? "`" }
 //   #[ ... ]          — content block (re-parses as markup items)
 CodeExpr { "#" codeExprBody }
 // codeExprBody: forms valid after '#' in markup, or after ':' / '=' in a
 // keyword-body.  FuncExpr handles ident+callSuffix(s); bare CodeIdent handles
 // a plain variable reference (#x).  No CallExpr with callSuffix* here — that
 // *-quantifier makes both shift and reduce carry !call precedence (a tie that
 // @right cannot resolve reliably once codeStatement* state-merging is in play).
 codeExprBody {
  KeywordExpr |
  AtomExpr |
-  CallExpr |
+  FuncExpr   |
  CodeIdent  |
  CodeBlock |
  ContentBlock
 }
@@ -73,18 +78,59 @@ codeExprBody {
 // callOrValue covers the subject of a keyword expression (#set text, #show link,
 // #import "pkg", #let name).  keywordBody is exclusive: ':' for show-rule bodies
 // and '=' for let-binding values (a keyword expression never has both).
-KeywordExpr { CodeKeyword callOrValue? keywordBody? }
+// Two precedences:
-callOrValue { CallExpr | CodeString }
+//   call @right — prefer extending callSuffixes (FuncExpr) over completing the
 //           FuncExpr and letting '(' start a new statement.  The `!call` marker
 //           encodes the shift as (call << 2) and the FuncExpr reduce as
 //           (call << 2) - 1 (due to @right); shift > reduce, so callSuffix
 //           chains are greedily extended.  Without @right both actions have
 //           the same numeric precedence and the conflict is unresolved.
 //   kw   — prefer CodeKeyword !kw callOrValueAndBody over CodeKeyword keywordBody?
 //           when an identifier follows the keyword.  shift = kw << 2, reduce
 //           (second alternative) = 0; kw > 0, no @right needed.
@precedence { call @right, kw }
 // KeywordExpr: used in markup-level code (#show, #let, #set …) AND nested
 // inside codeExprBody (e.g. the RHS after ':' in a show-rule).
 // Same two-alternative structure as codeStatement: the !kw on the first
 // alternative gives the shift prec kw > 0 over the unannotated reduce of the
 // second alternative (prec 0).  This avoids the call-vs-call tie that arises
 // from the old `callOrValue?` optional pattern.
 KeywordExpr {
  CodeKeyword !kw callOrValueAndBody |
  CodeKeyword keywordBody?
 }
 // callOrValue: FuncExpr for "ident(args)" / "ident.method", bare CodeIdent for
 // a plain name, CodeString for string subjects like #import "pkg".
 // FuncExpr requires at least one callSuffix, so at [CodeIdent ·] seeing '(':
 //   SHIFT (start callSuffixes, prec call) vs REDUCE bare CodeIdent (prec 0).
 //   call > 0 → shift wins cleanly.
 callOrValue { FuncExpr | CodeIdent | CodeString }
 keywordBody { ":" codeExprBody | "=" codeValue }
 AtomExpr    { CodeBool    }
-// CallExpr allows zero suffixes — used at top level (#x) and after keywords
+// codeStatement is the unit inside a CodeBlock's brace body.
-// (#set text(...)) where even a bare identifier is valid as a named reference.
+// Two explicit alternatives for the keyword case avoid the LALR ambiguity
-CallExpr { CodeIdent callSuffix* }
+// that arises from codeStatement* merging when callOrValue? is optional.
-// FuncExpr requires at least one suffix — used inside codeValue so that
+// The !kw annotation on the first alternative (shift callOrValueAndBody) has
-// 'table(...)' gets tok-function while plain identifiers like 'left'/'center'
+// higher precedence than the bare reduce of the second alternative (prec 0),
-// get tok-variableName instead of being false-positively styled as functions.
+// so 'show strong: …' grabs 'strong' as callOrValue rather than completing
-FuncExpr { CodeIdent callSuffix+ }
+// KeywordExpr early with empty callOrValue.
 codeStatement {
  CodeKeyword !kw callOrValueAndBody |
  CodeKeyword keywordBody? |
  codeValue |
  ";"
 }
 callOrValueAndBody { callOrValue keywordBody? }
 // FuncExpr: identifier followed by one-or-more call suffixes.
 // callSuffixes uses explicit left-recursion (not +) so the !call annotation
 // on the recursive extension point gives the shift prec call vs the unannotated
 // reduce of codeValue → FuncExpr (prec 0) — shift wins, no @right tie.
 callSuffixes { callSuffix | callSuffixes !call callSuffix }
 FuncExpr { CodeIdent !call callSuffixes }
 callSuffix {
  CodeArgs |
  "." CodeIdent |
@@ -114,8 +160,9 @@ codeValue {
 // Reuses codeArgList so named-key entries like (auto, 1fr) work too.
 CodeArray { "(" codeArgList? ")" }
-// CodeBlockBody depth-tracks braces so #{ let x = { 1 } } parses correctly.
+// CodeBlock parses its content as a codeStatement* list so that keywords
-CodeBlock    { "{" CodeBlockBody? "}" }
+// (show, let, set…) and identifiers inside braces receive proper highlighting.
 CodeBlock    { "{" codeStatement* "}" }
 // ContentBlock re-enters markup mode, allowing #[*bold* text].
 ContentBlock { "[" item* "]" }
@@ -162,10 +209,6 @@ Escape { "\\" EscapeChar }
  RawInlineContent
 }
@external tokens codeBlockTokenizer from "./tokens.mjs" {
  CodeBlockBody
 }
@external tokens blockCommentTokenizer from "./tokens.mjs" {
  BlockCommentBody
 }
@@ -6,42 +6,40 @@ export const
  RawBlockBody = 4,
  RawBlockClose = 5,
  RawInlineContent = 6,
-  CodeBlockBody = 7,
+  BlockCommentBody = 7,
-  BlockCommentBody = 8,
+  LineCommentContent = 8,
-  LineCommentContent = 9,
+  MathContent = 9,
-  MathContent = 10,
+  CodeKeyword = 10,
-  CodeKeyword = 11,
+  CodeIdent = 11,
-  CodeIdent = 12,
+  CodeArgKey = 12,
-  CodeArgKey = 13,
+  StrongBody = 13,
-  StrongBody = 14,
+  EmphBody = 14,
-  EmphBody = 15,
+  Document = 15,
-  Document = 16,
+  Heading = 16,
-  Heading = 17,
+  LineComment = 17,
-  LineComment = 18,
+  BlockComment = 18,
-  BlockComment = 19,
+  RawBlock = 19,
-  RawBlock = 20,
+  RawInline = 20,
-  RawInline = 21,
+  CodeExpr = 21,
-  CodeExpr = 22,
+  KeywordExpr = 22,
-  KeywordExpr = 23,
+  FuncExpr = 23,
-  CallExpr = 24,
+  CodeArgs = 24,
-  CodeArgs = 25,
+  CodeString = 25,
-  CodeString = 26,
+  CodeNumber = 26,
-  CodeNumber = 27,
+  CodeBool = 27,
-  CodeBool = 28,
+  ContentBlock = 28,
-  FuncExpr = 29,
+  CodeBlock = 29,
-  ContentBlock = 30,
+  InlineMath = 30,
-  CodeBlock = 31,
+  CodeArray = 31,
-  InlineMath = 32,
+  AtomExpr = 32,
-  CodeArray = 33,
+  Strong = 33,
-  AtomExpr = 34,
+  Emphasis = 34,
-  Strong = 35,
+  Label = 35,
-  Emphasis = 36,
+  LabelName = 36,
-  Label = 37,
+  Ref = 37,
-  LabelName = 38,
+  RefName = 38,
-  Ref = 39,
+  Escape = 39,
-  RefName = 40,
+  EscapeChar = 40,
-  Escape = 41,
+  URL = 41,
-  EscapeChar = 42,
+  MarkupContent = 42,
-  URL = 43,
+  ClosingSquare = 43
  MarkupContent = 44,
  ClosingSquare = 45