feat(typst): parse show-rule bodies, let-value bindings, and content-block call args

Three grammar gaps caused large blocks of code to be unhighlighted: 1. KeywordExpr now accepts an exclusive keywordBody: '#show sel: body' is parsed via ':', and '#let name = value' via '='. callOrValue extends the subject to include CodeString so '#import "pkg"' highlights the path. 2. ContentBlock added to callSuffix so '#func("arg")[content]' and '#next-step("url")[...]' parse their trailing content block as code rather than falling back to markup. 3. Tokenizer: COLON added as a valid predecessor so identifiers (e.g. 'blue' in 'fill: blue') and keywords (e.g. 'set' in '#show link: set text(...)') are recognised after ':'. EQUALS already added in the previous commit. The ident-chain backward scan now also skips whitespace before testing for '#' or ':', enabling 'text' in 'set text' to trace back to '#' through the keyword gap. @precedence updated with CodeString, '[', ':' to resolve overlapping-token conflicts with MarkupContent in merged states. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-09 14:57:21 +00:00
parent 47cf84f20b
commit d7ca7b194d
2 changed files with 27 additions and 12 deletions
@@ -285,8 +285,12 @@ export const mathContentTokenizer = new ExternalTokenizer(
 export const codeKeywordTokenizer = new ExternalTokenizer(
  (input, stack) => {
    if (!stack.canShift(CodeKeyword)) return
-    // Only fire right after '#'; any other predecessor means we are in body text.
-    if (input.peek(-1) !== HASH) return
+    // Valid positions: immediately after '#' (normal #set, #show) or after ':'
+    // (show-body: '#show sel: set text(...)').  Walk back past optional whitespace.
+    let back = -1
+    while (input.peek(back) === SPACE || input.peek(back) === TAB || input.peek(back) === NEWLINE) back--
+    const kwPrev = input.peek(back)
+    if (kwPrev !== HASH && kwPrev !== COLON) return

    // Peek ahead to read the full identifier without advancing.
    let len = 0
@@ -337,18 +341,23 @@ export const codeIdentTokenizer = new ExternalTokenizer(
    while (input.peek(back) === SPACE || input.peek(back) === TAB || input.peek(back) === NEWLINE) back--
    const prev = input.peek(back)

-    if (prev !== HASH && prev !== DOT && prev !== OPEN_PAREN && prev !== COMMA) {
-      // May be after a keyword like '#set' or '#show': scan back through the
-      // keyword word itself and check that '#' immediately precedes it.
+    if (prev !== HASH && prev !== DOT && prev !== OPEN_PAREN && prev !== COMMA && prev !== EQUALS && prev !== COLON) {
+      // May be after a keyword chain like '#set text' or (in show body) 'set body':
+      // scan back through the preceding identifier word, skip whitespace, and
+      // verify '#' or ':' precedes it.  Accepting ':' lets multi-word chains
+      // like '#show sel: set text' find ':' before 'set'.
      if (!isIdentTail(prev)) return
      let b = back
      while (isIdentTail(input.peek(b))) b--
-      if (input.peek(b) !== HASH) return
+      while (input.peek(b) === SPACE || input.peek(b) === TAB || input.peek(b) === NEWLINE) b--
+      const chainEnd = input.peek(b)
+      if (chainEnd !== HASH && chainEnd !== COLON) return
    }

    // In arg-delimiter positions ('(' or ',') we may emit CodeArgKey regardless
    // of canShift(CodeIdent) — LALR merging can suppress canShift(CodeIdent)
    // after a complex first argument (e.g. figure(table(...), caption: ...)).
+    // ':' and '=' are value positions, NOT arg-key positions.
    const couldBeArgKey = prev === OPEN_PAREN || prev === COMMA
    if (!couldBeIdent && !couldBeArgKey) return

@@ -68,10 +68,12 @@ codeExprBody {
  ContentBlock
 }

-// CallExpr? covers '#set text(size: 12pt)', '#show heading: ...', etc.
-// The optional CallExpr is only shifted when the next token is CodeIdent,
-// so there is no shift/reduce conflict with other items that follow keywords.
-KeywordExpr { CodeKeyword CallExpr? }
+// callOrValue covers the subject of a keyword expression (#set text, #show link,
+// #import "pkg", #let name).  keywordBody is exclusive: ':' for show-rule bodies
+// and '=' for let-binding values (a keyword expression never has both).
+KeywordExpr { CodeKeyword callOrValue? keywordBody? }
+callOrValue { CallExpr | CodeString }
+keywordBody { ":" codeExprBody | "=" codeValue }
 AtomExpr    { CodeBool    }

 // CallExpr allows zero suffixes — used at top level (#x) and after keywords
@@ -83,7 +85,8 @@ CallExpr { CodeIdent callSuffix* }
 FuncExpr { CodeIdent callSuffix+ }
 callSuffix {
  CodeArgs |
-  "." CodeIdent
+  "." CodeIdent |
+  ContentBlock
 }

 CodeArgs    { "(" codeArgList? ")" }
@@ -235,7 +238,10 @@ Escape { "\\" EscapeChar }
  //   by MarkupContent (redundant since '_' is in MarkupContent's exclusion
  //   set, but kept for clarity).
  // CodeIdent and StrongText/EmphText are now external tokens — not listed.
-  @precedence { CodeBool EscapeChar "(" "." "]" "_" spaces MarkupContent }
+  // "["  > MarkupContent: ContentBlock callSuffix wins in merged code/markup states.
+  // CodeString > MarkupContent: '"' starts a string literal after a keyword.
+  // ":"  > MarkupContent: keywordBody ':' wins over markup colon in code states.
+  @precedence { CodeBool EscapeChar CodeString "[" ":" "(" "." "]" "_" spaces MarkupContent }
 }

@skip { spaces }