feat(typst): parse show-rule bodies, let-value bindings, and content-block call args
Build and Deploy Verso / deploy (push) Successful in 14m13s

Three grammar gaps caused large blocks of code to be unhighlighted:

1. KeywordExpr now accepts an exclusive keywordBody: '#show sel: body' is
   parsed via ':', and '#let name = value' via '='.  callOrValue extends
   the subject to include CodeString so '#import "pkg"' highlights the path.

2. ContentBlock added to callSuffix so '#func("arg")[content]' and
   '#next-step("url")[...]' parse their trailing content block as code
   rather than falling back to markup.

3. Tokenizer: COLON added as a valid predecessor so identifiers (e.g. 'blue'
   in 'fill: blue') and keywords (e.g. 'set' in '#show link: set text(...)')
   are recognised after ':'.  EQUALS already added in the previous commit.
   The ident-chain backward scan now also skips whitespace before testing for
   '#' or ':', enabling 'text' in 'set text' to trace back to '#' through the
   keyword gap.  @precedence updated with CodeString, '[', ':' to resolve
   overlapping-token conflicts with MarkupContent in merged states.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
claude
2026-06-09 14:57:21 +00:00
parent 47cf84f20b
commit d7ca7b194d
2 changed files with 27 additions and 12 deletions
@@ -285,8 +285,12 @@ export const mathContentTokenizer = new ExternalTokenizer(
export const codeKeywordTokenizer = new ExternalTokenizer(
(input, stack) => {
if (!stack.canShift(CodeKeyword)) return
// Only fire right after '#'; any other predecessor means we are in body text.
if (input.peek(-1) !== HASH) return
// Valid positions: immediately after '#' (normal #set, #show) or after ':'
// (show-body: '#show sel: set text(...)'). Walk back past optional whitespace.
let back = -1
while (input.peek(back) === SPACE || input.peek(back) === TAB || input.peek(back) === NEWLINE) back--
const kwPrev = input.peek(back)
if (kwPrev !== HASH && kwPrev !== COLON) return
// Peek ahead to read the full identifier without advancing.
let len = 0
@@ -337,18 +341,23 @@ export const codeIdentTokenizer = new ExternalTokenizer(
while (input.peek(back) === SPACE || input.peek(back) === TAB || input.peek(back) === NEWLINE) back--
const prev = input.peek(back)
if (prev !== HASH && prev !== DOT && prev !== OPEN_PAREN && prev !== COMMA) {
// May be after a keyword like '#set' or '#show': scan back through the
// keyword word itself and check that '#' immediately precedes it.
if (prev !== HASH && prev !== DOT && prev !== OPEN_PAREN && prev !== COMMA && prev !== EQUALS && prev !== COLON) {
// May be after a keyword chain like '#set text' or (in show body) 'set body':
// scan back through the preceding identifier word, skip whitespace, and
// verify '#' or ':' precedes it. Accepting ':' lets multi-word chains
// like '#show sel: set text' find ':' before 'set'.
if (!isIdentTail(prev)) return
let b = back
while (isIdentTail(input.peek(b))) b--
if (input.peek(b) !== HASH) return
while (input.peek(b) === SPACE || input.peek(b) === TAB || input.peek(b) === NEWLINE) b--
const chainEnd = input.peek(b)
if (chainEnd !== HASH && chainEnd !== COLON) return
}
// In arg-delimiter positions ('(' or ',') we may emit CodeArgKey regardless
// of canShift(CodeIdent) — LALR merging can suppress canShift(CodeIdent)
// after a complex first argument (e.g. figure(table(...), caption: ...)).
// ':' and '=' are value positions, NOT arg-key positions.
const couldBeArgKey = prev === OPEN_PAREN || prev === COMMA
if (!couldBeIdent && !couldBeArgKey) return
@@ -68,10 +68,12 @@ codeExprBody {
ContentBlock
}
// CallExpr? covers '#set text(size: 12pt)', '#show heading: ...', etc.
// The optional CallExpr is only shifted when the next token is CodeIdent,
// so there is no shift/reduce conflict with other items that follow keywords.
KeywordExpr { CodeKeyword CallExpr? }
// callOrValue covers the subject of a keyword expression (#set text, #show link,
// #import "pkg", #let name). keywordBody is exclusive: ':' for show-rule bodies
// and '=' for let-binding values (a keyword expression never has both).
KeywordExpr { CodeKeyword callOrValue? keywordBody? }
callOrValue { CallExpr | CodeString }
keywordBody { ":" codeExprBody | "=" codeValue }
AtomExpr { CodeBool }
// CallExpr allows zero suffixes — used at top level (#x) and after keywords
@@ -83,7 +85,8 @@ CallExpr { CodeIdent callSuffix* }
FuncExpr { CodeIdent callSuffix+ }
callSuffix {
CodeArgs |
"." CodeIdent
"." CodeIdent |
ContentBlock
}
CodeArgs { "(" codeArgList? ")" }
@@ -235,7 +238,10 @@ Escape { "\\" EscapeChar }
// by MarkupContent (redundant since '_' is in MarkupContent's exclusion
// set, but kept for clarity).
// CodeIdent and StrongText/EmphText are now external tokens — not listed.
@precedence { CodeBool EscapeChar "(" "." "]" "_" spaces MarkupContent }
// "[" > MarkupContent: ContentBlock callSuffix wins in merged code/markup states.
// CodeString > MarkupContent: '"' starts a string literal after a keyword.
// ":" > MarkupContent: keywordBody ':' wins over markup colon in code states.
@precedence { CodeBool EscapeChar CodeString "[" ":" "(" "." "]" "_" spaces MarkupContent }
}
@skip { spaces }