fix(typst): highlight keywords/idents inside #{} code blocks
Build and Deploy Verso / deploy (push) Successful in 9m32s

Replace the opaque CodeBlockBody external tokenizer with grammar-parsed
codeStatement* so that keywords (show, let, set, …) and identifiers
inside #{ } code blocks receive proper Lezer nodes and are highlighted.

Key grammar changes:
- CodeBlock { "{" codeStatement* "}" } — structured, not opaque
- codeStatement uses two explicit alternatives for keyword lines:
    CodeKeyword !kw callOrValueAndBody  (grabs the subject eagerly)
    CodeKeyword keywordBody?            (bare keyword or body-only form)
  The !kw cut-point gives shift prec kw > 0 over the unannotated reduce,
  resolving the LALR merge ambiguity without @left/@right on kw.
- callOrValue { FuncExpr | CodeIdent | CodeString } — replaces CallExpr
  { CodeIdent !call callSuffix* }.  The * quantifier annotated both
  shift and reduce with !call, making them a same-prec tie that @right
  could not reliably resolve in merged states.  Using FuncExpr (required
  callSuffixes) + bare CodeIdent makes the tie strict (call > 0 for
  FuncExpr shift vs 0 for bare-ident reduce), then @right handles only
  the extension-of-callSuffixes case (shift = call<<2, FuncExpr reduce
  = call<<2 - 1 via @right encoding).
- KeywordExpr gets the same two-alternative structure as codeStatement
  so nested show/set/let inside a code block (e.g. show sel: set text)
  also parse without LALR state-merge conflicts.
- CallExpr removed; its role is split between FuncExpr (has args/chain)
  and bare CodeIdent (no args).  Styling updated: CodeExpr/CodeIdent
  replaces CallExpr/CodeIdent for bare #ident function-style highlights.
- codeKeywordTokenizer and codeIdentTokenizer already accept keywords /
  identifiers after { and ; (added in previous commit) — consistent with
  the new grammar.

Parse results:
  #{ show strong: link.with(url); body }
  → CodeKeyword "show", CodeIdent "strong", FuncExpr "link.with(url)",
    CodeIdent "body" — all properly highlighted, no ⚠ errors.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
claude
2026-06-09 21:47:21 +00:00
parent 056d9a7f47
commit 0656ddfe52
5 changed files with 132 additions and 113 deletions
@@ -49,11 +49,11 @@ export const TypstLanguage = LRLanguage.define({
CodeBool: t.atom,
// Identifiers:
// CallExpr/CodeIdent — top-level #func or after keywords (#set text) → function style
// FuncExpr/CodeIdent — func call inside a value expr (has args/method) → function style
// CodeExpr/CodeIdent — bare #func (no args) → function style
// FuncExpr/CodeIdent — func call with args/method (#func(...), link.with(url)) → function style
// CodeArgKey — named arg key (tokenizer pre-disambiguates on ':') → attributeName
// CodeIdent — plain variable/constant reference (e.g. 'left', 'center') → variable
'CallExpr/CodeIdent': t.function(t.variableName),
'CodeExpr/CodeIdent': t.function(t.variableName),
'FuncExpr/CodeIdent': t.function(t.variableName),
CodeArgKey: t.attributeName,
CodeIdent: t.variableName,
@@ -8,7 +8,6 @@ import {
RawBlockBody,
RawBlockClose,
RawInlineContent,
CodeBlockBody,
BlockCommentBody,
LineCommentContent,
MathContent,
@@ -35,6 +34,7 @@ const DOT = 46 // .
const OPEN_PAREN = 40 // (
const COMMA = 44 // ,
const COLON = 58 // :
const SEMICOLON = 59 // ;
const OPEN_ANGLE = 60 // <
const CLOSE_ANGLE = 62 // >
@@ -188,36 +188,6 @@ export const rawInlineTokenizer = new ExternalTokenizer(
{ contextual: false }
)
// ── codeBlockTokenizer ──────────────────────────────────────────────────
// Emits CodeBlockBody — the interior of a #{ ... } code block.
// Tracks brace nesting depth so that inner braces (e.g. #{ f({ x }) })
// are included in the body rather than closing the outer block.
export const codeBlockTokenizer = new ExternalTokenizer(
(input, _stack) => {
// The opening '{' has already been consumed by the grammar rule.
let depth = 1
let hasContent = false
while (input.next !== -1) {
const ch = input.next
if (ch === OPEN_BRACE) {
depth++
input.advance()
hasContent = true
} else if (ch === CLOSE_BRACE) {
if (depth === 1) break // leave this '}' for the grammar rule
depth--
input.advance()
hasContent = true
} else {
input.advance()
hasContent = true
}
}
if (hasContent) input.acceptToken(CodeBlockBody)
},
{ contextual: false }
)
// ── blockCommentTokenizer ───────────────────────────────────────────────
// Emits BlockCommentBody — the interior of a /* ... */ comment.
// Typst supports nested block comments (/* /* inner */ outer */), so this
@@ -298,12 +268,12 @@ export const mathContentTokenizer = new ExternalTokenizer(
export const codeKeywordTokenizer = new ExternalTokenizer(
(input, stack) => {
if (!stack.canShift(CodeKeyword)) return
// Valid positions: immediately after '#' (normal #set, #show) or after ':'
// (show-body: '#show sel: set text(...)'). Walk back past optional whitespace.
// Valid positions: after '#', ':', '{' (code block start), or ';'.
// Walk back past optional whitespace.
let back = -1
while (input.peek(back) === SPACE || input.peek(back) === TAB || input.peek(back) === NEWLINE) back--
const kwPrev = input.peek(back)
if (kwPrev !== HASH && kwPrev !== COLON) return
if (kwPrev !== HASH && kwPrev !== COLON && kwPrev !== OPEN_BRACE && kwPrev !== SEMICOLON) return
// Peek ahead to read the full identifier without advancing.
let len = 0
@@ -355,16 +325,24 @@ export const codeIdentTokenizer = new ExternalTokenizer(
const prev = input.peek(back)
if (prev !== HASH && prev !== DOT && prev !== OPEN_PAREN && prev !== COMMA && prev !== EQUALS && prev !== COLON) {
// May be after a keyword chain like '#set text' or (in show body) 'set body':
// scan back through the preceding identifier word, skip whitespace, and
// verify '#' or ':' precedes it. Accepting ':' lets multi-word chains
// like '#show sel: set text' find ':' before 'set'.
if (!isIdentTail(prev)) return
let b = back
while (isIdentTail(input.peek(b))) b--
while (input.peek(b) === SPACE || input.peek(b) === TAB || input.peek(b) === NEWLINE) b--
const chainEnd = input.peek(b)
if (chainEnd !== HASH && chainEnd !== COLON) return
if (!isIdentTail(prev)) {
// prev is a structural delimiter (e.g. ')' after a function call, '{' at
// block start, '}' after a nested block). These are valid statement-start
// positions inside a CodeBlock's codeStatement* list. Trust canShift —
// it's reliable in the grammar-parsed code-block states.
if (!couldBeIdent) return
} else {
// prev looks like the tail of a preceding word — scan back to find '#' or ':'.
// Accepting ':' lets multi-word chains like 'show sel: set text' work.
let b = back
while (isIdentTail(input.peek(b))) b--
while (input.peek(b) === SPACE || input.peek(b) === TAB || input.peek(b) === NEWLINE) b--
const chainEnd = input.peek(b)
if (chainEnd !== HASH && chainEnd !== COLON) {
// Could be second+ statement in a code block (e.g. after 'let x = 1').
if (!couldBeIdent) return
}
}
}
// In arg-delimiter positions ('(' or ',') we may emit CodeArgKey regardless
@@ -5,7 +5,6 @@
// headingTitleTokenizer — HeadingTitle: the title text to end of line
// rawTokenizer — triple-backtick raw block open/body/close
// rawInlineTokenizer — single-backtick raw inline content
// codeBlockTokenizer — brace-depth tracking inside #{ ... }
// blockCommentTokenizer — depth-tracked nested /* ... */ comments
// codeIdentTokenizer — CodeIdent: identifier, only fires in code context
// strongBodyTokenizer — StrongBody: content inside *...*
@@ -62,10 +61,16 @@ RawInline { "`" RawInlineContent? "`" }
// #[ ... ] — content block (re-parses as markup items)
CodeExpr { "#" codeExprBody }
// codeExprBody: forms valid after '#' in markup, or after ':' / '=' in a
// keyword-body. FuncExpr handles ident+callSuffix(s); bare CodeIdent handles
// a plain variable reference (#x). No CallExpr with callSuffix* here — that
// *-quantifier makes both shift and reduce carry !call precedence (a tie that
// @right cannot resolve reliably once codeStatement* state-merging is in play).
codeExprBody {
KeywordExpr |
AtomExpr |
CallExpr |
FuncExpr |
CodeIdent |
CodeBlock |
ContentBlock
}
@@ -73,18 +78,59 @@ codeExprBody {
// callOrValue covers the subject of a keyword expression (#set text, #show link,
// #import "pkg", #let name). keywordBody is exclusive: ':' for show-rule bodies
// and '=' for let-binding values (a keyword expression never has both).
KeywordExpr { CodeKeyword callOrValue? keywordBody? }
callOrValue { CallExpr | CodeString }
// Two precedences:
// call @right — prefer extending callSuffixes (FuncExpr) over completing the
// FuncExpr and letting '(' start a new statement. The `!call` marker
// encodes the shift as (call << 2) and the FuncExpr reduce as
// (call << 2) - 1 (due to @right); shift > reduce, so callSuffix
// chains are greedily extended. Without @right both actions have
// the same numeric precedence and the conflict is unresolved.
// kw — prefer CodeKeyword !kw callOrValueAndBody over CodeKeyword keywordBody?
// when an identifier follows the keyword. shift = kw << 2, reduce
// (second alternative) = 0; kw > 0, no @right needed.
@precedence { call @right, kw }
// KeywordExpr: used in markup-level code (#show, #let, #set …) AND nested
// inside codeExprBody (e.g. the RHS after ':' in a show-rule).
// Same two-alternative structure as codeStatement: the !kw on the first
// alternative gives the shift prec kw > 0 over the unannotated reduce of the
// second alternative (prec 0). This avoids the call-vs-call tie that arises
// from the old `callOrValue?` optional pattern.
KeywordExpr {
CodeKeyword !kw callOrValueAndBody |
CodeKeyword keywordBody?
}
// callOrValue: FuncExpr for "ident(args)" / "ident.method", bare CodeIdent for
// a plain name, CodeString for string subjects like #import "pkg".
// FuncExpr requires at least one callSuffix, so at [CodeIdent ·] seeing '(':
// SHIFT (start callSuffixes, prec call) vs REDUCE bare CodeIdent (prec 0).
// call > 0 → shift wins cleanly.
callOrValue { FuncExpr | CodeIdent | CodeString }
keywordBody { ":" codeExprBody | "=" codeValue }
AtomExpr { CodeBool }
// CallExpr allows zero suffixes — used at top level (#x) and after keywords
// (#set text(...)) where even a bare identifier is valid as a named reference.
CallExpr { CodeIdent callSuffix* }
// FuncExpr requires at least one suffix — used inside codeValue so that
// 'table(...)' gets tok-function while plain identifiers like 'left'/'center'
// get tok-variableName instead of being false-positively styled as functions.
FuncExpr { CodeIdent callSuffix+ }
// codeStatement is the unit inside a CodeBlock's brace body.
// Two explicit alternatives for the keyword case avoid the LALR ambiguity
// that arises from codeStatement* merging when callOrValue? is optional.
// The !kw annotation on the first alternative (shift callOrValueAndBody) has
// higher precedence than the bare reduce of the second alternative (prec 0),
// so 'show strong: …' grabs 'strong' as callOrValue rather than completing
// KeywordExpr early with empty callOrValue.
codeStatement {
CodeKeyword !kw callOrValueAndBody |
CodeKeyword keywordBody? |
codeValue |
";"
}
callOrValueAndBody { callOrValue keywordBody? }
// FuncExpr: identifier followed by one-or-more call suffixes.
// callSuffixes uses explicit left-recursion (not +) so the !call annotation
// on the recursive extension point gives the shift prec call vs the unannotated
// reduce of codeValue → FuncExpr (prec 0) — shift wins, no @right tie.
callSuffixes { callSuffix | callSuffixes !call callSuffix }
FuncExpr { CodeIdent !call callSuffixes }
callSuffix {
CodeArgs |
"." CodeIdent |
@@ -114,8 +160,9 @@ codeValue {
// Reuses codeArgList so named-key entries like (auto, 1fr) work too.
CodeArray { "(" codeArgList? ")" }
// CodeBlockBody depth-tracks braces so #{ let x = { 1 } } parses correctly.
CodeBlock { "{" CodeBlockBody? "}" }
// CodeBlock parses its content as a codeStatement* list so that keywords
// (show, let, set…) and identifiers inside braces receive proper highlighting.
CodeBlock { "{" codeStatement* "}" }
// ContentBlock re-enters markup mode, allowing #[*bold* text].
ContentBlock { "[" item* "]" }
@@ -162,10 +209,6 @@ Escape { "\\" EscapeChar }
RawInlineContent
}
@external tokens codeBlockTokenizer from "./tokens.mjs" {
CodeBlockBody
}
@external tokens blockCommentTokenizer from "./tokens.mjs" {
BlockCommentBody
}
File diff suppressed because one or more lines are too long
@@ -6,42 +6,40 @@ export const
RawBlockBody = 4,
RawBlockClose = 5,
RawInlineContent = 6,
CodeBlockBody = 7,
BlockCommentBody = 8,
LineCommentContent = 9,
MathContent = 10,
CodeKeyword = 11,
CodeIdent = 12,
CodeArgKey = 13,
StrongBody = 14,
EmphBody = 15,
Document = 16,
Heading = 17,
LineComment = 18,
BlockComment = 19,
RawBlock = 20,
RawInline = 21,
CodeExpr = 22,
KeywordExpr = 23,
CallExpr = 24,
CodeArgs = 25,
CodeString = 26,
CodeNumber = 27,
CodeBool = 28,
FuncExpr = 29,
ContentBlock = 30,
CodeBlock = 31,
InlineMath = 32,
CodeArray = 33,
AtomExpr = 34,
Strong = 35,
Emphasis = 36,
Label = 37,
LabelName = 38,
Ref = 39,
RefName = 40,
Escape = 41,
EscapeChar = 42,
URL = 43,
MarkupContent = 44,
ClosingSquare = 45
BlockCommentBody = 7,
LineCommentContent = 8,
MathContent = 9,
CodeKeyword = 10,
CodeIdent = 11,
CodeArgKey = 12,
StrongBody = 13,
EmphBody = 14,
Document = 15,
Heading = 16,
LineComment = 17,
BlockComment = 18,
RawBlock = 19,
RawInline = 20,
CodeExpr = 21,
KeywordExpr = 22,
FuncExpr = 23,
CodeArgs = 24,
CodeString = 25,
CodeNumber = 26,
CodeBool = 27,
ContentBlock = 28,
CodeBlock = 29,
InlineMath = 30,
CodeArray = 31,
AtomExpr = 32,
Strong = 33,
Emphasis = 34,
Label = 35,
LabelName = 36,
Ref = 37,
RefName = 38,
Escape = 39,
EscapeChar = 40,
URL = 41,
MarkupContent = 42,
ClosingSquare = 43