fix(typst): highlight keywords/idents inside #{} code blocks
Build and Deploy Verso / deploy (push) Successful in 9m32s

Replace the opaque CodeBlockBody external tokenizer with grammar-parsed
codeStatement* so that keywords (show, let, set, …) and identifiers
inside #{ } code blocks receive proper Lezer nodes and are highlighted.

Key grammar changes:
- CodeBlock { "{" codeStatement* "}" } — structured, not opaque
- codeStatement uses two explicit alternatives for keyword lines:
    CodeKeyword !kw callOrValueAndBody  (grabs the subject eagerly)
    CodeKeyword keywordBody?            (bare keyword or body-only form)
  The !kw cut-point gives shift prec kw > 0 over the unannotated reduce,
  resolving the LALR merge ambiguity without @left/@right on kw.
- callOrValue { FuncExpr | CodeIdent | CodeString } — replaces CallExpr
  { CodeIdent !call callSuffix* }.  The * quantifier annotated both
  shift and reduce with !call, making them a same-prec tie that @right
  could not reliably resolve in merged states.  Using FuncExpr (required
  callSuffixes) + bare CodeIdent makes the tie strict (call > 0 for
  FuncExpr shift vs 0 for bare-ident reduce), then @right handles only
  the extension-of-callSuffixes case (shift = call<<2, FuncExpr reduce
  = call<<2 - 1 via @right encoding).
- KeywordExpr gets the same two-alternative structure as codeStatement
  so nested show/set/let inside a code block (e.g. show sel: set text)
  also parse without LALR state-merge conflicts.
- CallExpr removed; its role is split between FuncExpr (has args/chain)
  and bare CodeIdent (no args).  Styling updated: CodeExpr/CodeIdent
  replaces CallExpr/CodeIdent for bare #ident function-style highlights.
- codeKeywordTokenizer and codeIdentTokenizer already accept keywords /
  identifiers after { and ; (added in previous commit) — consistent with
  the new grammar.

Parse results:
  #{ show strong: link.with(url); body }
  → CodeKeyword "show", CodeIdent "strong", FuncExpr "link.with(url)",
    CodeIdent "body" — all properly highlighted, no ⚠ errors.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
claude
2026-06-09 21:47:21 +00:00
parent 056d9a7f47
commit 0656ddfe52
5 changed files with 132 additions and 113 deletions
@@ -49,11 +49,11 @@ export const TypstLanguage = LRLanguage.define({
CodeBool: t.atom, CodeBool: t.atom,
// Identifiers: // Identifiers:
// CallExpr/CodeIdent — top-level #func or after keywords (#set text) → function style // CodeExpr/CodeIdent — bare #func (no args) → function style
// FuncExpr/CodeIdent — func call inside a value expr (has args/method) → function style // FuncExpr/CodeIdent — func call with args/method (#func(...), link.with(url)) → function style
// CodeArgKey — named arg key (tokenizer pre-disambiguates on ':') → attributeName // CodeArgKey — named arg key (tokenizer pre-disambiguates on ':') → attributeName
// CodeIdent — plain variable/constant reference (e.g. 'left', 'center') → variable // CodeIdent — plain variable/constant reference (e.g. 'left', 'center') → variable
'CallExpr/CodeIdent': t.function(t.variableName), 'CodeExpr/CodeIdent': t.function(t.variableName),
'FuncExpr/CodeIdent': t.function(t.variableName), 'FuncExpr/CodeIdent': t.function(t.variableName),
CodeArgKey: t.attributeName, CodeArgKey: t.attributeName,
CodeIdent: t.variableName, CodeIdent: t.variableName,
@@ -8,7 +8,6 @@ import {
RawBlockBody, RawBlockBody,
RawBlockClose, RawBlockClose,
RawInlineContent, RawInlineContent,
CodeBlockBody,
BlockCommentBody, BlockCommentBody,
LineCommentContent, LineCommentContent,
MathContent, MathContent,
@@ -35,6 +34,7 @@ const DOT = 46 // .
const OPEN_PAREN = 40 // ( const OPEN_PAREN = 40 // (
const COMMA = 44 // , const COMMA = 44 // ,
const COLON = 58 // : const COLON = 58 // :
const SEMICOLON = 59 // ;
const OPEN_ANGLE = 60 // < const OPEN_ANGLE = 60 // <
const CLOSE_ANGLE = 62 // > const CLOSE_ANGLE = 62 // >
@@ -188,36 +188,6 @@ export const rawInlineTokenizer = new ExternalTokenizer(
{ contextual: false } { contextual: false }
) )
// ── codeBlockTokenizer ──────────────────────────────────────────────────
// Emits CodeBlockBody — the interior of a #{ ... } code block.
// Tracks brace nesting depth so that inner braces (e.g. #{ f({ x }) })
// are included in the body rather than closing the outer block.
export const codeBlockTokenizer = new ExternalTokenizer(
(input, _stack) => {
// The opening '{' has already been consumed by the grammar rule.
let depth = 1
let hasContent = false
while (input.next !== -1) {
const ch = input.next
if (ch === OPEN_BRACE) {
depth++
input.advance()
hasContent = true
} else if (ch === CLOSE_BRACE) {
if (depth === 1) break // leave this '}' for the grammar rule
depth--
input.advance()
hasContent = true
} else {
input.advance()
hasContent = true
}
}
if (hasContent) input.acceptToken(CodeBlockBody)
},
{ contextual: false }
)
// ── blockCommentTokenizer ─────────────────────────────────────────────── // ── blockCommentTokenizer ───────────────────────────────────────────────
// Emits BlockCommentBody — the interior of a /* ... */ comment. // Emits BlockCommentBody — the interior of a /* ... */ comment.
// Typst supports nested block comments (/* /* inner */ outer */), so this // Typst supports nested block comments (/* /* inner */ outer */), so this
@@ -298,12 +268,12 @@ export const mathContentTokenizer = new ExternalTokenizer(
export const codeKeywordTokenizer = new ExternalTokenizer( export const codeKeywordTokenizer = new ExternalTokenizer(
(input, stack) => { (input, stack) => {
if (!stack.canShift(CodeKeyword)) return if (!stack.canShift(CodeKeyword)) return
// Valid positions: immediately after '#' (normal #set, #show) or after ':' // Valid positions: after '#', ':', '{' (code block start), or ';'.
// (show-body: '#show sel: set text(...)'). Walk back past optional whitespace. // Walk back past optional whitespace.
let back = -1 let back = -1
while (input.peek(back) === SPACE || input.peek(back) === TAB || input.peek(back) === NEWLINE) back-- while (input.peek(back) === SPACE || input.peek(back) === TAB || input.peek(back) === NEWLINE) back--
const kwPrev = input.peek(back) const kwPrev = input.peek(back)
if (kwPrev !== HASH && kwPrev !== COLON) return if (kwPrev !== HASH && kwPrev !== COLON && kwPrev !== OPEN_BRACE && kwPrev !== SEMICOLON) return
// Peek ahead to read the full identifier without advancing. // Peek ahead to read the full identifier without advancing.
let len = 0 let len = 0
@@ -355,16 +325,24 @@ export const codeIdentTokenizer = new ExternalTokenizer(
const prev = input.peek(back) const prev = input.peek(back)
if (prev !== HASH && prev !== DOT && prev !== OPEN_PAREN && prev !== COMMA && prev !== EQUALS && prev !== COLON) { if (prev !== HASH && prev !== DOT && prev !== OPEN_PAREN && prev !== COMMA && prev !== EQUALS && prev !== COLON) {
// May be after a keyword chain like '#set text' or (in show body) 'set body': if (!isIdentTail(prev)) {
// scan back through the preceding identifier word, skip whitespace, and // prev is a structural delimiter (e.g. ')' after a function call, '{' at
// verify '#' or ':' precedes it. Accepting ':' lets multi-word chains // block start, '}' after a nested block). These are valid statement-start
// like '#show sel: set text' find ':' before 'set'. // positions inside a CodeBlock's codeStatement* list. Trust canShift —
if (!isIdentTail(prev)) return // it's reliable in the grammar-parsed code-block states.
let b = back if (!couldBeIdent) return
while (isIdentTail(input.peek(b))) b-- } else {
while (input.peek(b) === SPACE || input.peek(b) === TAB || input.peek(b) === NEWLINE) b-- // prev looks like the tail of a preceding word — scan back to find '#' or ':'.
const chainEnd = input.peek(b) // Accepting ':' lets multi-word chains like 'show sel: set text' work.
if (chainEnd !== HASH && chainEnd !== COLON) return let b = back
while (isIdentTail(input.peek(b))) b--
while (input.peek(b) === SPACE || input.peek(b) === TAB || input.peek(b) === NEWLINE) b--
const chainEnd = input.peek(b)
if (chainEnd !== HASH && chainEnd !== COLON) {
// Could be second+ statement in a code block (e.g. after 'let x = 1').
if (!couldBeIdent) return
}
}
} }
// In arg-delimiter positions ('(' or ',') we may emit CodeArgKey regardless // In arg-delimiter positions ('(' or ',') we may emit CodeArgKey regardless
@@ -5,7 +5,6 @@
// headingTitleTokenizer — HeadingTitle: the title text to end of line // headingTitleTokenizer — HeadingTitle: the title text to end of line
// rawTokenizer — triple-backtick raw block open/body/close // rawTokenizer — triple-backtick raw block open/body/close
// rawInlineTokenizer — single-backtick raw inline content // rawInlineTokenizer — single-backtick raw inline content
// codeBlockTokenizer — brace-depth tracking inside #{ ... }
// blockCommentTokenizer — depth-tracked nested /* ... */ comments // blockCommentTokenizer — depth-tracked nested /* ... */ comments
// codeIdentTokenizer — CodeIdent: identifier, only fires in code context // codeIdentTokenizer — CodeIdent: identifier, only fires in code context
// strongBodyTokenizer — StrongBody: content inside *...* // strongBodyTokenizer — StrongBody: content inside *...*
@@ -62,10 +61,16 @@ RawInline { "`" RawInlineContent? "`" }
// #[ ... ] — content block (re-parses as markup items) // #[ ... ] — content block (re-parses as markup items)
CodeExpr { "#" codeExprBody } CodeExpr { "#" codeExprBody }
// codeExprBody: forms valid after '#' in markup, or after ':' / '=' in a
// keyword-body. FuncExpr handles ident+callSuffix(s); bare CodeIdent handles
// a plain variable reference (#x). No CallExpr with callSuffix* here — that
// *-quantifier makes both shift and reduce carry !call precedence (a tie that
// @right cannot resolve reliably once codeStatement* state-merging is in play).
codeExprBody { codeExprBody {
KeywordExpr | KeywordExpr |
AtomExpr | AtomExpr |
CallExpr | FuncExpr |
CodeIdent |
CodeBlock | CodeBlock |
ContentBlock ContentBlock
} }
@@ -73,18 +78,59 @@ codeExprBody {
// callOrValue covers the subject of a keyword expression (#set text, #show link, // callOrValue covers the subject of a keyword expression (#set text, #show link,
// #import "pkg", #let name). keywordBody is exclusive: ':' for show-rule bodies // #import "pkg", #let name). keywordBody is exclusive: ':' for show-rule bodies
// and '=' for let-binding values (a keyword expression never has both). // and '=' for let-binding values (a keyword expression never has both).
KeywordExpr { CodeKeyword callOrValue? keywordBody? } // Two precedences:
callOrValue { CallExpr | CodeString } // call @right — prefer extending callSuffixes (FuncExpr) over completing the
// FuncExpr and letting '(' start a new statement. The `!call` marker
// encodes the shift as (call << 2) and the FuncExpr reduce as
// (call << 2) - 1 (due to @right); shift > reduce, so callSuffix
// chains are greedily extended. Without @right both actions have
// the same numeric precedence and the conflict is unresolved.
// kw — prefer CodeKeyword !kw callOrValueAndBody over CodeKeyword keywordBody?
// when an identifier follows the keyword. shift = kw << 2, reduce
// (second alternative) = 0; kw > 0, no @right needed.
@precedence { call @right, kw }
// KeywordExpr: used in markup-level code (#show, #let, #set …) AND nested
// inside codeExprBody (e.g. the RHS after ':' in a show-rule).
// Same two-alternative structure as codeStatement: the !kw on the first
// alternative gives the shift prec kw > 0 over the unannotated reduce of the
// second alternative (prec 0). This avoids the call-vs-call tie that arises
// from the old `callOrValue?` optional pattern.
KeywordExpr {
CodeKeyword !kw callOrValueAndBody |
CodeKeyword keywordBody?
}
// callOrValue: FuncExpr for "ident(args)" / "ident.method", bare CodeIdent for
// a plain name, CodeString for string subjects like #import "pkg".
// FuncExpr requires at least one callSuffix, so at [CodeIdent ·] seeing '(':
// SHIFT (start callSuffixes, prec call) vs REDUCE bare CodeIdent (prec 0).
// call > 0 → shift wins cleanly.
callOrValue { FuncExpr | CodeIdent | CodeString }
keywordBody { ":" codeExprBody | "=" codeValue } keywordBody { ":" codeExprBody | "=" codeValue }
AtomExpr { CodeBool } AtomExpr { CodeBool }
// CallExpr allows zero suffixes — used at top level (#x) and after keywords // codeStatement is the unit inside a CodeBlock's brace body.
// (#set text(...)) where even a bare identifier is valid as a named reference. // Two explicit alternatives for the keyword case avoid the LALR ambiguity
CallExpr { CodeIdent callSuffix* } // that arises from codeStatement* merging when callOrValue? is optional.
// FuncExpr requires at least one suffix — used inside codeValue so that // The !kw annotation on the first alternative (shift callOrValueAndBody) has
// 'table(...)' gets tok-function while plain identifiers like 'left'/'center' // higher precedence than the bare reduce of the second alternative (prec 0),
// get tok-variableName instead of being false-positively styled as functions. // so 'show strong: …' grabs 'strong' as callOrValue rather than completing
FuncExpr { CodeIdent callSuffix+ } // KeywordExpr early with empty callOrValue.
codeStatement {
CodeKeyword !kw callOrValueAndBody |
CodeKeyword keywordBody? |
codeValue |
";"
}
callOrValueAndBody { callOrValue keywordBody? }
// FuncExpr: identifier followed by one-or-more call suffixes.
// callSuffixes uses explicit left-recursion (not +) so the !call annotation
// on the recursive extension point gives the shift prec call vs the unannotated
// reduce of codeValue → FuncExpr (prec 0) — shift wins, no @right tie.
callSuffixes { callSuffix | callSuffixes !call callSuffix }
FuncExpr { CodeIdent !call callSuffixes }
callSuffix { callSuffix {
CodeArgs | CodeArgs |
"." CodeIdent | "." CodeIdent |
@@ -114,8 +160,9 @@ codeValue {
// Reuses codeArgList so named-key entries like (auto, 1fr) work too. // Reuses codeArgList so named-key entries like (auto, 1fr) work too.
CodeArray { "(" codeArgList? ")" } CodeArray { "(" codeArgList? ")" }
// CodeBlockBody depth-tracks braces so #{ let x = { 1 } } parses correctly. // CodeBlock parses its content as a codeStatement* list so that keywords
CodeBlock { "{" CodeBlockBody? "}" } // (show, let, set…) and identifiers inside braces receive proper highlighting.
CodeBlock { "{" codeStatement* "}" }
// ContentBlock re-enters markup mode, allowing #[*bold* text]. // ContentBlock re-enters markup mode, allowing #[*bold* text].
ContentBlock { "[" item* "]" } ContentBlock { "[" item* "]" }
@@ -162,10 +209,6 @@ Escape { "\\" EscapeChar }
RawInlineContent RawInlineContent
} }
@external tokens codeBlockTokenizer from "./tokens.mjs" {
CodeBlockBody
}
@external tokens blockCommentTokenizer from "./tokens.mjs" { @external tokens blockCommentTokenizer from "./tokens.mjs" {
BlockCommentBody BlockCommentBody
} }
File diff suppressed because one or more lines are too long
@@ -6,42 +6,40 @@ export const
RawBlockBody = 4, RawBlockBody = 4,
RawBlockClose = 5, RawBlockClose = 5,
RawInlineContent = 6, RawInlineContent = 6,
CodeBlockBody = 7, BlockCommentBody = 7,
BlockCommentBody = 8, LineCommentContent = 8,
LineCommentContent = 9, MathContent = 9,
MathContent = 10, CodeKeyword = 10,
CodeKeyword = 11, CodeIdent = 11,
CodeIdent = 12, CodeArgKey = 12,
CodeArgKey = 13, StrongBody = 13,
StrongBody = 14, EmphBody = 14,
EmphBody = 15, Document = 15,
Document = 16, Heading = 16,
Heading = 17, LineComment = 17,
LineComment = 18, BlockComment = 18,
BlockComment = 19, RawBlock = 19,
RawBlock = 20, RawInline = 20,
RawInline = 21, CodeExpr = 21,
CodeExpr = 22, KeywordExpr = 22,
KeywordExpr = 23, FuncExpr = 23,
CallExpr = 24, CodeArgs = 24,
CodeArgs = 25, CodeString = 25,
CodeString = 26, CodeNumber = 26,
CodeNumber = 27, CodeBool = 27,
CodeBool = 28, ContentBlock = 28,
FuncExpr = 29, CodeBlock = 29,
ContentBlock = 30, InlineMath = 30,
CodeBlock = 31, CodeArray = 31,
InlineMath = 32, AtomExpr = 32,
CodeArray = 33, Strong = 33,
AtomExpr = 34, Emphasis = 34,
Strong = 35, Label = 35,
Emphasis = 36, LabelName = 36,
Label = 37, Ref = 37,
LabelName = 38, RefName = 38,
Ref = 39, Escape = 39,
RefName = 40, EscapeChar = 40,
Escape = 41, URL = 41,
EscapeChar = 42, MarkupContent = 42,
URL = 43, ClosingSquare = 43
MarkupContent = 44,
ClosingSquare = 45