typst: fix named-arg key highlighting and multi-line math
Build and Deploy Verso / deploy (push) Successful in 13m41s

Named arg keys (columns:, align:, caption:) were appearing in black
because LALR state merging broke the CodeArgs/CodeIdent path for
multi-line expressions.  Fix: emit a dedicated CodeArgKey token from
codeIdentTokenizer (forward-peek for ':' to pre-disambiguate), declare
it in the grammar's codeArgItem rule, and map it to t.attributeName in
styleTags — bypassing LALR lookahead entirely.

Multi-line display math ($ ...\n... $) was consuming the rest of the
document as orange text when contextual:true caused a backward scan to
find a previous closing '$' and falsely set isDisplay=true.  Fix:
revert mathContentTokenizer to contextual:false with '\n' stop (each
MathContent token covers one line), and change InlineMath to
MathContent* so @skip consumes the newlines between lines.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
claude
2026-06-09 11:25:31 +00:00
parent 2fdb155547
commit 4c6032bce0
3 changed files with 38 additions and 32 deletions
@@ -14,9 +14,8 @@ import { typstDocumentOutline } from './document-outline'
// Note on tree structure: rules starting with a lowercase letter in the grammar
// are inline (no tree node), so their children are promoted to the parent.
// E.g. codeArgItem, codeValue, callSuffix, codeArgList are all inline.
// Therefore:
// - The named-argument key "CodeIdent" is a *direct* child of CodeArgs.
// - Positional arguments that are identifiers are wrapped in CallExpr.
// Named arg keys emit CodeArgKey (not CodeIdent) via codeIdentTokenizer,
// so CodeArgKey appears at the same level as other codeArgItem children.
export const TypstLanguage = LRLanguage.define({
name: 'typst',
@@ -51,10 +50,10 @@ export const TypstLanguage = LRLanguage.define({
// Identifiers:
// - direct child of CallExpr → function/method name
// - direct child of CodeArgs → named argument key (key: value syntax)
// - everywhere else → plain variable
// - CodeArgKey (named arg key, emitted by tokenizer before ':') → attributeName
// - everywhere else → plain variable
'CallExpr/CodeIdent': t.function(t.variableName),
'CodeArgs/CodeIdent': t.attributeName,
CodeArgKey: t.attributeName,
CodeIdent: t.variableName,
// Literals in code mode
@@ -14,6 +14,7 @@ import {
MathContent,
CodeKeyword,
CodeIdent,
CodeArgKey,
StrongBody,
EmphBody,
} from './typst.terms.mjs'
@@ -33,6 +34,7 @@ const UNDERSCORE = 95 // _
const DOT = 46 // .
const OPEN_PAREN = 40 // (
const COMMA = 44 // ,
const COLON = 58 // :
const KEYWORDS = new Set([
'let', 'set', 'show', 'import', 'include',
@@ -252,36 +254,23 @@ export const lineCommentContentTokenizer = new ExternalTokenizer(
)
// ── mathContentTokenizer ────────────────────────────────────────────────
// Emits MathContent — everything between the $...$ delimiters.
// Emits MathContent — one line of content between the $...$ delimiters.
// Stops at '$' or '\n' so each token is bounded to a single line.
//
// Typst distinguishes inline math ($x^2$) from display math ($ x^2 $):
// display math has whitespace between the opening '$' and the content.
// We detect this by scanning back to '$': if there is any whitespace
// between '$' and the current position (i.e. @skip consumed it), the
// tokenizer allows newlines so multi-line display math works. Inline math
// keeps the newline stop, preventing a lone '$' from consuming the rest of
// the document.
//
// contextual: true — only fires inside InlineMath after '$', never in
// body text. The '$' token appears nowhere else in the grammar so the
// post-'$' state does not merge with item* states.
// The grammar uses MathContent* (not MathContent?) so multi-line display
// math ($ ... \n ... $) is handled by multiple MathContent tokens, one per
// line, with @skip consuming the newlines in between. This keeps each
// token short and prevents a stray '$' from consuming the whole document.
export const mathContentTokenizer = new ExternalTokenizer(
(input, _stack) => {
// Scan back to the opening '$', detecting display vs inline math.
let back = -1
while (input.peek(back) === SPACE || input.peek(back) === TAB || input.peek(back) === NEWLINE) back--
if (input.peek(back) !== DOLLAR) return
const isDisplay = back < -1 // whitespace between '$' and current position
let hasContent = false
while (input.next !== -1 && input.next !== DOLLAR) {
if (!isDisplay && input.next === NEWLINE) break
while (input.next !== -1 && input.next !== DOLLAR && input.next !== NEWLINE) {
input.advance()
hasContent = true
}
if (hasContent) input.acceptToken(MathContent)
},
{ contextual: true }
{ contextual: false }
)
// ── codeKeywordTokenizer ─────────────────────────────────────────────────
@@ -335,7 +324,9 @@ export const codeKeywordTokenizer = new ExternalTokenizer(
// handle them without conflict.
export const codeIdentTokenizer = new ExternalTokenizer(
(input, stack) => {
if (!stack.canShift(CodeIdent)) return
const couldBeKey = stack.canShift(CodeArgKey)
const couldBeIdent = stack.canShift(CodeIdent)
if (!couldBeKey && !couldBeIdent) return
// Guard: only fire in code context.
// Walk back past whitespace to the nearest non-space character.
@@ -368,8 +359,18 @@ export const codeIdentTokenizer = new ExternalTokenizer(
// Let codeKeywordTokenizer handle keywords; let CodeBool handle bools.
if (KEYWORDS.has(word) || BOOLS.has(word)) return
// Emit CodeArgKey when this identifier is a named arg key (followed by ':').
// Pre-disambiguating here avoids relying on LALR lookahead to choose between
// codeArgItem alternatives, which is fragile under Lezer's state merging.
let isArgKey = false
if (couldBeKey) {
let afterLen = len
while (input.peek(afterLen) === SPACE || input.peek(afterLen) === TAB) afterLen++
isArgKey = (input.peek(afterLen) === COLON)
}
for (let i = 0; i < len; i++) input.advance()
input.acceptToken(CodeIdent)
input.acceptToken(isArgKey ? CodeArgKey : CodeIdent)
},
{ contextual: true }
)
@@ -83,7 +83,7 @@ callSuffix {
CodeArgs { "(" codeArgList? ")" }
codeArgList { codeArgItem ("," codeArgItem)* ","? }
codeArgItem {
CodeIdent ":" codeValue |
CodeArgKey ":" codeValue |
codeValue
}
@@ -104,7 +104,9 @@ ContentBlock { "[" item* "]" }
// ── Math ──────────────────────────────────────────────────────────────────
// Both inline ($x^2$) and display ($ x^2 $) math use the same node type.
InlineMath { "$" MathContent? "$" }
// MathContent* (not ?) allows multi-line display math: each line becomes one
// MathContent token (stopping at '\n'), and @skip consumes the newlines between.
InlineMath { "$" MathContent* "$" }
// ── Markup formatting ─────────────────────────────────────────────────────
// Strong and Emphasis use flat external body tokens (StrongBody / EmphBody)
@@ -169,8 +171,12 @@ Escape { "\\" EscapeChar }
// the token from firing in markup body text, where LALR state merging would
// otherwise cause the entire token (including any leading '_') to be consumed
// as a code identifier instead of letting '_' open an Emphasis.
// CodeArgKey is emitted by the same tokenizer when an identifier is immediately
// followed by ':' — the tokenizer pre-disambiguates named arg keys so the LALR
// parser does not need to choose between codeArgItem alternatives on lookahead.
@external tokens codeIdentTokenizer from "./tokens.mjs" {
CodeIdent
CodeIdent,
CodeArgKey
}
@external tokens strongBodyTokenizer from "./tokens.mjs" {