diff --git a/services/web/frontend/js/features/source-editor/languages/typst/index.ts b/services/web/frontend/js/features/source-editor/languages/typst/index.ts index 8d0847deac..ba3f3d7014 100644 --- a/services/web/frontend/js/features/source-editor/languages/typst/index.ts +++ b/services/web/frontend/js/features/source-editor/languages/typst/index.ts @@ -14,9 +14,8 @@ import { typstDocumentOutline } from './document-outline' // Note on tree structure: rules starting with a lowercase letter in the grammar // are inline (no tree node), so their children are promoted to the parent. // E.g. codeArgItem, codeValue, callSuffix, codeArgList are all inline. -// Therefore: -// - The named-argument key "CodeIdent" is a *direct* child of CodeArgs. -// - Positional arguments that are identifiers are wrapped in CallExpr. +// Named arg keys emit CodeArgKey (not CodeIdent) via codeIdentTokenizer, +// so CodeArgKey appears at the same level as other codeArgItem children. export const TypstLanguage = LRLanguage.define({ name: 'typst', @@ -51,10 +50,10 @@ export const TypstLanguage = LRLanguage.define({ // Identifiers: // - direct child of CallExpr → function/method name - // - direct child of CodeArgs → named argument key (key: value syntax) - // - everywhere else → plain variable + // - CodeArgKey (named arg key, emitted by tokenizer before ':') → attributeName + // - everywhere else → plain variable 'CallExpr/CodeIdent': t.function(t.variableName), - 'CodeArgs/CodeIdent': t.attributeName, + CodeArgKey: t.attributeName, CodeIdent: t.variableName, // Literals in code mode diff --git a/services/web/frontend/js/features/source-editor/lezer-typst/tokens.mjs b/services/web/frontend/js/features/source-editor/lezer-typst/tokens.mjs index d9ca805889..23e276d672 100644 --- a/services/web/frontend/js/features/source-editor/lezer-typst/tokens.mjs +++ b/services/web/frontend/js/features/source-editor/lezer-typst/tokens.mjs @@ -14,6 +14,7 @@ import { MathContent, CodeKeyword, CodeIdent, + CodeArgKey, StrongBody, EmphBody, } from './typst.terms.mjs' @@ -33,6 +34,7 @@ const UNDERSCORE = 95 // _ const DOT = 46 // . const OPEN_PAREN = 40 // ( const COMMA = 44 // , +const COLON = 58 // : const KEYWORDS = new Set([ 'let', 'set', 'show', 'import', 'include', @@ -252,36 +254,23 @@ export const lineCommentContentTokenizer = new ExternalTokenizer( ) // ── mathContentTokenizer ──────────────────────────────────────────────── -// Emits MathContent — everything between the $...$ delimiters. +// Emits MathContent — one line of content between the $...$ delimiters. +// Stops at '$' or '\n' so each token is bounded to a single line. // -// Typst distinguishes inline math ($x^2$) from display math ($ x^2 $): -// display math has whitespace between the opening '$' and the content. -// We detect this by scanning back to '$': if there is any whitespace -// between '$' and the current position (i.e. @skip consumed it), the -// tokenizer allows newlines so multi-line display math works. Inline math -// keeps the newline stop, preventing a lone '$' from consuming the rest of -// the document. -// -// contextual: true — only fires inside InlineMath after '$', never in -// body text. The '$' token appears nowhere else in the grammar so the -// post-'$' state does not merge with item* states. +// The grammar uses MathContent* (not MathContent?) so multi-line display +// math ($ ... \n ... $) is handled by multiple MathContent tokens, one per +// line, with @skip consuming the newlines in between. This keeps each +// token short and prevents a stray '$' from consuming the whole document. export const mathContentTokenizer = new ExternalTokenizer( (input, _stack) => { - // Scan back to the opening '$', detecting display vs inline math. - let back = -1 - while (input.peek(back) === SPACE || input.peek(back) === TAB || input.peek(back) === NEWLINE) back-- - if (input.peek(back) !== DOLLAR) return - const isDisplay = back < -1 // whitespace between '$' and current position - let hasContent = false - while (input.next !== -1 && input.next !== DOLLAR) { - if (!isDisplay && input.next === NEWLINE) break + while (input.next !== -1 && input.next !== DOLLAR && input.next !== NEWLINE) { input.advance() hasContent = true } if (hasContent) input.acceptToken(MathContent) }, - { contextual: true } + { contextual: false } ) // ── codeKeywordTokenizer ───────────────────────────────────────────────── @@ -335,7 +324,9 @@ export const codeKeywordTokenizer = new ExternalTokenizer( // handle them without conflict. export const codeIdentTokenizer = new ExternalTokenizer( (input, stack) => { - if (!stack.canShift(CodeIdent)) return + const couldBeKey = stack.canShift(CodeArgKey) + const couldBeIdent = stack.canShift(CodeIdent) + if (!couldBeKey && !couldBeIdent) return // Guard: only fire in code context. // Walk back past whitespace to the nearest non-space character. @@ -368,8 +359,18 @@ export const codeIdentTokenizer = new ExternalTokenizer( // Let codeKeywordTokenizer handle keywords; let CodeBool handle bools. if (KEYWORDS.has(word) || BOOLS.has(word)) return + // Emit CodeArgKey when this identifier is a named arg key (followed by ':'). + // Pre-disambiguating here avoids relying on LALR lookahead to choose between + // codeArgItem alternatives, which is fragile under Lezer's state merging. + let isArgKey = false + if (couldBeKey) { + let afterLen = len + while (input.peek(afterLen) === SPACE || input.peek(afterLen) === TAB) afterLen++ + isArgKey = (input.peek(afterLen) === COLON) + } + for (let i = 0; i < len; i++) input.advance() - input.acceptToken(CodeIdent) + input.acceptToken(isArgKey ? CodeArgKey : CodeIdent) }, { contextual: true } ) diff --git a/services/web/frontend/js/features/source-editor/lezer-typst/typst.grammar b/services/web/frontend/js/features/source-editor/lezer-typst/typst.grammar index b22373b547..52309e3324 100644 --- a/services/web/frontend/js/features/source-editor/lezer-typst/typst.grammar +++ b/services/web/frontend/js/features/source-editor/lezer-typst/typst.grammar @@ -83,7 +83,7 @@ callSuffix { CodeArgs { "(" codeArgList? ")" } codeArgList { codeArgItem ("," codeArgItem)* ","? } codeArgItem { - CodeIdent ":" codeValue | + CodeArgKey ":" codeValue | codeValue } @@ -104,7 +104,9 @@ ContentBlock { "[" item* "]" } // ── Math ────────────────────────────────────────────────────────────────── // Both inline ($x^2$) and display ($ x^2 $) math use the same node type. -InlineMath { "$" MathContent? "$" } +// MathContent* (not ?) allows multi-line display math: each line becomes one +// MathContent token (stopping at '\n'), and @skip consumes the newlines between. +InlineMath { "$" MathContent* "$" } // ── Markup formatting ───────────────────────────────────────────────────── // Strong and Emphasis use flat external body tokens (StrongBody / EmphBody) @@ -169,8 +171,12 @@ Escape { "\\" EscapeChar } // the token from firing in markup body text, where LALR state merging would // otherwise cause the entire token (including any leading '_') to be consumed // as a code identifier instead of letting '_' open an Emphasis. +// CodeArgKey is emitted by the same tokenizer when an identifier is immediately +// followed by ':' — the tokenizer pre-disambiguates named arg keys so the LALR +// parser does not need to choose between codeArgItem alternatives on lookahead. @external tokens codeIdentTokenizer from "./tokens.mjs" { - CodeIdent + CodeIdent, + CodeArgKey } @external tokens strongBodyTokenizer from "./tokens.mjs" {