diff --git a/services/web/frontend/js/features/source-editor/languages/typst/index.ts b/services/web/frontend/js/features/source-editor/languages/typst/index.ts index a3d651731f..8d0847deac 100644 --- a/services/web/frontend/js/features/source-editor/languages/typst/index.ts +++ b/services/web/frontend/js/features/source-editor/languages/typst/index.ts @@ -73,8 +73,8 @@ export const TypstLanguage = LRLanguage.define({ MathContent: t.string, // Markup emphasis - 'Strong/"*" Strong/StrongText': t.strong, - 'Emphasis/"_" Emphasis/EmphText': t.emphasis, + 'Strong/"*" Strong/StrongBody': t.strong, + 'Emphasis/"_" Emphasis/EmphBody': t.emphasis, // Labels () and references (@name) 'Label/"<" Label/">" Label/LabelName': t.labelName, diff --git a/services/web/frontend/js/features/source-editor/lezer-typst/tokens.mjs b/services/web/frontend/js/features/source-editor/lezer-typst/tokens.mjs index 90fb687339..e3a5f1436f 100644 --- a/services/web/frontend/js/features/source-editor/lezer-typst/tokens.mjs +++ b/services/web/frontend/js/features/source-editor/lezer-typst/tokens.mjs @@ -13,19 +13,26 @@ import { LineCommentContent, MathContent, CodeKeyword, + CodeIdent, + StrongBody, + EmphBody, } from './typst.terms.mjs' -const BACKTICK = 96 // ` -const SLASH = 47 // / -const STAR = 42 // * -const NEWLINE = 10 // \n -const EQUALS = 61 // = -const SPACE = 32 // -const TAB = 9 // \t -const DOLLAR = 36 // $ +const BACKTICK = 96 // ` +const SLASH = 47 // / +const STAR = 42 // * +const NEWLINE = 10 // \n +const EQUALS = 61 // = +const SPACE = 32 // +const TAB = 9 // \t +const DOLLAR = 36 // $ const OPEN_BRACE = 123 // { const CLOSE_BRACE = 125 // } -const HASH = 35 // # +const HASH = 35 // # +const UNDERSCORE = 95 // _ +const DOT = 46 // . +const OPEN_PAREN = 40 // ( +const COMMA = 44 // , const KEYWORDS = new Set([ 'let', 'set', 'show', 'import', 'include', @@ -34,6 +41,13 @@ const KEYWORDS = new Set([ 'and', 'or', 'not', 'context', ]) +const BOOLS = new Set(['true', 'false', 'none', 'auto']) + +const isAlpha = ch => (ch >= 65 && ch <= 90) || (ch >= 97 && ch <= 122) +const isDigit = ch => ch >= 48 && ch <= 57 +const isIdentHead = ch => isAlpha(ch) || ch === UNDERSCORE +const isIdentTail = ch => isAlpha(ch) || isDigit(ch) || ch === UNDERSCORE || ch === 45 + // ── headingTokenizer ──────────────────────────────────────────────────── // Emits HeadingMark — the "=+" prefix plus the trailing whitespace. // Only fires at the start of a line (pos 0, or character after '\n'). @@ -272,15 +286,7 @@ export const codeKeywordTokenizer = new ExternalTokenizer( let len = 0 while (true) { const ch = input.peek(len) - if ((ch >= 65 && ch <= 90) || // A–Z - (ch >= 97 && ch <= 122) || // a–z - (ch >= 48 && ch <= 57) || // 0–9 - ch === 95 || // _ - ch === 45) { // - - len++ - } else { - break - } + if (isIdentHead(ch) || (len > 0 && isIdentTail(ch))) { len++ } else { break } } if (len === 0) return @@ -296,3 +302,90 @@ export const codeKeywordTokenizer = new ExternalTokenizer( }, { contextual: true } ) + +// ── codeIdentTokenizer ─────────────────────────────────────────────────── +// Emits CodeIdent — identifier tokens inside code expressions (#ident, +// #func(args), #obj.method, etc.). +// +// Moving CodeIdent from @tokens to an external tokenizer allows a +// character-level guard: we only emit when the preceding non-whitespace +// character is one of '#', '.', '(', ',' — genuine code-context positions. +// This stops the token from firing in markup body text where LALR-merged +// states would otherwise cause '_italic_' to be consumed as one big +// CodeIdent (since '_' is a valid identHead) instead of opening Emphasis. +// +// Keywords and bools are excluded so codeKeywordTokenizer / CodeBool can +// handle them without conflict. +export const codeIdentTokenizer = new ExternalTokenizer( + (input, stack) => { + if (!stack.canShift(CodeIdent)) return + + // Guard: only fire in code context. + // Walk back past any horizontal whitespace (@skip) to the nearest + // non-space character and check that it is a code-mode delimiter. + let back = -1 + while (input.peek(back) === SPACE || input.peek(back) === TAB) back-- + const prev = input.peek(back) + if (prev !== HASH && prev !== DOT && prev !== OPEN_PAREN && prev !== COMMA) return + + // Must start with an identifier head character. + if (!isIdentHead(input.next)) return + + // Peek ahead to read the full identifier. + let len = 0 + while (true) { + const ch = input.peek(len) + if (len === 0 ? isIdentHead(ch) : isIdentTail(ch)) { len++ } else { break } + } + if (len === 0) return + + const chars = [] + for (let i = 0; i < len; i++) chars.push(input.peek(i)) + const word = String.fromCharCode(...chars) + + // Let codeKeywordTokenizer handle keywords; let CodeBool handle bools. + if (KEYWORDS.has(word) || BOOLS.has(word)) return + + for (let i = 0; i < len; i++) input.advance() + input.acceptToken(CodeIdent) + }, + { contextual: true } +) + +// ── strongBodyTokenizer ────────────────────────────────────────────────── +// Emits StrongBody — the content between the '*' delimiters of a Strong node. +// +// contextual: true — only fires when StrongBody is in the valid set, i.e. +// inside Strong → "*" . StrongBody? "*". This state is very specific and +// is not merged with item* by Lezer's aggressive LALR merging, so canShift +// is a reliable guard here. +// +// Reads everything up to the first '*' or newline (Typst bold does not span +// lines). A trailing '*' that is the closing delimiter is left for the +// grammar rule to consume. +export const strongBodyTokenizer = new ExternalTokenizer( + (input, _stack) => { + let hasContent = false + while (input.next !== -1 && input.next !== STAR && input.next !== NEWLINE) { + input.advance() + hasContent = true + } + if (hasContent) input.acceptToken(StrongBody) + }, + { contextual: true } +) + +// ── emphBodyTokenizer ──────────────────────────────────────────────────── +// Emits EmphBody — the content between the '_' delimiters of an Emphasis node. +// Same design as strongBodyTokenizer; stops at '_' or newline. +export const emphBodyTokenizer = new ExternalTokenizer( + (input, _stack) => { + let hasContent = false + while (input.next !== -1 && input.next !== UNDERSCORE && input.next !== NEWLINE) { + input.advance() + hasContent = true + } + if (hasContent) input.acceptToken(EmphBody) + }, + { contextual: true } +) diff --git a/services/web/frontend/js/features/source-editor/lezer-typst/typst.grammar b/services/web/frontend/js/features/source-editor/lezer-typst/typst.grammar index 9c51eee536..3473f5006f 100644 --- a/services/web/frontend/js/features/source-editor/lezer-typst/typst.grammar +++ b/services/web/frontend/js/features/source-editor/lezer-typst/typst.grammar @@ -7,6 +7,9 @@ // rawInlineTokenizer — single-backtick raw inline content // codeBlockTokenizer — brace-depth tracking inside #{ ... } // blockCommentTokenizer — depth-tracked nested /* ... */ comments +// codeIdentTokenizer — CodeIdent: identifier, only fires in code context +// strongBodyTokenizer — StrongBody: content inside *...* +// emphBodyTokenizer — EmphBody: content inside _..._ @top Document { item* } @@ -105,16 +108,15 @@ ContentBlock { "[" item* "]" } InlineMath { "$" MathContent? "$" } // ── Markup formatting ───────────────────────────────────────────────────── -// Cross-nesting of Strong/Emphasis is intentionally excluded to avoid a -// mutual-recursion cycle (Strong→Emphasis→Strong) that causes state explosion -// in the Lezer LR automaton builder. StrongText includes '_' and EmphText -// includes '*', so the nested delimiters are treated as plain text inside the -// opposite construct rather than producing error nodes. -Strong { "*" strongItem* "*" } -strongItem { CodeExpr | InlineMath | RawInline | Label | Ref | StrongText } - -Emphasis { "_" emphItem* "_" } -emphItem { CodeExpr | InlineMath | RawInline | Label | Ref | EmphText } +// Strong and Emphasis use flat external body tokens (StrongBody / EmphBody) +// rather than recursive strongItem* / emphItem* loops. The loop approach +// triggered LALR state merging that caused item*-level tokens (MarkupContent, +// CodeIdent) to win over StrongText/EmphText inside the construct, so the +// body nodes were never produced. The flat external tokens are contextual +// (canShift only fires inside Strong/Emphasis) and reliably avoid those +// merged states. +Strong { "*" StrongBody? "*" } +Emphasis { "_" EmphBody? "_" } // ── Labels and references ───────────────────────────────────────────────── Label { "<" LabelName ">" } @@ -162,6 +164,24 @@ Escape { "\\" EscapeChar } CodeKeyword } +// CodeIdent is external so codeIdentTokenizer can apply a character-level +// guard: it only emits when the preceding non-whitespace character is one of +// '#', '.', '(', ',' — i.e. genuinely inside a code expression. This stops +// the token from firing in markup body text, where LALR state merging would +// otherwise cause the entire token (including any leading '_') to be consumed +// as a code identifier instead of letting '_' open an Emphasis. +@external tokens codeIdentTokenizer from "./tokens.mjs" { + CodeIdent +} + +@external tokens strongBodyTokenizer from "./tokens.mjs" { + StrongBody +} + +@external tokens emphBodyTokenizer from "./tokens.mjs" { + EmphBody +} + // ── Regular tokens ──────────────────────────────────────────────────────── @tokens { // Horizontal whitespace only. Newlines are kept as explicit Newline items @@ -172,11 +192,6 @@ Escape { "\\" EscapeChar } // Boolean / null literals — distinct from keywords for highlighting. CodeBool { "true" | "false" | "none" | "auto" } - // General identifier: [A-Za-z_][A-Za-z0-9_-]* - CodeIdent { identHead identTail* } - identHead { @asciiLetter | "_" } - identTail { @asciiLetter | @digit | "_" | "-" } - // Double-quoted string with backslash escapes (no single-quoted strings in Typst). CodeString { '"' (!["\\\n] | "\\" _)* '"' } @@ -186,23 +201,14 @@ Escape { "\\" EscapeChar } ("pt" | "mm" | "cm" | "in" | "em" | "rem" | "fr" | "deg" | "rad" | "%")? } - // Text tokens for markup contexts; each excludes its own delimiters. - // HeadingText, LineCommentContent, and MathContent are external tokens - // (see above) — broad "read-to-delimiter" tokens that would otherwise - // conflict with every other literal token in LALR-merged states. - // '<' is excluded from StrongText/EmphText so that Label ('<' LabelName '>') - // is recognised inside strong/emphasis rather than consumed as plain text. - StrongText { ![\n*$#`<@\\]+ } - EmphText { ![\n_$#`<@\\]+ } - // Regular markup: excludes all special-character starters plus whitespace // (whitespace is handled by @skip). The '/' is excluded so that '//' and // '/*' are not accidentally consumed as plain text. MarkupContent { ![\n \t=*_$#/<@`\\]+ } // Label names: identifiers with optional dots/colons (e.g. ). - LabelName { (identHead | @digit) (identTail | "." | ":")* } - RefName { identHead identTail* } + LabelName { (@asciiLetter | "_" | @digit) (@asciiLetter | @digit | "_" | "-" | "." | ":")* } + RefName { (@asciiLetter | "_") (@asciiLetter | @digit | "_" | "-")* } // Escape: any single character after backslash. EscapeChar { _ } @@ -210,19 +216,15 @@ Escape { "\\" EscapeChar } // Newline item — kept out of @skip so heading detection works. Newline { "\n" } - // Resolve ambiguities: more-specific tokens win over broader catch-alls. - // EscapeChar > spaces: after '\', EscapeChar must win over the skip token - // (both match \t; without this, '\t' would be mis-tokenized). - // "(" > "." > "]" > text tokens: after '#' CodeIdent, callSuffix delimiters - // must win over MarkupContent/StrongText/EmphText in merged states. - // LineCommentContent and MathContent are external tokens — not listed here. - // "_" added after CodeIdent: KeywordExpr { CodeKeyword CallExpr? } merges - // the post-keyword state with markup states where "_" starts Emphasis. - // CodeIdent wins so '#set _name(...)' is tokenised correctly; in pure markup - // states CodeIdent is not in the valid set so "_" still opens Emphasis. - // CodeKeyword is now an external token (codeKeywordTokenizer) and therefore - // not listed here — it uses a peek(-1)==='#' guard to stay out of markup. - @precedence { CodeBool CodeIdent EscapeChar "(" "." "]" "_" spaces MarkupContent StrongText EmphText } + // Resolve ambiguities in merged states: + // EscapeChar > spaces: after '\', EscapeChar must win over the skip token. + // "(" > "." > "]": callSuffix delimiters must win over MarkupContent after + // a code identifier (merged states expose these to the markup tokenizer). + // "_" > MarkupContent: '_' must open Emphasis rather than being swallowed + // by MarkupContent (redundant since '_' is in MarkupContent's exclusion + // set, but kept for clarity). + // CodeIdent and StrongText/EmphText are now external tokens — not listed. + @precedence { CodeBool EscapeChar "(" "." "]" "_" spaces MarkupContent } } @skip { spaces }