lezer-typst: convert LineCommentContent and MathContent to external tokenizers

Both tokens are "read until delimiter" catchalls that match almost every non-newline character, causing buildTokenGroups conflicts with every other literal token in LALR-merged states. Moving them to ExternalTokenizer (the same pattern already used for HeadingTitle, RawBlockBody, etc.) makes them context-isolated: the LR state machine only calls them when those tokens are actually valid, so they never participate in the static token-group overlap check. Also exclude '<' from StrongText/EmphText so Label ('<' LabelName '>') is recognised inside strong/emphasis spans rather than being consumed as plain text. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-08 07:56:20 +00:00
parent e21f7cc0d5
commit 1dcd6e24f4
2 changed files with 55 additions and 14 deletions
@@ -10,6 +10,8 @@ import {
  RawInlineContent,
  CodeBlockBody,
  BlockCommentBody,
+  LineCommentContent,
+  MathContent,
 } from './typst.terms.mjs'

 const BACKTICK  = 96  // `
@@ -19,6 +21,7 @@ const NEWLINE   = 10  // \n
 const EQUALS    = 61  // =
 const SPACE     = 32  //
 const TAB       =  9  // \t
+const DOLLAR    = 36  // $
 const OPEN_BRACE  = 123 // {
 const CLOSE_BRACE = 125 // }

@@ -191,3 +194,36 @@ export const blockCommentTokenizer = new ExternalTokenizer(
  },
  { contextual: false }
 )
+
+// ── lineCommentContentTokenizer ─────────────────────────────────────────
+// Emits LineCommentContent — everything from the current position to EOL.
+// External rather than a @tokens rule because ![\n]+ conflicts with every
+// non-newline literal token in LALR-merged states.  External tokenizers are
+// context-isolated: only called when the LR state expects this token.
+export const lineCommentContentTokenizer = new ExternalTokenizer(
+  (input, _stack) => {
+    let hasContent = false
+    while (input.next !== -1 && input.next !== NEWLINE) {
+      input.advance()
+      hasContent = true
+    }
+    if (hasContent) input.acceptToken(LineCommentContent)
+  },
+  { contextual: false }
+)
+
+// ── mathContentTokenizer ────────────────────────────────────────────────
+// Emits MathContent — everything between the $...$ delimiters (no newlines).
+// External rather than a @tokens rule for the same reason as LineCommentContent:
+// ![$\n]+ overlaps with spaces, '<', '@', and other literals in merged states.
+export const mathContentTokenizer = new ExternalTokenizer(
+  (input, _stack) => {
+    let hasContent = false
+    while (input.next !== -1 && input.next !== DOLLAR && input.next !== NEWLINE) {
+      input.advance()
+      hasContent = true
+    }
+    if (hasContent) input.acceptToken(MathContent)
+  },
+  { contextual: false }
+)
@@ -148,6 +148,14 @@ Escape { "\\" EscapeChar }
  BlockCommentBody
 }

+@external tokens lineCommentContentTokenizer from "./tokens.mjs" {
+  LineCommentContent
+}
+
+@external tokens mathContentTokenizer from "./tokens.mjs" {
+  MathContent
+}
+
 // ── Regular tokens ────────────────────────────────────────────────────────
@tokens {
  // Horizontal whitespace only.  Newlines are kept as explicit Newline items
@@ -181,16 +189,14 @@ Escape { "\\" EscapeChar }
    ("pt" | "mm" | "cm" | "in" | "em" | "rem" | "fr" | "deg" | "rad" | "%")?
  }

-  // Comment content — everything to end of line.
-  LineCommentContent { ![\n]+ }
-
-  // Math content — everything between the $ delimiters (no crossing newlines).
-  MathContent { ![$\n]+ }
-
  // Text tokens for markup contexts; each excludes its own delimiters.
-  // HeadingText is gone: HeadingTitle is now an external token (see above).
-  StrongText   { ![\n*$#`@\\]+    }
-  EmphText     { ![\n_$#`@\\]+    }
+  // HeadingText, LineCommentContent, and MathContent are external tokens
+  // (see above) — broad "read-to-delimiter" tokens that would otherwise
+  // conflict with every other literal token in LALR-merged states.
+  // '<' is excluded from StrongText/EmphText so that Label ('<' LabelName '>')
+  // is recognised inside strong/emphasis rather than consumed as plain text.
+  StrongText   { ![\n*$#`<@\\]+   }
+  EmphText     { ![\n_$#`<@\\]+   }

  // Regular markup: excludes all special-character starters plus whitespace
  // (whitespace is handled by @skip).  The '/' is excluded so that '//' and
@@ -210,11 +216,10 @@ Escape { "\\" EscapeChar }
  // Resolve ambiguities: more-specific tokens win over broader catch-alls.
  // EscapeChar > spaces: after '\', EscapeChar must win over the skip token
  //   (both match \t; without this, '\t' would be mis-tokenized).
-  // "(" > "." > text tokens: after '#' CodeIdent, callSuffix delimiters must win
-  //   over MarkupContent/StrongText/EmphText in the LALR-merged state.
-  // "]" > LineCommentContent: inside #[...], ']' closes the ContentBlock even
-  //   if it appears after '//' (comment does not "protect" the bracket).
-  @precedence { CodeKeyword CodeBool CodeIdent EscapeChar "(" "." "]" spaces MarkupContent StrongText EmphText LineCommentContent }
+  // "(" > "." > "]" > text tokens: after '#' CodeIdent, callSuffix delimiters
+  //   must win over MarkupContent/StrongText/EmphText in merged states.
+  // LineCommentContent and MathContent are external tokens — not listed here.
+  @precedence { CodeKeyword CodeBool CodeIdent EscapeChar "(" "." "]" spaces MarkupContent StrongText EmphText }
 }

@skip { spaces }