tinymist_world/parser/
semantic_tokens.rs

1//! From <https://github.com/nvarner/typst-lsp/blob/cc7bad9bd9764bfea783f2fab415cb3061fd8bff/src/server/semantic_tokens/mod.rs>
2
3use strum::IntoEnumIterator;
4use typst::syntax::{LinkedNode, Source, SyntaxKind, ast};
5
6use super::modifier_set::ModifierSet;
7use super::typst_tokens::{Modifier, TokenType};
8
9/// The legend of the semantic tokens.
10#[derive(serde::Deserialize, serde::Serialize, Debug, Clone)]
11pub struct SemanticTokensLegend {
12    /// The token types.
13    #[serde(rename = "tokenTypes")]
14    pub token_types: Vec<String>,
15    /// The token modifiers.
16    #[serde(rename = "tokenModifiers")]
17    pub token_modifiers: Vec<String>,
18}
19
20/// Gets the legend of the semantic tokens.
21pub fn get_semantic_tokens_legend() -> SemanticTokensLegend {
22    SemanticTokensLegend {
23        token_types: TokenType::iter()
24            .map(|e| {
25                let e: &'static str = e.into();
26
27                e.to_owned()
28            })
29            .collect(),
30        token_modifiers: Modifier::iter()
31            .map(|e| {
32                let e: &'static str = e.into();
33
34                e.to_owned()
35            })
36            .collect(),
37    }
38}
39
40/// The encoding of the offset.
41#[derive(Debug, Clone, Copy)]
42pub enum OffsetEncoding {
43    /// The UTF-8 encoding.
44    Utf8,
45    /// The UTF-16 encoding.
46    Utf16,
47}
48
49/// Gets the full semantic tokens.
50pub fn get_semantic_tokens_full(source: &Source, encoding: OffsetEncoding) -> Vec<SemanticToken> {
51    let root = LinkedNode::new(source.root());
52    let mut full = tokenize_tree(&root, ModifierSet::empty());
53
54    let mut init = (0, 0);
55    for token in full.iter_mut() {
56        // resolve offset to position
57        let offset = ((token.delta_line as u64) << 32) | token.delta_start_character as u64;
58        let position = (match encoding {
59            OffsetEncoding::Utf8 => offset_to_position_utf8,
60            OffsetEncoding::Utf16 => offset_to_position_utf16,
61        })(offset as usize, source);
62        token.delta_line = position.0;
63        token.delta_start_character = position.1;
64
65        let next = (token.delta_line, token.delta_start_character);
66        token.delta_line -= init.0;
67        if token.delta_line == 0 {
68            token.delta_start_character -= init.1;
69        }
70        init = next;
71    }
72
73    full
74}
75
76/// Tokenizes a single node.
77fn tokenize_single_node(node: &LinkedNode, modifiers: ModifierSet) -> Option<SemanticToken> {
78    let is_leaf = node.children().next().is_none();
79
80    token_from_node(node)
81        .or_else(|| is_leaf.then_some(TokenType::Text))
82        .map(|token_type| SemanticToken::new(token_type, modifiers, node))
83}
84
85/// Tokenizes a node and its children.
86fn tokenize_tree(root: &LinkedNode<'_>, parent_modifiers: ModifierSet) -> Vec<SemanticToken> {
87    let modifiers = parent_modifiers | modifiers_from_node(root);
88
89    let token = tokenize_single_node(root, modifiers).into_iter();
90    let children = root
91        .children()
92        .flat_map(move |child| tokenize_tree(&child, modifiers));
93    token.chain(children).collect()
94}
95
96/// A semantic token.
97#[derive(Debug, Clone, Copy)]
98pub struct SemanticToken {
99    /// The delta line.
100    pub delta_line: u32,
101    /// The delta start character.
102    pub delta_start_character: u32,
103    /// The length.
104    pub length: u32,
105    /// The token type.
106    pub token_type: u32,
107    /// The token modifiers.
108    pub token_modifiers: u32,
109}
110
111impl SemanticToken {
112    /// Creates a new semantic token.
113    fn new(token_type: TokenType, modifiers: ModifierSet, node: &LinkedNode) -> Self {
114        let source = node.get().clone().into_text();
115
116        let raw_position = node.offset() as u64;
117        let raw_position = ((raw_position >> 32) as u32, raw_position as u32);
118
119        Self {
120            token_type: token_type as u32,
121            token_modifiers: modifiers.bitset(),
122            delta_line: raw_position.0,
123            delta_start_character: raw_position.1,
124            length: source.chars().map(char::len_utf16).sum::<usize>() as u32,
125        }
126    }
127}
128
129/// Determines the [`Modifier`]s to be applied to a node and all its children.
130///
131/// Returns `ModifierSet::empty()` if the node is not a valid node.
132///
133/// Note that this does not recurse up, so calling it on a child node may not
134/// return a modifier that should be applied to it due to a parent.
135fn modifiers_from_node(node: &LinkedNode) -> ModifierSet {
136    match node.kind() {
137        SyntaxKind::Emph => ModifierSet::new(&[Modifier::Emph]),
138        SyntaxKind::Strong => ModifierSet::new(&[Modifier::Strong]),
139        SyntaxKind::Math | SyntaxKind::Equation => ModifierSet::new(&[Modifier::Math]),
140        _ => ModifierSet::empty(),
141    }
142}
143
144/// Determines the best [`TokenType`] for an entire node and its children, if
145/// any. If there is no single `TokenType`, or none better than `Text`, returns
146/// `None`.
147///
148/// In tokenization, returning `Some` stops recursion, while returning `None`
149/// continues and attempts to tokenize each of `node`'s children. If there are
150/// no children, `Text` is taken as the default.
151fn token_from_node(node: &LinkedNode) -> Option<TokenType> {
152    use SyntaxKind::*;
153
154    match node.kind() {
155        Star if node.parent_kind() == Some(Strong) => Some(TokenType::Punctuation),
156        Star if node.parent_kind() == Some(ModuleImport) => Some(TokenType::Operator),
157
158        Underscore if node.parent_kind() == Some(Emph) => Some(TokenType::Punctuation),
159        Underscore if node.parent_kind() == Some(MathAttach) => Some(TokenType::Operator),
160
161        MathIdent | Ident => Some(token_from_ident(node)),
162        Hash => token_from_hashtag(node),
163
164        LeftBrace | RightBrace | LeftBracket | RightBracket | LeftParen | RightParen | Comma
165        | Semicolon | Colon => Some(TokenType::Punctuation),
166        Linebreak | Escape | Shorthand => Some(TokenType::Escape),
167        Link => Some(TokenType::Link),
168        Raw => Some(TokenType::Raw),
169        Label => Some(TokenType::Label),
170        RefMarker => Some(TokenType::Ref),
171        Heading | HeadingMarker => Some(TokenType::Heading),
172        ListMarker | EnumMarker | TermMarker => Some(TokenType::ListMarker),
173        MathAlignPoint | Plus | Minus | Slash | Hat | Dot | Eq | EqEq | ExclEq | Lt | LtEq | Gt
174        | GtEq | PlusEq | HyphEq | StarEq | SlashEq | Dots | Arrow | Not | And | Or => {
175            Some(TokenType::Operator)
176        }
177        Dollar => Some(TokenType::Delimiter),
178        None | Auto | Let | Show | If | Else | For | In | While | Break | Continue | Return
179        | Import | Include | As | Set => Some(TokenType::Keyword),
180        Bool => Some(TokenType::Bool),
181        Int | Float | Numeric => Some(TokenType::Number),
182        Str => Some(TokenType::String),
183        LineComment | BlockComment => Some(TokenType::Comment),
184        Error => Some(TokenType::Error),
185
186        // Disambiguate from `SyntaxKind::None`
187        _ => Option::None,
188    }
189}
190
191/// Checks if the identifier is a function.
192///
193/// TODO: differentiate also using tokens in scope, not just context
194fn is_function_ident(ident: &LinkedNode) -> bool {
195    let Some(next) = ident.next_leaf() else {
196        return false;
197    };
198    let function_call = matches!(next.kind(), SyntaxKind::LeftParen)
199        && matches!(
200            next.parent_kind(),
201            Some(SyntaxKind::Args | SyntaxKind::Params)
202        );
203    let function_content = matches!(next.kind(), SyntaxKind::LeftBracket)
204        && matches!(next.parent_kind(), Some(SyntaxKind::ContentBlock));
205    function_call || function_content
206}
207
208/// Gets the token type from an identifier.
209fn token_from_ident(ident: &LinkedNode) -> TokenType {
210    if is_function_ident(ident) {
211        TokenType::Function
212    } else {
213        TokenType::Interpolated
214    }
215}
216
217/// Gets the expression following a hashtag.
218fn get_expr_following_hashtag<'a>(hashtag: &LinkedNode<'a>) -> Option<LinkedNode<'a>> {
219    hashtag
220        .next_sibling()
221        .filter(|next| next.cast::<ast::Expr>().is_some_and(|expr| expr.hash()))
222        .and_then(|node| node.leftmost_leaf())
223}
224
225/// Gets the token type from a hashtag.
226fn token_from_hashtag(hashtag: &LinkedNode) -> Option<TokenType> {
227    get_expr_following_hashtag(hashtag)
228        .as_ref()
229        .and_then(token_from_node)
230}
231
232/// Converts an offset to a position in UTF-8.
233fn offset_to_position_utf8(typst_offset: usize, typst_source: &Source) -> (u32, u32) {
234    let line_index = typst_source.byte_to_line(typst_offset).unwrap();
235    let column_index = typst_source.byte_to_column(typst_offset).unwrap();
236
237    (line_index as u32, column_index as u32)
238}
239
240/// Converts an offset to a position in UTF-16.
241fn offset_to_position_utf16(typst_offset: usize, typst_source: &Source) -> (u32, u32) {
242    let line_index = typst_source.byte_to_line(typst_offset).unwrap();
243
244    let lsp_line = line_index as u32;
245
246    // See the implementation of `lsp_to_typst::position_to_offset` for discussion
247    // relevant to this function.
248
249    // TODO: Typst's `Source` could easily provide an implementation of the method
250    // we   need here. Submit a PR to `typst` to add it, then update
251    // this if/when merged.
252
253    let utf16_offset = typst_source.byte_to_utf16(typst_offset).unwrap();
254
255    let byte_line_offset = typst_source.line_to_byte(line_index).unwrap();
256    let utf16_line_offset = typst_source.byte_to_utf16(byte_line_offset).unwrap();
257
258    let utf16_column_offset = utf16_offset - utf16_line_offset;
259    let lsp_column = utf16_column_offset;
260
261    (lsp_line, lsp_column as u32)
262}