typlite/parser/
core.rs

1//! HTML parser core, containing main structures and general parsing logic
2
3use std::sync::Arc;
4
5use cmark_writer::ast::{CustomNode, HtmlAttribute, HtmlElement as CmarkHtmlElement, Node};
6use cmark_writer::{CommonMarkWriter, WriteResult};
7use ecow::EcoString;
8use tinymist_project::LspWorld;
9use typst::html::{HtmlElement, HtmlNode, tag};
10
11use crate::Result;
12use crate::TypliteFeat;
13use crate::attributes::{AlertsAttr, HeadingAttr, RawAttr, TypliteAttrsParser, md_attr};
14use crate::common::{AlertNode, CenterNode, VerbatimNode};
15use crate::tags::md_tag;
16
17use super::{list::ListParser, table::TableParser};
18
19/// HTML to AST parser implementation
20pub struct HtmlToAstParser {
21    pub asset_counter: usize,
22    pub feat: TypliteFeat,
23    pub world: Arc<LspWorld>,
24    pub list_level: usize,
25    pub blocks: Vec<Node>,
26    pub inline_buffer: Vec<Node>,
27}
28
29impl HtmlToAstParser {
30    pub fn new(feat: TypliteFeat, world: &Arc<LspWorld>) -> Self {
31        Self {
32            feat,
33            world: world.clone(),
34            asset_counter: 0,
35            list_level: 0,
36            blocks: Vec::new(),
37            inline_buffer: Vec::new(),
38        }
39    }
40
41    pub fn convert_element(&mut self, element: &HtmlElement) -> Result<()> {
42        match element.tag {
43            tag::head => Ok(()),
44
45            tag::html | tag::body | md_tag::doc => {
46                self.convert_children(element)?;
47                Ok(())
48            }
49
50            tag::p | tag::span | tag::div => {
51                self.convert_children(element)?;
52                Ok(())
53            }
54
55            tag::strong | md_tag::strong => self.convert_strong(element),
56            tag::em | md_tag::emph => self.convert_emphasis(element),
57
58            tag::br => {
59                self.inline_buffer.push(Node::HardBreak);
60                Ok(())
61            }
62
63            tag::ol => {
64                self.flush_inline_buffer();
65                let items = ListParser::convert_list(self, element);
66                self.blocks.push(Node::OrderedList {
67                    start: 1,
68                    items: items?,
69                });
70                Ok(())
71            }
72
73            tag::ul => {
74                self.flush_inline_buffer();
75                let items = ListParser::convert_list(self, element);
76                self.blocks.push(Node::UnorderedList(items?));
77                Ok(())
78            }
79
80            md_tag::parbreak => {
81                self.flush_inline_buffer();
82                Ok(())
83            }
84
85            md_tag::heading => {
86                self.flush_inline_buffer();
87                let attrs = HeadingAttr::parse(&element.attrs)?;
88                self.convert_children(element)?;
89                self.flush_inline_buffer_as_block(|content| {
90                    Node::heading(attrs.level as u8 + 1, content)
91                });
92                Ok(())
93            }
94
95            md_tag::raw => {
96                let attrs = RawAttr::parse(&element.attrs)?;
97                if attrs.block {
98                    self.flush_inline_buffer();
99                    self.blocks
100                        .push(Node::code_block(Some(attrs.lang), attrs.text));
101                } else {
102                    self.inline_buffer.push(Node::InlineCode(attrs.text));
103                }
104                Ok(())
105            }
106
107            md_tag::quote => {
108                let prev_blocks = std::mem::take(&mut self.blocks);
109                self.flush_inline_buffer();
110                self.convert_children(element)?;
111                let content = Node::Paragraph(std::mem::take(&mut self.inline_buffer));
112                let mut quote = std::mem::take(&mut self.blocks);
113                quote.push(content);
114                self.blocks.clear();
115                self.blocks.extend(prev_blocks);
116                self.blocks.push(Node::BlockQuote(quote));
117                Ok(())
118            }
119
120            md_tag::figure => self.convert_figure(element),
121            md_tag::highlight => self.convert_highlight(element),
122            md_tag::strike => self.convert_strikethrough(element),
123            md_tag::link => self.convert_link(element),
124            md_tag::image => self.convert_image(element),
125
126            md_tag::linebreak => {
127                self.inline_buffer.push(Node::HardBreak);
128                Ok(())
129            }
130
131            md_tag::source => {
132                let src = self.convert_source(element);
133                self.inline_buffer.push(src);
134                Ok(())
135            }
136
137            md_tag::table | md_tag::grid => {
138                self.flush_inline_buffer();
139                if let Some(table) = TableParser::convert_table(self, element)? {
140                    self.blocks.push(table);
141                }
142                Ok(())
143            }
144
145            md_tag::idoc => {
146                let src = self.convert_idoc(element);
147                self.inline_buffer.push(src);
148                Ok(())
149            }
150
151            md_tag::math_equation_inline | md_tag::math_equation_block => {
152                if element.tag == md_tag::math_equation_block {
153                    self.flush_inline_buffer();
154                    self.convert_children(element)?;
155                    let content = std::mem::take(&mut self.inline_buffer);
156                    self.blocks
157                        .push(Node::Custom(Box::new(CenterNode::new(content))));
158                } else {
159                    self.convert_children(element)?;
160                }
161                Ok(())
162            }
163
164            md_tag::alerts => {
165                self.flush_inline_buffer();
166                let attrs = AlertsAttr::parse(&element.attrs)?;
167                let prev_blocks = std::mem::take(&mut self.blocks);
168                self.flush_inline_buffer();
169                self.convert_children(element)?;
170                let content = Node::Paragraph(std::mem::take(&mut self.inline_buffer));
171                let mut quote = std::mem::take(&mut self.blocks);
172                quote.push(content);
173                self.blocks.clear();
174                self.blocks.extend(prev_blocks);
175                self.blocks.push(Node::Custom(Box::new(AlertNode {
176                    content: quote,
177                    class: attrs.class,
178                })));
179                Ok(())
180            }
181
182            md_tag::verbatim => {
183                self.inline_buffer.push(Node::Custom(Box::new(VerbatimNode {
184                    content: element
185                        .attrs
186                        .0
187                        .iter()
188                        .find(|(name, _)| *name == md_attr::src)
189                        .map(|(_, value)| value.clone())
190                        .unwrap_or_default(),
191                })));
192                Ok(())
193            }
194
195            _ => {
196                let tag_name = element.tag.resolve().to_string();
197
198                if !tag_name.starts_with("m1") {
199                    let html_element = self.create_html_element(element)?;
200                    self.inline_buffer.push(html_element);
201                } else {
202                    self.convert_children(element)?;
203                }
204                Ok(())
205            }
206        }
207    }
208
209    /// Create a CommonMark HTML element from the given HTML element    
210    pub(crate) fn create_html_element(&mut self, element: &HtmlElement) -> Result<Node> {
211        let attributes = element
212            .attrs
213            .0
214            .iter()
215            .map(|(name, value)| HtmlAttribute {
216                name: name.resolve().to_string().into(),
217                value: value.clone(),
218            })
219            .collect();
220
221        let mut children = Vec::new();
222        self.convert_children_into(&mut children, element)?;
223
224        Ok(Node::HtmlElement(CmarkHtmlElement {
225            tag: element.tag.resolve().to_string().into(),
226            attributes,
227            children,
228            self_closing: element.children.is_empty(),
229        }))
230    }
231
232    pub fn flush_inline_buffer(&mut self) {
233        if !self.inline_buffer.is_empty() {
234            self.blocks
235                .push(Node::Paragraph(std::mem::take(&mut self.inline_buffer)));
236        }
237    }
238
239    pub fn flush_inline_buffer_as_block(&mut self, make_block: impl FnOnce(Vec<Node>) -> Node) {
240        if !self.inline_buffer.is_empty() {
241            self.blocks
242                .push(make_block(std::mem::take(&mut self.inline_buffer)));
243        }
244    }
245
246    pub fn convert_children(&mut self, element: &HtmlElement) -> Result<()> {
247        for child in &element.children {
248            match child {
249                HtmlNode::Text(text, _) => {
250                    self.inline_buffer.push(Node::Text(text.clone()));
251                }
252                HtmlNode::Element(element) => {
253                    self.convert_element(element)?;
254                }
255                HtmlNode::Frame(frame) => {
256                    let res = self.convert_frame(frame);
257                    self.inline_buffer.push(res);
258                }
259                HtmlNode::Tag(..) => {}
260            }
261        }
262        Ok(())
263    }
264
265    pub fn convert_children_into(
266        &mut self,
267        target: &mut Vec<Node>,
268        element: &HtmlElement,
269    ) -> Result<()> {
270        let prev_buffer = std::mem::take(&mut self.inline_buffer);
271        self.convert_children(element)?;
272        target.append(&mut self.inline_buffer);
273        self.inline_buffer = prev_buffer;
274        Ok(())
275    }
276}
277
278#[derive(Debug, Clone)]
279pub(crate) struct Comment(pub EcoString);
280
281impl CustomNode for Comment {
282    fn as_any(&self) -> &dyn std::any::Any {
283        self
284    }
285
286    fn as_any_mut(&mut self) -> &mut dyn std::any::Any {
287        self
288    }
289
290    fn write(&self, writer: &mut CommonMarkWriter) -> WriteResult<()> {
291        writer.write_str("<!-- ")?;
292        writer.write_str(&self.0)?;
293        writer.write_str(" -->")?;
294        Ok(())
295    }
296
297    fn clone_box(&self) -> Box<dyn CustomNode> {
298        Box::new(self.clone())
299    }
300
301    fn eq_box(&self, other: &dyn CustomNode) -> bool {
302        if let Some(other) = other.as_any().downcast_ref::<Comment>() {
303            self.0 == other.0
304        } else {
305            false
306        }
307    }
308
309    fn is_block(&self) -> bool {
310        false
311    }
312}
313
314impl HtmlToAstParser {
315    pub fn is_block_element(element: &HtmlElement) -> bool {
316        matches!(
317            element.tag,
318            tag::p
319                | tag::div
320                | tag::blockquote
321                | tag::h1
322                | tag::h2
323                | tag::h3
324                | tag::h4
325                | tag::h5
326                | tag::h6
327                | tag::hr
328                | tag::pre
329                | tag::table
330                | tag::section
331                | tag::article
332                | tag::header
333                | tag::footer
334                | tag::main
335                | tag::aside
336                | tag::nav
337                | tag::ul
338                | tag::ol
339                | md_tag::heading
340                | md_tag::quote
341                | md_tag::raw
342                | md_tag::parbreak
343                | md_tag::table
344                | md_tag::grid
345                | md_tag::figure
346        )
347    }
348
349    pub fn process_list_item_element(&mut self, element: &HtmlElement) -> Result<Vec<Node>> {
350        if element.tag == tag::ul || element.tag == tag::ol {
351            let items = super::list::ListParser::convert_list(self, element)?;
352            if element.tag == tag::ul {
353                return Ok(vec![Node::UnorderedList(items)]);
354            } else {
355                return Ok(vec![Node::OrderedList { start: 1, items }]);
356            }
357        }
358
359        let prev_blocks = std::mem::take(&mut self.blocks);
360        let prev_buffer = std::mem::take(&mut self.inline_buffer);
361
362        self.convert_element(element)?;
363        let mut result = Vec::new();
364
365        if !self.blocks.is_empty() {
366            result.extend(std::mem::take(&mut self.blocks));
367        } else if !self.inline_buffer.is_empty() {
368            if Self::is_block_element(element) {
369                result.push(Node::Paragraph(std::mem::take(&mut self.inline_buffer)));
370            } else {
371                result = std::mem::take(&mut self.inline_buffer);
372            }
373        }
374
375        self.blocks = prev_blocks;
376        self.inline_buffer = prev_buffer;
377
378        Ok(result)
379    }
380
381    pub fn parse(mut self, root: &HtmlElement) -> Result<Node> {
382        self.blocks.clear();
383        self.inline_buffer.clear();
384
385        self.convert_element(root)?;
386        self.flush_inline_buffer();
387
388        Ok(Node::Document(self.blocks))
389    }
390}