typlite/parser/
core.rs

1//! HTML parser core, containing main structures and general parsing logic
2
3use std::sync::Arc;
4
5use typst::diag::SourceDiagnostic;
6use typst_syntax::Span;
7
8use cmark_writer::WriteResult;
9use cmark_writer::ast::{CustomNode, HtmlAttribute, HtmlElement as CmarkHtmlElement, Node};
10use cmark_writer::writer::InlineWriterProxy;
11use ecow::EcoString;
12use tinymist_project::LspWorld;
13use typst_html::{HtmlElement, HtmlNode, tag};
14
15use crate::Result;
16use crate::TypliteFeat;
17use crate::attributes::{AlertsAttr, HeadingAttr, RawAttr, TypliteAttrsParser, md_attr};
18use crate::common::{AlertNode, CenterNode, VerbatimNode};
19use crate::diagnostics::WarningCollector;
20use crate::tags::md_tag;
21
22use super::{list::ListParser, table::TableParser};
23
24/// HTML to AST parser implementation
25pub struct HtmlToAstParser {
26    pub asset_counter: usize,
27    pub feat: TypliteFeat,
28    pub world: Arc<LspWorld>,
29    pub list_level: usize,
30    pub blocks: Vec<Node>,
31    pub inline_buffer: Vec<Node>,
32    pub(crate) warnings: WarningCollector,
33}
34
35impl HtmlToAstParser {
36    pub(crate) fn new(
37        feat: TypliteFeat,
38        world: &Arc<LspWorld>,
39        warnings: WarningCollector,
40    ) -> Self {
41        Self {
42            feat,
43            world: world.clone(),
44            asset_counter: 0,
45            list_level: 0,
46            blocks: Vec::new(),
47            inline_buffer: Vec::new(),
48            warnings,
49        }
50    }
51
52    pub fn convert_element(&mut self, element: &HtmlElement) -> Result<()> {
53        match element.tag {
54            tag::head => Ok(()),
55
56            tag::html | tag::body | md_tag::doc => {
57                self.convert_children(element)?;
58                Ok(())
59            }
60
61            tag::p | tag::span | tag::div => {
62                self.convert_children(element)?;
63                Ok(())
64            }
65
66            tag::strong | md_tag::strong => self.convert_strong(element),
67            tag::em | md_tag::emph => self.convert_emphasis(element),
68            tag::mark => self.convert_highlight(element),
69            tag::s => self.convert_strikethrough(element),
70
71            tag::br => {
72                self.inline_buffer.push(Node::HardBreak);
73                Ok(())
74            }
75
76            tag::ol => {
77                self.flush_inline_buffer();
78                let items = ListParser::convert_list(self, element);
79                self.blocks.push(Node::OrderedList {
80                    start: 1,
81                    items: items?,
82                });
83                Ok(())
84            }
85
86            tag::ul => {
87                self.flush_inline_buffer();
88                let items = ListParser::convert_list(self, element);
89                self.blocks.push(Node::UnorderedList(items?));
90                Ok(())
91            }
92
93            md_tag::parbreak => {
94                self.flush_inline_buffer();
95                Ok(())
96            }
97
98            md_tag::heading => {
99                self.flush_inline_buffer();
100                let attrs = HeadingAttr::parse(&element.attrs)?;
101                self.convert_children(element)?;
102                self.flush_inline_buffer_as_block(|content| {
103                    Node::heading(attrs.level as u8 + 1, content)
104                });
105                Ok(())
106            }
107
108            md_tag::raw => {
109                let attrs = RawAttr::parse(&element.attrs)?;
110                if attrs.block {
111                    self.flush_inline_buffer();
112                    self.blocks
113                        .push(Node::code_block(Some(attrs.lang), attrs.text));
114                } else {
115                    self.inline_buffer.push(Node::InlineCode(attrs.text));
116                }
117                Ok(())
118            }
119
120            md_tag::quote => {
121                let prev_blocks = std::mem::take(&mut self.blocks);
122                self.flush_inline_buffer();
123                self.convert_children(element)?;
124                let content = Node::Paragraph(std::mem::take(&mut self.inline_buffer));
125                let mut quote = std::mem::take(&mut self.blocks);
126                quote.push(content);
127                self.blocks.clear();
128                self.blocks.extend(prev_blocks);
129                self.blocks.push(Node::BlockQuote(quote));
130                Ok(())
131            }
132
133            md_tag::figure => self.convert_figure(element),
134            md_tag::link => self.convert_link(element),
135            md_tag::image => self.convert_image(element),
136
137            md_tag::linebreak => {
138                self.inline_buffer.push(Node::HardBreak);
139                Ok(())
140            }
141
142            md_tag::source => {
143                let src = self.convert_source(element);
144                self.inline_buffer.push(src);
145                Ok(())
146            }
147
148            md_tag::table | md_tag::grid => {
149                self.flush_inline_buffer();
150                if let Some(table) = TableParser::convert_table(self, element)? {
151                    self.blocks.push(table);
152                }
153                Ok(())
154            }
155
156            md_tag::idoc => {
157                let src = self.convert_idoc(element);
158                self.inline_buffer.push(src);
159                Ok(())
160            }
161
162            md_tag::math_equation_inline | md_tag::math_equation_block => {
163                if element.tag == md_tag::math_equation_block {
164                    self.flush_inline_buffer();
165                    self.convert_children(element)?;
166                    let content = std::mem::take(&mut self.inline_buffer);
167                    self.blocks
168                        .push(Node::Custom(Box::new(CenterNode::new(content))));
169                } else {
170                    self.convert_children(element)?;
171                }
172                Ok(())
173            }
174
175            md_tag::alerts => {
176                self.flush_inline_buffer();
177                let attrs = AlertsAttr::parse(&element.attrs)?;
178                let prev_blocks = std::mem::take(&mut self.blocks);
179                self.flush_inline_buffer();
180                self.convert_children(element)?;
181                let content = Node::Paragraph(std::mem::take(&mut self.inline_buffer));
182                let mut quote = std::mem::take(&mut self.blocks);
183                quote.push(content);
184                self.blocks.clear();
185                self.blocks.extend(prev_blocks);
186                self.blocks.push(Node::Custom(Box::new(AlertNode {
187                    content: quote,
188                    class: attrs.class,
189                })));
190                Ok(())
191            }
192
193            md_tag::verbatim => {
194                self.inline_buffer.push(Node::Custom(Box::new(VerbatimNode {
195                    content: element
196                        .attrs
197                        .0
198                        .iter()
199                        .find(|(name, _)| *name == md_attr::src)
200                        .map(|(_, value)| value.clone())
201                        .unwrap_or_default(),
202                })));
203                Ok(())
204            }
205
206            _ => {
207                let tag_name = element.tag.resolve().to_string();
208
209                if !tag_name.starts_with("m1") {
210                    // self.warn_at(
211                    //     Some(element.span),
212                    //     eco_format!(
213                    //         "unsupported HTML element `<{tag_name}>`; exported as raw HTML"
214                    //     ),
215                    // );
216                    let html_element = self.create_html_element(element)?;
217                    self.inline_buffer.push(html_element);
218                } else {
219                    self.convert_children(element)?;
220                }
221                Ok(())
222            }
223        }
224    }
225
226    /// Create a CommonMark HTML element from the given HTML element    
227    pub(crate) fn create_html_element(&mut self, element: &HtmlElement) -> Result<Node> {
228        let attributes = element
229            .attrs
230            .0
231            .iter()
232            .map(|(name, value)| HtmlAttribute {
233                name: name.resolve().to_string().into(),
234                value: value.clone(),
235            })
236            .collect();
237
238        let (inline_nodes, block_nodes) = self.capture_children(element)?;
239
240        let mut children = Vec::new();
241        if !inline_nodes.is_empty() {
242            children.extend(inline_nodes);
243        }
244        children.extend(block_nodes);
245
246        Ok(Node::HtmlElement(CmarkHtmlElement {
247            tag: element.tag.resolve().to_string().into(),
248            attributes,
249            children,
250            self_closing: element.children.is_empty(),
251        }))
252    }
253
254    pub fn flush_inline_buffer(&mut self) {
255        if !self.inline_buffer.is_empty() {
256            self.blocks
257                .push(Node::Paragraph(std::mem::take(&mut self.inline_buffer)));
258        }
259    }
260
261    pub fn flush_inline_buffer_as_block(&mut self, make_block: impl FnOnce(Vec<Node>) -> Node) {
262        if !self.inline_buffer.is_empty() {
263            self.blocks
264                .push(make_block(std::mem::take(&mut self.inline_buffer)));
265        }
266    }
267
268    pub fn convert_children(&mut self, element: &HtmlElement) -> Result<()> {
269        for child in &element.children {
270            match child {
271                HtmlNode::Text(text, _) => {
272                    self.inline_buffer.push(Node::Text(text.clone()));
273                }
274                HtmlNode::Element(element) => {
275                    self.convert_element(element)?;
276                }
277                HtmlNode::Frame(frame) => {
278                    let res = self.convert_frame(&frame.inner);
279                    self.inline_buffer.push(res);
280                }
281                HtmlNode::Tag(..) => {}
282            }
283        }
284        Ok(())
285    }
286
287    pub fn convert_children_into(
288        &mut self,
289        target: &mut Vec<Node>,
290        element: &HtmlElement,
291    ) -> Result<()> {
292        let prev_buffer = std::mem::take(&mut self.inline_buffer);
293        self.convert_children(element)?;
294        target.append(&mut self.inline_buffer);
295        self.inline_buffer = prev_buffer;
296        Ok(())
297    }
298
299    /// Convert element children while capturing both inline and block outputs.
300    pub fn capture_children(&mut self, element: &HtmlElement) -> Result<(Vec<Node>, Vec<Node>)> {
301        let prev_buffer = std::mem::take(&mut self.inline_buffer);
302        let prev_blocks = std::mem::take(&mut self.blocks);
303
304        self.convert_children(element)?;
305
306        let inline = std::mem::take(&mut self.inline_buffer);
307        let blocks = std::mem::take(&mut self.blocks);
308
309        self.inline_buffer = prev_buffer;
310        self.blocks = prev_blocks;
311
312        Ok((inline, blocks))
313    }
314
315    pub(crate) fn warn_at(&mut self, span: Option<Span>, message: EcoString) {
316        let span = span.unwrap_or_else(Span::detached);
317        let span = self
318            .feat
319            .wrap_info
320            .as_ref()
321            .and_then(|info| self.remap_span_from_wrapper(span, info))
322            .unwrap_or(span);
323
324        let diag = SourceDiagnostic::warning(span, message);
325        self.warnings.extend(std::iter::once(diag));
326    }
327
328    fn remap_span_from_wrapper(&self, span: Span, info: &crate::WrapInfo) -> Option<Span> {
329        info.remap_span(self.world.as_ref(), span)
330    }
331}
332
333#[derive(Debug, Clone)]
334pub(crate) struct Comment(pub EcoString);
335
336impl CustomNode for Comment {
337    fn as_any(&self) -> &dyn std::any::Any {
338        self
339    }
340
341    fn as_any_mut(&mut self) -> &mut dyn std::any::Any {
342        self
343    }
344
345    fn write_inline(&self, writer: &mut InlineWriterProxy) -> WriteResult<()> {
346        writer.write_str("<!-- ")?;
347        writer.write_str(&self.0)?;
348        writer.write_str(" -->")?;
349        Ok(())
350    }
351
352    fn clone_box(&self) -> Box<dyn CustomNode> {
353        Box::new(self.clone())
354    }
355
356    fn eq_box(&self, other: &dyn CustomNode) -> bool {
357        if let Some(other) = other.as_any().downcast_ref::<Comment>() {
358            self.0 == other.0
359        } else {
360            false
361        }
362    }
363
364    fn is_block(&self) -> bool {
365        false
366    }
367}
368
369impl HtmlToAstParser {
370    pub fn is_block_element(element: &HtmlElement) -> bool {
371        matches!(
372            element.tag,
373            tag::p
374                | tag::div
375                | tag::blockquote
376                | tag::h1
377                | tag::h2
378                | tag::h3
379                | tag::h4
380                | tag::h5
381                | tag::h6
382                | tag::hr
383                | tag::pre
384                | tag::table
385                | tag::section
386                | tag::article
387                | tag::header
388                | tag::footer
389                | tag::main
390                | tag::aside
391                | tag::nav
392                | tag::ul
393                | tag::ol
394                | md_tag::heading
395                | md_tag::quote
396                | md_tag::raw
397                | md_tag::parbreak
398                | md_tag::table
399                | md_tag::grid
400                | md_tag::figure
401        )
402    }
403
404    pub fn process_list_item_element(&mut self, element: &HtmlElement) -> Result<Vec<Node>> {
405        if element.tag == tag::ul || element.tag == tag::ol {
406            let items = super::list::ListParser::convert_list(self, element)?;
407            if element.tag == tag::ul {
408                return Ok(vec![Node::UnorderedList(items)]);
409            } else {
410                return Ok(vec![Node::OrderedList { start: 1, items }]);
411            }
412        }
413
414        let prev_blocks = std::mem::take(&mut self.blocks);
415        let prev_buffer = std::mem::take(&mut self.inline_buffer);
416
417        self.convert_element(element)?;
418        let mut result = Vec::new();
419
420        if !self.blocks.is_empty() {
421            result.extend(std::mem::take(&mut self.blocks));
422        } else if !self.inline_buffer.is_empty() {
423            if Self::is_block_element(element) {
424                result.push(Node::Paragraph(std::mem::take(&mut self.inline_buffer)));
425            } else {
426                result = std::mem::take(&mut self.inline_buffer);
427            }
428        }
429
430        self.blocks = prev_blocks;
431        self.inline_buffer = prev_buffer;
432
433        Ok(result)
434    }
435
436    pub fn parse(mut self, root: &HtmlElement) -> Result<Node> {
437        self.blocks.clear();
438        self.inline_buffer.clear();
439
440        self.convert_element(root)?;
441        self.flush_inline_buffer();
442
443        Ok(Node::Document(self.blocks))
444    }
445}