typlite/parser/
core.rs

1//! HTML parser core, containing main structures and general parsing logic
2
3use std::sync::Arc;
4
5use typst::diag::SourceDiagnostic;
6use typst_syntax::Span;
7
8use cmark_writer::WriteResult;
9use cmark_writer::ast::{CustomNode, HtmlAttribute, HtmlElement as CmarkHtmlElement, Node};
10use cmark_writer::writer::InlineWriterProxy;
11use ecow::EcoString;
12use tinymist_project::LspWorld;
13use typst_html::{HtmlElement, HtmlNode, tag};
14
15use crate::Result;
16use crate::TypliteFeat;
17use crate::attributes::{AlertsAttr, HeadingAttr, RawAttr, TypliteAttrsParser, VerbatimAttr};
18use crate::common::{AlertNode, BlockVerbatimNode, CenterNode, VerbatimNode};
19use crate::diagnostics::WarningCollector;
20use crate::tags::md_tag;
21
22use super::{list::ListParser, table::TableParser};
23
24/// HTML to AST parser implementation
25pub struct HtmlToAstParser {
26    pub asset_counter: usize,
27    pub feat: TypliteFeat,
28    pub world: Arc<LspWorld>,
29    pub list_level: usize,
30    pub blocks: Vec<Node>,
31    pub inline_buffer: Vec<Node>,
32    pub(crate) warnings: WarningCollector,
33}
34
35impl HtmlToAstParser {
36    pub(crate) fn new(
37        feat: TypliteFeat,
38        world: &Arc<LspWorld>,
39        warnings: WarningCollector,
40    ) -> Self {
41        Self {
42            feat,
43            world: world.clone(),
44            asset_counter: 0,
45            list_level: 0,
46            blocks: Vec::new(),
47            inline_buffer: Vec::new(),
48            warnings,
49        }
50    }
51
52    pub fn convert_element(&mut self, element: &HtmlElement) -> Result<()> {
53        match element.tag {
54            tag::head => Ok(()),
55
56            tag::html | tag::body | md_tag::doc => {
57                self.convert_children(element)?;
58                Ok(())
59            }
60
61            tag::p | tag::span | tag::div => {
62                self.convert_children(element)?;
63                Ok(())
64            }
65
66            tag::strong | md_tag::strong => self.convert_strong(element),
67            tag::em | md_tag::emph => self.convert_emphasis(element),
68            tag::mark => self.convert_highlight(element),
69            tag::s => self.convert_strikethrough(element),
70
71            tag::br => {
72                self.inline_buffer.push(Node::HardBreak);
73                Ok(())
74            }
75
76            tag::ol => {
77                self.flush_inline_buffer();
78                let items = ListParser::convert_list(self, element);
79                self.blocks.push(Node::OrderedList {
80                    start: 1,
81                    items: items?,
82                });
83                Ok(())
84            }
85
86            tag::ul => {
87                self.flush_inline_buffer();
88                let items = ListParser::convert_list(self, element);
89                self.blocks.push(Node::UnorderedList(items?));
90                Ok(())
91            }
92
93            md_tag::parbreak => {
94                self.flush_inline_buffer();
95                Ok(())
96            }
97
98            md_tag::heading => {
99                self.flush_inline_buffer();
100                let attrs = HeadingAttr::parse(&element.attrs)?;
101                self.convert_children(element)?;
102                self.flush_inline_buffer_as_block(|content| {
103                    Node::heading(attrs.level as u8 + 1, content)
104                });
105                Ok(())
106            }
107
108            md_tag::raw => {
109                let attrs = RawAttr::parse(&element.attrs)?;
110                if attrs.block {
111                    self.flush_inline_buffer();
112                    self.blocks
113                        .push(Node::code_block(Some(attrs.lang), attrs.text));
114                } else {
115                    self.inline_buffer.push(Node::InlineCode(attrs.text));
116                }
117                Ok(())
118            }
119
120            md_tag::quote => {
121                let prev_blocks = std::mem::take(&mut self.blocks);
122                self.flush_inline_buffer();
123                self.convert_children(element)?;
124                let content = Node::Paragraph(std::mem::take(&mut self.inline_buffer));
125                let mut quote = std::mem::take(&mut self.blocks);
126                quote.push(content);
127                self.blocks.clear();
128                self.blocks.extend(prev_blocks);
129                self.blocks.push(Node::BlockQuote(quote));
130                Ok(())
131            }
132
133            md_tag::figure => self.convert_figure(element),
134            md_tag::link => self.convert_link(element),
135            md_tag::image => self.convert_image(element),
136
137            md_tag::linebreak => {
138                self.inline_buffer.push(Node::HardBreak);
139                Ok(())
140            }
141
142            md_tag::source => {
143                let src = self.convert_source(element);
144                self.inline_buffer.push(src);
145                Ok(())
146            }
147
148            md_tag::table | md_tag::grid => {
149                self.flush_inline_buffer();
150                if let Some(table) = TableParser::convert_table(self, element)? {
151                    self.blocks.push(table);
152                }
153                Ok(())
154            }
155
156            md_tag::idoc => {
157                let src = self.convert_idoc(element);
158                self.inline_buffer.push(src);
159                Ok(())
160            }
161
162            md_tag::math_equation_inline | md_tag::math_equation_block => {
163                if element.tag == md_tag::math_equation_block {
164                    self.flush_inline_buffer();
165                    self.convert_children(element)?;
166                    let content = std::mem::take(&mut self.inline_buffer);
167                    self.blocks
168                        .push(Node::Custom(Box::new(CenterNode::new(content))));
169                } else {
170                    self.convert_children(element)?;
171                }
172                Ok(())
173            }
174
175            md_tag::alerts => {
176                self.flush_inline_buffer();
177                let attrs = AlertsAttr::parse(&element.attrs)?;
178                let prev_blocks = std::mem::take(&mut self.blocks);
179                self.flush_inline_buffer();
180                self.convert_children(element)?;
181                let content = Node::Paragraph(std::mem::take(&mut self.inline_buffer));
182                let mut quote = std::mem::take(&mut self.blocks);
183                quote.push(content);
184                self.blocks.clear();
185                self.blocks.extend(prev_blocks);
186                self.blocks.push(Node::Custom(Box::new(AlertNode {
187                    content: quote,
188                    class: attrs.class,
189                })));
190                Ok(())
191            }
192
193            md_tag::verbatim => {
194                let attrs = VerbatimAttr::parse(&element.attrs)?;
195                if attrs.block {
196                    self.flush_inline_buffer();
197                    self.blocks.push(Node::Custom(Box::new(BlockVerbatimNode {
198                        content: attrs.src,
199                    })));
200                } else {
201                    self.inline_buffer
202                        .push(Node::Custom(Box::new(VerbatimNode { content: attrs.src })));
203                }
204                Ok(())
205            }
206
207            _ => {
208                let tag_name = element.tag.resolve().to_string();
209
210                if !tag_name.starts_with("m1") {
211                    // self.warn_at(
212                    //     Some(element.span),
213                    //     eco_format!(
214                    //         "unsupported HTML element `<{tag_name}>`; exported as raw HTML"
215                    //     ),
216                    // );
217                    let html_element = self.create_html_element(element)?;
218                    self.inline_buffer.push(html_element);
219                } else {
220                    self.convert_children(element)?;
221                }
222                Ok(())
223            }
224        }
225    }
226
227    /// Create a CommonMark HTML element from the given HTML element    
228    pub(crate) fn create_html_element(&mut self, element: &HtmlElement) -> Result<Node> {
229        let attributes = element
230            .attrs
231            .0
232            .iter()
233            .map(|(name, value)| HtmlAttribute {
234                name: name.resolve().to_string().into(),
235                value: value.clone(),
236            })
237            .collect();
238
239        let (inline_nodes, block_nodes) = self.capture_children(element)?;
240
241        let mut children = Vec::new();
242        if !inline_nodes.is_empty() {
243            children.extend(inline_nodes);
244        }
245        children.extend(block_nodes);
246
247        Ok(Node::HtmlElement(CmarkHtmlElement {
248            tag: element.tag.resolve().to_string().into(),
249            attributes,
250            children,
251            self_closing: element.children.is_empty(),
252        }))
253    }
254
255    pub fn flush_inline_buffer(&mut self) {
256        if !self.inline_buffer.is_empty() {
257            self.blocks
258                .push(Node::Paragraph(std::mem::take(&mut self.inline_buffer)));
259        }
260    }
261
262    pub fn flush_inline_buffer_as_block(&mut self, make_block: impl FnOnce(Vec<Node>) -> Node) {
263        if !self.inline_buffer.is_empty() {
264            self.blocks
265                .push(make_block(std::mem::take(&mut self.inline_buffer)));
266        }
267    }
268
269    pub fn convert_children(&mut self, element: &HtmlElement) -> Result<()> {
270        for child in &element.children {
271            match child {
272                HtmlNode::Text(text, _) => {
273                    self.inline_buffer.push(Node::Text(text.clone()));
274                }
275                HtmlNode::Element(element) => {
276                    self.convert_element(element)?;
277                }
278                HtmlNode::Frame(frame) => {
279                    let res = self.convert_frame(&frame.inner);
280                    self.inline_buffer.push(res);
281                }
282                HtmlNode::Tag(..) => {}
283            }
284        }
285        Ok(())
286    }
287
288    pub fn convert_children_into(
289        &mut self,
290        target: &mut Vec<Node>,
291        element: &HtmlElement,
292    ) -> Result<()> {
293        let prev_buffer = std::mem::take(&mut self.inline_buffer);
294        self.convert_children(element)?;
295        target.append(&mut self.inline_buffer);
296        self.inline_buffer = prev_buffer;
297        Ok(())
298    }
299
300    /// Convert element children while capturing both inline and block outputs.
301    pub fn capture_children(&mut self, element: &HtmlElement) -> Result<(Vec<Node>, Vec<Node>)> {
302        let prev_buffer = std::mem::take(&mut self.inline_buffer);
303        let prev_blocks = std::mem::take(&mut self.blocks);
304
305        self.convert_children(element)?;
306
307        let inline = std::mem::take(&mut self.inline_buffer);
308        let blocks = std::mem::take(&mut self.blocks);
309
310        self.inline_buffer = prev_buffer;
311        self.blocks = prev_blocks;
312
313        Ok((inline, blocks))
314    }
315
316    pub(crate) fn warn_at(&mut self, span: Option<Span>, message: EcoString) {
317        let span = span.unwrap_or_else(Span::detached);
318        let span = self
319            .feat
320            .wrap_info
321            .as_ref()
322            .and_then(|info| self.remap_span_from_wrapper(span, info))
323            .unwrap_or(span);
324
325        let diag = SourceDiagnostic::warning(span, message);
326        self.warnings.extend(std::iter::once(diag));
327    }
328
329    fn remap_span_from_wrapper(&self, span: Span, info: &crate::WrapInfo) -> Option<Span> {
330        info.remap_span(self.world.as_ref(), span)
331    }
332}
333
334#[derive(Debug, Clone)]
335pub(crate) struct Comment(pub EcoString);
336
337impl CustomNode for Comment {
338    fn as_any(&self) -> &dyn std::any::Any {
339        self
340    }
341
342    fn as_any_mut(&mut self) -> &mut dyn std::any::Any {
343        self
344    }
345
346    fn write_inline(&self, writer: &mut InlineWriterProxy) -> WriteResult<()> {
347        writer.write_str("<!-- ")?;
348        writer.write_str(&self.0)?;
349        writer.write_str(" -->")?;
350        Ok(())
351    }
352
353    fn clone_box(&self) -> Box<dyn CustomNode> {
354        Box::new(self.clone())
355    }
356
357    fn eq_box(&self, other: &dyn CustomNode) -> bool {
358        if let Some(other) = other.as_any().downcast_ref::<Comment>() {
359            self.0 == other.0
360        } else {
361            false
362        }
363    }
364
365    fn is_block(&self) -> bool {
366        false
367    }
368}
369
370impl HtmlToAstParser {
371    pub fn is_block_element(element: &HtmlElement) -> bool {
372        matches!(
373            element.tag,
374            tag::p
375                | tag::div
376                | tag::blockquote
377                | tag::h1
378                | tag::h2
379                | tag::h3
380                | tag::h4
381                | tag::h5
382                | tag::h6
383                | tag::hr
384                | tag::pre
385                | tag::table
386                | tag::section
387                | tag::article
388                | tag::header
389                | tag::footer
390                | tag::main
391                | tag::aside
392                | tag::nav
393                | tag::ul
394                | tag::ol
395                | md_tag::heading
396                | md_tag::quote
397                | md_tag::raw
398                | md_tag::parbreak
399                | md_tag::table
400                | md_tag::grid
401                | md_tag::figure
402        ) || (element.tag == md_tag::verbatim && Self::is_verbatim_block(element))
403    }
404
405    fn is_verbatim_block(element: &HtmlElement) -> bool {
406        VerbatimAttr::parse(&element.attrs)
407            .map(|attrs| attrs.block)
408            .unwrap_or(false)
409    }
410
411    pub fn process_list_item_element(&mut self, element: &HtmlElement) -> Result<Vec<Node>> {
412        if element.tag == tag::ul || element.tag == tag::ol {
413            let items = super::list::ListParser::convert_list(self, element)?;
414            if element.tag == tag::ul {
415                return Ok(vec![Node::UnorderedList(items)]);
416            } else {
417                return Ok(vec![Node::OrderedList { start: 1, items }]);
418            }
419        }
420
421        let prev_blocks = std::mem::take(&mut self.blocks);
422        let prev_buffer = std::mem::take(&mut self.inline_buffer);
423
424        self.convert_element(element)?;
425        let mut result = Vec::new();
426
427        if !self.blocks.is_empty() {
428            result.extend(std::mem::take(&mut self.blocks));
429        } else if !self.inline_buffer.is_empty() {
430            if Self::is_block_element(element) {
431                result.push(Node::Paragraph(std::mem::take(&mut self.inline_buffer)));
432            } else {
433                result = std::mem::take(&mut self.inline_buffer);
434            }
435        }
436
437        self.blocks = prev_blocks;
438        self.inline_buffer = prev_buffer;
439
440        Ok(result)
441    }
442
443    pub fn parse(mut self, root: &HtmlElement) -> Result<Node> {
444        self.blocks.clear();
445        self.inline_buffer.clear();
446
447        self.convert_element(root)?;
448        self.flush_inline_buffer();
449
450        Ok(Node::Document(self.blocks))
451    }
452}