1use std::sync::Arc;
4
5use typst::diag::SourceDiagnostic;
6use typst_syntax::Span;
7
8use cmark_writer::WriteResult;
9use cmark_writer::ast::{CustomNode, HtmlAttribute, HtmlElement as CmarkHtmlElement, Node};
10use cmark_writer::writer::InlineWriterProxy;
11use ecow::EcoString;
12use tinymist_project::LspWorld;
13use typst_html::{HtmlElement, HtmlNode, tag};
14
15use crate::Result;
16use crate::TypliteFeat;
17use crate::attributes::{AlertsAttr, HeadingAttr, RawAttr, TypliteAttrsParser, md_attr};
18use crate::common::{AlertNode, CenterNode, VerbatimNode};
19use crate::diagnostics::WarningCollector;
20use crate::tags::md_tag;
21
22use super::{list::ListParser, table::TableParser};
23
24pub struct HtmlToAstParser {
26 pub asset_counter: usize,
27 pub feat: TypliteFeat,
28 pub world: Arc<LspWorld>,
29 pub list_level: usize,
30 pub blocks: Vec<Node>,
31 pub inline_buffer: Vec<Node>,
32 pub(crate) warnings: WarningCollector,
33}
34
35impl HtmlToAstParser {
36 pub(crate) fn new(
37 feat: TypliteFeat,
38 world: &Arc<LspWorld>,
39 warnings: WarningCollector,
40 ) -> Self {
41 Self {
42 feat,
43 world: world.clone(),
44 asset_counter: 0,
45 list_level: 0,
46 blocks: Vec::new(),
47 inline_buffer: Vec::new(),
48 warnings,
49 }
50 }
51
52 pub fn convert_element(&mut self, element: &HtmlElement) -> Result<()> {
53 match element.tag {
54 tag::head => Ok(()),
55
56 tag::html | tag::body | md_tag::doc => {
57 self.convert_children(element)?;
58 Ok(())
59 }
60
61 tag::p | tag::span | tag::div => {
62 self.convert_children(element)?;
63 Ok(())
64 }
65
66 tag::strong | md_tag::strong => self.convert_strong(element),
67 tag::em | md_tag::emph => self.convert_emphasis(element),
68 tag::mark => self.convert_highlight(element),
69 tag::s => self.convert_strikethrough(element),
70
71 tag::br => {
72 self.inline_buffer.push(Node::HardBreak);
73 Ok(())
74 }
75
76 tag::ol => {
77 self.flush_inline_buffer();
78 let items = ListParser::convert_list(self, element);
79 self.blocks.push(Node::OrderedList {
80 start: 1,
81 items: items?,
82 });
83 Ok(())
84 }
85
86 tag::ul => {
87 self.flush_inline_buffer();
88 let items = ListParser::convert_list(self, element);
89 self.blocks.push(Node::UnorderedList(items?));
90 Ok(())
91 }
92
93 md_tag::parbreak => {
94 self.flush_inline_buffer();
95 Ok(())
96 }
97
98 md_tag::heading => {
99 self.flush_inline_buffer();
100 let attrs = HeadingAttr::parse(&element.attrs)?;
101 self.convert_children(element)?;
102 self.flush_inline_buffer_as_block(|content| {
103 Node::heading(attrs.level as u8 + 1, content)
104 });
105 Ok(())
106 }
107
108 md_tag::raw => {
109 let attrs = RawAttr::parse(&element.attrs)?;
110 if attrs.block {
111 self.flush_inline_buffer();
112 self.blocks
113 .push(Node::code_block(Some(attrs.lang), attrs.text));
114 } else {
115 self.inline_buffer.push(Node::InlineCode(attrs.text));
116 }
117 Ok(())
118 }
119
120 md_tag::quote => {
121 let prev_blocks = std::mem::take(&mut self.blocks);
122 self.flush_inline_buffer();
123 self.convert_children(element)?;
124 let content = Node::Paragraph(std::mem::take(&mut self.inline_buffer));
125 let mut quote = std::mem::take(&mut self.blocks);
126 quote.push(content);
127 self.blocks.clear();
128 self.blocks.extend(prev_blocks);
129 self.blocks.push(Node::BlockQuote(quote));
130 Ok(())
131 }
132
133 md_tag::figure => self.convert_figure(element),
134 md_tag::link => self.convert_link(element),
135 md_tag::image => self.convert_image(element),
136
137 md_tag::linebreak => {
138 self.inline_buffer.push(Node::HardBreak);
139 Ok(())
140 }
141
142 md_tag::source => {
143 let src = self.convert_source(element);
144 self.inline_buffer.push(src);
145 Ok(())
146 }
147
148 md_tag::table | md_tag::grid => {
149 self.flush_inline_buffer();
150 if let Some(table) = TableParser::convert_table(self, element)? {
151 self.blocks.push(table);
152 }
153 Ok(())
154 }
155
156 md_tag::idoc => {
157 let src = self.convert_idoc(element);
158 self.inline_buffer.push(src);
159 Ok(())
160 }
161
162 md_tag::math_equation_inline | md_tag::math_equation_block => {
163 if element.tag == md_tag::math_equation_block {
164 self.flush_inline_buffer();
165 self.convert_children(element)?;
166 let content = std::mem::take(&mut self.inline_buffer);
167 self.blocks
168 .push(Node::Custom(Box::new(CenterNode::new(content))));
169 } else {
170 self.convert_children(element)?;
171 }
172 Ok(())
173 }
174
175 md_tag::alerts => {
176 self.flush_inline_buffer();
177 let attrs = AlertsAttr::parse(&element.attrs)?;
178 let prev_blocks = std::mem::take(&mut self.blocks);
179 self.flush_inline_buffer();
180 self.convert_children(element)?;
181 let content = Node::Paragraph(std::mem::take(&mut self.inline_buffer));
182 let mut quote = std::mem::take(&mut self.blocks);
183 quote.push(content);
184 self.blocks.clear();
185 self.blocks.extend(prev_blocks);
186 self.blocks.push(Node::Custom(Box::new(AlertNode {
187 content: quote,
188 class: attrs.class,
189 })));
190 Ok(())
191 }
192
193 md_tag::verbatim => {
194 self.inline_buffer.push(Node::Custom(Box::new(VerbatimNode {
195 content: element
196 .attrs
197 .0
198 .iter()
199 .find(|(name, _)| *name == md_attr::src)
200 .map(|(_, value)| value.clone())
201 .unwrap_or_default(),
202 })));
203 Ok(())
204 }
205
206 _ => {
207 let tag_name = element.tag.resolve().to_string();
208
209 if !tag_name.starts_with("m1") {
210 let html_element = self.create_html_element(element)?;
217 self.inline_buffer.push(html_element);
218 } else {
219 self.convert_children(element)?;
220 }
221 Ok(())
222 }
223 }
224 }
225
226 pub(crate) fn create_html_element(&mut self, element: &HtmlElement) -> Result<Node> {
228 let attributes = element
229 .attrs
230 .0
231 .iter()
232 .map(|(name, value)| HtmlAttribute {
233 name: name.resolve().to_string().into(),
234 value: value.clone(),
235 })
236 .collect();
237
238 let (inline_nodes, block_nodes) = self.capture_children(element)?;
239
240 let mut children = Vec::new();
241 if !inline_nodes.is_empty() {
242 children.extend(inline_nodes);
243 }
244 children.extend(block_nodes);
245
246 Ok(Node::HtmlElement(CmarkHtmlElement {
247 tag: element.tag.resolve().to_string().into(),
248 attributes,
249 children,
250 self_closing: element.children.is_empty(),
251 }))
252 }
253
254 pub fn flush_inline_buffer(&mut self) {
255 if !self.inline_buffer.is_empty() {
256 self.blocks
257 .push(Node::Paragraph(std::mem::take(&mut self.inline_buffer)));
258 }
259 }
260
261 pub fn flush_inline_buffer_as_block(&mut self, make_block: impl FnOnce(Vec<Node>) -> Node) {
262 if !self.inline_buffer.is_empty() {
263 self.blocks
264 .push(make_block(std::mem::take(&mut self.inline_buffer)));
265 }
266 }
267
268 pub fn convert_children(&mut self, element: &HtmlElement) -> Result<()> {
269 for child in &element.children {
270 match child {
271 HtmlNode::Text(text, _) => {
272 self.inline_buffer.push(Node::Text(text.clone()));
273 }
274 HtmlNode::Element(element) => {
275 self.convert_element(element)?;
276 }
277 HtmlNode::Frame(frame) => {
278 let res = self.convert_frame(&frame.inner);
279 self.inline_buffer.push(res);
280 }
281 HtmlNode::Tag(..) => {}
282 }
283 }
284 Ok(())
285 }
286
287 pub fn convert_children_into(
288 &mut self,
289 target: &mut Vec<Node>,
290 element: &HtmlElement,
291 ) -> Result<()> {
292 let prev_buffer = std::mem::take(&mut self.inline_buffer);
293 self.convert_children(element)?;
294 target.append(&mut self.inline_buffer);
295 self.inline_buffer = prev_buffer;
296 Ok(())
297 }
298
299 pub fn capture_children(&mut self, element: &HtmlElement) -> Result<(Vec<Node>, Vec<Node>)> {
301 let prev_buffer = std::mem::take(&mut self.inline_buffer);
302 let prev_blocks = std::mem::take(&mut self.blocks);
303
304 self.convert_children(element)?;
305
306 let inline = std::mem::take(&mut self.inline_buffer);
307 let blocks = std::mem::take(&mut self.blocks);
308
309 self.inline_buffer = prev_buffer;
310 self.blocks = prev_blocks;
311
312 Ok((inline, blocks))
313 }
314
315 pub(crate) fn warn_at(&mut self, span: Option<Span>, message: EcoString) {
316 let span = span.unwrap_or_else(Span::detached);
317 let span = self
318 .feat
319 .wrap_info
320 .as_ref()
321 .and_then(|info| self.remap_span_from_wrapper(span, info))
322 .unwrap_or(span);
323
324 let diag = SourceDiagnostic::warning(span, message);
325 self.warnings.extend(std::iter::once(diag));
326 }
327
328 fn remap_span_from_wrapper(&self, span: Span, info: &crate::WrapInfo) -> Option<Span> {
329 info.remap_span(self.world.as_ref(), span)
330 }
331}
332
333#[derive(Debug, Clone)]
334pub(crate) struct Comment(pub EcoString);
335
336impl CustomNode for Comment {
337 fn as_any(&self) -> &dyn std::any::Any {
338 self
339 }
340
341 fn as_any_mut(&mut self) -> &mut dyn std::any::Any {
342 self
343 }
344
345 fn write_inline(&self, writer: &mut InlineWriterProxy) -> WriteResult<()> {
346 writer.write_str("<!-- ")?;
347 writer.write_str(&self.0)?;
348 writer.write_str(" -->")?;
349 Ok(())
350 }
351
352 fn clone_box(&self) -> Box<dyn CustomNode> {
353 Box::new(self.clone())
354 }
355
356 fn eq_box(&self, other: &dyn CustomNode) -> bool {
357 if let Some(other) = other.as_any().downcast_ref::<Comment>() {
358 self.0 == other.0
359 } else {
360 false
361 }
362 }
363
364 fn is_block(&self) -> bool {
365 false
366 }
367}
368
369impl HtmlToAstParser {
370 pub fn is_block_element(element: &HtmlElement) -> bool {
371 matches!(
372 element.tag,
373 tag::p
374 | tag::div
375 | tag::blockquote
376 | tag::h1
377 | tag::h2
378 | tag::h3
379 | tag::h4
380 | tag::h5
381 | tag::h6
382 | tag::hr
383 | tag::pre
384 | tag::table
385 | tag::section
386 | tag::article
387 | tag::header
388 | tag::footer
389 | tag::main
390 | tag::aside
391 | tag::nav
392 | tag::ul
393 | tag::ol
394 | md_tag::heading
395 | md_tag::quote
396 | md_tag::raw
397 | md_tag::parbreak
398 | md_tag::table
399 | md_tag::grid
400 | md_tag::figure
401 )
402 }
403
404 pub fn process_list_item_element(&mut self, element: &HtmlElement) -> Result<Vec<Node>> {
405 if element.tag == tag::ul || element.tag == tag::ol {
406 let items = super::list::ListParser::convert_list(self, element)?;
407 if element.tag == tag::ul {
408 return Ok(vec![Node::UnorderedList(items)]);
409 } else {
410 return Ok(vec![Node::OrderedList { start: 1, items }]);
411 }
412 }
413
414 let prev_blocks = std::mem::take(&mut self.blocks);
415 let prev_buffer = std::mem::take(&mut self.inline_buffer);
416
417 self.convert_element(element)?;
418 let mut result = Vec::new();
419
420 if !self.blocks.is_empty() {
421 result.extend(std::mem::take(&mut self.blocks));
422 } else if !self.inline_buffer.is_empty() {
423 if Self::is_block_element(element) {
424 result.push(Node::Paragraph(std::mem::take(&mut self.inline_buffer)));
425 } else {
426 result = std::mem::take(&mut self.inline_buffer);
427 }
428 }
429
430 self.blocks = prev_blocks;
431 self.inline_buffer = prev_buffer;
432
433 Ok(result)
434 }
435
436 pub fn parse(mut self, root: &HtmlElement) -> Result<Node> {
437 self.blocks.clear();
438 self.inline_buffer.clear();
439
440 self.convert_element(root)?;
441 self.flush_inline_buffer();
442
443 Ok(Node::Document(self.blocks))
444 }
445}