typlite/writer/docx/
writer.rs

1//! DOCX document writer implementation
2
3use base64::Engine;
4use cmark_writer::ast::{ListItem, Node};
5use docx_rs::*;
6use ecow::EcoString;
7use log::{debug, warn};
8use std::fs;
9use std::io::Cursor;
10
11use crate::Result;
12use crate::common::{
13    CenterNode, FigureNode, FormatWriter, HighlightNode, InlineNode, VerbatimNode,
14};
15
16use super::image_processor::DocxImageProcessor;
17use super::numbering::DocxNumbering;
18use super::styles::DocxStyles;
19
20/// DOCX writer that generates DOCX directly from AST (without intermediate
21/// representation)
22pub struct DocxWriter {
23    styles: DocxStyles,
24    numbering: DocxNumbering,
25    list_level: usize,
26    list_numbering_count: usize,
27    image_processor: DocxImageProcessor,
28}
29
30impl Default for DocxWriter {
31    fn default() -> Self {
32        Self::new()
33    }
34}
35
36impl DocxWriter {
37    pub fn new() -> Self {
38        Self {
39            styles: DocxStyles::new(),
40            numbering: DocxNumbering::new(),
41            list_level: 0,
42            list_numbering_count: 0,
43            image_processor: DocxImageProcessor::new(),
44        }
45    }
46
47    /// Process image node
48    fn process_image(&self, docx: Docx, url: &str, alt_nodes: &[Node]) -> Result<Docx> {
49        // Build alt text
50        let alt_text = if !alt_nodes.is_empty() {
51            let mut text = String::new();
52            for node in alt_nodes {
53                if let Node::Text(content) = node {
54                    text.push_str(content);
55                }
56            }
57            Some(text)
58        } else {
59            None
60        };
61
62        // Try reading image file
63        if let Ok(img_data) = fs::read(url) {
64            Ok(self
65                .image_processor
66                .process_image_data(docx, &img_data, alt_text.as_deref(), None))
67        } else {
68            let placeholder = format!("[Image not found: {url}]");
69            let para = Paragraph::new().add_run(Run::new().add_text(placeholder));
70            Ok(docx.add_paragraph(para))
71        }
72    }
73
74    /// Process figure node (image with caption)
75    fn process_figure(&mut self, mut docx: Docx, figure_node: &FigureNode) -> Result<Docx> {
76        // First handle the figure body (typically an image)
77        match &*figure_node.body {
78            Node::Paragraph(content) => {
79                for node in content {
80                    if let Node::Image {
81                        url,
82                        title: _,
83                        alt: _,
84                    } = node
85                    {
86                        // Process the image
87                        if let Ok(img_data) = fs::read(url.as_str()) {
88                            let alt_text = figure_node.caption.clone();
89                            // Add the image with caption
90                            docx = self.image_processor.process_image_data(
91                                docx,
92                                &img_data,
93                                Some(&alt_text),
94                                None,
95                            );
96
97                            // Add caption as a separate paragraph with Caption style
98                            if !figure_node.caption.is_empty() {
99                                let caption_text = format!("Figure: {}", figure_node.caption);
100                                let caption_para = Paragraph::new()
101                                    .style("Caption")
102                                    .add_run(Run::new().add_text(caption_text));
103                                docx = docx.add_paragraph(caption_para);
104                            }
105                        } else {
106                            // Image not found, show placeholder
107                            let placeholder = format!("[Image not found: {url}]");
108                            let para = Paragraph::new().add_run(Run::new().add_text(placeholder));
109                            docx = docx.add_paragraph(para);
110
111                            // Still add caption
112                            if !figure_node.caption.is_empty() {
113                                let caption_para = Paragraph::new()
114                                    .style("Caption")
115                                    .add_run(Run::new().add_text(&figure_node.caption));
116                                docx = docx.add_paragraph(caption_para);
117                            }
118                        }
119                    } else {
120                        // Handle non-image content
121                        let mut para = Paragraph::new();
122                        let run = Run::new();
123                        let run = self.process_inline_to_run(run, node)?;
124                        if !run.children.is_empty() {
125                            para = para.add_run(run);
126                            docx = docx.add_paragraph(para);
127                        }
128
129                        // Add caption as a separate paragraph
130                        if !figure_node.caption.is_empty() {
131                            let caption_para = Paragraph::new()
132                                .style("Caption")
133                                .add_run(Run::new().add_text(&figure_node.caption));
134                            docx = docx.add_paragraph(caption_para);
135                        }
136                    }
137                }
138            }
139            // Handle other content types within figure
140            _ => {
141                // Process the content using standard node processing
142                docx = self.process_node(docx, &figure_node.body)?;
143
144                // Add caption as a separate paragraph
145                if !figure_node.caption.is_empty() {
146                    let caption_para = Paragraph::new()
147                        .style("Caption")
148                        .add_run(Run::new().add_text(&figure_node.caption));
149                    docx = docx.add_paragraph(caption_para);
150                }
151            }
152        }
153
154        Ok(docx)
155    }
156
157    /// Process inline element and add to Run
158    fn process_inline_to_run(&self, mut run: Run, node: &Node) -> Result<Run> {
159        match node {
160            Node::Text(text) => {
161                run = run.add_text(text);
162            }
163            Node::Strong(content) => {
164                run = run.style("Strong");
165                for child in content {
166                    run = self.process_inline_to_run(run, child)?;
167                }
168            }
169            Node::Emphasis(content) => {
170                run = run.style("Emphasis");
171                for child in content {
172                    run = self.process_inline_to_run(run, child)?;
173                }
174            }
175            Node::Strikethrough(content) => {
176                run = run.strike();
177                for child in content {
178                    run = self.process_inline_to_run(run, child)?;
179                }
180            }
181            Node::Link {
182                url: _,
183                title: _,
184                content,
185            } => {
186                // Hyperlinks need to be processed at paragraph level, only handle content here
187                run = run.style("Hyperlink");
188                for child in content {
189                    run = self.process_inline_to_run(run, child)?;
190                }
191            }
192            Node::Image {
193                url,
194                title: _,
195                alt: _,
196            } => {
197                if let Ok(img_data) = fs::read(url.as_str()) {
198                    run = self.image_processor.process_inline_image(run, &img_data)?;
199                } else {
200                    run = run.add_text(format!("[Image not found: {url}]"));
201                }
202            }
203            Node::HtmlElement(element) => {
204                // Handle special HTML elements
205                if element.tag == "img" && element.self_closing {
206                    let is_typst_block = element
207                        .attributes
208                        .iter()
209                        .any(|a| a.name == "alt" && a.value == "typst-block");
210
211                    let src = element
212                        .attributes
213                        .iter()
214                        .find(|a| a.name == "src")
215                        .map(|a| a.value.as_str())
216                        .unwrap_or("");
217
218                    if src.starts_with("data:image/") {
219                        run = self.image_processor.process_data_url_image(
220                            run,
221                            src,
222                            is_typst_block,
223                        )?;
224                    }
225                } else {
226                    // Standard element content processing
227                    for child in &element.children {
228                        run = self.process_inline_to_run(run, child)?;
229                    }
230                }
231            }
232            Node::InlineCode(code) => {
233                run = run.style("CodeInline").add_text(code);
234            }
235            Node::HardBreak => {
236                run = run.add_break(BreakType::TextWrapping);
237            }
238            Node::SoftBreak => {
239                run = run.add_text(" ");
240            }
241            node if node.is_custom_type::<HighlightNode>() => {
242                let highlight_node = node.as_custom_type::<HighlightNode>().unwrap();
243                run = run.highlight("yellow");
244                for child in &highlight_node.content {
245                    run = self.process_inline_to_run(run, child)?;
246                }
247            }
248            node if node.is_custom_type::<InlineNode>() => {
249                let inline_node = node.as_custom_type::<InlineNode>().unwrap();
250                for child in &inline_node.content {
251                    run = self.process_inline_to_run(run, child)?;
252                }
253            }
254            node if node.is_custom_type::<VerbatimNode>() => {
255                let node = node.as_custom_type::<VerbatimNode>().unwrap();
256                warn!(
257                    "ignoring `m1verbatim` content in DOCX export: {:?}",
258                    node.content
259                );
260            }
261            // Other inline element types
262            _ => {
263                debug!("unhandled inline node in DOCX export: {node:?}");
264            }
265        }
266
267        Ok(run)
268    }
269
270    /// Process paragraph and add to document
271    fn process_paragraph(
272        &self,
273        mut docx: Docx,
274        content: &[Node],
275        style: Option<&str>,
276    ) -> Result<Docx> {
277        let mut para = Paragraph::new();
278
279        // Apply style
280        if let Some(style_name) = style {
281            para = para.style(style_name);
282        }
283
284        // Extract all link nodes
285        let mut links = Vec::new();
286        for (i, node) in content.iter().enumerate() {
287            if let Node::Link {
288                url,
289                title: _,
290                content: _,
291            } = node
292            {
293                links.push((i, url.clone()));
294            }
295        }
296
297        // If no links, process paragraph normally
298        if links.is_empty() {
299            // Process paragraph content
300            for node in content {
301                let run = Run::new();
302                let run = self.process_inline_to_run(run, node)?;
303                if !run.children.is_empty() {
304                    para = para.add_run(run);
305                }
306            }
307        } else {
308            // If links exist, we need to process in segments
309            let mut last_idx = 0;
310            for (idx, url) in links {
311                // Process content before the link
312                for item in content.iter().take(idx).skip(last_idx) {
313                    let run = Run::new();
314                    let run = self.process_inline_to_run(run, item)?;
315                    if !run.children.is_empty() {
316                        para = para.add_run(run);
317                    }
318                }
319
320                // Process link
321                if let Node::Link {
322                    url: _,
323                    title: _,
324                    content: link_content,
325                } = &content[idx]
326                {
327                    let mut hyperlink_run = Run::new().style("Hyperlink");
328                    for child in link_content {
329                        hyperlink_run = self.process_inline_to_run(hyperlink_run, child)?;
330                    }
331
332                    // Create and add hyperlink
333                    if !hyperlink_run.children.is_empty() {
334                        let hyperlink =
335                            Hyperlink::new(&url, HyperlinkType::External).add_run(hyperlink_run);
336                        para = para.add_hyperlink(hyperlink);
337                    }
338                }
339
340                last_idx = idx + 1;
341            }
342
343            // Process content after the last link
344            for item in content.iter().skip(last_idx) {
345                let run = Run::new();
346                let run = self.process_inline_to_run(run, item)?;
347                if !run.children.is_empty() {
348                    para = para.add_run(run);
349                }
350            }
351        }
352
353        // Only add when paragraph has content
354        if !para.children.is_empty() {
355            docx = docx.add_paragraph(para);
356        }
357
358        Ok(docx)
359    }
360
361    /// Process node and add to document
362    fn process_node(&mut self, mut docx: Docx, node: &Node) -> Result<Docx> {
363        match node {
364            Node::Document(blocks) => {
365                for block in blocks {
366                    docx = self.process_node(docx, block)?;
367                }
368            }
369            Node::Paragraph(content) => {
370                docx = self.process_paragraph(docx, content, None)?;
371            }
372            Node::Heading {
373                level,
374                content,
375                heading_type: _,
376            } => {
377                // Determine heading style name
378                let style_name = match level {
379                    1 => "Heading1",
380                    2 => "Heading2",
381                    3 => "Heading3",
382                    4 => "Heading4",
383                    5 => "Heading5",
384                    _ => "Heading6",
385                };
386
387                docx = self.process_paragraph(docx, content, Some(style_name))?;
388            }
389            Node::BlockQuote(content) => {
390                for block in content {
391                    if let Node::Paragraph(inline) = block {
392                        docx = self.process_paragraph(docx, inline, Some("Blockquote"))?;
393                    } else {
394                        docx = self.process_node(docx, block)?;
395                    }
396                }
397            }
398            Node::CodeBlock {
399                language,
400                content,
401                block_type: _,
402            } => {
403                // Add language information
404                if let Some(lang) = language
405                    && !lang.is_empty()
406                {
407                    let lang_para = Paragraph::new()
408                        .style("CodeBlock")
409                        .add_run(Run::new().add_text(lang));
410                    docx = docx.add_paragraph(lang_para);
411                }
412
413                // Process code line by line, preserving line breaks
414                let lines: Vec<&str> = content.split('\n').collect();
415                for line in lines {
416                    let code_para = Paragraph::new()
417                        .style("CodeBlock")
418                        .add_run(Run::new().add_text(line));
419                    docx = docx.add_paragraph(code_para);
420                }
421            }
422            Node::OrderedList { start: _, items } => {
423                docx = self.process_ordered_list(docx, items)?;
424            }
425            Node::UnorderedList(items) => {
426                docx = self.process_unordered_list(docx, items)?;
427            }
428            Node::Table {
429                headers,
430                rows,
431                alignments: _,
432            } => {
433                docx = self.process_table(docx, headers, rows)?;
434            }
435            Node::Image { url, title: _, alt } => {
436                docx = self.process_image(docx, url, alt)?;
437            }
438            node if node.is_custom_type::<FigureNode>() => {
439                let figure_node = node.as_custom_type::<FigureNode>().unwrap();
440                docx = self.process_figure(docx, figure_node)?;
441            }
442            node if node.is_custom_type::<CenterNode>() => {
443                let center_node = node.as_custom_type::<CenterNode>().unwrap();
444                // Handle regular node but with center alignment
445                match &center_node.node {
446                    Node::Paragraph(content) => {
447                        docx = self.process_paragraph(docx, content, None)?;
448                        // Get the last paragraph and center it
449                        if let Some(DocumentChild::Paragraph(para)) =
450                            docx.document.children.last_mut()
451                        {
452                            para.property = para.property.clone().align(AlignmentType::Center);
453                        }
454                    }
455                    Node::HtmlElement(element) => {
456                        let start_idx = docx.document.children.len();
457                        for child in &element.children {
458                            docx = self.process_node(docx, child)?;
459                        }
460                        for child in docx.document.children.iter_mut().skip(start_idx) {
461                            if let DocumentChild::Paragraph(para) = child {
462                                para.property = para.property.clone().align(AlignmentType::Center);
463                            }
464                        }
465                    }
466                    other => {
467                        docx = self.process_node(docx, other)?;
468                        // Get the last element and center it if it's a paragraph
469                        if let Some(DocumentChild::Paragraph(para)) =
470                            docx.document.children.last_mut()
471                        {
472                            para.property = para.property.clone().align(AlignmentType::Center);
473                        }
474                    }
475                }
476            }
477            node if node.is_custom_type::<crate::common::ExternalFrameNode>() => {
478                let external_frame = node
479                    .as_custom_type::<crate::common::ExternalFrameNode>()
480                    .unwrap();
481                let data = base64::engine::general_purpose::STANDARD
482                    .decode(&external_frame.svg)
483                    .map_err(|e| format!("Failed to decode SVG data: {e}"))?;
484
485                docx = self.image_processor.process_image_data(
486                    docx,
487                    &data,
488                    Some(&external_frame.alt_text),
489                    None,
490                );
491            }
492            node if node.is_custom_type::<HighlightNode>() => {
493                let highlight_node = node.as_custom_type::<HighlightNode>().unwrap();
494                // Handle HighlightNode at block level (convert to paragraph)
495                let mut para = Paragraph::new();
496                let mut run = Run::new().highlight("yellow");
497
498                for child in &highlight_node.content {
499                    run = self.process_inline_to_run(run, child)?;
500                }
501
502                if !run.children.is_empty() {
503                    para = para.add_run(run);
504                    docx = docx.add_paragraph(para);
505                }
506            }
507            node if node.is_custom_type::<InlineNode>() => {
508                let inline_node = node.as_custom_type::<InlineNode>().unwrap();
509                // Handle InlineNode at block level (convert to paragraph)
510                let mut para = Paragraph::new();
511                let mut run = Run::new();
512
513                for child in &inline_node.content {
514                    run = self.process_inline_to_run(run, child)?;
515                }
516
517                if !run.children.is_empty() {
518                    para = para.add_run(run);
519                    docx = docx.add_paragraph(para);
520                }
521            }
522            Node::ThematicBreak => {
523                // Add horizontal line as specially formatted paragraph
524                let hr_para = Paragraph::new()
525                    .style("HorizontalLine")
526                    .add_run(Run::new().add_text(""));
527                docx = docx.add_paragraph(hr_para);
528            }
529            // Inline elements should not be processed here individually
530            _ => {}
531        }
532
533        Ok(docx)
534    }
535
536    /// Process ordered list
537    fn process_ordered_list(&mut self, mut docx: Docx, items: &[ListItem]) -> Result<Docx> {
538        // Enter deeper list level
539        self.list_level += 1;
540        let current_level = self.list_level - 1;
541
542        // Create new ordered list numbering definition
543        let (doc, num_id) = self.numbering.create_ordered_numbering(docx);
544        docx = doc;
545
546        // Process list items
547        for item in items {
548            if let ListItem::Ordered { content, .. } = item {
549                docx = self.process_list_item_content(docx, content, num_id, current_level)?;
550            }
551        }
552
553        // Exit list level
554        self.list_level -= 1;
555        Ok(docx)
556    }
557
558    /// Process unordered list
559    fn process_unordered_list(&mut self, mut docx: Docx, items: &[ListItem]) -> Result<Docx> {
560        // Enter deeper list level
561        self.list_level += 1;
562        let current_level = self.list_level - 1;
563
564        // Create new unordered list numbering definition
565        let (doc, num_id) = self.numbering.create_unordered_numbering(docx);
566        docx = doc;
567
568        // Process list items
569        for item in items {
570            if let ListItem::Unordered { content } = item {
571                docx = self.process_list_item_content(docx, content, num_id, current_level)?;
572            }
573        }
574
575        // Exit list level
576        self.list_level -= 1;
577        Ok(docx)
578    }
579
580    /// Helper function to process list item content
581    fn process_list_item_content(
582        &mut self,
583        mut docx: Docx,
584        content: &[Node],
585        num_id: usize,
586        level: usize,
587    ) -> Result<Docx> {
588        // If content is empty, add empty paragraph
589        if content.is_empty() {
590            let empty_para = Paragraph::new()
591                .numbering(NumberingId::new(num_id), IndentLevel::new(level))
592                .add_run(Run::new().add_text(""));
593            return Ok(docx.add_paragraph(empty_para));
594        }
595
596        // Process content
597        for block in content {
598            match block {
599                Node::Paragraph(inline) => {
600                    let mut para = Paragraph::new()
601                        .numbering(NumberingId::new(num_id), IndentLevel::new(level));
602
603                    // Process paragraph content
604                    for node in inline {
605                        let run = Run::new();
606                        let run = self.process_inline_to_run(run, node)?;
607                        if !run.children.is_empty() {
608                            para = para.add_run(run);
609                        }
610                    }
611
612                    docx = docx.add_paragraph(para);
613                }
614                // Recursively process nested lists
615                Node::OrderedList { start: _, items: _ } | Node::UnorderedList(_) => {
616                    docx = self.process_node(docx, block)?;
617                }
618                _ => {
619                    docx = self.process_node(docx, block)?;
620                }
621            }
622        }
623
624        Ok(docx)
625    }
626
627    /// Process table
628    fn process_table(&self, mut docx: Docx, headers: &[Node], rows: &[Vec<Node>]) -> Result<Docx> {
629        let mut table = Table::new(vec![]).style("Table");
630
631        // Process table headers
632        if !headers.is_empty() {
633            let mut cells = Vec::new();
634
635            for header_node in headers {
636                let mut table_cell = TableCell::new();
637                let mut para = Paragraph::new();
638
639                let run = Run::new();
640                let run = self.process_inline_to_run(run, header_node)?;
641                if !run.children.is_empty() {
642                    para = para.add_run(run);
643                }
644
645                if !para.children.is_empty() {
646                    table_cell = table_cell.add_paragraph(para);
647                }
648
649                cells.push(table_cell);
650            }
651
652            if !cells.is_empty() {
653                let header_row = TableRow::new(cells);
654                table = table.add_row(header_row);
655            }
656        }
657
658        // Process table rows
659        for row in rows {
660            let mut cells = Vec::new();
661
662            for cell_node in row {
663                let mut table_cell = TableCell::new();
664                let mut para = Paragraph::new();
665
666                let run = Run::new();
667                let run = self.process_inline_to_run(run, cell_node)?;
668                if !run.children.is_empty() {
669                    para = para.add_run(run);
670                }
671
672                if !para.children.is_empty() {
673                    table_cell = table_cell.add_paragraph(para);
674                }
675
676                cells.push(table_cell);
677            }
678
679            if !cells.is_empty() {
680                let data_row = TableRow::new(cells);
681                table = table.add_row(data_row);
682            }
683        }
684
685        // Add table to document
686        docx = docx.add_table(table);
687
688        Ok(docx)
689    }
690
691    /// Generate DOCX document
692    pub fn generate_docx(&mut self, doc: &Node) -> Result<Vec<u8>> {
693        // Create DOCX document and initialize styles
694        let mut docx = Docx::new();
695        docx = self.styles.initialize_styles(docx);
696
697        // Process document content
698        docx = self.process_node(docx, doc)?;
699
700        // Initialize numbering definitions
701        docx = self.numbering.initialize_numbering(docx);
702
703        // Build and pack document
704        let docx_built = docx.build();
705        let mut buffer = Vec::new();
706        docx_built
707            .pack(&mut Cursor::new(&mut buffer))
708            .map_err(|e| format!("Failed to pack DOCX: {e}"))?;
709
710        Ok(buffer)
711    }
712}
713
714impl FormatWriter for DocxWriter {
715    fn write_vec(&mut self, document: &Node) -> Result<Vec<u8>> {
716        self.list_level = 0;
717        self.list_numbering_count = 0;
718        self.generate_docx(document)
719    }
720
721    fn write_eco(&mut self, _document: &Node, _output: &mut EcoString) -> Result<()> {
722        Err("DOCX format does not support EcoString output".into())
723    }
724}