typlite/writer/docx/
writer.rs

1//! DOCX document writer implementation
2
3use base64::Engine;
4use cmark_writer::ast::{ListItem, Node};
5use docx_rs::*;
6use ecow::EcoString;
7use log::debug;
8use std::fs;
9use std::io::Cursor;
10
11use crate::Result;
12use crate::common::{
13    BlockVerbatimNode, CenterNode, FigureNode, FormatWriter, HighlightNode, InlineNode,
14    VerbatimNode,
15};
16
17use super::image_processor::DocxImageProcessor;
18use super::numbering::DocxNumbering;
19use super::styles::DocxStyles;
20
21/// DOCX writer that generates DOCX directly from AST (without intermediate
22/// representation)
23pub struct DocxWriter {
24    styles: DocxStyles,
25    numbering: DocxNumbering,
26    list_level: usize,
27    list_numbering_count: usize,
28    image_processor: DocxImageProcessor,
29}
30
31impl Default for DocxWriter {
32    fn default() -> Self {
33        Self::new()
34    }
35}
36
37impl DocxWriter {
38    pub fn new() -> Self {
39        Self {
40            styles: DocxStyles::new(),
41            numbering: DocxNumbering::new(),
42            list_level: 0,
43            list_numbering_count: 0,
44            image_processor: DocxImageProcessor::new(),
45        }
46    }
47
48    /// Process image node
49    fn process_image(&self, docx: Docx, url: &str, alt_nodes: &[Node]) -> Result<Docx> {
50        // Build alt text
51        let alt_text = if !alt_nodes.is_empty() {
52            let mut text = String::new();
53            for node in alt_nodes {
54                if let Node::Text(content) = node {
55                    text.push_str(content);
56                }
57            }
58            Some(text)
59        } else {
60            None
61        };
62
63        // Try reading image file
64        if let Ok(img_data) = fs::read(url) {
65            Ok(self
66                .image_processor
67                .process_image_data(docx, &img_data, alt_text.as_deref(), None))
68        } else {
69            let placeholder = format!("[Image not found: {url}]");
70            let para = Paragraph::new().add_run(Run::new().add_text(placeholder));
71            Ok(docx.add_paragraph(para))
72        }
73    }
74
75    /// Process figure node (image with caption)
76    fn process_figure(&mut self, mut docx: Docx, figure_node: &FigureNode) -> Result<Docx> {
77        // First handle the figure body (typically an image)
78        match &*figure_node.body {
79            Node::Paragraph(content) => {
80                for node in content {
81                    if let Node::Image {
82                        url,
83                        title: _,
84                        alt: _,
85                    } = node
86                    {
87                        // Process the image
88                        if let Ok(img_data) = fs::read(url.as_str()) {
89                            let alt_text = figure_node.caption.clone();
90                            // Add the image with caption
91                            docx = self.image_processor.process_image_data(
92                                docx,
93                                &img_data,
94                                Some(&alt_text),
95                                None,
96                            );
97
98                            // Add caption as a separate paragraph with Caption style
99                            if !figure_node.caption.is_empty() {
100                                let caption_text = format!("Figure: {}", figure_node.caption);
101                                let caption_para = Paragraph::new()
102                                    .style("Caption")
103                                    .add_run(Run::new().add_text(caption_text));
104                                docx = docx.add_paragraph(caption_para);
105                            }
106                        } else {
107                            // Image not found, show placeholder
108                            let placeholder = format!("[Image not found: {url}]");
109                            let para = Paragraph::new().add_run(Run::new().add_text(placeholder));
110                            docx = docx.add_paragraph(para);
111
112                            // Still add caption
113                            if !figure_node.caption.is_empty() {
114                                let caption_para = Paragraph::new()
115                                    .style("Caption")
116                                    .add_run(Run::new().add_text(&figure_node.caption));
117                                docx = docx.add_paragraph(caption_para);
118                            }
119                        }
120                    } else {
121                        // Handle non-image content
122                        let mut para = Paragraph::new();
123                        let run = Run::new();
124                        let run = self.process_inline_to_run(run, node)?;
125                        if !run.children.is_empty() {
126                            para = para.add_run(run);
127                            docx = docx.add_paragraph(para);
128                        }
129
130                        // Add caption as a separate paragraph
131                        if !figure_node.caption.is_empty() {
132                            let caption_para = Paragraph::new()
133                                .style("Caption")
134                                .add_run(Run::new().add_text(&figure_node.caption));
135                            docx = docx.add_paragraph(caption_para);
136                        }
137                    }
138                }
139            }
140            // Handle other content types within figure
141            _ => {
142                // Process the content using standard node processing
143                docx = self.process_node(docx, &figure_node.body)?;
144
145                // Add caption as a separate paragraph
146                if !figure_node.caption.is_empty() {
147                    let caption_para = Paragraph::new()
148                        .style("Caption")
149                        .add_run(Run::new().add_text(&figure_node.caption));
150                    docx = docx.add_paragraph(caption_para);
151                }
152            }
153        }
154
155        Ok(docx)
156    }
157
158    /// Process inline element and add to Run
159    fn process_inline_to_run(&self, mut run: Run, node: &Node) -> Result<Run> {
160        match node {
161            Node::Text(text) => {
162                run = run.add_text(text);
163            }
164            Node::Strong(content) => {
165                run = run.style("Strong");
166                for child in content {
167                    run = self.process_inline_to_run(run, child)?;
168                }
169            }
170            Node::Emphasis(content) => {
171                run = run.style("Emphasis");
172                for child in content {
173                    run = self.process_inline_to_run(run, child)?;
174                }
175            }
176            Node::Strikethrough(content) => {
177                run = run.strike();
178                for child in content {
179                    run = self.process_inline_to_run(run, child)?;
180                }
181            }
182            Node::Link {
183                url: _,
184                title: _,
185                content,
186            } => {
187                // Hyperlinks need to be processed at paragraph level, only handle content here
188                run = run.style("Hyperlink");
189                for child in content {
190                    run = self.process_inline_to_run(run, child)?;
191                }
192            }
193            Node::Image {
194                url,
195                title: _,
196                alt: _,
197            } => {
198                if let Ok(img_data) = fs::read(url.as_str()) {
199                    run = self.image_processor.process_inline_image(run, &img_data)?;
200                } else {
201                    run = run.add_text(format!("[Image not found: {url}]"));
202                }
203            }
204            Node::HtmlElement(element) => {
205                // Handle special HTML elements
206                if element.tag == "img" && element.self_closing {
207                    let is_typst_block = element
208                        .attributes
209                        .iter()
210                        .any(|a| a.name == "alt" && a.value == "typst-block");
211
212                    let src = element
213                        .attributes
214                        .iter()
215                        .find(|a| a.name == "src")
216                        .map(|a| a.value.as_str())
217                        .unwrap_or("");
218
219                    if src.starts_with("data:image/") {
220                        run = self.image_processor.process_data_url_image(
221                            run,
222                            src,
223                            is_typst_block,
224                        )?;
225                    }
226                } else {
227                    // Standard element content processing
228                    for child in &element.children {
229                        run = self.process_inline_to_run(run, child)?;
230                    }
231                }
232            }
233            Node::InlineCode(code) => {
234                run = run.style("CodeInline").add_text(code);
235            }
236            Node::HardBreak => {
237                run = run.add_break(BreakType::TextWrapping);
238            }
239            Node::SoftBreak => {
240                run = run.add_text(" ");
241            }
242            node if node.is_custom_type::<HighlightNode>() => {
243                let highlight_node = node.as_custom_type::<HighlightNode>().unwrap();
244                run = run.highlight("yellow");
245                for child in &highlight_node.content {
246                    run = self.process_inline_to_run(run, child)?;
247                }
248            }
249            node if node.is_custom_type::<InlineNode>() => {
250                let inline_node = node.as_custom_type::<InlineNode>().unwrap();
251                for child in &inline_node.content {
252                    run = self.process_inline_to_run(run, child)?;
253                }
254            }
255            node if node.is_custom_type::<VerbatimNode>() => {
256                let node = node.as_custom_type::<VerbatimNode>().unwrap();
257                run = run.style("CodeInline").add_text(&node.content);
258            }
259            // Other inline element types
260            _ => {
261                debug!("unhandled inline node in DOCX export: {node:?}");
262            }
263        }
264
265        Ok(run)
266    }
267
268    /// Process paragraph and add to document
269    fn process_paragraph(
270        &self,
271        mut docx: Docx,
272        content: &[Node],
273        style: Option<&str>,
274    ) -> Result<Docx> {
275        let mut para = Paragraph::new();
276
277        // Apply style
278        if let Some(style_name) = style {
279            para = para.style(style_name);
280        }
281
282        // Extract all link nodes
283        let mut links = Vec::new();
284        for (i, node) in content.iter().enumerate() {
285            if let Node::Link {
286                url,
287                title: _,
288                content: _,
289            } = node
290            {
291                links.push((i, url.clone()));
292            }
293        }
294
295        // If no links, process paragraph normally
296        if links.is_empty() {
297            // Process paragraph content
298            for node in content {
299                let run = Run::new();
300                let run = self.process_inline_to_run(run, node)?;
301                if !run.children.is_empty() {
302                    para = para.add_run(run);
303                }
304            }
305        } else {
306            // If links exist, we need to process in segments
307            let mut last_idx = 0;
308            for (idx, url) in links {
309                // Process content before the link
310                for item in content.iter().take(idx).skip(last_idx) {
311                    let run = Run::new();
312                    let run = self.process_inline_to_run(run, item)?;
313                    if !run.children.is_empty() {
314                        para = para.add_run(run);
315                    }
316                }
317
318                // Process link
319                if let Node::Link {
320                    url: _,
321                    title: _,
322                    content: link_content,
323                } = &content[idx]
324                {
325                    let mut hyperlink_run = Run::new().style("Hyperlink");
326                    for child in link_content {
327                        hyperlink_run = self.process_inline_to_run(hyperlink_run, child)?;
328                    }
329
330                    // Create and add hyperlink
331                    if !hyperlink_run.children.is_empty() {
332                        let hyperlink =
333                            Hyperlink::new(&url, HyperlinkType::External).add_run(hyperlink_run);
334                        para = para.add_hyperlink(hyperlink);
335                    }
336                }
337
338                last_idx = idx + 1;
339            }
340
341            // Process content after the last link
342            for item in content.iter().skip(last_idx) {
343                let run = Run::new();
344                let run = self.process_inline_to_run(run, item)?;
345                if !run.children.is_empty() {
346                    para = para.add_run(run);
347                }
348            }
349        }
350
351        // Only add when paragraph has content
352        if !para.children.is_empty() {
353            docx = docx.add_paragraph(para);
354        }
355
356        Ok(docx)
357    }
358
359    /// Process node and add to document
360    fn process_node(&mut self, mut docx: Docx, node: &Node) -> Result<Docx> {
361        match node {
362            Node::Document(blocks) => {
363                for block in blocks {
364                    docx = self.process_node(docx, block)?;
365                }
366            }
367            Node::Paragraph(content) => {
368                docx = self.process_paragraph(docx, content, None)?;
369            }
370            Node::Heading {
371                level,
372                content,
373                heading_type: _,
374            } => {
375                // Determine heading style name
376                let style_name = match level {
377                    1 => "Heading1",
378                    2 => "Heading2",
379                    3 => "Heading3",
380                    4 => "Heading4",
381                    5 => "Heading5",
382                    _ => "Heading6",
383                };
384
385                docx = self.process_paragraph(docx, content, Some(style_name))?;
386            }
387            Node::BlockQuote(content) => {
388                for block in content {
389                    if let Node::Paragraph(inline) = block {
390                        docx = self.process_paragraph(docx, inline, Some("Blockquote"))?;
391                    } else {
392                        docx = self.process_node(docx, block)?;
393                    }
394                }
395            }
396            Node::CodeBlock {
397                language,
398                content,
399                block_type: _,
400            } => {
401                // Add language information
402                if let Some(lang) = language
403                    && !lang.is_empty()
404                {
405                    let lang_para = Paragraph::new()
406                        .style("CodeBlock")
407                        .add_run(Run::new().add_text(lang));
408                    docx = docx.add_paragraph(lang_para);
409                }
410
411                // Process code line by line, preserving line breaks
412                let lines: Vec<&str> = content.split('\n').collect();
413                for line in lines {
414                    let code_para = Paragraph::new()
415                        .style("CodeBlock")
416                        .add_run(Run::new().add_text(line));
417                    docx = docx.add_paragraph(code_para);
418                }
419            }
420            Node::OrderedList { start: _, items } => {
421                docx = self.process_ordered_list(docx, items)?;
422            }
423            Node::UnorderedList(items) => {
424                docx = self.process_unordered_list(docx, items)?;
425            }
426            Node::Table {
427                headers,
428                rows,
429                alignments: _,
430            } => {
431                docx = self.process_table(docx, headers, rows)?;
432            }
433            Node::Image { url, title: _, alt } => {
434                docx = self.process_image(docx, url, alt)?;
435            }
436            node if node.is_custom_type::<FigureNode>() => {
437                let figure_node = node.as_custom_type::<FigureNode>().unwrap();
438                docx = self.process_figure(docx, figure_node)?;
439            }
440            node if node.is_custom_type::<CenterNode>() => {
441                let center_node = node.as_custom_type::<CenterNode>().unwrap();
442                // Handle regular node but with center alignment
443                match &center_node.node {
444                    Node::Paragraph(content) => {
445                        docx = self.process_paragraph(docx, content, None)?;
446                        // Get the last paragraph and center it
447                        if let Some(DocumentChild::Paragraph(para)) =
448                            docx.document.children.last_mut()
449                        {
450                            para.property = para.property.clone().align(AlignmentType::Center);
451                        }
452                    }
453                    Node::HtmlElement(element) => {
454                        let start_idx = docx.document.children.len();
455                        for child in &element.children {
456                            docx = self.process_node(docx, child)?;
457                        }
458                        for child in docx.document.children.iter_mut().skip(start_idx) {
459                            if let DocumentChild::Paragraph(para) = child {
460                                para.property = para.property.clone().align(AlignmentType::Center);
461                            }
462                        }
463                    }
464                    other => {
465                        docx = self.process_node(docx, other)?;
466                        // Get the last element and center it if it's a paragraph
467                        if let Some(DocumentChild::Paragraph(para)) =
468                            docx.document.children.last_mut()
469                        {
470                            para.property = para.property.clone().align(AlignmentType::Center);
471                        }
472                    }
473                }
474            }
475            node if node.is_custom_type::<crate::common::ExternalFrameNode>() => {
476                let external_frame = node
477                    .as_custom_type::<crate::common::ExternalFrameNode>()
478                    .unwrap();
479                let data = base64::engine::general_purpose::STANDARD
480                    .decode(&external_frame.svg)
481                    .map_err(|e| format!("Failed to decode SVG data: {e}"))?;
482
483                docx = self.image_processor.process_image_data(
484                    docx,
485                    &data,
486                    Some(&external_frame.alt_text),
487                    None,
488                );
489            }
490            node if node.is_custom_type::<HighlightNode>() => {
491                let highlight_node = node.as_custom_type::<HighlightNode>().unwrap();
492                // Handle HighlightNode at block level (convert to paragraph)
493                let mut para = Paragraph::new();
494                let mut run = Run::new().highlight("yellow");
495
496                for child in &highlight_node.content {
497                    run = self.process_inline_to_run(run, child)?;
498                }
499
500                if !run.children.is_empty() {
501                    para = para.add_run(run);
502                    docx = docx.add_paragraph(para);
503                }
504            }
505            node if node.is_custom_type::<BlockVerbatimNode>() => {
506                let block_node = node.as_custom_type::<BlockVerbatimNode>().unwrap();
507                for line in block_node.content.split('\n') {
508                    let para = Paragraph::new()
509                        .style("CodeBlock")
510                        .add_run(Run::new().add_text(line));
511                    docx = docx.add_paragraph(para);
512                }
513            }
514            node if node.is_custom_type::<InlineNode>() => {
515                let inline_node = node.as_custom_type::<InlineNode>().unwrap();
516                // Handle InlineNode at block level (convert to paragraph)
517                let mut para = Paragraph::new();
518                let mut run = Run::new();
519
520                for child in &inline_node.content {
521                    run = self.process_inline_to_run(run, child)?;
522                }
523
524                if !run.children.is_empty() {
525                    para = para.add_run(run);
526                    docx = docx.add_paragraph(para);
527                }
528            }
529            Node::ThematicBreak => {
530                // Add horizontal line as specially formatted paragraph
531                let hr_para = Paragraph::new()
532                    .style("HorizontalLine")
533                    .add_run(Run::new().add_text(""));
534                docx = docx.add_paragraph(hr_para);
535            }
536            // Inline elements should not be processed here individually
537            _ => {}
538        }
539
540        Ok(docx)
541    }
542
543    /// Process ordered list
544    fn process_ordered_list(&mut self, mut docx: Docx, items: &[ListItem]) -> Result<Docx> {
545        // Enter deeper list level
546        self.list_level += 1;
547        let current_level = self.list_level - 1;
548
549        // Create new ordered list numbering definition
550        let (doc, num_id) = self.numbering.create_ordered_numbering(docx);
551        docx = doc;
552
553        // Process list items
554        for item in items {
555            if let ListItem::Ordered { content, .. } = item {
556                docx = self.process_list_item_content(docx, content, num_id, current_level)?;
557            }
558        }
559
560        // Exit list level
561        self.list_level -= 1;
562        Ok(docx)
563    }
564
565    /// Process unordered list
566    fn process_unordered_list(&mut self, mut docx: Docx, items: &[ListItem]) -> Result<Docx> {
567        // Enter deeper list level
568        self.list_level += 1;
569        let current_level = self.list_level - 1;
570
571        // Create new unordered list numbering definition
572        let (doc, num_id) = self.numbering.create_unordered_numbering(docx);
573        docx = doc;
574
575        // Process list items
576        for item in items {
577            if let ListItem::Unordered { content } = item {
578                docx = self.process_list_item_content(docx, content, num_id, current_level)?;
579            }
580        }
581
582        // Exit list level
583        self.list_level -= 1;
584        Ok(docx)
585    }
586
587    /// Helper function to process list item content
588    fn process_list_item_content(
589        &mut self,
590        mut docx: Docx,
591        content: &[Node],
592        num_id: usize,
593        level: usize,
594    ) -> Result<Docx> {
595        // If content is empty, add empty paragraph
596        if content.is_empty() {
597            let empty_para = Paragraph::new()
598                .numbering(NumberingId::new(num_id), IndentLevel::new(level))
599                .add_run(Run::new().add_text(""));
600            return Ok(docx.add_paragraph(empty_para));
601        }
602
603        // Process content
604        for block in content {
605            match block {
606                Node::Paragraph(inline) => {
607                    let mut para = Paragraph::new()
608                        .numbering(NumberingId::new(num_id), IndentLevel::new(level));
609
610                    // Process paragraph content
611                    for node in inline {
612                        let run = Run::new();
613                        let run = self.process_inline_to_run(run, node)?;
614                        if !run.children.is_empty() {
615                            para = para.add_run(run);
616                        }
617                    }
618
619                    docx = docx.add_paragraph(para);
620                }
621                // Recursively process nested lists
622                Node::OrderedList { start: _, items: _ } | Node::UnorderedList(_) => {
623                    docx = self.process_node(docx, block)?;
624                }
625                _ => {
626                    docx = self.process_node(docx, block)?;
627                }
628            }
629        }
630
631        Ok(docx)
632    }
633
634    /// Process table
635    fn process_table(&self, mut docx: Docx, headers: &[Node], rows: &[Vec<Node>]) -> Result<Docx> {
636        let mut table = Table::new(vec![]).style("Table");
637
638        // Process table headers
639        if !headers.is_empty() {
640            let mut cells = Vec::new();
641
642            for header_node in headers {
643                let mut table_cell = TableCell::new();
644                let mut para = Paragraph::new();
645
646                let run = Run::new();
647                let run = self.process_inline_to_run(run, header_node)?;
648                if !run.children.is_empty() {
649                    para = para.add_run(run);
650                }
651
652                if !para.children.is_empty() {
653                    table_cell = table_cell.add_paragraph(para);
654                }
655
656                cells.push(table_cell);
657            }
658
659            if !cells.is_empty() {
660                let header_row = TableRow::new(cells);
661                table = table.add_row(header_row);
662            }
663        }
664
665        // Process table rows
666        for row in rows {
667            let mut cells = Vec::new();
668
669            for cell_node in row {
670                let mut table_cell = TableCell::new();
671                let mut para = Paragraph::new();
672
673                let run = Run::new();
674                let run = self.process_inline_to_run(run, cell_node)?;
675                if !run.children.is_empty() {
676                    para = para.add_run(run);
677                }
678
679                if !para.children.is_empty() {
680                    table_cell = table_cell.add_paragraph(para);
681                }
682
683                cells.push(table_cell);
684            }
685
686            if !cells.is_empty() {
687                let data_row = TableRow::new(cells);
688                table = table.add_row(data_row);
689            }
690        }
691
692        // Add table to document
693        docx = docx.add_table(table);
694
695        Ok(docx)
696    }
697
698    /// Generate DOCX document
699    pub fn generate_docx(&mut self, doc: &Node) -> Result<Vec<u8>> {
700        // Create DOCX document and initialize styles
701        let mut docx = Docx::new();
702        docx = self.styles.initialize_styles(docx);
703
704        // Process document content
705        docx = self.process_node(docx, doc)?;
706
707        // Initialize numbering definitions
708        docx = self.numbering.initialize_numbering(docx);
709
710        // Build and pack document
711        let docx_built = docx.build();
712        let mut buffer = Vec::new();
713        docx_built
714            .pack(&mut Cursor::new(&mut buffer))
715            .map_err(|e| format!("Failed to pack DOCX: {e}"))?;
716
717        Ok(buffer)
718    }
719}
720
721impl FormatWriter for DocxWriter {
722    fn write_vec(&mut self, document: &Node) -> Result<Vec<u8>> {
723        self.list_level = 0;
724        self.list_numbering_count = 0;
725        self.generate_docx(document)
726    }
727
728    fn write_eco(&mut self, _document: &Node, _output: &mut EcoString) -> Result<()> {
729        Err("DOCX format does not support EcoString output".into())
730    }
731}