Skip to content

Docx

Setup

Parse a Microsoft Word (.docx) file into a structured Document using DocumentPipeline.

DocxParser uses python-docx to traverse the document body, extracting paragraphs (with heading / list semantic types), tables, and inline images from body, headers, and footers.

Install the dependency before running with a real file:

Bash
pip install python-docx
python quick_start_docx.py

The example uses a mock so it runs without any .docx file on disk.

Python
import io
import json
from unittest.mock import MagicMock, patch

from sayou.document.models import Document, Page, TableElement, TextElement
from sayou.document.pipeline import DocumentPipeline

Parse Paragraphs and Tables

DocxParser maps paragraph styles to semantic types stored in element.raw_attributes:

Style semantic_type extra key
Heading 19 "heading" heading_level
List Bullet / Number "list" list_level
Normal / other (none)

Tables become TableElement objects with: - .data — 2-D list of strings (LLM-friendly) - .cells — list of TableCell with is_header, row_span, col_span

Python
def _make_para(text, style_name="Normal", style_id="Normal"):
    para = MagicMock()
    para.text = text
    para.style.name = style_name
    para.style.style_id = style_id
    from docx.enum.style import WD_STYLE_TYPE

    para.style.type = WD_STYLE_TYPE.PARAGRAPH
    para._p = MagicMock()
    para._p.pPr = None
    run = MagicMock()
    run.text = text
    run._element.xml = "<w:r/>"
    para.runs = [run]
    return para


def _make_table(rows):
    tbl = MagicMock()
    mock_rows = []
    for row_data in rows:
        row = MagicMock()
        row.cells = [MagicMock(text=c) for c in row_data]
        mock_rows.append(row)
    tbl.rows = mock_rows
    return tbl


with patch("sayou.document.parser.docx_parser.DocxDocument") as MockDoc, patch(
    "sayou.document.parser.docx_parser.Paragraph"
) as MockPara, patch("sayou.document.parser.docx_parser.Table") as MockTable:

    mock_doc = MagicMock()
    mock_doc.sections = []

    paras = [
        _make_para("Sayou Fabric Overview", "Heading 1", "Heading1"),
        _make_para("Introduction", "Heading 2", "Heading2"),
        _make_para("Architecture", "Heading 2", "Heading2"),
        _make_para("First bullet", "List Bullet", "ListBullet"),
        _make_para("Second bullet", "List Bullet", "ListBullet"),
        _make_para("Plain paragraph describing the system."),
    ]
    tables = [
        _make_table(
            [
                ["Library", "Purpose", "Output"],
                ["Connector", "Data collection", "SayouPacket"],
                ["Document", "File parsing", "Document"],
                ["Refinery", "Content normalization", "SayouBlock"],
                ["Chunking", "Text splitting", "SayouChunk"],
            ]
        )
    ]

    class _FakeBody:
        def __iter__(self_inner):
            for p in list(paras):
                m = MagicMock()
                m.tag = "p"
                m.tag.endswith = lambda s: s == "p"
                yield m
            for t in list(tables):
                m = MagicMock()
                m.tag = "tbl"
                m.tag.endswith = lambda s: s == "tbl"
                yield m

    mock_doc.element.body = _FakeBody()
    MockDoc.return_value = mock_doc
    para_q = list(paras)
    table_q = list(tables)
    MockPara.side_effect = lambda e, d: para_q.pop(0) if para_q else MagicMock()
    MockTable.side_effect = lambda e, d: table_q.pop(0) if table_q else MagicMock()

    doc = DocumentPipeline.process(b"PK\x03\x04", "overview.docx")

page = doc.pages[0]
text_elems = [e for e in page.elements if isinstance(e, TextElement)]
table_elems = [e for e in page.elements if isinstance(e, TableElement)]

print("=== Parse Paragraphs and Tables ===")
print(f"  doc_type     : {doc.doc_type}")
print(f"  text elements: {len(text_elems)}")
print(f"  table elements:{len(table_elems)}")
for e in text_elems:
    stype = e.raw_attributes.get("semantic_type", "text")
    level = e.raw_attributes.get("heading_level", "")
    tag = f"{stype}({level})" if level else stype
    print(f"    [{tag:12s}] {e.text!r}")

Inspect Table Structure

The first row's cells have is_header=True (heuristic: first row = header). Each TableCell also has row_span and col_span (currently 1; merge detection is planned for v0.1.0).

Python
if table_elems:
    tbl = table_elems[0]
    print("\n=== Inspect Table Structure ===")
    print(f"  Rows: {len(tbl.data)}  Cols: {len(tbl.data[0])}")
    print(f"  Header row: {tbl.data[0]}")
    for row_cells in tbl.cells[:3]:
        for cell in row_cells:
            mark = " [H]" if cell.is_header else ""
            print(f"    {cell.text:25s}{mark}")
        print()

Save Results

Document.model_dump() serialises the full element tree to a plain dict.

Python
output = doc.model_dump()
with open("docx_result.json", "w", encoding="utf-8") as f:
    json.dump(output, f, indent=2, ensure_ascii=False)

print("Saved Document to 'docx_result.json'")