Unverified Commit c932e65b authored by Maxime Lefrançois's avatar Maxime Lefrançois
Browse files

support rowspans and colspans in tables

parent afabd38a
Loading
Loading
Loading
Loading
+51 −6
Original line number Diff line number Diff line
@@ -258,7 +258,6 @@ def cell_is_header(cell):
            return True
    return False


def extract_hyperlink(hyperlink: Hyperlink, ctx: RunContext):
    if hyperlink.url:
        href = hyperlink.url
@@ -401,7 +400,7 @@ class TS2MDExtractor:
        md = self.extract_clause("3.2\tSymbols")
        Path(self.out_folder, "symbols.md").write_text(md, "utf-8")

        md = self.extract_clause("3.3\tAbbreviations")
        md = self.extract_clause("\tAbbreviations")
        Path(self.out_folder, "abbreviations.md").write_text(md, "utf-8")

        i = 4
@@ -1001,7 +1000,7 @@ class TS2MDExtractor:
        image_paragraph = get_prev_block(self.document, paragraph)
        return self.get_paragraph_image_info(image_paragraph, i + 1)

    def extract_table(self, table, extract_format: ExtractFormat = ExtractFormat.MD):
    def extract_table(self, table, extract_format: ExtractFormat = ExtractFormat.HTML):
        """
        Generate HTML for a python-docx Table.
        - If the paragraph just above has style TH and matches 'Table N: ...',
@@ -1026,7 +1025,7 @@ class TS2MDExtractor:
            + '"'
        )

        lines = []
        lines:List[str] = []
        lines.append(
            f"""<table id="{html.escape(cap_id)}" data-docx-preferred-width={preferred_width}>"""
        )
@@ -1034,16 +1033,62 @@ class TS2MDExtractor:
        if cap_id and cap_text:
            lines.append(f"  <caption>{cap_text}</caption>")

        seen = set()
        for r_idx, row in enumerate(table.rows):
            lines.append("  <tr>")
            use_th = r_idx == 0 and first_row_has_headers
            for cell in row.cells:
            cells = row.cells
            if r_idx == 0:
                rowspans = len(cells) * [1]
            for i, cell in enumerate(row.cells):
                if cell._tc in seen:
                    rowspans[i] += 1
                    continue  # skip duplicate reference
                seen.add(cell._tc)
                
                colspan_attr = ""
                rowspan_attr = ""

                tcPr = cell._tc.tcPr
                if tcPr is not None:
                    # skip continued cells (vMerge)
                    vMerge = tcPr.find(qn("w:vMerge"))
                    if vMerge is not None:
                        val = vMerge.get(qn("w:val"))
                        if val == "restart":
                            marker = f' rowspan="TODO{i}"'
                            for j, line in enumerate(lines[-1::-1]):
                                if marker in line:
                                    newmarker = '' if rowspans[i]==1 else f' rowspan="{rowspans[i]}"'
                                    lines[-1-j] = lines[-1-j].replace(marker, newmarker, 1)
                                    break
                            rowspans[i] = 1
                            rowspan_attr = marker

                    # Handle colspan (gridSpan)
                    gridSpan = tcPr.find(qn("w:gridSpan"))
                    colspan_attr = ""
                    if gridSpan is not None:
                        colspan_attr = f' colspan="{gridSpan.get(qn("w:val"))}"'


                tag = "th" if (use_th and cell_is_header(cell)) else "td"
                align_attr = cell_align_style(cell)
                cell_html = self.cell_text_html(cell)
                lines.append(f"    <{tag}{align_attr}>{cell_html}</{tag}>")
                
                lines.append(f"    <{tag}{align_attr}{colspan_attr}{rowspan_attr}>{cell_html}</{tag}>")
            lines.append("  </tr>")

        # finalize rowspans
        for i, rowspan in enumerate(rowspans):
            marker = f' rowspan="TODO{i}"'
            for j, line in enumerate(lines[-1::-1]):
                if marker in line:
                    newmarker = '' if rowspan==1 else f' rowspan="{rowspan}"'
                    lines[-1-j] = lines[-1-j].replace(marker, newmarker, 1)
                    break


        lines.append("</table>")
        return "\n".join(lines)

+79 −10
Original line number Diff line number Diff line
@@ -213,6 +213,33 @@ class TSGenerator:
        self.description(OWL.bottomDataProperty, OWL_GRAPH)
        self.description(OWL.topDataProperty, OWL_GRAPH)


    @staticmethod
    def _add_gridspan(cell:_Cell, val:int):
        tcPr = cell._tc.get_or_add_tcPr()
        vMerge = tcPr.find(qn("w:gridSpan"))
        if vMerge is None:
            vMerge = OxmlElement("w:gridSpan")
            tcPr.append(vMerge)
        vMerge.set(qn("w:val"), str(val))

    @staticmethod
    def _add_vmerge(cell:_Cell, kind:str):  # kind in {"restart","continue"}
        tcPr = cell._tc.get_or_add_tcPr()
        vMerge = tcPr.find(qn("w:vMerge"))
        if vMerge is None:
            vMerge = OxmlElement("w:vMerge")
            tcPr.append(vMerge)
        vMerge.set(qn("w:val"), kind)

    @staticmethod
    def _add_vmerge_restart(cell:_Cell):
        TSGenerator._add_vmerge(cell, "restart")

    @staticmethod
    def _add_vmerge_continue(cell):
        TSGenerator._add_vmerge(cell, "continue")
        
    def _compute_next_bookmark_id(self) -> int:
        max_id = 0
        for bookmark in self.document.element.xpath("//w:bookmarkStart"):
@@ -980,7 +1007,7 @@ class TSGenerator:
        self.delete_section("User defined")
        self.insert_soup_for_file("abstract")
        self.insert_soup_for_file("description")
        self.insert_soup_for_file("examples")
        self.insert_soup_for_file("examples", mandatory=False)
        self._is_appendix = True
        self.insert_soup_for_file("annexes", mandatory=False)
        self.describe_ontology()
@@ -1460,8 +1487,9 @@ class TSGenerator:
            for col, pref in zip(tbl.columns, preferred_width):
                col.width = compute_length(pref, text_width)

        spans = len(tbl.columns) * [(0,1)] # remaining_vertical, colspan
        for tr in rows:
            self.insert_soup_tr(tr, tbl, styling)
            spans = self.insert_soup_tr(tr, spans, tbl, styling)
        self._cursor = tbl

    def insert_soup_caption(self, el: Tag, styling: Callable = None) -> None:
@@ -1474,18 +1502,59 @@ class TSGenerator:
        with Bookmark(self, id):
            self.insert_soup_children(el, styling)

    def insert_soup_tr(self, el: Tag, table: Table, styling: Callable = None) -> None:
    def insert_soup_tr(self, el: Tag, spans:List[int], table: Table, styling: Callable = None) -> List[int]:
        """
        Add a <tr> (table row) to a python-docx Table.
        """
        cells = el.find_all(["td", "th"])
        row_cells = table.add_row().cells
        for i, cell in enumerate(cells):
            strip_outer_text(cell)
            if cell.name == "th":
                self.insert_soup_th(cell, row_cells[i], styling)
        tds = list(el.find_all(["td", "th"]))
        td_id = 0

        row = table.add_row()
        cells = row.cells
        cell_idx = 0

        while cell_idx < len(cells):
            # if rowspans[cell_idx][0] > 0: this cell is a continuation:
            # skip, decrement rowspans[i][0], increment cell_idx, continue
            rowspan, colspan = spans[cell_idx]
            if rowspan > 0:
                spans[cell_idx] = (rowspan - 1, colspan) 
                self._add_vmerge_continue(cells[cell_idx])
                # there may also be a colspan
                if colspan > 1:
                    self._add_gridspan(cells[cell_idx], colspan)
                    for i in range(colspan-1):
                        to_remove = cells[cell_idx+1]._tc
                        to_remove.getparent().remove(to_remove)
                cell_idx += 1
                continue

            td = tds[td_id]
            
            colspan = int(td.get("colspan", "1"))
            if colspan > 1:
                self._add_gridspan(cells[cell_idx], colspan)
                for i in range(colspan-1):
                    to_remove = cells[cell_idx+1]._tc
                    to_remove.getparent().remove(to_remove)

            rowspan = int(td.get("rowspan", "1"))
            if rowspan > 1:
                self._add_vmerge_restart(cells[cell_idx])
                spans[cell_idx] = (rowspan-1, colspan)
                for i in range(cell_idx+1, cell_idx+colspan):
                    spans[i] = (rowspan-1, 1)

            strip_outer_text(td)
            if td.name == "th":
                self.insert_soup_th(td, cells[cell_idx], styling)
            else:
                self.insert_soup_td(cell, row_cells[i], styling)
                self.insert_soup_td(td, cells[cell_idx], styling)

            td_id += 1
            cell_idx += colspan

        return spans

    def insert_soup_th(self, el: Tag, cell: _Cell, styling: Callable = None) -> None:
        """