Loading src/saref_pypeline/docgen/ts2md_extractor.py +51 −6 Original line number Diff line number Diff line Loading @@ -258,7 +258,6 @@ def cell_is_header(cell): return True return False def extract_hyperlink(hyperlink: Hyperlink, ctx: RunContext): if hyperlink.url: href = hyperlink.url Loading Loading @@ -401,7 +400,7 @@ class TS2MDExtractor: md = self.extract_clause("3.2\tSymbols") Path(self.out_folder, "symbols.md").write_text(md, "utf-8") md = self.extract_clause("3.3\tAbbreviations") md = self.extract_clause("\tAbbreviations") Path(self.out_folder, "abbreviations.md").write_text(md, "utf-8") i = 4 Loading Loading @@ -1001,7 +1000,7 @@ class TS2MDExtractor: image_paragraph = get_prev_block(self.document, paragraph) return self.get_paragraph_image_info(image_paragraph, i + 1) def extract_table(self, table, extract_format: ExtractFormat = ExtractFormat.MD): def extract_table(self, table, extract_format: ExtractFormat = ExtractFormat.HTML): """ Generate HTML for a python-docx Table. - If the paragraph just above has style TH and matches 'Table N: ...', Loading @@ -1026,7 +1025,7 @@ class TS2MDExtractor: + '"' ) lines = [] lines:List[str] = [] lines.append( f"""<table id="{html.escape(cap_id)}" data-docx-preferred-width={preferred_width}>""" ) Loading @@ -1034,16 +1033,62 @@ class TS2MDExtractor: if cap_id and cap_text: lines.append(f" <caption>{cap_text}</caption>") seen = set() for r_idx, row in enumerate(table.rows): lines.append(" <tr>") use_th = r_idx == 0 and first_row_has_headers for cell in row.cells: cells = row.cells if r_idx == 0: rowspans = len(cells) * [1] for i, cell in enumerate(row.cells): if cell._tc in seen: rowspans[i] += 1 continue # skip duplicate reference seen.add(cell._tc) colspan_attr = "" rowspan_attr = "" tcPr = cell._tc.tcPr if tcPr is not None: # skip continued cells (vMerge) vMerge = tcPr.find(qn("w:vMerge")) if vMerge is not None: val = vMerge.get(qn("w:val")) if val == "restart": marker = f' rowspan="TODO{i}"' for j, line in enumerate(lines[-1::-1]): if marker in line: newmarker = '' if rowspans[i]==1 else f' rowspan="{rowspans[i]}"' lines[-1-j] = lines[-1-j].replace(marker, newmarker, 1) break rowspans[i] = 1 rowspan_attr = marker # Handle colspan (gridSpan) gridSpan = tcPr.find(qn("w:gridSpan")) colspan_attr = "" if gridSpan is not None: colspan_attr = f' colspan="{gridSpan.get(qn("w:val"))}"' tag = "th" if (use_th and cell_is_header(cell)) else "td" align_attr = cell_align_style(cell) cell_html = self.cell_text_html(cell) lines.append(f" <{tag}{align_attr}>{cell_html}</{tag}>") lines.append(f" <{tag}{align_attr}{colspan_attr}{rowspan_attr}>{cell_html}</{tag}>") lines.append(" </tr>") # finalize rowspans for i, rowspan in enumerate(rowspans): marker = f' rowspan="TODO{i}"' for j, line in enumerate(lines[-1::-1]): if marker in line: newmarker = '' if rowspan==1 else f' rowspan="{rowspan}"' lines[-1-j] = lines[-1-j].replace(marker, newmarker, 1) break lines.append("</table>") return "\n".join(lines) Loading src/saref_pypeline/docgen/ts_generator.py +79 −10 Original line number Diff line number Diff line Loading @@ -213,6 +213,33 @@ class TSGenerator: self.description(OWL.bottomDataProperty, OWL_GRAPH) self.description(OWL.topDataProperty, OWL_GRAPH) @staticmethod def _add_gridspan(cell:_Cell, val:int): tcPr = cell._tc.get_or_add_tcPr() vMerge = tcPr.find(qn("w:gridSpan")) if vMerge is None: vMerge = OxmlElement("w:gridSpan") tcPr.append(vMerge) vMerge.set(qn("w:val"), str(val)) @staticmethod def _add_vmerge(cell:_Cell, kind:str): # kind in {"restart","continue"} tcPr = cell._tc.get_or_add_tcPr() vMerge = tcPr.find(qn("w:vMerge")) if vMerge is None: vMerge = OxmlElement("w:vMerge") tcPr.append(vMerge) vMerge.set(qn("w:val"), kind) @staticmethod def _add_vmerge_restart(cell:_Cell): TSGenerator._add_vmerge(cell, "restart") @staticmethod def _add_vmerge_continue(cell): TSGenerator._add_vmerge(cell, "continue") def _compute_next_bookmark_id(self) -> int: max_id = 0 for bookmark in self.document.element.xpath("//w:bookmarkStart"): Loading Loading @@ -980,7 +1007,7 @@ class TSGenerator: self.delete_section("User defined") self.insert_soup_for_file("abstract") self.insert_soup_for_file("description") self.insert_soup_for_file("examples") self.insert_soup_for_file("examples", mandatory=False) self._is_appendix = True self.insert_soup_for_file("annexes", mandatory=False) self.describe_ontology() Loading Loading @@ -1460,8 +1487,9 @@ class TSGenerator: for col, pref in zip(tbl.columns, preferred_width): col.width = compute_length(pref, text_width) spans = len(tbl.columns) * [(0,1)] # remaining_vertical, colspan for tr in rows: self.insert_soup_tr(tr, tbl, styling) spans = self.insert_soup_tr(tr, spans, tbl, styling) self._cursor = tbl def insert_soup_caption(self, el: Tag, styling: Callable = None) -> None: Loading @@ -1474,18 +1502,59 @@ class TSGenerator: with Bookmark(self, id): self.insert_soup_children(el, styling) def insert_soup_tr(self, el: Tag, table: Table, styling: Callable = None) -> None: def insert_soup_tr(self, el: Tag, spans:List[int], table: Table, styling: Callable = None) -> List[int]: """ Add a <tr> (table row) to a python-docx Table. """ cells = el.find_all(["td", "th"]) row_cells = table.add_row().cells for i, cell in enumerate(cells): strip_outer_text(cell) if cell.name == "th": self.insert_soup_th(cell, row_cells[i], styling) tds = list(el.find_all(["td", "th"])) td_id = 0 row = table.add_row() cells = row.cells cell_idx = 0 while cell_idx < len(cells): # if rowspans[cell_idx][0] > 0: this cell is a continuation: # skip, decrement rowspans[i][0], increment cell_idx, continue rowspan, colspan = spans[cell_idx] if rowspan > 0: spans[cell_idx] = (rowspan - 1, colspan) self._add_vmerge_continue(cells[cell_idx]) # there may also be a colspan if colspan > 1: self._add_gridspan(cells[cell_idx], colspan) for i in range(colspan-1): to_remove = cells[cell_idx+1]._tc to_remove.getparent().remove(to_remove) cell_idx += 1 continue td = tds[td_id] colspan = int(td.get("colspan", "1")) if colspan > 1: self._add_gridspan(cells[cell_idx], colspan) for i in range(colspan-1): to_remove = cells[cell_idx+1]._tc to_remove.getparent().remove(to_remove) rowspan = int(td.get("rowspan", "1")) if rowspan > 1: self._add_vmerge_restart(cells[cell_idx]) spans[cell_idx] = (rowspan-1, colspan) for i in range(cell_idx+1, cell_idx+colspan): spans[i] = (rowspan-1, 1) strip_outer_text(td) if td.name == "th": self.insert_soup_th(td, cells[cell_idx], styling) else: self.insert_soup_td(cell, row_cells[i], styling) self.insert_soup_td(td, cells[cell_idx], styling) td_id += 1 cell_idx += colspan return spans def insert_soup_th(self, el: Tag, cell: _Cell, styling: Callable = None) -> None: """ Loading Loading
src/saref_pypeline/docgen/ts2md_extractor.py +51 −6 Original line number Diff line number Diff line Loading @@ -258,7 +258,6 @@ def cell_is_header(cell): return True return False def extract_hyperlink(hyperlink: Hyperlink, ctx: RunContext): if hyperlink.url: href = hyperlink.url Loading Loading @@ -401,7 +400,7 @@ class TS2MDExtractor: md = self.extract_clause("3.2\tSymbols") Path(self.out_folder, "symbols.md").write_text(md, "utf-8") md = self.extract_clause("3.3\tAbbreviations") md = self.extract_clause("\tAbbreviations") Path(self.out_folder, "abbreviations.md").write_text(md, "utf-8") i = 4 Loading Loading @@ -1001,7 +1000,7 @@ class TS2MDExtractor: image_paragraph = get_prev_block(self.document, paragraph) return self.get_paragraph_image_info(image_paragraph, i + 1) def extract_table(self, table, extract_format: ExtractFormat = ExtractFormat.MD): def extract_table(self, table, extract_format: ExtractFormat = ExtractFormat.HTML): """ Generate HTML for a python-docx Table. - If the paragraph just above has style TH and matches 'Table N: ...', Loading @@ -1026,7 +1025,7 @@ class TS2MDExtractor: + '"' ) lines = [] lines:List[str] = [] lines.append( f"""<table id="{html.escape(cap_id)}" data-docx-preferred-width={preferred_width}>""" ) Loading @@ -1034,16 +1033,62 @@ class TS2MDExtractor: if cap_id and cap_text: lines.append(f" <caption>{cap_text}</caption>") seen = set() for r_idx, row in enumerate(table.rows): lines.append(" <tr>") use_th = r_idx == 0 and first_row_has_headers for cell in row.cells: cells = row.cells if r_idx == 0: rowspans = len(cells) * [1] for i, cell in enumerate(row.cells): if cell._tc in seen: rowspans[i] += 1 continue # skip duplicate reference seen.add(cell._tc) colspan_attr = "" rowspan_attr = "" tcPr = cell._tc.tcPr if tcPr is not None: # skip continued cells (vMerge) vMerge = tcPr.find(qn("w:vMerge")) if vMerge is not None: val = vMerge.get(qn("w:val")) if val == "restart": marker = f' rowspan="TODO{i}"' for j, line in enumerate(lines[-1::-1]): if marker in line: newmarker = '' if rowspans[i]==1 else f' rowspan="{rowspans[i]}"' lines[-1-j] = lines[-1-j].replace(marker, newmarker, 1) break rowspans[i] = 1 rowspan_attr = marker # Handle colspan (gridSpan) gridSpan = tcPr.find(qn("w:gridSpan")) colspan_attr = "" if gridSpan is not None: colspan_attr = f' colspan="{gridSpan.get(qn("w:val"))}"' tag = "th" if (use_th and cell_is_header(cell)) else "td" align_attr = cell_align_style(cell) cell_html = self.cell_text_html(cell) lines.append(f" <{tag}{align_attr}>{cell_html}</{tag}>") lines.append(f" <{tag}{align_attr}{colspan_attr}{rowspan_attr}>{cell_html}</{tag}>") lines.append(" </tr>") # finalize rowspans for i, rowspan in enumerate(rowspans): marker = f' rowspan="TODO{i}"' for j, line in enumerate(lines[-1::-1]): if marker in line: newmarker = '' if rowspan==1 else f' rowspan="{rowspan}"' lines[-1-j] = lines[-1-j].replace(marker, newmarker, 1) break lines.append("</table>") return "\n".join(lines) Loading
src/saref_pypeline/docgen/ts_generator.py +79 −10 Original line number Diff line number Diff line Loading @@ -213,6 +213,33 @@ class TSGenerator: self.description(OWL.bottomDataProperty, OWL_GRAPH) self.description(OWL.topDataProperty, OWL_GRAPH) @staticmethod def _add_gridspan(cell:_Cell, val:int): tcPr = cell._tc.get_or_add_tcPr() vMerge = tcPr.find(qn("w:gridSpan")) if vMerge is None: vMerge = OxmlElement("w:gridSpan") tcPr.append(vMerge) vMerge.set(qn("w:val"), str(val)) @staticmethod def _add_vmerge(cell:_Cell, kind:str): # kind in {"restart","continue"} tcPr = cell._tc.get_or_add_tcPr() vMerge = tcPr.find(qn("w:vMerge")) if vMerge is None: vMerge = OxmlElement("w:vMerge") tcPr.append(vMerge) vMerge.set(qn("w:val"), kind) @staticmethod def _add_vmerge_restart(cell:_Cell): TSGenerator._add_vmerge(cell, "restart") @staticmethod def _add_vmerge_continue(cell): TSGenerator._add_vmerge(cell, "continue") def _compute_next_bookmark_id(self) -> int: max_id = 0 for bookmark in self.document.element.xpath("//w:bookmarkStart"): Loading Loading @@ -980,7 +1007,7 @@ class TSGenerator: self.delete_section("User defined") self.insert_soup_for_file("abstract") self.insert_soup_for_file("description") self.insert_soup_for_file("examples") self.insert_soup_for_file("examples", mandatory=False) self._is_appendix = True self.insert_soup_for_file("annexes", mandatory=False) self.describe_ontology() Loading Loading @@ -1460,8 +1487,9 @@ class TSGenerator: for col, pref in zip(tbl.columns, preferred_width): col.width = compute_length(pref, text_width) spans = len(tbl.columns) * [(0,1)] # remaining_vertical, colspan for tr in rows: self.insert_soup_tr(tr, tbl, styling) spans = self.insert_soup_tr(tr, spans, tbl, styling) self._cursor = tbl def insert_soup_caption(self, el: Tag, styling: Callable = None) -> None: Loading @@ -1474,18 +1502,59 @@ class TSGenerator: with Bookmark(self, id): self.insert_soup_children(el, styling) def insert_soup_tr(self, el: Tag, table: Table, styling: Callable = None) -> None: def insert_soup_tr(self, el: Tag, spans:List[int], table: Table, styling: Callable = None) -> List[int]: """ Add a <tr> (table row) to a python-docx Table. """ cells = el.find_all(["td", "th"]) row_cells = table.add_row().cells for i, cell in enumerate(cells): strip_outer_text(cell) if cell.name == "th": self.insert_soup_th(cell, row_cells[i], styling) tds = list(el.find_all(["td", "th"])) td_id = 0 row = table.add_row() cells = row.cells cell_idx = 0 while cell_idx < len(cells): # if rowspans[cell_idx][0] > 0: this cell is a continuation: # skip, decrement rowspans[i][0], increment cell_idx, continue rowspan, colspan = spans[cell_idx] if rowspan > 0: spans[cell_idx] = (rowspan - 1, colspan) self._add_vmerge_continue(cells[cell_idx]) # there may also be a colspan if colspan > 1: self._add_gridspan(cells[cell_idx], colspan) for i in range(colspan-1): to_remove = cells[cell_idx+1]._tc to_remove.getparent().remove(to_remove) cell_idx += 1 continue td = tds[td_id] colspan = int(td.get("colspan", "1")) if colspan > 1: self._add_gridspan(cells[cell_idx], colspan) for i in range(colspan-1): to_remove = cells[cell_idx+1]._tc to_remove.getparent().remove(to_remove) rowspan = int(td.get("rowspan", "1")) if rowspan > 1: self._add_vmerge_restart(cells[cell_idx]) spans[cell_idx] = (rowspan-1, colspan) for i in range(cell_idx+1, cell_idx+colspan): spans[i] = (rowspan-1, 1) strip_outer_text(td) if td.name == "th": self.insert_soup_th(td, cells[cell_idx], styling) else: self.insert_soup_td(cell, row_cells[i], styling) self.insert_soup_td(td, cells[cell_idx], styling) td_id += 1 cell_idx += colspan return spans def insert_soup_th(self, el: Tag, cell: _Cell, styling: Callable = None) -> None: """ Loading