support rowspans and colspans in tables (c932e65b) · Commits · SAREF / saref-pypeline

src/saref_pypeline/docgen/ts2md_extractor.py

+51 −6

Original line number	Diff line number	Diff line
		@@ -258,7 +258,6 @@ def cell_is_header(cell):
		return True
		return False


		def extract_hyperlink(hyperlink: Hyperlink, ctx: RunContext):
		if hyperlink.url:
		href = hyperlink.url
		@@ -401,7 +400,7 @@ class TS2MDExtractor:
		md = self.extract_clause("3.2\tSymbols")
		Path(self.out_folder, "symbols.md").write_text(md, "utf-8")

		md = self.extract_clause("3.3\tAbbreviations")
		md = self.extract_clause("\tAbbreviations")
		Path(self.out_folder, "abbreviations.md").write_text(md, "utf-8")

		i = 4
		@@ -1001,7 +1000,7 @@ class TS2MDExtractor:
		image_paragraph = get_prev_block(self.document, paragraph)
		return self.get_paragraph_image_info(image_paragraph, i + 1)

		def extract_table(self, table, extract_format: ExtractFormat = ExtractFormat.MD):
		def extract_table(self, table, extract_format: ExtractFormat = ExtractFormat.HTML):
		"""
		Generate HTML for a python-docx Table.
		- If the paragraph just above has style TH and matches 'Table N: ...',
		@@ -1026,7 +1025,7 @@ class TS2MDExtractor:
		+ '"'
		)

		lines = []
		lines:List[str] = []
		lines.append(
		f"""<table id="{html.escape(cap_id)}" data-docx-preferred-width={preferred_width}>"""
		)
		@@ -1034,16 +1033,62 @@ class TS2MDExtractor:
		if cap_id and cap_text:
		lines.append(f" <caption>{cap_text}</caption>")

		seen = set()
		for r_idx, row in enumerate(table.rows):
		lines.append(" <tr>")
		use_th = r_idx == 0 and first_row_has_headers
		for cell in row.cells:
		cells = row.cells
		if r_idx == 0:
		rowspans = len(cells) * [1]
		for i, cell in enumerate(row.cells):
		if cell._tc in seen:
		rowspans[i] += 1
		continue # skip duplicate reference
		seen.add(cell._tc)

		colspan_attr = ""
		rowspan_attr = ""

		tcPr = cell._tc.tcPr
		if tcPr is not None:
		# skip continued cells (vMerge)
		vMerge = tcPr.find(qn("w:vMerge"))
		if vMerge is not None:
		val = vMerge.get(qn("w:val"))
		if val == "restart":
		marker = f' rowspan="TODO{i}"'
		for j, line in enumerate(lines[-1::-1]):
		if marker in line:
		newmarker = '' if rowspans[i]==1 else f' rowspan="{rowspans[i]}"'
		lines[-1-j] = lines[-1-j].replace(marker, newmarker, 1)
		break
		rowspans[i] = 1
		rowspan_attr = marker

		# Handle colspan (gridSpan)
		gridSpan = tcPr.find(qn("w:gridSpan"))
		colspan_attr = ""
		if gridSpan is not None:
		colspan_attr = f' colspan="{gridSpan.get(qn("w:val"))}"'


		tag = "th" if (use_th and cell_is_header(cell)) else "td"
		align_attr = cell_align_style(cell)
		cell_html = self.cell_text_html(cell)
		lines.append(f" <{tag}{align_attr}>{cell_html}</{tag}>")

		lines.append(f" <{tag}{align_attr}{colspan_attr}{rowspan_attr}>{cell_html}</{tag}>")
		lines.append(" </tr>")

		# finalize rowspans
		for i, rowspan in enumerate(rowspans):
		marker = f' rowspan="TODO{i}"'
		for j, line in enumerate(lines[-1::-1]):
		if marker in line:
		newmarker = '' if rowspan==1 else f' rowspan="{rowspan}"'
		lines[-1-j] = lines[-1-j].replace(marker, newmarker, 1)
		break


		lines.append("</table>")
		return "\n".join(lines)

src/saref_pypeline/docgen/ts_generator.py

+79 −10

Original line number	Diff line number	Diff line
		@@ -213,6 +213,33 @@ class TSGenerator:
		self.description(OWL.bottomDataProperty, OWL_GRAPH)
		self.description(OWL.topDataProperty, OWL_GRAPH)


		@staticmethod
		def _add_gridspan(cell:_Cell, val:int):
		tcPr = cell._tc.get_or_add_tcPr()
		vMerge = tcPr.find(qn("w:gridSpan"))
		if vMerge is None:
		vMerge = OxmlElement("w:gridSpan")
		tcPr.append(vMerge)
		vMerge.set(qn("w:val"), str(val))

		@staticmethod
		def _add_vmerge(cell:_Cell, kind:str): # kind in {"restart","continue"}
		tcPr = cell._tc.get_or_add_tcPr()
		vMerge = tcPr.find(qn("w:vMerge"))
		if vMerge is None:
		vMerge = OxmlElement("w:vMerge")
		tcPr.append(vMerge)
		vMerge.set(qn("w:val"), kind)

		@staticmethod
		def _add_vmerge_restart(cell:_Cell):
		TSGenerator._add_vmerge(cell, "restart")

		@staticmethod
		def _add_vmerge_continue(cell):
		TSGenerator._add_vmerge(cell, "continue")

		def _compute_next_bookmark_id(self) -> int:
		max_id = 0
		for bookmark in self.document.element.xpath("//w:bookmarkStart"):
		@@ -980,7 +1007,7 @@ class TSGenerator:
		self.delete_section("User defined")
		self.insert_soup_for_file("abstract")
		self.insert_soup_for_file("description")
		self.insert_soup_for_file("examples")
		self.insert_soup_for_file("examples", mandatory=False)
		self._is_appendix = True
		self.insert_soup_for_file("annexes", mandatory=False)
		self.describe_ontology()
		@@ -1460,8 +1487,9 @@ class TSGenerator:
		for col, pref in zip(tbl.columns, preferred_width):
		col.width = compute_length(pref, text_width)

		spans = len(tbl.columns) * [(0,1)] # remaining_vertical, colspan
		for tr in rows:
		self.insert_soup_tr(tr, tbl, styling)
		spans = self.insert_soup_tr(tr, spans, tbl, styling)
		self._cursor = tbl

		def insert_soup_caption(self, el: Tag, styling: Callable = None) -> None:
		@@ -1474,18 +1502,59 @@ class TSGenerator:
		with Bookmark(self, id):
		self.insert_soup_children(el, styling)

		def insert_soup_tr(self, el: Tag, table: Table, styling: Callable = None) -> None:
		def insert_soup_tr(self, el: Tag, spans:List[int], table: Table, styling: Callable = None) -> List[int]:
		"""
		Add a <tr> (table row) to a python-docx Table.
		"""
		cells = el.find_all(["td", "th"])
		row_cells = table.add_row().cells
		for i, cell in enumerate(cells):
		strip_outer_text(cell)
		if cell.name == "th":
		self.insert_soup_th(cell, row_cells[i], styling)
		tds = list(el.find_all(["td", "th"]))
		td_id = 0

		row = table.add_row()
		cells = row.cells
		cell_idx = 0

		while cell_idx < len(cells):
		# if rowspans[cell_idx][0] > 0: this cell is a continuation:
		# skip, decrement rowspans[i][0], increment cell_idx, continue
		rowspan, colspan = spans[cell_idx]
		if rowspan > 0:
		spans[cell_idx] = (rowspan - 1, colspan)
		self._add_vmerge_continue(cells[cell_idx])
		# there may also be a colspan
		if colspan > 1:
		self._add_gridspan(cells[cell_idx], colspan)
		for i in range(colspan-1):
		to_remove = cells[cell_idx+1]._tc
		to_remove.getparent().remove(to_remove)
		cell_idx += 1
		continue

		td = tds[td_id]

		colspan = int(td.get("colspan", "1"))
		if colspan > 1:
		self._add_gridspan(cells[cell_idx], colspan)
		for i in range(colspan-1):
		to_remove = cells[cell_idx+1]._tc
		to_remove.getparent().remove(to_remove)

		rowspan = int(td.get("rowspan", "1"))
		if rowspan > 1:
		self._add_vmerge_restart(cells[cell_idx])
		spans[cell_idx] = (rowspan-1, colspan)
		for i in range(cell_idx+1, cell_idx+colspan):
		spans[i] = (rowspan-1, 1)

		strip_outer_text(td)
		if td.name == "th":
		self.insert_soup_th(td, cells[cell_idx], styling)
		else:
		self.insert_soup_td(cell, row_cells[i], styling)
		self.insert_soup_td(td, cells[cell_idx], styling)

		td_id += 1
		cell_idx += colspan

		return spans

		def insert_soup_th(self, el: Tag, cell: _Cell, styling: Callable = None) -> None:
		"""