roundtrip TS->md->TS for auto, city, ehaw (afabd38a) · Commits · SAREF / saref-pypeline

src/saref_pypeline/main.py

+4 −0

Original line number	Diff line number	Diff line
		@@ -158,6 +158,10 @@ def main():
		sys.exit()

		kwargs = vars(args)
		if not os.path.isdir(args.directory):
		print(f"Error: directory {args.directory} does not exist", file=sys.stderr)
		sys.exit(-2)

		pipeline = SAREFPipeline(**kwargs)
		# with Profile() as profile:

src/saref_pypeline/docgen/site_manager.py

+8 −3

Original line number	Diff line number	Diff line
		@@ -137,7 +137,9 @@ class SiteManager:

		# html documentation
		html = docgen.render_ontology_documentation(project_version.ontology)
		Path(target_dir, project_version.ontology.name + ".html").write_text(html)
		dest = Path(target_dir, project_version.ontology.name + ".html")
		dest.write_text(html)
		logger.debug(f"HTML page for {project_version.ontology}: {dest}")

		if project_version.examples:
		os.makedirs(os.path.join(target_dir, "example"), exist_ok=True)
		@@ -196,8 +198,11 @@ class SiteManager:
		m, t, e = (x.zfill(2) for x in docxgen.context[WK_FIELD.mte].split("."))
		document_name = f"ts_{odd}{ddd}{part}v{m}{t}{e}_{project_version.ontology.name}_{time_formatted}.docx"
		os.makedirs(self.ts_dir, exist_ok=True)
		document.save(os.path.join(self.ts_dir, document_name))
		# os.system(f'cmd.exe /C start "{self.ts_dir}/{document_name}"')
		dest = os.path.join(self.ts_dir, document_name)
		document.save(dest)
		logger.info(f"TS for {project_version} generated at {dest}")

		os.system(f'cmd.exe /C start "{self.ts_dir}/{document_name}"')

		def extract_from_ts(self, project_version: SAREFProjectVersion):
		project = project_version.project

src/saref_pypeline/docgen/ts2md_extractor.py

+96 −42

Original line number	Diff line number	Diff line
		@@ -4,12 +4,12 @@ import os
		import re
		from pathlib import Path
		from functools import cache, cached_property, lru_cache
		import shutil
		from typing import Any, Callable, Dict, Generator, List, Tuple
		from copy import deepcopy
		from datetime import datetime
		from functools import cache
		import zipfile
		from lxml import etree
		from dotenv import dotenv_values
		import requests
		import platform
		@@ -46,6 +46,7 @@ from docx.enum.table import (
		WD_CELL_VERTICAL_ALIGNMENT,
		)
		from docx.oxml import OxmlElement
		from docx.oxml.xmlchemy import BaseOxmlElement
		from docx.oxml.table import CT_Tbl
		from docx.oxml.text.paragraph import CT_P
		from docx.table import _Cell, Table
		@@ -53,7 +54,8 @@ from docx.text.paragraph import Paragraph
		from docx.text.hyperlink import Hyperlink
		from docx.shape import InlineShape

		from saref_pypeline.docgen.utils import OWL_GRAPH, print_admonition
		from saref_pypeline._logging import TRACE_LEVEL
		from saref_pypeline.docgen.utils import OWL_GRAPH, print_admonition, with_flags, pprint_xml
		from saref_pypeline.docgen.docxutils import iter_block_items, get_prev_block
		from saref_pypeline.entities import (
		SAREFCore,
		@@ -101,14 +103,6 @@ class RunContext:
		for markup in Markup.__members__.values():
		setattr(self, markup.style, None)

		def pprint_xml(entity):
		"""
		Pretty-print the XML of a python-docx entity (_element).
		"""
		entity = getattr(entity, "_element", entity)
		xml_bytes = etree.tostring(entity, pretty_print=True, encoding="unicode")
		print(xml_bytes)


		def open_url(url: str) -> None:
		"""
		@@ -368,12 +362,19 @@ class TS2MDExtractor:
		if not os.path.isdir(self.doc_folder):
		self.doc_folder = os.path.join(self.pipeline.sources_dir, "ts")

		# self.out_folder = os.path.join(
		# self.pipeline.target_dir,
		# "ts",
		# self.project.name,
		# os.path.splitext(filename)[0],
		# )

		self.out_folder = os.path.join(
		self.pipeline.target_dir,
		"ts",
		self.project.name,
		self.pipeline.directory,
		"documentation",
		os.path.splitext(filename)[0],
		)
		shutil.rmtree(self.out_folder, ignore_errors=True)

		self.file_path = os.path.join(self.doc_folder, filename)
		if not os.path.isfile(self.file_path):
		@@ -383,6 +384,8 @@ class TS2MDExtractor:
		if not confirm:
		return

		logger.log(TRACE_LEVEL, f"Extracting from TS {self.project_version} with file {self.file_path}" )

		self.extract_figures()

		self.document = Document(self.file_path)
		@@ -411,6 +414,8 @@ class TS2MDExtractor:
		Path(self.out_folder, f"annex_{chr(code_char)}.md").write_text(md, "utf-8")
		code_char += 1

		logger.debug(f"Extraction complete: {self.out_folder}")

		def get_docx_url(self) -> Tuple[str, str]:
		"""
		Compute the ETSI TS docx download URL from work item metadata.
		@@ -562,7 +567,7 @@ class TS2MDExtractor:

		def extract_paragraph(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		) -> str:
		try:
		style = P_STYLE(paragraph.style.name)
		except:
		@@ -589,14 +594,24 @@ class TS2MDExtractor:
		# keep track of spaces, as md markup needs to be right before/after non-space characters
		# invariant: last character of last item in ctx.content is a non-blank character

		for child in paragraph.iter_inner_content():
		if isinstance(child, Hyperlink):
		# We cannot use python-docx method iter_inner_content, as it only considers runs and hyperlinks.
		# For example elements <fldSimple> are ignored
		# Special elements like <bookmarkStart>, <bookmarkEnd> are ignored.
		for _c in paragraph._p.xpath("./w:r \| ./w:hyperlink \| ./w:fldSimple"):
		_c:BaseOxmlElement
		if _c.tag == qn("w:r"):
		child = Run(_c, paragraph)
		extract_run(child, ctx)
		elif _c.tag == qn("w:hyperlink"):
		child = Hyperlink(_c, paragraph)
		if self.extract_hyperlink_necessary(child):
		extract_hyperlink(child, ctx)
		else:
		for run in child.runs:
		extract_run(run, ctx)
		elif isinstance(child, Run):
		elif _c.tag == qn("w:fldSimple"):
		for _r in _c.xpath("./w:r", namespaces=_c.nsmap):
		child = Run(_r, paragraph)
		extract_run(child, ctx)

		for markup in [Markup.STRONG, Markup.EM, Markup.SUP, Markup.CODE]:
		@@ -618,7 +633,7 @@ class TS2MDExtractor:
		md = match.group(2)
		elif match := re.match(r"^Annex.?\n(.)", md):
		md = match.group(1)
		return prefix + md + "\n"
		return "\n" + prefix + md + "\n"

		def extract_Heading_1(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		@@ -702,38 +717,44 @@ class TS2MDExtractor:
		level: int,
		extract_format: ExtractFormat = ExtractFormat.MD,
		):
		md = self.extract_inner_content(paragraph)
		return print_admonition(f"indent-{level}", "", md)
		content = self.extract_inner_content(paragraph, extract_format)
		if extract_format == ExtractFormat.MD:
		return print_admonition(f"indent-{level}", "", content)
		else:
		return f"""<li data-docx-pstyle="B{level}">{content}</li>"""

		def extract_B1(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		"""Indent 1"""
		return self.extract_B(paragraph, 1)
		# exception if in table
		if paragraph._element.getparent().tag == qn("w:tc"):
		return self.extract_TB1(paragraph, extract_format)
		return self.extract_B(paragraph, 1, extract_format)

		def extract_B2(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		"""Indent 2"""
		return self.extract_B(paragraph, 2)
		return self.extract_B(paragraph, 2, extract_format)

		def extract_B3(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		"""Indent 3"""
		return self.extract_B(paragraph, 3)
		return self.extract_B(paragraph, 3, extract_format)

		def extract_B4(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		"""Indent 4"""
		return self.extract_B(paragraph, 4)
		return self.extract_B(paragraph, 4, extract_format)

		def extract_B5(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		"""Indent 5"""
		return self.extract_B(paragraph, 5)
		return self.extract_B(paragraph, 5, extract_format)

		def extract_B_plus(
		self,
		@@ -742,26 +763,29 @@ class TS2MDExtractor:
		extract_format: ExtractFormat = ExtractFormat.MD,
		):
		"""Bulleted indent 1 (round bullets)"""
		md = self.extract_inner_content(paragraph)
		md = self.extract_inner_content(paragraph, extract_format)
		return level * " " + f"* {md}"

		def extract_B1_plus(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		"""Bulleted indent 1 (round bullets)"""
		return self.extract_B_plus(paragraph, 0)
		# exception if in table
		if paragraph._element.getparent().tag == qn("w:tc"):
		return self.extract_TB1(paragraph, extract_format)
		return self.extract_B_plus(paragraph, 0, extract_format)

		def extract_B2_plus(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		"""Bulleted indent 2 (dashes)"""
		return self.extract_B_plus(paragraph, 4)
		return self.extract_B_plus(paragraph, 4, extract_format)

		def extract_B3_plus(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		"""Bulleted indent 3 (square bullets)"""
		return self.extract_B_plus(paragraph, 8)
		return self.extract_B_plus(paragraph, 8, extract_format)

		def extract_BN(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		@@ -784,7 +808,10 @@ class TS2MDExtractor:
		):
		"""Standard paragraph, Definition"""
		md = self.extract_inner_content(paragraph, extract_format)
		if extract_format == ExtractFormat.MD:
		return f"\n{md}\n"
		else:
		return f"<p>{md}</p>"

		def extract_TT(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		@@ -827,7 +854,7 @@ class TS2MDExtractor:
		<figcaption>{label}: {caption}</figcaption>
		</figure>\n"""
		else:
		return "{md}"
		return f"{md}"

		def extract_FL(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		@@ -878,19 +905,21 @@ class TS2MDExtractor:
		):
		"""List in tables Level 1"""
		html = self.extract_inner_content(paragraph, extract_format)
		return f"\n<li>{html}</li>"
		return f"""<li data-docx-pstyle="TB1">{html}</li>"""

		def extract_TB2(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.HTML
		):
		"""List in tables Level 2"""
		return self.extract_TB1(paragraph, extract_format)
		html = self.extract_inner_content(paragraph, extract_format)
		return f"""<li data-docx-pstyle="TB2">{html}</li>"""

		def extract_TAN(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.HTML
		):
		"""Note in table => use "tab" between "item/number" and "text"."""
		return self.extract_NO(paragraph, extract_format)
		html = self.extract_inner_content(paragraph, extract_format)
		return f"""\n<li data-docx-pstyle="TAN">{html}</li>"""

		# ---------------------------------------
		# For tables
		@@ -905,14 +934,13 @@ class TS2MDExtractor:
		return "Table_id_unknown", "label_unknown"
		prev = get_prev_block(self.document, table)
		if isinstance(prev, Paragraph) and getattr(prev.style, "name", None) == "TH":
		m = CAPTION_TABLE_RE.match(prev.text or "")
		md = self.extract_inner_content(prev)
		m = CAPTION_TABLE_RE.match(md)
		if m:
		num = m.group(1)
		title = m.group(2) or ""
		cap_id = f"Table_{num}"
		# Use the full visible text as caption: "Table N: Title"
		visible = prev.text.strip()
		return cap_id, visible
		return cap_id, md
		return self.caption_from_prev_paragraph(prev, i + 1)

		def cell_text_html(self, cell):
		@@ -920,9 +948,19 @@ class TS2MDExtractor:
		Join paragraphs in a cell with <br>. Preserve tabs and basic HTML escaping.
		"""
		parts = []
		for p in cell.paragraphs:
		parts.append(self.extract_paragraph(p, ExtractFormat.HTML))
		return "<br>".join(parts)
		for first, last, p in with_flags(cell.paragraphs):
		html = self.extract_paragraph(p, ExtractFormat.HTML)
		if first and last and html.startswith("<p>"):
		html = html[3:-4]

		if html.startswith("<li") and (first or not parts[-1].startswith("<li")):
		parts.append("<ul>")
		if not first and parts[-1].startswith("<li") and not html.startswith("<li"):
		parts.append("</ul>")
		parts.append(html)
		if last and html.startswith("<li"):
		parts.append("</ul>")
		return "\n".join(parts)

		def get_paragraph_image_info(self, paragraph: Paragraph, i: int = 0):
		"""
		@@ -1008,3 +1046,19 @@ class TS2MDExtractor:

		lines.append("</table>")
		return "\n".join(lines)

		# ---------------------------------------
		# For some special cases in SAREF TSs
		# ---------------------------------------

		def extract_List_Paragraph(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		"""List paragraphs with list level"""
		try:
		level = int(paragraph._element.xpath("w:pPr/w:numPr/w:ilvl/@w:val")[0])
		except:
		level = 1

		return level4" "+ "* " + self.extract_Normal(paragraph, extract_format).strip()

src/saref_pypeline/docgen/ts_generator.py

+107 −42

Original line number	Diff line number	Diff line
		@@ -46,6 +46,7 @@ from saref_pypeline.docgen.utils import (
		EntityDescription,
		with_flags,
		materialize_links,
		pprint_xml
		)
		from saref_pypeline.entities import (
		SAREFCore,
		@@ -290,6 +291,12 @@ class TSGenerator:
		self._cursor = self._cursor._parent
		elif isinstance(self._cursor, Table):
		self.new_paragraph()
		elif isinstance(self._cursor, _Cell):
		new_p = OxmlElement("w:p")
		self._cursor._element.append(new_p)
		paragraph = Paragraph(new_p, self._cursor)
		self._cursor = paragraph


		def ensure_cursor_run(self) -> None:
		"""
		@@ -306,10 +313,10 @@ class TSGenerator:

		def ensure_pstyle(self, style: P_STYLE, styling: Callable = None) -> Callable:
		def more_styling(el=None):
		if isinstance(self._cursor, Paragraph):
		self._cursor.style = style
		if styling:
		styling(el)
		if isinstance(self._cursor, Paragraph):
		self._cursor.style = style

		return more_styling

		@@ -317,19 +324,19 @@ class TSGenerator:
		self, style_run: Callable[[Run], None], styling: Callable = None
		) -> Callable:
		def more_styling(el=None):
		if isinstance(self._cursor, Run):
		style_run(self._cursor)
		if styling:
		styling(el)
		if isinstance(self._cursor, Run):
		style_run(self._cursor)

		return more_styling

		def ensure_cstyle(self, style: C_STYLE, styling: Callable = None) -> Callable:
		def more_styling(el=None):
		if isinstance(self._cursor, Run):
		self._cursor.style = style
		if styling:
		styling(el)
		if isinstance(self._cursor, Run):
		self._cursor.style = style

		return more_styling

		@@ -915,7 +922,12 @@ class TSGenerator:
		for p in soup.find_all("p"):
		p.attrs["class"] = "Normal"
		for li in soup.find_all("li"):
		li.string = re.sub(r" : ", "\t", li.string, 1)
		for desc in li.descendants:
		if isinstance(desc, NavigableString):
		new_text, n = re.subn(r"\s*:\s+", "\t", str(desc), 1)
		if n: # replaced once
		desc.replace_with(new_text)
		break
		li["data-docx-pstyle"] = P_STYLE.EW
		self.insert_soup(soup)

		@@ -970,12 +982,13 @@ class TSGenerator:
		self.insert_soup_for_file("description")
		self.insert_soup_for_file("examples")
		self._is_appendix = True
		self.insert_soup_for_file("annexes")
		self.insert_soup_for_file("annexes", mandatory=False)
		self.describe_ontology()

		def insert_soup_for_file(self, file: str):
		def insert_soup_for_file(self, file: str, mandatory=True):
		soup = self.get_soup(file)
		if not soup:
		if mandatory:
		self.new_paragraph()
		self.new_run(
		f"File documentation/{file}.md does not exist.", style=C_STYLE.Guidance
		@@ -1027,7 +1040,7 @@ class TSGenerator:
		data = html_path.read_text(encoding="utf-8")
		elif md_path.exists():
		data = markdown(
		md_path.read_text(encoding="utf-8"), extensions=["extra", "codehilite"]
		md_path.read_text(encoding="utf-8"), extensions=["extra", "admonition", "codehilite"]
		)

		if not data:
		@@ -1047,6 +1060,8 @@ class TSGenerator:
		if el == "\n":
		return
		self.ensure_cursor_paragraph()
		if styling:
		styling(el)
		self.new_run(el)
		if styling:
		styling(el)
		@@ -1074,20 +1089,56 @@ class TSGenerator:
		for child in el.children:
		self.insert_soup(child, styling)

		def insert_soup_div(self, el: Tag, styling: Callable = None) -> None:
		# only a few special cases of admonition are supported
		if "admonition" in el.get("class"):
		title = el.find("p", class_="admonition-title")
		body_parts = [p for p in el.find_all("p") if p is not title]
		if len(body_parts) >= 1:
		p = body_parts[0]
		if title:
		p.insert(0, *list(title.children))
		else:
		p = title
		style = P_STYLE.NO if title and "NOTE" in title.getText() else P_STYLE.EX
		self.insert_soup_p(p, self.ensure_pstyle(style))
		for p in body_parts[1:]:
		self.insert_soup_p(p, self.ensure_pstyle(P_STYLE.EW, styling))
		else:
		self.new_paragraph()
		if styling:
		styling(el)
		self.insert_soup_children(el, styling)



		def insert_soup_br(self, el: Tag, styling: Callable = None) -> None:
		self.ensure_cursor_paragraph()
		self.new_paragraph()
		if styling:
		styling()

		def insert_soup_p(self, el: Tag, styling: Callable = None) -> None:
		self.ensure_cursor_paragraph()
		if any(parent.name == "table" for parent in el.parents) \
		and len(self._cursor._p.getparent().xpath("./w:p")) == 1 \
		and len(self._cursor._p.getparent().xpath("./w:p/w:r")) == 0:
		self.ensure_cursor_paragraph() # do nothing
		else:
		self.new_paragraph()

		if styling:
		styling(el)

		add_colon = False
		add_tab = False
		if el.getText().startswith("NOTE"):
		self._cursor.style = P_STYLE.NO
		add_colon = True
		add_tab = True
		elif el.getText().startswith("EXAMPLE"):
		self._cursor.style = P_STYLE.EX
		add_colon = True
		add_tab = True

		if add_colon:
		if add_tab:
		for child in el.children:
		if isinstance(child, NavigableString) and ":" in child.text:
		child.replace_with(re.sub(r":\w*", ":\t", child.text, 1))
		@@ -1149,18 +1200,24 @@ class TSGenerator:

		def insert_soup_list(self, el: Tag, styling: Callable = None) -> None:
		for li in el.find_all("li", recursive=False):
		self.insert_soup_li(li, styling)

		def insert_soup_li(self, li: Tag, styling: Callable = None) -> None:
		self.new_paragraph()
		if styling:
		styling(li)
		if style := li.get("data-docx-pstyle", None):
		self._cursor.style = style
		style = self._cursor.style
		if "data-docx-pstyle" in li.attrs:
		style = li.get("data-docx-pstyle")
		styling = self.ensure_pstyle(style, styling)

		for child in li.children:
		if isinstance(child, Tag) and child.name == "ul":
		sub_p_style = SUB_UL_STYLE.get(style, None)
		sub_p_style = SUB_UL_STYLE.get(style.name, None)
		more_styling = self.ensure_pstyle(sub_p_style, styling)
		self.insert_soup_list(child, more_styling)
		elif isinstance(child, Tag) and child.name == "ol":
		sub_p_style = SUB_OL_STYLE.get(style, None)
		sub_p_style = SUB_OL_STYLE.get(style.name, None)
		more_styling = self.ensure_pstyle(sub_p_style, styling)
		self.insert_soup_list(child, more_styling)
		else:
		@@ -1336,6 +1393,7 @@ class TSGenerator:
		self._cursor = run
		styling()
		self._cursor = paragraph
		styling()

		# Assemble and append
		h.append(r)
		@@ -1444,18 +1502,25 @@ class TSGenerator:
		Add a <td> (table data cell).
		"""
		cell.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
		# for _p in cell._tc.xpath("./w:p"):
		# cell._element.remove(_p)
		# self._cursor = cell
		self._cursor = cell.paragraphs[0]
		# self._cursor.alignment = WD_ALIGN_PARAGRAPH.CENTER

		# Decide style based on HTML attributes
		align = el.get("align", "").lower()
		style = el.get("style", "").lower()
		styles = dict(
		rule.strip().split(":", 1)
		for rule in style.split(";") if ":" in rule
		)
		align = styles.get("text-align", "").strip()
		if align == "center":
		self._cursor.style = P_STYLE.TAC
		style = P_STYLE.TAC
		elif align == "right":
		self._cursor.style = P_STYLE.TAR
		style = P_STYLE.TAR
		else:
		self._cursor.style = P_STYLE.TAL # default align left
		self.insert_soup_children(el, styling)
		style = P_STYLE.TAL # default align left
		self.insert_soup_children(el, self.ensure_pstyle(style, styling))

		# ---------------------------------------------------------------------
		# Methods for the ontology reference annex
		@@ -1861,7 +1926,7 @@ class TSGenerator:
		elif literal.datatype == URIRef(
		"http://www.iana.org/assignments/media-types/text/markdown"
		):
		self.insert_soup(markdown(literal, extensions=["extra", "codehilite"]))
		self.insert_soup(markdown(literal, extensions=["extra", "admonition", "codehilite"]))
		else:
		self.new_run(literal.replace("\r", ""))

src/saref_pypeline/docgen/utils.py

+9 −0

Original line number	Diff line number	Diff line
		@@ -9,6 +9,7 @@ from typing import TypeVar
		from rdflib.term import URIRef, Literal
		from rdflib import Graph, RDF, RDFS, OWL, XSD
		from dominate.tags import sup, a, li
		from lxml import etree

		from saref_pypeline.entities import SAREFGraphDocument

		@@ -205,3 +206,11 @@ def print_admonition(classes: str, title: str, md: str):
		prefix = 4 * " "
		content = md.replace("\n", f"\n{prefix}")
		return f'\n!!! {classes} "{title}"\n{prefix}{content}\n'

		def pprint_xml(entity):
		"""
		Pretty-print the XML of a python-docx entity (_element).
		"""
		entity = getattr(entity, "_element", entity)
		xml_bytes = etree.tostring(entity, pretty_print=True, encoding="unicode")
		print(xml_bytes)