Loading src/saref_pypeline/__main__.py +4 −0 Original line number Diff line number Diff line Loading @@ -158,6 +158,10 @@ def main(): sys.exit() kwargs = vars(args) if not os.path.isdir(args.directory): print(f"Error: directory {args.directory} does not exist", file=sys.stderr) sys.exit(-2) pipeline = SAREFPipeline(**kwargs) # with Profile() as profile: Loading src/saref_pypeline/docgen/site_manager.py +8 −3 Original line number Diff line number Diff line Loading @@ -137,7 +137,9 @@ class SiteManager: # html documentation html = docgen.render_ontology_documentation(project_version.ontology) Path(target_dir, project_version.ontology.name + ".html").write_text(html) dest = Path(target_dir, project_version.ontology.name + ".html") dest.write_text(html) logger.debug(f"HTML page for {project_version.ontology}: {dest}") if project_version.examples: os.makedirs(os.path.join(target_dir, "example"), exist_ok=True) Loading Loading @@ -196,8 +198,11 @@ class SiteManager: m, t, e = (x.zfill(2) for x in docxgen.context[WK_FIELD.mte].split(".")) document_name = f"ts_{odd}{ddd}{part}v{m}{t}{e}_{project_version.ontology.name}_{time_formatted}.docx" os.makedirs(self.ts_dir, exist_ok=True) document.save(os.path.join(self.ts_dir, document_name)) # os.system(f'cmd.exe /C start "{self.ts_dir}/{document_name}"') dest = os.path.join(self.ts_dir, document_name) document.save(dest) logger.info(f"TS for {project_version} generated at {dest}") os.system(f'cmd.exe /C start "{self.ts_dir}/{document_name}"') def extract_from_ts(self, project_version: SAREFProjectVersion): project = project_version.project Loading src/saref_pypeline/docgen/ts2md_extractor.py +96 −42 Original line number Diff line number Diff line Loading @@ -4,12 +4,12 @@ import os import re from pathlib import Path from functools import cache, cached_property, lru_cache import shutil from typing import Any, Callable, Dict, Generator, List, Tuple from copy import deepcopy from datetime import datetime from functools import cache import zipfile from lxml import etree from dotenv import dotenv_values import requests import platform Loading Loading @@ -46,6 +46,7 @@ from docx.enum.table import ( WD_CELL_VERTICAL_ALIGNMENT, ) from docx.oxml import OxmlElement from docx.oxml.xmlchemy import BaseOxmlElement from docx.oxml.table import CT_Tbl from docx.oxml.text.paragraph import CT_P from docx.table import _Cell, Table Loading @@ -53,7 +54,8 @@ from docx.text.paragraph import Paragraph from docx.text.hyperlink import Hyperlink from docx.shape import InlineShape from saref_pypeline.docgen.utils import OWL_GRAPH, print_admonition from saref_pypeline._logging import TRACE_LEVEL from saref_pypeline.docgen.utils import OWL_GRAPH, print_admonition, with_flags, pprint_xml from saref_pypeline.docgen.docxutils import iter_block_items, get_prev_block from saref_pypeline.entities import ( SAREFCore, Loading Loading @@ -101,14 +103,6 @@ class RunContext: for markup in Markup.__members__.values(): setattr(self, markup.style, None) def pprint_xml(entity): """ Pretty-print the XML of a python-docx entity (_element). """ entity = getattr(entity, "_element", entity) xml_bytes = etree.tostring(entity, pretty_print=True, encoding="unicode") print(xml_bytes) def open_url(url: str) -> None: """ Loading Loading @@ -368,12 +362,19 @@ class TS2MDExtractor: if not os.path.isdir(self.doc_folder): self.doc_folder = os.path.join(self.pipeline.sources_dir, "ts") # self.out_folder = os.path.join( # self.pipeline.target_dir, # "ts", # self.project.name, # os.path.splitext(filename)[0], # ) self.out_folder = os.path.join( self.pipeline.target_dir, "ts", self.project.name, self.pipeline.directory, "documentation", os.path.splitext(filename)[0], ) shutil.rmtree(self.out_folder, ignore_errors=True) self.file_path = os.path.join(self.doc_folder, filename) if not os.path.isfile(self.file_path): Loading @@ -383,6 +384,8 @@ class TS2MDExtractor: if not confirm: return logger.log(TRACE_LEVEL, f"Extracting from TS {self.project_version} with file {self.file_path}" ) self.extract_figures() self.document = Document(self.file_path) Loading Loading @@ -411,6 +414,8 @@ class TS2MDExtractor: Path(self.out_folder, f"annex_{chr(code_char)}.md").write_text(md, "utf-8") code_char += 1 logger.debug(f"Extraction complete: {self.out_folder}") def get_docx_url(self) -> Tuple[str, str]: """ Compute the ETSI TS docx download URL from work item metadata. Loading Loading @@ -562,7 +567,7 @@ class TS2MDExtractor: def extract_paragraph( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD ): ) -> str: try: style = P_STYLE(paragraph.style.name) except: Loading @@ -589,14 +594,24 @@ class TS2MDExtractor: # keep track of spaces, as md markup needs to be right before/after non-space characters # invariant: last character of last item in ctx.content is a non-blank character for child in paragraph.iter_inner_content(): if isinstance(child, Hyperlink): # We cannot use python-docx method iter_inner_content, as it only considers runs and hyperlinks. # For example elements <fldSimple> are ignored # Special elements like <bookmarkStart>, <bookmarkEnd> are ignored. for _c in paragraph._p.xpath("./w:r | ./w:hyperlink | ./w:fldSimple"): _c:BaseOxmlElement if _c.tag == qn("w:r"): child = Run(_c, paragraph) extract_run(child, ctx) elif _c.tag == qn("w:hyperlink"): child = Hyperlink(_c, paragraph) if self.extract_hyperlink_necessary(child): extract_hyperlink(child, ctx) else: for run in child.runs: extract_run(run, ctx) elif isinstance(child, Run): elif _c.tag == qn("w:fldSimple"): for _r in _c.xpath("./w:r", namespaces=_c.nsmap): child = Run(_r, paragraph) extract_run(child, ctx) for markup in [Markup.STRONG, Markup.EM, Markup.SUP, Markup.CODE]: Loading @@ -618,7 +633,7 @@ class TS2MDExtractor: md = match.group(2) elif match := re.match(r"^Annex.*?\n(.*)", md): md = match.group(1) return prefix + md + "\n" return "\n" + prefix + md + "\n" def extract_Heading_1( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD Loading Loading @@ -702,38 +717,44 @@ class TS2MDExtractor: level: int, extract_format: ExtractFormat = ExtractFormat.MD, ): md = self.extract_inner_content(paragraph) return print_admonition(f"indent-{level}", "", md) content = self.extract_inner_content(paragraph, extract_format) if extract_format == ExtractFormat.MD: return print_admonition(f"indent-{level}", "", content) else: return f"""<li data-docx-pstyle="B{level}">{content}</li>""" def extract_B1( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD ): """Indent 1""" return self.extract_B(paragraph, 1) # exception if in table if paragraph._element.getparent().tag == qn("w:tc"): return self.extract_TB1(paragraph, extract_format) return self.extract_B(paragraph, 1, extract_format) def extract_B2( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD ): """Indent 2""" return self.extract_B(paragraph, 2) return self.extract_B(paragraph, 2, extract_format) def extract_B3( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD ): """Indent 3""" return self.extract_B(paragraph, 3) return self.extract_B(paragraph, 3, extract_format) def extract_B4( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD ): """Indent 4""" return self.extract_B(paragraph, 4) return self.extract_B(paragraph, 4, extract_format) def extract_B5( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD ): """Indent 5""" return self.extract_B(paragraph, 5) return self.extract_B(paragraph, 5, extract_format) def extract_B_plus( self, Loading @@ -742,26 +763,29 @@ class TS2MDExtractor: extract_format: ExtractFormat = ExtractFormat.MD, ): """Bulleted indent 1 (round bullets)""" md = self.extract_inner_content(paragraph) md = self.extract_inner_content(paragraph, extract_format) return level * " " + f"* {md}" def extract_B1_plus( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD ): """Bulleted indent 1 (round bullets)""" return self.extract_B_plus(paragraph, 0) # exception if in table if paragraph._element.getparent().tag == qn("w:tc"): return self.extract_TB1(paragraph, extract_format) return self.extract_B_plus(paragraph, 0, extract_format) def extract_B2_plus( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD ): """Bulleted indent 2 (dashes)""" return self.extract_B_plus(paragraph, 4) return self.extract_B_plus(paragraph, 4, extract_format) def extract_B3_plus( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD ): """Bulleted indent 3 (square bullets)""" return self.extract_B_plus(paragraph, 8) return self.extract_B_plus(paragraph, 8, extract_format) def extract_BN( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD Loading @@ -784,7 +808,10 @@ class TS2MDExtractor: ): """Standard paragraph, Definition""" md = self.extract_inner_content(paragraph, extract_format) if extract_format == ExtractFormat.MD: return f"\n{md}\n" else: return f"<p>{md}</p>" def extract_TT( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD Loading Loading @@ -827,7 +854,7 @@ class TS2MDExtractor: <figcaption>{label}: {caption}</figcaption> </figure>\n""" else: return "**{md}**" return f"**{md}**" def extract_FL( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD Loading Loading @@ -878,19 +905,21 @@ class TS2MDExtractor: ): """List in tables Level 1""" html = self.extract_inner_content(paragraph, extract_format) return f"\n<li>{html}</li>" return f"""<li data-docx-pstyle="TB1">{html}</li>""" def extract_TB2( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.HTML ): """List in tables Level 2""" return self.extract_TB1(paragraph, extract_format) html = self.extract_inner_content(paragraph, extract_format) return f"""<li data-docx-pstyle="TB2">{html}</li>""" def extract_TAN( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.HTML ): """Note in table => use "tab" between "item/number" and "text".""" return self.extract_NO(paragraph, extract_format) html = self.extract_inner_content(paragraph, extract_format) return f"""\n<li data-docx-pstyle="TAN">{html}</li>""" # --------------------------------------- # For tables Loading @@ -905,14 +934,13 @@ class TS2MDExtractor: return "Table_id_unknown", "label_unknown" prev = get_prev_block(self.document, table) if isinstance(prev, Paragraph) and getattr(prev.style, "name", None) == "TH": m = CAPTION_TABLE_RE.match(prev.text or "") md = self.extract_inner_content(prev) m = CAPTION_TABLE_RE.match(md) if m: num = m.group(1) title = m.group(2) or "" cap_id = f"Table_{num}" # Use the full visible text as caption: "Table N: Title" visible = prev.text.strip() return cap_id, visible return cap_id, md return self.caption_from_prev_paragraph(prev, i + 1) def cell_text_html(self, cell): Loading @@ -920,9 +948,19 @@ class TS2MDExtractor: Join paragraphs in a cell with <br>. Preserve tabs and basic HTML escaping. """ parts = [] for p in cell.paragraphs: parts.append(self.extract_paragraph(p, ExtractFormat.HTML)) return "<br>".join(parts) for first, last, p in with_flags(cell.paragraphs): html = self.extract_paragraph(p, ExtractFormat.HTML) if first and last and html.startswith("<p>"): html = html[3:-4] if html.startswith("<li") and (first or not parts[-1].startswith("<li")): parts.append("<ul>") if not first and parts[-1].startswith("<li") and not html.startswith("<li"): parts.append("</ul>") parts.append(html) if last and html.startswith("<li"): parts.append("</ul>") return "\n".join(parts) def get_paragraph_image_info(self, paragraph: Paragraph, i: int = 0): """ Loading Loading @@ -1008,3 +1046,19 @@ class TS2MDExtractor: lines.append("</table>") return "\n".join(lines) # --------------------------------------- # For some special cases in SAREF TSs # --------------------------------------- def extract_List_Paragraph( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD ): """List paragraphs with list level""" try: level = int(paragraph._element.xpath("w:pPr/w:numPr/w:ilvl/@w:val")[0]) except: level = 1 return level*4*" "+ "* " + self.extract_Normal(paragraph, extract_format).strip() src/saref_pypeline/docgen/ts_generator.py +107 −42 Original line number Diff line number Diff line Loading @@ -46,6 +46,7 @@ from saref_pypeline.docgen.utils import ( EntityDescription, with_flags, materialize_links, pprint_xml ) from saref_pypeline.entities import ( SAREFCore, Loading Loading @@ -290,6 +291,12 @@ class TSGenerator: self._cursor = self._cursor._parent elif isinstance(self._cursor, Table): self.new_paragraph() elif isinstance(self._cursor, _Cell): new_p = OxmlElement("w:p") self._cursor._element.append(new_p) paragraph = Paragraph(new_p, self._cursor) self._cursor = paragraph def ensure_cursor_run(self) -> None: """ Loading @@ -306,10 +313,10 @@ class TSGenerator: def ensure_pstyle(self, style: P_STYLE, styling: Callable = None) -> Callable: def more_styling(el=None): if isinstance(self._cursor, Paragraph): self._cursor.style = style if styling: styling(el) if isinstance(self._cursor, Paragraph): self._cursor.style = style return more_styling Loading @@ -317,19 +324,19 @@ class TSGenerator: self, style_run: Callable[[Run], None], styling: Callable = None ) -> Callable: def more_styling(el=None): if isinstance(self._cursor, Run): style_run(self._cursor) if styling: styling(el) if isinstance(self._cursor, Run): style_run(self._cursor) return more_styling def ensure_cstyle(self, style: C_STYLE, styling: Callable = None) -> Callable: def more_styling(el=None): if isinstance(self._cursor, Run): self._cursor.style = style if styling: styling(el) if isinstance(self._cursor, Run): self._cursor.style = style return more_styling Loading Loading @@ -915,7 +922,12 @@ class TSGenerator: for p in soup.find_all("p"): p.attrs["class"] = "Normal" for li in soup.find_all("li"): li.string = re.sub(r" *: *", "\t", li.string, 1) for desc in li.descendants: if isinstance(desc, NavigableString): new_text, n = re.subn(r"\s*:\s+", "\t", str(desc), 1) if n: # replaced once desc.replace_with(new_text) break li["data-docx-pstyle"] = P_STYLE.EW self.insert_soup(soup) Loading Loading @@ -970,12 +982,13 @@ class TSGenerator: self.insert_soup_for_file("description") self.insert_soup_for_file("examples") self._is_appendix = True self.insert_soup_for_file("annexes") self.insert_soup_for_file("annexes", mandatory=False) self.describe_ontology() def insert_soup_for_file(self, file: str): def insert_soup_for_file(self, file: str, mandatory=True): soup = self.get_soup(file) if not soup: if mandatory: self.new_paragraph() self.new_run( f"File documentation/{file}.md does not exist.", style=C_STYLE.Guidance Loading Loading @@ -1027,7 +1040,7 @@ class TSGenerator: data = html_path.read_text(encoding="utf-8") elif md_path.exists(): data = markdown( md_path.read_text(encoding="utf-8"), extensions=["extra", "codehilite"] md_path.read_text(encoding="utf-8"), extensions=["extra", "admonition", "codehilite"] ) if not data: Loading @@ -1047,6 +1060,8 @@ class TSGenerator: if el == "\n": return self.ensure_cursor_paragraph() if styling: styling(el) self.new_run(el) if styling: styling(el) Loading Loading @@ -1074,20 +1089,56 @@ class TSGenerator: for child in el.children: self.insert_soup(child, styling) def insert_soup_div(self, el: Tag, styling: Callable = None) -> None: # only a few special cases of admonition are supported if "admonition" in el.get("class"): title = el.find("p", class_="admonition-title") body_parts = [p for p in el.find_all("p") if p is not title] if len(body_parts) >= 1: p = body_parts[0] if title: p.insert(0, *list(title.children)) else: p = title style = P_STYLE.NO if title and "NOTE" in title.getText() else P_STYLE.EX self.insert_soup_p(p, self.ensure_pstyle(style)) for p in body_parts[1:]: self.insert_soup_p(p, self.ensure_pstyle(P_STYLE.EW, styling)) else: self.new_paragraph() if styling: styling(el) self.insert_soup_children(el, styling) def insert_soup_br(self, el: Tag, styling: Callable = None) -> None: self.ensure_cursor_paragraph() self.new_paragraph() if styling: styling() def insert_soup_p(self, el: Tag, styling: Callable = None) -> None: self.ensure_cursor_paragraph() if any(parent.name == "table" for parent in el.parents) \ and len(self._cursor._p.getparent().xpath("./w:p")) == 1 \ and len(self._cursor._p.getparent().xpath("./w:p/w:r")) == 0: self.ensure_cursor_paragraph() # do nothing else: self.new_paragraph() if styling: styling(el) add_colon = False add_tab = False if el.getText().startswith("NOTE"): self._cursor.style = P_STYLE.NO add_colon = True add_tab = True elif el.getText().startswith("EXAMPLE"): self._cursor.style = P_STYLE.EX add_colon = True add_tab = True if add_colon: if add_tab: for child in el.children: if isinstance(child, NavigableString) and ":" in child.text: child.replace_with(re.sub(r":\w*", ":\t", child.text, 1)) Loading Loading @@ -1149,18 +1200,24 @@ class TSGenerator: def insert_soup_list(self, el: Tag, styling: Callable = None) -> None: for li in el.find_all("li", recursive=False): self.insert_soup_li(li, styling) def insert_soup_li(self, li: Tag, styling: Callable = None) -> None: self.new_paragraph() if styling: styling(li) if style := li.get("data-docx-pstyle", None): self._cursor.style = style style = self._cursor.style if "data-docx-pstyle" in li.attrs: style = li.get("data-docx-pstyle") styling = self.ensure_pstyle(style, styling) for child in li.children: if isinstance(child, Tag) and child.name == "ul": sub_p_style = SUB_UL_STYLE.get(style, None) sub_p_style = SUB_UL_STYLE.get(style.name, None) more_styling = self.ensure_pstyle(sub_p_style, styling) self.insert_soup_list(child, more_styling) elif isinstance(child, Tag) and child.name == "ol": sub_p_style = SUB_OL_STYLE.get(style, None) sub_p_style = SUB_OL_STYLE.get(style.name, None) more_styling = self.ensure_pstyle(sub_p_style, styling) self.insert_soup_list(child, more_styling) else: Loading Loading @@ -1336,6 +1393,7 @@ class TSGenerator: self._cursor = run styling() self._cursor = paragraph styling() # Assemble and append h.append(r) Loading Loading @@ -1444,18 +1502,25 @@ class TSGenerator: Add a <td> (table data cell). """ cell.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER # for _p in cell._tc.xpath("./w:p"): # cell._element.remove(_p) # self._cursor = cell self._cursor = cell.paragraphs[0] # self._cursor.alignment = WD_ALIGN_PARAGRAPH.CENTER # Decide style based on HTML attributes align = el.get("align", "").lower() style = el.get("style", "").lower() styles = dict( rule.strip().split(":", 1) for rule in style.split(";") if ":" in rule ) align = styles.get("text-align", "").strip() if align == "center": self._cursor.style = P_STYLE.TAC style = P_STYLE.TAC elif align == "right": self._cursor.style = P_STYLE.TAR style = P_STYLE.TAR else: self._cursor.style = P_STYLE.TAL # default align left self.insert_soup_children(el, styling) style = P_STYLE.TAL # default align left self.insert_soup_children(el, self.ensure_pstyle(style, styling)) # --------------------------------------------------------------------- # Methods for the ontology reference annex Loading Loading @@ -1861,7 +1926,7 @@ class TSGenerator: elif literal.datatype == URIRef( "http://www.iana.org/assignments/media-types/text/markdown" ): self.insert_soup(markdown(literal, extensions=["extra", "codehilite"])) self.insert_soup(markdown(literal, extensions=["extra", "admonition", "codehilite"])) else: self.new_run(literal.replace("\r", "")) Loading src/saref_pypeline/docgen/utils.py +9 −0 Original line number Diff line number Diff line Loading @@ -9,6 +9,7 @@ from typing import TypeVar from rdflib.term import URIRef, Literal from rdflib import Graph, RDF, RDFS, OWL, XSD from dominate.tags import sup, a, li from lxml import etree from saref_pypeline.entities import SAREFGraphDocument Loading Loading @@ -205,3 +206,11 @@ def print_admonition(classes: str, title: str, md: str): prefix = 4 * " " content = md.replace("\n", f"\n{prefix}") return f'\n!!! {classes} "{title}"\n{prefix}{content}\n' def pprint_xml(entity): """ Pretty-print the XML of a python-docx entity (_element). """ entity = getattr(entity, "_element", entity) xml_bytes = etree.tostring(entity, pretty_print=True, encoding="unicode") print(xml_bytes) Loading
src/saref_pypeline/__main__.py +4 −0 Original line number Diff line number Diff line Loading @@ -158,6 +158,10 @@ def main(): sys.exit() kwargs = vars(args) if not os.path.isdir(args.directory): print(f"Error: directory {args.directory} does not exist", file=sys.stderr) sys.exit(-2) pipeline = SAREFPipeline(**kwargs) # with Profile() as profile: Loading
src/saref_pypeline/docgen/site_manager.py +8 −3 Original line number Diff line number Diff line Loading @@ -137,7 +137,9 @@ class SiteManager: # html documentation html = docgen.render_ontology_documentation(project_version.ontology) Path(target_dir, project_version.ontology.name + ".html").write_text(html) dest = Path(target_dir, project_version.ontology.name + ".html") dest.write_text(html) logger.debug(f"HTML page for {project_version.ontology}: {dest}") if project_version.examples: os.makedirs(os.path.join(target_dir, "example"), exist_ok=True) Loading Loading @@ -196,8 +198,11 @@ class SiteManager: m, t, e = (x.zfill(2) for x in docxgen.context[WK_FIELD.mte].split(".")) document_name = f"ts_{odd}{ddd}{part}v{m}{t}{e}_{project_version.ontology.name}_{time_formatted}.docx" os.makedirs(self.ts_dir, exist_ok=True) document.save(os.path.join(self.ts_dir, document_name)) # os.system(f'cmd.exe /C start "{self.ts_dir}/{document_name}"') dest = os.path.join(self.ts_dir, document_name) document.save(dest) logger.info(f"TS for {project_version} generated at {dest}") os.system(f'cmd.exe /C start "{self.ts_dir}/{document_name}"') def extract_from_ts(self, project_version: SAREFProjectVersion): project = project_version.project Loading
src/saref_pypeline/docgen/ts2md_extractor.py +96 −42 Original line number Diff line number Diff line Loading @@ -4,12 +4,12 @@ import os import re from pathlib import Path from functools import cache, cached_property, lru_cache import shutil from typing import Any, Callable, Dict, Generator, List, Tuple from copy import deepcopy from datetime import datetime from functools import cache import zipfile from lxml import etree from dotenv import dotenv_values import requests import platform Loading Loading @@ -46,6 +46,7 @@ from docx.enum.table import ( WD_CELL_VERTICAL_ALIGNMENT, ) from docx.oxml import OxmlElement from docx.oxml.xmlchemy import BaseOxmlElement from docx.oxml.table import CT_Tbl from docx.oxml.text.paragraph import CT_P from docx.table import _Cell, Table Loading @@ -53,7 +54,8 @@ from docx.text.paragraph import Paragraph from docx.text.hyperlink import Hyperlink from docx.shape import InlineShape from saref_pypeline.docgen.utils import OWL_GRAPH, print_admonition from saref_pypeline._logging import TRACE_LEVEL from saref_pypeline.docgen.utils import OWL_GRAPH, print_admonition, with_flags, pprint_xml from saref_pypeline.docgen.docxutils import iter_block_items, get_prev_block from saref_pypeline.entities import ( SAREFCore, Loading Loading @@ -101,14 +103,6 @@ class RunContext: for markup in Markup.__members__.values(): setattr(self, markup.style, None) def pprint_xml(entity): """ Pretty-print the XML of a python-docx entity (_element). """ entity = getattr(entity, "_element", entity) xml_bytes = etree.tostring(entity, pretty_print=True, encoding="unicode") print(xml_bytes) def open_url(url: str) -> None: """ Loading Loading @@ -368,12 +362,19 @@ class TS2MDExtractor: if not os.path.isdir(self.doc_folder): self.doc_folder = os.path.join(self.pipeline.sources_dir, "ts") # self.out_folder = os.path.join( # self.pipeline.target_dir, # "ts", # self.project.name, # os.path.splitext(filename)[0], # ) self.out_folder = os.path.join( self.pipeline.target_dir, "ts", self.project.name, self.pipeline.directory, "documentation", os.path.splitext(filename)[0], ) shutil.rmtree(self.out_folder, ignore_errors=True) self.file_path = os.path.join(self.doc_folder, filename) if not os.path.isfile(self.file_path): Loading @@ -383,6 +384,8 @@ class TS2MDExtractor: if not confirm: return logger.log(TRACE_LEVEL, f"Extracting from TS {self.project_version} with file {self.file_path}" ) self.extract_figures() self.document = Document(self.file_path) Loading Loading @@ -411,6 +414,8 @@ class TS2MDExtractor: Path(self.out_folder, f"annex_{chr(code_char)}.md").write_text(md, "utf-8") code_char += 1 logger.debug(f"Extraction complete: {self.out_folder}") def get_docx_url(self) -> Tuple[str, str]: """ Compute the ETSI TS docx download URL from work item metadata. Loading Loading @@ -562,7 +567,7 @@ class TS2MDExtractor: def extract_paragraph( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD ): ) -> str: try: style = P_STYLE(paragraph.style.name) except: Loading @@ -589,14 +594,24 @@ class TS2MDExtractor: # keep track of spaces, as md markup needs to be right before/after non-space characters # invariant: last character of last item in ctx.content is a non-blank character for child in paragraph.iter_inner_content(): if isinstance(child, Hyperlink): # We cannot use python-docx method iter_inner_content, as it only considers runs and hyperlinks. # For example elements <fldSimple> are ignored # Special elements like <bookmarkStart>, <bookmarkEnd> are ignored. for _c in paragraph._p.xpath("./w:r | ./w:hyperlink | ./w:fldSimple"): _c:BaseOxmlElement if _c.tag == qn("w:r"): child = Run(_c, paragraph) extract_run(child, ctx) elif _c.tag == qn("w:hyperlink"): child = Hyperlink(_c, paragraph) if self.extract_hyperlink_necessary(child): extract_hyperlink(child, ctx) else: for run in child.runs: extract_run(run, ctx) elif isinstance(child, Run): elif _c.tag == qn("w:fldSimple"): for _r in _c.xpath("./w:r", namespaces=_c.nsmap): child = Run(_r, paragraph) extract_run(child, ctx) for markup in [Markup.STRONG, Markup.EM, Markup.SUP, Markup.CODE]: Loading @@ -618,7 +633,7 @@ class TS2MDExtractor: md = match.group(2) elif match := re.match(r"^Annex.*?\n(.*)", md): md = match.group(1) return prefix + md + "\n" return "\n" + prefix + md + "\n" def extract_Heading_1( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD Loading Loading @@ -702,38 +717,44 @@ class TS2MDExtractor: level: int, extract_format: ExtractFormat = ExtractFormat.MD, ): md = self.extract_inner_content(paragraph) return print_admonition(f"indent-{level}", "", md) content = self.extract_inner_content(paragraph, extract_format) if extract_format == ExtractFormat.MD: return print_admonition(f"indent-{level}", "", content) else: return f"""<li data-docx-pstyle="B{level}">{content}</li>""" def extract_B1( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD ): """Indent 1""" return self.extract_B(paragraph, 1) # exception if in table if paragraph._element.getparent().tag == qn("w:tc"): return self.extract_TB1(paragraph, extract_format) return self.extract_B(paragraph, 1, extract_format) def extract_B2( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD ): """Indent 2""" return self.extract_B(paragraph, 2) return self.extract_B(paragraph, 2, extract_format) def extract_B3( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD ): """Indent 3""" return self.extract_B(paragraph, 3) return self.extract_B(paragraph, 3, extract_format) def extract_B4( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD ): """Indent 4""" return self.extract_B(paragraph, 4) return self.extract_B(paragraph, 4, extract_format) def extract_B5( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD ): """Indent 5""" return self.extract_B(paragraph, 5) return self.extract_B(paragraph, 5, extract_format) def extract_B_plus( self, Loading @@ -742,26 +763,29 @@ class TS2MDExtractor: extract_format: ExtractFormat = ExtractFormat.MD, ): """Bulleted indent 1 (round bullets)""" md = self.extract_inner_content(paragraph) md = self.extract_inner_content(paragraph, extract_format) return level * " " + f"* {md}" def extract_B1_plus( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD ): """Bulleted indent 1 (round bullets)""" return self.extract_B_plus(paragraph, 0) # exception if in table if paragraph._element.getparent().tag == qn("w:tc"): return self.extract_TB1(paragraph, extract_format) return self.extract_B_plus(paragraph, 0, extract_format) def extract_B2_plus( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD ): """Bulleted indent 2 (dashes)""" return self.extract_B_plus(paragraph, 4) return self.extract_B_plus(paragraph, 4, extract_format) def extract_B3_plus( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD ): """Bulleted indent 3 (square bullets)""" return self.extract_B_plus(paragraph, 8) return self.extract_B_plus(paragraph, 8, extract_format) def extract_BN( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD Loading @@ -784,7 +808,10 @@ class TS2MDExtractor: ): """Standard paragraph, Definition""" md = self.extract_inner_content(paragraph, extract_format) if extract_format == ExtractFormat.MD: return f"\n{md}\n" else: return f"<p>{md}</p>" def extract_TT( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD Loading Loading @@ -827,7 +854,7 @@ class TS2MDExtractor: <figcaption>{label}: {caption}</figcaption> </figure>\n""" else: return "**{md}**" return f"**{md}**" def extract_FL( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD Loading Loading @@ -878,19 +905,21 @@ class TS2MDExtractor: ): """List in tables Level 1""" html = self.extract_inner_content(paragraph, extract_format) return f"\n<li>{html}</li>" return f"""<li data-docx-pstyle="TB1">{html}</li>""" def extract_TB2( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.HTML ): """List in tables Level 2""" return self.extract_TB1(paragraph, extract_format) html = self.extract_inner_content(paragraph, extract_format) return f"""<li data-docx-pstyle="TB2">{html}</li>""" def extract_TAN( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.HTML ): """Note in table => use "tab" between "item/number" and "text".""" return self.extract_NO(paragraph, extract_format) html = self.extract_inner_content(paragraph, extract_format) return f"""\n<li data-docx-pstyle="TAN">{html}</li>""" # --------------------------------------- # For tables Loading @@ -905,14 +934,13 @@ class TS2MDExtractor: return "Table_id_unknown", "label_unknown" prev = get_prev_block(self.document, table) if isinstance(prev, Paragraph) and getattr(prev.style, "name", None) == "TH": m = CAPTION_TABLE_RE.match(prev.text or "") md = self.extract_inner_content(prev) m = CAPTION_TABLE_RE.match(md) if m: num = m.group(1) title = m.group(2) or "" cap_id = f"Table_{num}" # Use the full visible text as caption: "Table N: Title" visible = prev.text.strip() return cap_id, visible return cap_id, md return self.caption_from_prev_paragraph(prev, i + 1) def cell_text_html(self, cell): Loading @@ -920,9 +948,19 @@ class TS2MDExtractor: Join paragraphs in a cell with <br>. Preserve tabs and basic HTML escaping. """ parts = [] for p in cell.paragraphs: parts.append(self.extract_paragraph(p, ExtractFormat.HTML)) return "<br>".join(parts) for first, last, p in with_flags(cell.paragraphs): html = self.extract_paragraph(p, ExtractFormat.HTML) if first and last and html.startswith("<p>"): html = html[3:-4] if html.startswith("<li") and (first or not parts[-1].startswith("<li")): parts.append("<ul>") if not first and parts[-1].startswith("<li") and not html.startswith("<li"): parts.append("</ul>") parts.append(html) if last and html.startswith("<li"): parts.append("</ul>") return "\n".join(parts) def get_paragraph_image_info(self, paragraph: Paragraph, i: int = 0): """ Loading Loading @@ -1008,3 +1046,19 @@ class TS2MDExtractor: lines.append("</table>") return "\n".join(lines) # --------------------------------------- # For some special cases in SAREF TSs # --------------------------------------- def extract_List_Paragraph( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD ): """List paragraphs with list level""" try: level = int(paragraph._element.xpath("w:pPr/w:numPr/w:ilvl/@w:val")[0]) except: level = 1 return level*4*" "+ "* " + self.extract_Normal(paragraph, extract_format).strip()
src/saref_pypeline/docgen/ts_generator.py +107 −42 Original line number Diff line number Diff line Loading @@ -46,6 +46,7 @@ from saref_pypeline.docgen.utils import ( EntityDescription, with_flags, materialize_links, pprint_xml ) from saref_pypeline.entities import ( SAREFCore, Loading Loading @@ -290,6 +291,12 @@ class TSGenerator: self._cursor = self._cursor._parent elif isinstance(self._cursor, Table): self.new_paragraph() elif isinstance(self._cursor, _Cell): new_p = OxmlElement("w:p") self._cursor._element.append(new_p) paragraph = Paragraph(new_p, self._cursor) self._cursor = paragraph def ensure_cursor_run(self) -> None: """ Loading @@ -306,10 +313,10 @@ class TSGenerator: def ensure_pstyle(self, style: P_STYLE, styling: Callable = None) -> Callable: def more_styling(el=None): if isinstance(self._cursor, Paragraph): self._cursor.style = style if styling: styling(el) if isinstance(self._cursor, Paragraph): self._cursor.style = style return more_styling Loading @@ -317,19 +324,19 @@ class TSGenerator: self, style_run: Callable[[Run], None], styling: Callable = None ) -> Callable: def more_styling(el=None): if isinstance(self._cursor, Run): style_run(self._cursor) if styling: styling(el) if isinstance(self._cursor, Run): style_run(self._cursor) return more_styling def ensure_cstyle(self, style: C_STYLE, styling: Callable = None) -> Callable: def more_styling(el=None): if isinstance(self._cursor, Run): self._cursor.style = style if styling: styling(el) if isinstance(self._cursor, Run): self._cursor.style = style return more_styling Loading Loading @@ -915,7 +922,12 @@ class TSGenerator: for p in soup.find_all("p"): p.attrs["class"] = "Normal" for li in soup.find_all("li"): li.string = re.sub(r" *: *", "\t", li.string, 1) for desc in li.descendants: if isinstance(desc, NavigableString): new_text, n = re.subn(r"\s*:\s+", "\t", str(desc), 1) if n: # replaced once desc.replace_with(new_text) break li["data-docx-pstyle"] = P_STYLE.EW self.insert_soup(soup) Loading Loading @@ -970,12 +982,13 @@ class TSGenerator: self.insert_soup_for_file("description") self.insert_soup_for_file("examples") self._is_appendix = True self.insert_soup_for_file("annexes") self.insert_soup_for_file("annexes", mandatory=False) self.describe_ontology() def insert_soup_for_file(self, file: str): def insert_soup_for_file(self, file: str, mandatory=True): soup = self.get_soup(file) if not soup: if mandatory: self.new_paragraph() self.new_run( f"File documentation/{file}.md does not exist.", style=C_STYLE.Guidance Loading Loading @@ -1027,7 +1040,7 @@ class TSGenerator: data = html_path.read_text(encoding="utf-8") elif md_path.exists(): data = markdown( md_path.read_text(encoding="utf-8"), extensions=["extra", "codehilite"] md_path.read_text(encoding="utf-8"), extensions=["extra", "admonition", "codehilite"] ) if not data: Loading @@ -1047,6 +1060,8 @@ class TSGenerator: if el == "\n": return self.ensure_cursor_paragraph() if styling: styling(el) self.new_run(el) if styling: styling(el) Loading Loading @@ -1074,20 +1089,56 @@ class TSGenerator: for child in el.children: self.insert_soup(child, styling) def insert_soup_div(self, el: Tag, styling: Callable = None) -> None: # only a few special cases of admonition are supported if "admonition" in el.get("class"): title = el.find("p", class_="admonition-title") body_parts = [p for p in el.find_all("p") if p is not title] if len(body_parts) >= 1: p = body_parts[0] if title: p.insert(0, *list(title.children)) else: p = title style = P_STYLE.NO if title and "NOTE" in title.getText() else P_STYLE.EX self.insert_soup_p(p, self.ensure_pstyle(style)) for p in body_parts[1:]: self.insert_soup_p(p, self.ensure_pstyle(P_STYLE.EW, styling)) else: self.new_paragraph() if styling: styling(el) self.insert_soup_children(el, styling) def insert_soup_br(self, el: Tag, styling: Callable = None) -> None: self.ensure_cursor_paragraph() self.new_paragraph() if styling: styling() def insert_soup_p(self, el: Tag, styling: Callable = None) -> None: self.ensure_cursor_paragraph() if any(parent.name == "table" for parent in el.parents) \ and len(self._cursor._p.getparent().xpath("./w:p")) == 1 \ and len(self._cursor._p.getparent().xpath("./w:p/w:r")) == 0: self.ensure_cursor_paragraph() # do nothing else: self.new_paragraph() if styling: styling(el) add_colon = False add_tab = False if el.getText().startswith("NOTE"): self._cursor.style = P_STYLE.NO add_colon = True add_tab = True elif el.getText().startswith("EXAMPLE"): self._cursor.style = P_STYLE.EX add_colon = True add_tab = True if add_colon: if add_tab: for child in el.children: if isinstance(child, NavigableString) and ":" in child.text: child.replace_with(re.sub(r":\w*", ":\t", child.text, 1)) Loading Loading @@ -1149,18 +1200,24 @@ class TSGenerator: def insert_soup_list(self, el: Tag, styling: Callable = None) -> None: for li in el.find_all("li", recursive=False): self.insert_soup_li(li, styling) def insert_soup_li(self, li: Tag, styling: Callable = None) -> None: self.new_paragraph() if styling: styling(li) if style := li.get("data-docx-pstyle", None): self._cursor.style = style style = self._cursor.style if "data-docx-pstyle" in li.attrs: style = li.get("data-docx-pstyle") styling = self.ensure_pstyle(style, styling) for child in li.children: if isinstance(child, Tag) and child.name == "ul": sub_p_style = SUB_UL_STYLE.get(style, None) sub_p_style = SUB_UL_STYLE.get(style.name, None) more_styling = self.ensure_pstyle(sub_p_style, styling) self.insert_soup_list(child, more_styling) elif isinstance(child, Tag) and child.name == "ol": sub_p_style = SUB_OL_STYLE.get(style, None) sub_p_style = SUB_OL_STYLE.get(style.name, None) more_styling = self.ensure_pstyle(sub_p_style, styling) self.insert_soup_list(child, more_styling) else: Loading Loading @@ -1336,6 +1393,7 @@ class TSGenerator: self._cursor = run styling() self._cursor = paragraph styling() # Assemble and append h.append(r) Loading Loading @@ -1444,18 +1502,25 @@ class TSGenerator: Add a <td> (table data cell). """ cell.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER # for _p in cell._tc.xpath("./w:p"): # cell._element.remove(_p) # self._cursor = cell self._cursor = cell.paragraphs[0] # self._cursor.alignment = WD_ALIGN_PARAGRAPH.CENTER # Decide style based on HTML attributes align = el.get("align", "").lower() style = el.get("style", "").lower() styles = dict( rule.strip().split(":", 1) for rule in style.split(";") if ":" in rule ) align = styles.get("text-align", "").strip() if align == "center": self._cursor.style = P_STYLE.TAC style = P_STYLE.TAC elif align == "right": self._cursor.style = P_STYLE.TAR style = P_STYLE.TAR else: self._cursor.style = P_STYLE.TAL # default align left self.insert_soup_children(el, styling) style = P_STYLE.TAL # default align left self.insert_soup_children(el, self.ensure_pstyle(style, styling)) # --------------------------------------------------------------------- # Methods for the ontology reference annex Loading Loading @@ -1861,7 +1926,7 @@ class TSGenerator: elif literal.datatype == URIRef( "http://www.iana.org/assignments/media-types/text/markdown" ): self.insert_soup(markdown(literal, extensions=["extra", "codehilite"])) self.insert_soup(markdown(literal, extensions=["extra", "admonition", "codehilite"])) else: self.new_run(literal.replace("\r", "")) Loading
src/saref_pypeline/docgen/utils.py +9 −0 Original line number Diff line number Diff line Loading @@ -9,6 +9,7 @@ from typing import TypeVar from rdflib.term import URIRef, Literal from rdflib import Graph, RDF, RDFS, OWL, XSD from dominate.tags import sup, a, li from lxml import etree from saref_pypeline.entities import SAREFGraphDocument Loading Loading @@ -205,3 +206,11 @@ def print_admonition(classes: str, title: str, md: str): prefix = 4 * " " content = md.replace("\n", f"\n{prefix}") return f'\n!!! {classes} "{title}"\n{prefix}{content}\n' def pprint_xml(entity): """ Pretty-print the XML of a python-docx entity (_element). """ entity = getattr(entity, "_element", entity) xml_bytes = etree.tostring(entity, pretty_print=True, encoding="unicode") print(xml_bytes)