Unverified Commit afabd38a authored by Maxime Lefrançois's avatar Maxime Lefrançois
Browse files

roundtrip TS->md->TS for auto, city, ehaw

parent 75a1b878
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
@@ -158,6 +158,10 @@ def main():
        sys.exit()

    kwargs = vars(args)
    if not os.path.isdir(args.directory):
        print(f"Error: directory {args.directory} does not exist", file=sys.stderr)
        sys.exit(-2)

    pipeline = SAREFPipeline(**kwargs)
    # with Profile() as profile:

+8 −3
Original line number Diff line number Diff line
@@ -137,7 +137,9 @@ class SiteManager:

        # html documentation
        html = docgen.render_ontology_documentation(project_version.ontology)
        Path(target_dir, project_version.ontology.name + ".html").write_text(html)
        dest = Path(target_dir, project_version.ontology.name + ".html")
        dest.write_text(html)
        logger.debug(f"HTML page for {project_version.ontology}: {dest}")

        if project_version.examples:
            os.makedirs(os.path.join(target_dir, "example"), exist_ok=True)
@@ -196,8 +198,11 @@ class SiteManager:
        m, t, e = (x.zfill(2) for x in docxgen.context[WK_FIELD.mte].split("."))
        document_name = f"ts_{odd}{ddd}{part}v{m}{t}{e}_{project_version.ontology.name}_{time_formatted}.docx"
        os.makedirs(self.ts_dir, exist_ok=True)
        document.save(os.path.join(self.ts_dir, document_name))
        # os.system(f'cmd.exe /C start "{self.ts_dir}/{document_name}"')
        dest = os.path.join(self.ts_dir, document_name)
        document.save(dest)
        logger.info(f"TS for {project_version} generated at {dest}")

        os.system(f'cmd.exe /C start "{self.ts_dir}/{document_name}"')

    def extract_from_ts(self, project_version: SAREFProjectVersion):
        project = project_version.project
+96 −42
Original line number Diff line number Diff line
@@ -4,12 +4,12 @@ import os
import re
from pathlib import Path
from functools import cache, cached_property, lru_cache
import shutil
from typing import Any, Callable, Dict, Generator, List, Tuple
from copy import deepcopy
from datetime import datetime
from functools import cache
import zipfile
from lxml import etree
from dotenv import dotenv_values
import requests
import platform
@@ -46,6 +46,7 @@ from docx.enum.table import (
    WD_CELL_VERTICAL_ALIGNMENT,
)
from docx.oxml import OxmlElement
from docx.oxml.xmlchemy import BaseOxmlElement
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import _Cell, Table
@@ -53,7 +54,8 @@ from docx.text.paragraph import Paragraph
from docx.text.hyperlink import Hyperlink
from docx.shape import InlineShape

from saref_pypeline.docgen.utils import OWL_GRAPH, print_admonition
from saref_pypeline._logging import TRACE_LEVEL
from saref_pypeline.docgen.utils import OWL_GRAPH, print_admonition, with_flags, pprint_xml
from saref_pypeline.docgen.docxutils import iter_block_items, get_prev_block
from saref_pypeline.entities import (
    SAREFCore,
@@ -101,14 +103,6 @@ class RunContext:
        for markup in Markup.__members__.values():
            setattr(self, markup.style, None)

def pprint_xml(entity):
    """
    Pretty-print the XML of a python-docx entity (_element).
    """
    entity = getattr(entity, "_element", entity)
    xml_bytes = etree.tostring(entity, pretty_print=True, encoding="unicode")
    print(xml_bytes)


def open_url(url: str) -> None:
    """
@@ -368,12 +362,19 @@ class TS2MDExtractor:
        if not os.path.isdir(self.doc_folder):
            self.doc_folder = os.path.join(self.pipeline.sources_dir, "ts")

        # self.out_folder = os.path.join(
        #     self.pipeline.target_dir,
        #     "ts",
        #     self.project.name,
        #     os.path.splitext(filename)[0],
        # )

        self.out_folder = os.path.join(
            self.pipeline.target_dir,
            "ts",
            self.project.name,
            self.pipeline.directory,
            "documentation",
            os.path.splitext(filename)[0],
        )
        shutil.rmtree(self.out_folder, ignore_errors=True)

        self.file_path = os.path.join(self.doc_folder, filename)
        if not os.path.isfile(self.file_path):
@@ -383,6 +384,8 @@ class TS2MDExtractor:
            if not confirm:
                return

        logger.log(TRACE_LEVEL, f"Extracting from TS {self.project_version} with file {self.file_path}" )

        self.extract_figures()

        self.document = Document(self.file_path)
@@ -411,6 +414,8 @@ class TS2MDExtractor:
            Path(self.out_folder, f"annex_{chr(code_char)}.md").write_text(md, "utf-8")
            code_char += 1

        logger.debug(f"Extraction complete: {self.out_folder}")

    def get_docx_url(self) -> Tuple[str, str]:
        """
        Compute the ETSI TS docx download URL from work item metadata.
@@ -562,7 +567,7 @@ class TS2MDExtractor:

    def extract_paragraph(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
    ) -> str:
        try:
            style = P_STYLE(paragraph.style.name)
        except:
@@ -589,14 +594,24 @@ class TS2MDExtractor:
        # keep track of spaces, as md markup needs to be right before/after non-space characters
        # invariant: last character of last item in ctx.content is a non-blank character

        for child in paragraph.iter_inner_content():
            if isinstance(child, Hyperlink):
        # We cannot use python-docx method iter_inner_content, as it only considers runs and hyperlinks.
        # For example elements <fldSimple> are ignored 
        # Special elements like <bookmarkStart>, <bookmarkEnd> are ignored.
        for _c in paragraph._p.xpath("./w:r | ./w:hyperlink | ./w:fldSimple"):
            _c:BaseOxmlElement
            if _c.tag == qn("w:r"):
                child = Run(_c, paragraph)
                extract_run(child, ctx)
            elif _c.tag == qn("w:hyperlink"):
                child = Hyperlink(_c, paragraph)
                if self.extract_hyperlink_necessary(child):
                    extract_hyperlink(child, ctx)
                else:
                    for run in child.runs:
                        extract_run(run, ctx)
            elif isinstance(child, Run):
            elif _c.tag == qn("w:fldSimple"):
                for _r in _c.xpath("./w:r", namespaces=_c.nsmap):
                    child = Run(_r, paragraph)
                    extract_run(child, ctx)

        for markup in [Markup.STRONG, Markup.EM, Markup.SUP, Markup.CODE]:
@@ -618,7 +633,7 @@ class TS2MDExtractor:
            md = match.group(2)
        elif match := re.match(r"^Annex.*?\n(.*)", md):
            md = match.group(1)
        return prefix + md + "\n"
        return "\n" + prefix + md + "\n"

    def extract_Heading_1(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
@@ -702,38 +717,44 @@ class TS2MDExtractor:
        level: int,
        extract_format: ExtractFormat = ExtractFormat.MD,
    ):
        md = self.extract_inner_content(paragraph)
        return print_admonition(f"indent-{level}", "", md)
        content = self.extract_inner_content(paragraph, extract_format)
        if extract_format == ExtractFormat.MD:
            return print_admonition(f"indent-{level}", "", content)
        else:
            return f"""<li data-docx-pstyle="B{level}">{content}</li>"""

    def extract_B1(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        """Indent 1"""
        return self.extract_B(paragraph, 1)
        # exception if in table
        if paragraph._element.getparent().tag == qn("w:tc"):
            return self.extract_TB1(paragraph, extract_format)
        return self.extract_B(paragraph, 1, extract_format)

    def extract_B2(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        """Indent 2"""
        return self.extract_B(paragraph, 2)
        return self.extract_B(paragraph, 2, extract_format)

    def extract_B3(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        """Indent 3"""
        return self.extract_B(paragraph, 3)
        return self.extract_B(paragraph, 3, extract_format)

    def extract_B4(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        """Indent 4"""
        return self.extract_B(paragraph, 4)
        return self.extract_B(paragraph, 4, extract_format)

    def extract_B5(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        """Indent 5"""
        return self.extract_B(paragraph, 5)
        return self.extract_B(paragraph, 5, extract_format)

    def extract_B_plus(
        self,
@@ -742,26 +763,29 @@ class TS2MDExtractor:
        extract_format: ExtractFormat = ExtractFormat.MD,
    ):
        """Bulleted indent 1 (round bullets)"""
        md = self.extract_inner_content(paragraph)
        md = self.extract_inner_content(paragraph, extract_format)
        return level * " " + f"* {md}"

    def extract_B1_plus(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        """Bulleted indent 1 (round bullets)"""
        return self.extract_B_plus(paragraph, 0)
        # exception if in table
        if paragraph._element.getparent().tag == qn("w:tc"):
            return self.extract_TB1(paragraph, extract_format)
        return self.extract_B_plus(paragraph, 0, extract_format)

    def extract_B2_plus(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        """Bulleted indent 2 (dashes)"""
        return self.extract_B_plus(paragraph, 4)
        return self.extract_B_plus(paragraph, 4, extract_format)

    def extract_B3_plus(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        """Bulleted indent 3 (square bullets)"""
        return self.extract_B_plus(paragraph, 8)
        return self.extract_B_plus(paragraph, 8, extract_format)

    def extract_BN(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
@@ -784,7 +808,10 @@ class TS2MDExtractor:
    ):
        """Standard paragraph, Definition"""
        md = self.extract_inner_content(paragraph, extract_format)
        if extract_format == ExtractFormat.MD:
            return f"\n{md}\n"
        else:
            return f"<p>{md}</p>"

    def extract_TT(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
@@ -827,7 +854,7 @@ class TS2MDExtractor:
        <figcaption>{label}: {caption}</figcaption>
    </figure>\n"""
        else:
            return "**{md}**"
            return f"**{md}**"

    def extract_FL(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
@@ -878,19 +905,21 @@ class TS2MDExtractor:
    ):
        """List in tables Level 1"""
        html = self.extract_inner_content(paragraph, extract_format)
        return f"\n<li>{html}</li>"
        return f"""<li data-docx-pstyle="TB1">{html}</li>"""

    def extract_TB2(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.HTML
    ):
        """List in tables Level 2"""
        return self.extract_TB1(paragraph, extract_format)
        html = self.extract_inner_content(paragraph, extract_format)
        return f"""<li data-docx-pstyle="TB2">{html}</li>"""

    def extract_TAN(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.HTML
    ):
        """Note in table => use "tab" between "item/number" and "text"."""
        return self.extract_NO(paragraph, extract_format)
        html = self.extract_inner_content(paragraph, extract_format)
        return f"""\n<li data-docx-pstyle="TAN">{html}</li>"""

    # ---------------------------------------
    # For tables
@@ -905,14 +934,13 @@ class TS2MDExtractor:
            return "Table_id_unknown", "label_unknown"
        prev = get_prev_block(self.document, table)
        if isinstance(prev, Paragraph) and getattr(prev.style, "name", None) == "TH":
            m = CAPTION_TABLE_RE.match(prev.text or "")
            md = self.extract_inner_content(prev)
            m = CAPTION_TABLE_RE.match(md)
            if m:
                num = m.group(1)
                title = m.group(2) or ""
                cap_id = f"Table_{num}"
                # Use the full visible text as caption: "Table N: Title"
                visible = prev.text.strip()
                return cap_id, visible
                return cap_id, md
        return self.caption_from_prev_paragraph(prev, i + 1)

    def cell_text_html(self, cell):
@@ -920,9 +948,19 @@ class TS2MDExtractor:
        Join paragraphs in a cell with <br>. Preserve tabs and basic HTML escaping.
        """
        parts = []
        for p in cell.paragraphs:
            parts.append(self.extract_paragraph(p, ExtractFormat.HTML))
        return "<br>".join(parts)
        for first, last, p in with_flags(cell.paragraphs):
            html = self.extract_paragraph(p, ExtractFormat.HTML)
            if first and last and html.startswith("<p>"):
                html = html[3:-4]

            if html.startswith("<li") and (first or not parts[-1].startswith("<li")):
                parts.append("<ul>")
            if not first and parts[-1].startswith("<li") and not html.startswith("<li"):
                parts.append("</ul>")
            parts.append(html)
            if last and html.startswith("<li"):
                parts.append("</ul>")
        return "\n".join(parts)

    def get_paragraph_image_info(self, paragraph: Paragraph, i: int = 0):
        """
@@ -1008,3 +1046,19 @@ class TS2MDExtractor:

        lines.append("</table>")
        return "\n".join(lines)

    # ---------------------------------------
    # For some special cases in SAREF TSs 
    # ---------------------------------------

    def extract_List_Paragraph(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        """List paragraphs with list level"""
        try:
            level = int(paragraph._element.xpath("w:pPr/w:numPr/w:ilvl/@w:val")[0])
        except:
            level = 1

        return level*4*" "+ "* " + self.extract_Normal(paragraph, extract_format).strip()
+107 −42
Original line number Diff line number Diff line
@@ -46,6 +46,7 @@ from saref_pypeline.docgen.utils import (
    EntityDescription,
    with_flags,
    materialize_links,
    pprint_xml
)
from saref_pypeline.entities import (
    SAREFCore,
@@ -290,6 +291,12 @@ class TSGenerator:
                self._cursor = self._cursor._parent
        elif isinstance(self._cursor, Table):
            self.new_paragraph()
        elif isinstance(self._cursor, _Cell):
            new_p = OxmlElement("w:p")
            self._cursor._element.append(new_p)
            paragraph = Paragraph(new_p, self._cursor)
            self._cursor = paragraph
            

    def ensure_cursor_run(self) -> None:
        """
@@ -306,10 +313,10 @@ class TSGenerator:

    def ensure_pstyle(self, style: P_STYLE, styling: Callable = None) -> Callable:
        def more_styling(el=None):
            if isinstance(self._cursor, Paragraph):
                self._cursor.style = style
            if styling:
                styling(el)
            if isinstance(self._cursor, Paragraph):
                self._cursor.style = style

        return more_styling

@@ -317,19 +324,19 @@ class TSGenerator:
        self, style_run: Callable[[Run], None], styling: Callable = None
    ) -> Callable:
        def more_styling(el=None):
            if isinstance(self._cursor, Run):
                style_run(self._cursor)
            if styling:
                styling(el)
            if isinstance(self._cursor, Run):
                style_run(self._cursor)

        return more_styling

    def ensure_cstyle(self, style: C_STYLE, styling: Callable = None) -> Callable:
        def more_styling(el=None):
            if isinstance(self._cursor, Run):
                self._cursor.style = style
            if styling:
                styling(el)
            if isinstance(self._cursor, Run):
                self._cursor.style = style

        return more_styling

@@ -915,7 +922,12 @@ class TSGenerator:
        for p in soup.find_all("p"):
            p.attrs["class"] = "Normal"
        for li in soup.find_all("li"):
            li.string = re.sub(r" *: *", "\t", li.string, 1)
            for desc in li.descendants:
                if isinstance(desc, NavigableString):
                    new_text, n = re.subn(r"\s*:\s+", "\t", str(desc), 1)
                    if n:  # replaced once
                        desc.replace_with(new_text)
                        break
            li["data-docx-pstyle"] = P_STYLE.EW
        self.insert_soup(soup)

@@ -970,12 +982,13 @@ class TSGenerator:
        self.insert_soup_for_file("description")
        self.insert_soup_for_file("examples")
        self._is_appendix = True
        self.insert_soup_for_file("annexes")
        self.insert_soup_for_file("annexes", mandatory=False)
        self.describe_ontology()

    def insert_soup_for_file(self, file: str):
    def insert_soup_for_file(self, file: str, mandatory=True):
        soup = self.get_soup(file)
        if not soup:
            if mandatory:
                self.new_paragraph()
                self.new_run(
                    f"File documentation/{file}.md does not exist.", style=C_STYLE.Guidance
@@ -1027,7 +1040,7 @@ class TSGenerator:
            data = html_path.read_text(encoding="utf-8")
        elif md_path.exists():
            data = markdown(
                md_path.read_text(encoding="utf-8"), extensions=["extra", "codehilite"]
                md_path.read_text(encoding="utf-8"), extensions=["extra", "admonition", "codehilite"]
            )

        if not data:
@@ -1047,6 +1060,8 @@ class TSGenerator:
            if el == "\n":
                return
            self.ensure_cursor_paragraph()
            if styling:
                styling(el)
            self.new_run(el)
            if styling:
                styling(el)
@@ -1074,20 +1089,56 @@ class TSGenerator:
        for child in el.children:
            self.insert_soup(child, styling)

    def insert_soup_div(self, el: Tag, styling: Callable = None) -> None:
        # only a few special cases of admonition are supported
        if "admonition" in el.get("class"):
            title = el.find("p", class_="admonition-title")
            body_parts = [p for p in el.find_all("p") if p is not title]
            if len(body_parts) >= 1:
                p = body_parts[0]
                if title:
                    p.insert(0, *list(title.children))
            else:
                p = title
            style = P_STYLE.NO if title and "NOTE" in title.getText() else P_STYLE.EX
            self.insert_soup_p(p, self.ensure_pstyle(style))
            for p in body_parts[1:]:
                self.insert_soup_p(p, self.ensure_pstyle(P_STYLE.EW, styling))
        else:
            self.new_paragraph()
            if styling:
                styling(el)
            self.insert_soup_children(el, styling)
                 
        

    def insert_soup_br(self, el: Tag, styling: Callable = None) -> None:
        self.ensure_cursor_paragraph()
        self.new_paragraph()
        if styling:
            styling()
        
    def insert_soup_p(self, el: Tag, styling: Callable = None) -> None:
        self.ensure_cursor_paragraph()
        if any(parent.name == "table" for parent in el.parents) \
            and len(self._cursor._p.getparent().xpath("./w:p")) == 1 \
            and len(self._cursor._p.getparent().xpath("./w:p/w:r")) == 0:
                self.ensure_cursor_paragraph() # do nothing
        else: 
            self.new_paragraph()
        
        if styling:
            styling(el)

        add_colon = False
        add_tab = False
        if el.getText().startswith("NOTE"):
            self._cursor.style = P_STYLE.NO
            add_colon = True
            add_tab = True
        elif el.getText().startswith("EXAMPLE"):
            self._cursor.style = P_STYLE.EX
            add_colon = True
            add_tab = True

        if add_colon:
        if add_tab:
            for child in el.children:
                if isinstance(child, NavigableString) and ":" in child.text:
                    child.replace_with(re.sub(r":\w*", ":\t", child.text, 1))
@@ -1149,18 +1200,24 @@ class TSGenerator:

    def insert_soup_list(self, el: Tag, styling: Callable = None) -> None:
        for li in el.find_all("li", recursive=False):
            self.insert_soup_li(li, styling)

    def insert_soup_li(self, li: Tag, styling: Callable = None) -> None:
        self.new_paragraph()
        if styling:
            styling(li)
            if style := li.get("data-docx-pstyle", None):
                self._cursor.style = style
        style = self._cursor.style
        if "data-docx-pstyle" in li.attrs:
            style = li.get("data-docx-pstyle")
            styling = self.ensure_pstyle(style, styling)

        for child in li.children:
            if isinstance(child, Tag) and child.name == "ul":
                    sub_p_style = SUB_UL_STYLE.get(style, None)
                sub_p_style = SUB_UL_STYLE.get(style.name, None)
                more_styling = self.ensure_pstyle(sub_p_style, styling)
                self.insert_soup_list(child, more_styling)
            elif isinstance(child, Tag) and child.name == "ol":
                    sub_p_style = SUB_OL_STYLE.get(style, None)
                sub_p_style = SUB_OL_STYLE.get(style.name, None)
                more_styling = self.ensure_pstyle(sub_p_style, styling)
                self.insert_soup_list(child, more_styling)
            else:
@@ -1336,6 +1393,7 @@ class TSGenerator:
            self._cursor = run
            styling()
            self._cursor = paragraph
            styling()

        # Assemble and append
        h.append(r)
@@ -1444,18 +1502,25 @@ class TSGenerator:
        Add a <td> (table data cell).
        """
        cell.vertical_alignment = WD_CELL_VERTICAL_ALIGNMENT.CENTER
        # for _p in cell._tc.xpath("./w:p"):
        #     cell._element.remove(_p)
        # self._cursor = cell
        self._cursor = cell.paragraphs[0]
        # self._cursor.alignment = WD_ALIGN_PARAGRAPH.CENTER
        
        # Decide style based on HTML attributes
        align = el.get("align", "").lower()
        style = el.get("style", "").lower()
        styles = dict(
            rule.strip().split(":", 1)
            for rule in style.split(";") if ":" in rule
        )
        align = styles.get("text-align", "").strip()
        if align == "center":
            self._cursor.style = P_STYLE.TAC
            style = P_STYLE.TAC
        elif align == "right":
            self._cursor.style = P_STYLE.TAR
            style = P_STYLE.TAR
        else:
            self._cursor.style = P_STYLE.TAL  # default align left
        self.insert_soup_children(el, styling)
            style = P_STYLE.TAL  # default align left
        self.insert_soup_children(el, self.ensure_pstyle(style, styling))

    # ---------------------------------------------------------------------
    # Methods for the ontology reference annex
@@ -1861,7 +1926,7 @@ class TSGenerator:
        elif literal.datatype == URIRef(
            "http://www.iana.org/assignments/media-types/text/markdown"
        ):
            self.insert_soup(markdown(literal, extensions=["extra", "codehilite"]))
            self.insert_soup(markdown(literal, extensions=["extra", "admonition", "codehilite"]))
        else:
            self.new_run(literal.replace("\r", ""))

+9 −0
Original line number Diff line number Diff line
@@ -9,6 +9,7 @@ from typing import TypeVar
from rdflib.term import URIRef, Literal
from rdflib import Graph, RDF, RDFS, OWL, XSD
from dominate.tags import sup, a, li
from lxml import etree

from saref_pypeline.entities import SAREFGraphDocument

@@ -205,3 +206,11 @@ def print_admonition(classes: str, title: str, md: str):
    prefix = 4 * " "
    content = md.replace("\n", f"\n{prefix}")
    return f'\n!!! {classes} "{title}"\n{prefix}{content}\n'

def pprint_xml(entity):
    """
    Pretty-print the XML of a python-docx entity (_element).
    """
    entity = getattr(entity, "_element", entity)
    xml_bytes = etree.tostring(entity, pretty_print=True, encoding="unicode")
    print(xml_bytes)
Loading