Unverified Commit 6a695e1e authored by Maxime Lefrançois's avatar Maxime Lefrançois
Browse files

adjustments search and roundtrip SAREF4BLG

parent 3d9ef7ee
Loading
Loading
Loading
Loading
+2 −2
Original line number Original line Diff line number Diff line
@@ -84,5 +84,5 @@ owl:topObjectProperty a owl:ObjectProperty ;
"""
"""
)
)
PATTERN_TERM_NAME = re.compile(r"^[a-zA-Z0-9][a-zA-Z0-9_-]+$")
PATTERN_TERM_NAME = re.compile(r"^[a-zA-Z0-9][a-zA-Z0-9_-]+$")
CAPTION_TABLE_RE = re.compile(r"^\s*Table[\s\xa0]+([A-Z0-9\.]+)[\xa0\s:]*(.*)\s*$")
CAPTION_TABLE_RE = re.compile(r"^\s*Table[\s\xa0]+([A-Z0-9\.]+)[\xa0\s:]*([\s\S]*?)\s*$")
CAPTION_FIGURE_RE = re.compile(r"^\s*Figure[\s\xa0]+([A-Z0-9\.]+)[\xa0\s:]*(.*)\s*$")
CAPTION_FIGURE_RE = re.compile(r"^\s*Figure[\s\xa0]+([A-Z0-9\.]+)[\xa0\s:]*([\s\S]*?)\s*$")
+3 −0
Original line number Original line Diff line number Diff line
@@ -208,6 +208,9 @@ class P_STYLE(StrEnum, metaclass=_MetaEnum):
    # Style found in SAREF4AUTO V1.1.1
    # Style found in SAREF4AUTO V1.1.1
    List_Paragraph = "List Paragraph"
    List_Paragraph = "List Paragraph"


    # Style found in SAREF4BLDG V2.1.1
    Annotation_Text = "annotation text"



class C_STYLE(StrEnum, metaclass=_MetaEnum):
class C_STYLE(StrEnum, metaclass=_MetaEnum):
    """ETSI Character Styles, obtained from skeleton"""
    """ETSI Character Styles, obtained from skeleton"""
+1 −0
Original line number Original line Diff line number Diff line
@@ -1132,6 +1132,7 @@ class SAREFPipeline:
                return dumper.represent_str(data.name)
                return dumper.represent_str(data.name)


            yaml.SafeDumper.add_representer(WK_FIELD, represent_string)
            yaml.SafeDumper.add_representer(WK_FIELD, represent_string)
            os.makedirs(os.path.dirname(metadata_file), exist_ok=True)
            with open(metadata_file, "w") as f:
            with open(metadata_file, "w") as f:
                yaml.safe_dump(self.projects_metadata, f)
                yaml.safe_dump(self.projects_metadata, f)


+38 −5
Original line number Original line Diff line number Diff line
@@ -645,7 +645,9 @@ class TS2MDExtractor:
    ):
    ):
        """Reference, Example => use "tab" between "item/number" and "text"."""
        """Reference, Example => use "tab" between "item/number" and "text"."""
        md = self.extract_inner_content(paragraph)
        md = self.extract_inner_content(paragraph)
        if md.startswith("EXAMPLE"):
        if not md:
            return ""
        elif md.startswith("EXAMPLE"):
            # Example
            # Example
            left, right = md.split("\t", 1)
            left, right = md.split("\t", 1)
            content = right.replace("\n", "\n    ")
            content = right.replace("\n", "\n    ")
@@ -797,7 +799,8 @@ class TS2MDExtractor:
    ):
    ):
        """Programming language"""
        """Programming language"""
        md = self.extract_inner_content(paragraph, extract_format)
        md = self.extract_inner_content(paragraph, extract_format)
        return f"```\n{md}\n```"
        md = md.replace("&", "&amp;").replace("<", "&lt;")
        return f"""<pre><code class="language-turtle">{md}</code></pre>""" if md else ""


    def extract_EQ(
    def extract_EQ(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
@@ -830,7 +833,11 @@ class TS2MDExtractor:
    <figcaption>{label}: {caption}</figcaption>
    <figcaption>{label}: {caption}</figcaption>
</figure>\n"""
</figure>\n"""
        else:
        else:
            return f"**{md}**"
            pre_code = self.get_paragraph_pre_code(paragraph)
            return f"""<figure>
    {pre_code}
    <figcaption>{label}: {caption}</figcaption>
</figure>\n"""


    def extract_FL(
    def extract_FL(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
@@ -997,6 +1004,24 @@ class TS2MDExtractor:
        image_paragraph = get_prev_block(self.document, paragraph)
        image_paragraph = get_prev_block(self.document, paragraph)
        return self.get_paragraph_image_info(image_paragraph, i + 1)
        return self.get_paragraph_image_info(image_paragraph, i + 1)


    def get_paragraph_pre_code(self, block, i: int = 0):
        """
        If table with one row and one column, it contains turtle code.
        """
        if i > 3:
            return ""
        if not isinstance(block, Table):
            block = get_prev_block(self.document, block)
            return self.get_paragraph_pre_code(block, i+1)
        if len(block.rows) != 1 or len(block.columns) != 1:
            logger.warning("expected table with one line and one column for Listings")
            return ""

        cell = block.cell(0, 0)
        ttl = "\n".join(p.text for p in cell.paragraphs)
        ttl = ttl.replace("&", "&amp;").replace("<", "&lt;")
        return f"""<pre><code class="language-turtle">{ttl}</code></pre>""" if ttl else ""

    def extract_table(self, table, extract_format: ExtractFormat = ExtractFormat.HTML):
    def extract_table(self, table, extract_format: ExtractFormat = ExtractFormat.HTML):
        """
        """
        Generate HTML for a python-docx Table.
        Generate HTML for a python-docx Table.
@@ -1007,6 +1032,10 @@ class TS2MDExtractor:
        """
        """
        assert isinstance(table, Table)
        assert isinstance(table, Table)


        # ignore if nrows=1 and ncols=1 like in SAREF4BLDG
        if len(table.rows) == 1 and len(table.columns)==1:
            return
        
        cap_id, cap_text = self.caption_from_prev_paragraph(table)
        cap_id, cap_text = self.caption_from_prev_paragraph(table)


        if cap_text and "Prefixes and namespaces" in cap_text:
        if cap_text and "Prefixes and namespaces" in cap_text:
@@ -1118,6 +1147,10 @@ class TS2MDExtractor:
        else:
        else:
            return ""
            return ""


    def extract_Annotation_Text(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        return self.extract_Normal(paragraph, extract_format)


def print_admonition(classes: str, title: str, md: str):
def print_admonition(classes: str, title: str, md: str):
    prefix = 4 * " "
    prefix = 4 * " "
@@ -1126,7 +1159,7 @@ def print_admonition(classes: str, title: str, md: str):




def get_prev_block(document: Document, target):
def get_prev_block(document: Document, target):
    """Return the block (Paragraph or Table) that appears immediately before target_tbl."""
    """Return the block (Paragraph or Table) that appears immediately before target."""
    prev = None
    prev = None
    for block in iter_block_items(document):
    for block in iter_block_items(document):
        if isinstance(block, Table | Paragraph) and block._element is target._element:
        if isinstance(block, Table | Paragraph) and block._element is target._element:
+12 −3
Original line number Original line Diff line number Diff line
@@ -902,11 +902,11 @@ class TSGenerator:
        informative_soup = BeautifulSoup()
        informative_soup = BeautifulSoup()
        normative_h2 = normative_soup.find("h2")
        normative_h2 = normative_soup.find("h2")
        if normative_h2:
        if normative_h2:
            informative_h3 = normative_h2.find_next("h2")
            informative_h2 = normative_h2.find_next("h2")
            for el in informative_h3.find_next_siblings():
            for el in informative_h2.find_next_siblings():
                el.extract()
                el.extract()
                informative_soup.append(el)
                informative_soup.append(el)
            for h2 in normative_soup.find_all("h2"):
            for h2 in normative_soup.find_all(re.compile("h1|h2")):
                h2.extract()
                h2.extract()


            self.insert_references("Normative references", normative_soup)
            self.insert_references("Normative references", normative_soup)
@@ -1380,6 +1380,8 @@ class TSGenerator:
            if isinstance(child, Tag):
            if isinstance(child, Tag):
                if child.name == "img":
                if child.name == "img":
                    self.insert_soup_img(child, text_width)
                    self.insert_soup_img(child, text_width)
                if child.name == "pre":
                    self.insert_soup_pre(child)
                elif child.name == "figcaption":
                elif child.name == "figcaption":
                    self.insert_soup_figcaption(child)
                    self.insert_soup_figcaption(child)


@@ -1463,6 +1465,13 @@ class TSGenerator:
        with Bookmark(self, id):
        with Bookmark(self, id):
            self.insert_soup_children(el)
            self.insert_soup_children(el)


    def insert_soup_pre(self, pre: Tag) -> None:
        if not (len(pre.contents) == 1 and pre.contents[0].name == "code"):
            logger.warning("Not supported <pre> element")
        code = pre.contents[0].text
        self.new_paragraph(code, style=P_STYLE.PL)

            
    def insert_soup_a(self, a: Tag) -> None:
    def insert_soup_a(self, a: Tag) -> None:
        self.ensure_cursor_paragraph()
        self.ensure_cursor_paragraph()
        if href := a.get("href"):
        if href := a.get("href"):
Loading