Loading src/saref_pypeline/constants.py +2 −2 Original line number Original line Diff line number Diff line Loading @@ -84,5 +84,5 @@ owl:topObjectProperty a owl:ObjectProperty ; """ """ ) ) PATTERN_TERM_NAME = re.compile(r"^[a-zA-Z0-9][a-zA-Z0-9_-]+$") PATTERN_TERM_NAME = re.compile(r"^[a-zA-Z0-9][a-zA-Z0-9_-]+$") CAPTION_TABLE_RE = re.compile(r"^\s*Table[\s\xa0]+([A-Z0-9\.]+)[\xa0\s:]*(.*)\s*$") CAPTION_TABLE_RE = re.compile(r"^\s*Table[\s\xa0]+([A-Z0-9\.]+)[\xa0\s:]*([\s\S]*?)\s*$") CAPTION_FIGURE_RE = re.compile(r"^\s*Figure[\s\xa0]+([A-Z0-9\.]+)[\xa0\s:]*(.*)\s*$") CAPTION_FIGURE_RE = re.compile(r"^\s*Figure[\s\xa0]+([A-Z0-9\.]+)[\xa0\s:]*([\s\S]*?)\s*$") src/saref_pypeline/etsi.py +3 −0 Original line number Original line Diff line number Diff line Loading @@ -208,6 +208,9 @@ class P_STYLE(StrEnum, metaclass=_MetaEnum): # Style found in SAREF4AUTO V1.1.1 # Style found in SAREF4AUTO V1.1.1 List_Paragraph = "List Paragraph" List_Paragraph = "List Paragraph" # Style found in SAREF4BLDG V2.1.1 Annotation_Text = "annotation text" class C_STYLE(StrEnum, metaclass=_MetaEnum): class C_STYLE(StrEnum, metaclass=_MetaEnum): """ETSI Character Styles, obtained from skeleton""" """ETSI Character Styles, obtained from skeleton""" Loading src/saref_pypeline/pipeline.py +1 −0 Original line number Original line Diff line number Diff line Loading @@ -1132,6 +1132,7 @@ class SAREFPipeline: return dumper.represent_str(data.name) return dumper.represent_str(data.name) yaml.SafeDumper.add_representer(WK_FIELD, represent_string) yaml.SafeDumper.add_representer(WK_FIELD, represent_string) os.makedirs(os.path.dirname(metadata_file), exist_ok=True) with open(metadata_file, "w") as f: with open(metadata_file, "w") as f: yaml.safe_dump(self.projects_metadata, f) yaml.safe_dump(self.projects_metadata, f) Loading src/saref_pypeline/ts/ts2md_extractor.py +38 −5 Original line number Original line Diff line number Diff line Loading @@ -645,7 +645,9 @@ class TS2MDExtractor: ): ): """Reference, Example => use "tab" between "item/number" and "text".""" """Reference, Example => use "tab" between "item/number" and "text".""" md = self.extract_inner_content(paragraph) md = self.extract_inner_content(paragraph) if md.startswith("EXAMPLE"): if not md: return "" elif md.startswith("EXAMPLE"): # Example # Example left, right = md.split("\t", 1) left, right = md.split("\t", 1) content = right.replace("\n", "\n ") content = right.replace("\n", "\n ") Loading Loading @@ -797,7 +799,8 @@ class TS2MDExtractor: ): ): """Programming language""" """Programming language""" md = self.extract_inner_content(paragraph, extract_format) md = self.extract_inner_content(paragraph, extract_format) return f"```\n{md}\n```" md = md.replace("&", "&").replace("<", "<") return f"""<pre><code class="language-turtle">{md}</code></pre>""" if md else "" def extract_EQ( def extract_EQ( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD Loading Loading @@ -830,7 +833,11 @@ class TS2MDExtractor: <figcaption>{label}: {caption}</figcaption> <figcaption>{label}: {caption}</figcaption> </figure>\n""" </figure>\n""" else: else: return f"**{md}**" pre_code = self.get_paragraph_pre_code(paragraph) return f"""<figure> {pre_code} <figcaption>{label}: {caption}</figcaption> </figure>\n""" def extract_FL( def extract_FL( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD Loading Loading @@ -997,6 +1004,24 @@ class TS2MDExtractor: image_paragraph = get_prev_block(self.document, paragraph) image_paragraph = get_prev_block(self.document, paragraph) return self.get_paragraph_image_info(image_paragraph, i + 1) return self.get_paragraph_image_info(image_paragraph, i + 1) def get_paragraph_pre_code(self, block, i: int = 0): """ If table with one row and one column, it contains turtle code. """ if i > 3: return "" if not isinstance(block, Table): block = get_prev_block(self.document, block) return self.get_paragraph_pre_code(block, i+1) if len(block.rows) != 1 or len(block.columns) != 1: logger.warning("expected table with one line and one column for Listings") return "" cell = block.cell(0, 0) ttl = "\n".join(p.text for p in cell.paragraphs) ttl = ttl.replace("&", "&").replace("<", "<") return f"""<pre><code class="language-turtle">{ttl}</code></pre>""" if ttl else "" def extract_table(self, table, extract_format: ExtractFormat = ExtractFormat.HTML): def extract_table(self, table, extract_format: ExtractFormat = ExtractFormat.HTML): """ """ Generate HTML for a python-docx Table. Generate HTML for a python-docx Table. Loading @@ -1007,6 +1032,10 @@ class TS2MDExtractor: """ """ assert isinstance(table, Table) assert isinstance(table, Table) # ignore if nrows=1 and ncols=1 like in SAREF4BLDG if len(table.rows) == 1 and len(table.columns)==1: return cap_id, cap_text = self.caption_from_prev_paragraph(table) cap_id, cap_text = self.caption_from_prev_paragraph(table) if cap_text and "Prefixes and namespaces" in cap_text: if cap_text and "Prefixes and namespaces" in cap_text: Loading Loading @@ -1118,6 +1147,10 @@ class TS2MDExtractor: else: else: return "" return "" def extract_Annotation_Text( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD ): return self.extract_Normal(paragraph, extract_format) def print_admonition(classes: str, title: str, md: str): def print_admonition(classes: str, title: str, md: str): prefix = 4 * " " prefix = 4 * " " Loading @@ -1126,7 +1159,7 @@ def print_admonition(classes: str, title: str, md: str): def get_prev_block(document: Document, target): def get_prev_block(document: Document, target): """Return the block (Paragraph or Table) that appears immediately before target_tbl.""" """Return the block (Paragraph or Table) that appears immediately before target.""" prev = None prev = None for block in iter_block_items(document): for block in iter_block_items(document): if isinstance(block, Table | Paragraph) and block._element is target._element: if isinstance(block, Table | Paragraph) and block._element is target._element: Loading src/saref_pypeline/ts/ts_generator.py +12 −3 Original line number Original line Diff line number Diff line Loading @@ -902,11 +902,11 @@ class TSGenerator: informative_soup = BeautifulSoup() informative_soup = BeautifulSoup() normative_h2 = normative_soup.find("h2") normative_h2 = normative_soup.find("h2") if normative_h2: if normative_h2: informative_h3 = normative_h2.find_next("h2") informative_h2 = normative_h2.find_next("h2") for el in informative_h3.find_next_siblings(): for el in informative_h2.find_next_siblings(): el.extract() el.extract() informative_soup.append(el) informative_soup.append(el) for h2 in normative_soup.find_all("h2"): for h2 in normative_soup.find_all(re.compile("h1|h2")): h2.extract() h2.extract() self.insert_references("Normative references", normative_soup) self.insert_references("Normative references", normative_soup) Loading Loading @@ -1380,6 +1380,8 @@ class TSGenerator: if isinstance(child, Tag): if isinstance(child, Tag): if child.name == "img": if child.name == "img": self.insert_soup_img(child, text_width) self.insert_soup_img(child, text_width) if child.name == "pre": self.insert_soup_pre(child) elif child.name == "figcaption": elif child.name == "figcaption": self.insert_soup_figcaption(child) self.insert_soup_figcaption(child) Loading Loading @@ -1463,6 +1465,13 @@ class TSGenerator: with Bookmark(self, id): with Bookmark(self, id): self.insert_soup_children(el) self.insert_soup_children(el) def insert_soup_pre(self, pre: Tag) -> None: if not (len(pre.contents) == 1 and pre.contents[0].name == "code"): logger.warning("Not supported <pre> element") code = pre.contents[0].text self.new_paragraph(code, style=P_STYLE.PL) def insert_soup_a(self, a: Tag) -> None: def insert_soup_a(self, a: Tag) -> None: self.ensure_cursor_paragraph() self.ensure_cursor_paragraph() if href := a.get("href"): if href := a.get("href"): Loading Loading
src/saref_pypeline/constants.py +2 −2 Original line number Original line Diff line number Diff line Loading @@ -84,5 +84,5 @@ owl:topObjectProperty a owl:ObjectProperty ; """ """ ) ) PATTERN_TERM_NAME = re.compile(r"^[a-zA-Z0-9][a-zA-Z0-9_-]+$") PATTERN_TERM_NAME = re.compile(r"^[a-zA-Z0-9][a-zA-Z0-9_-]+$") CAPTION_TABLE_RE = re.compile(r"^\s*Table[\s\xa0]+([A-Z0-9\.]+)[\xa0\s:]*(.*)\s*$") CAPTION_TABLE_RE = re.compile(r"^\s*Table[\s\xa0]+([A-Z0-9\.]+)[\xa0\s:]*([\s\S]*?)\s*$") CAPTION_FIGURE_RE = re.compile(r"^\s*Figure[\s\xa0]+([A-Z0-9\.]+)[\xa0\s:]*(.*)\s*$") CAPTION_FIGURE_RE = re.compile(r"^\s*Figure[\s\xa0]+([A-Z0-9\.]+)[\xa0\s:]*([\s\S]*?)\s*$")
src/saref_pypeline/etsi.py +3 −0 Original line number Original line Diff line number Diff line Loading @@ -208,6 +208,9 @@ class P_STYLE(StrEnum, metaclass=_MetaEnum): # Style found in SAREF4AUTO V1.1.1 # Style found in SAREF4AUTO V1.1.1 List_Paragraph = "List Paragraph" List_Paragraph = "List Paragraph" # Style found in SAREF4BLDG V2.1.1 Annotation_Text = "annotation text" class C_STYLE(StrEnum, metaclass=_MetaEnum): class C_STYLE(StrEnum, metaclass=_MetaEnum): """ETSI Character Styles, obtained from skeleton""" """ETSI Character Styles, obtained from skeleton""" Loading
src/saref_pypeline/pipeline.py +1 −0 Original line number Original line Diff line number Diff line Loading @@ -1132,6 +1132,7 @@ class SAREFPipeline: return dumper.represent_str(data.name) return dumper.represent_str(data.name) yaml.SafeDumper.add_representer(WK_FIELD, represent_string) yaml.SafeDumper.add_representer(WK_FIELD, represent_string) os.makedirs(os.path.dirname(metadata_file), exist_ok=True) with open(metadata_file, "w") as f: with open(metadata_file, "w") as f: yaml.safe_dump(self.projects_metadata, f) yaml.safe_dump(self.projects_metadata, f) Loading
src/saref_pypeline/ts/ts2md_extractor.py +38 −5 Original line number Original line Diff line number Diff line Loading @@ -645,7 +645,9 @@ class TS2MDExtractor: ): ): """Reference, Example => use "tab" between "item/number" and "text".""" """Reference, Example => use "tab" between "item/number" and "text".""" md = self.extract_inner_content(paragraph) md = self.extract_inner_content(paragraph) if md.startswith("EXAMPLE"): if not md: return "" elif md.startswith("EXAMPLE"): # Example # Example left, right = md.split("\t", 1) left, right = md.split("\t", 1) content = right.replace("\n", "\n ") content = right.replace("\n", "\n ") Loading Loading @@ -797,7 +799,8 @@ class TS2MDExtractor: ): ): """Programming language""" """Programming language""" md = self.extract_inner_content(paragraph, extract_format) md = self.extract_inner_content(paragraph, extract_format) return f"```\n{md}\n```" md = md.replace("&", "&").replace("<", "<") return f"""<pre><code class="language-turtle">{md}</code></pre>""" if md else "" def extract_EQ( def extract_EQ( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD Loading Loading @@ -830,7 +833,11 @@ class TS2MDExtractor: <figcaption>{label}: {caption}</figcaption> <figcaption>{label}: {caption}</figcaption> </figure>\n""" </figure>\n""" else: else: return f"**{md}**" pre_code = self.get_paragraph_pre_code(paragraph) return f"""<figure> {pre_code} <figcaption>{label}: {caption}</figcaption> </figure>\n""" def extract_FL( def extract_FL( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD Loading Loading @@ -997,6 +1004,24 @@ class TS2MDExtractor: image_paragraph = get_prev_block(self.document, paragraph) image_paragraph = get_prev_block(self.document, paragraph) return self.get_paragraph_image_info(image_paragraph, i + 1) return self.get_paragraph_image_info(image_paragraph, i + 1) def get_paragraph_pre_code(self, block, i: int = 0): """ If table with one row and one column, it contains turtle code. """ if i > 3: return "" if not isinstance(block, Table): block = get_prev_block(self.document, block) return self.get_paragraph_pre_code(block, i+1) if len(block.rows) != 1 or len(block.columns) != 1: logger.warning("expected table with one line and one column for Listings") return "" cell = block.cell(0, 0) ttl = "\n".join(p.text for p in cell.paragraphs) ttl = ttl.replace("&", "&").replace("<", "<") return f"""<pre><code class="language-turtle">{ttl}</code></pre>""" if ttl else "" def extract_table(self, table, extract_format: ExtractFormat = ExtractFormat.HTML): def extract_table(self, table, extract_format: ExtractFormat = ExtractFormat.HTML): """ """ Generate HTML for a python-docx Table. Generate HTML for a python-docx Table. Loading @@ -1007,6 +1032,10 @@ class TS2MDExtractor: """ """ assert isinstance(table, Table) assert isinstance(table, Table) # ignore if nrows=1 and ncols=1 like in SAREF4BLDG if len(table.rows) == 1 and len(table.columns)==1: return cap_id, cap_text = self.caption_from_prev_paragraph(table) cap_id, cap_text = self.caption_from_prev_paragraph(table) if cap_text and "Prefixes and namespaces" in cap_text: if cap_text and "Prefixes and namespaces" in cap_text: Loading Loading @@ -1118,6 +1147,10 @@ class TS2MDExtractor: else: else: return "" return "" def extract_Annotation_Text( self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD ): return self.extract_Normal(paragraph, extract_format) def print_admonition(classes: str, title: str, md: str): def print_admonition(classes: str, title: str, md: str): prefix = 4 * " " prefix = 4 * " " Loading @@ -1126,7 +1159,7 @@ def print_admonition(classes: str, title: str, md: str): def get_prev_block(document: Document, target): def get_prev_block(document: Document, target): """Return the block (Paragraph or Table) that appears immediately before target_tbl.""" """Return the block (Paragraph or Table) that appears immediately before target.""" prev = None prev = None for block in iter_block_items(document): for block in iter_block_items(document): if isinstance(block, Table | Paragraph) and block._element is target._element: if isinstance(block, Table | Paragraph) and block._element is target._element: Loading
src/saref_pypeline/ts/ts_generator.py +12 −3 Original line number Original line Diff line number Diff line Loading @@ -902,11 +902,11 @@ class TSGenerator: informative_soup = BeautifulSoup() informative_soup = BeautifulSoup() normative_h2 = normative_soup.find("h2") normative_h2 = normative_soup.find("h2") if normative_h2: if normative_h2: informative_h3 = normative_h2.find_next("h2") informative_h2 = normative_h2.find_next("h2") for el in informative_h3.find_next_siblings(): for el in informative_h2.find_next_siblings(): el.extract() el.extract() informative_soup.append(el) informative_soup.append(el) for h2 in normative_soup.find_all("h2"): for h2 in normative_soup.find_all(re.compile("h1|h2")): h2.extract() h2.extract() self.insert_references("Normative references", normative_soup) self.insert_references("Normative references", normative_soup) Loading Loading @@ -1380,6 +1380,8 @@ class TSGenerator: if isinstance(child, Tag): if isinstance(child, Tag): if child.name == "img": if child.name == "img": self.insert_soup_img(child, text_width) self.insert_soup_img(child, text_width) if child.name == "pre": self.insert_soup_pre(child) elif child.name == "figcaption": elif child.name == "figcaption": self.insert_soup_figcaption(child) self.insert_soup_figcaption(child) Loading Loading @@ -1463,6 +1465,13 @@ class TSGenerator: with Bookmark(self, id): with Bookmark(self, id): self.insert_soup_children(el) self.insert_soup_children(el) def insert_soup_pre(self, pre: Tag) -> None: if not (len(pre.contents) == 1 and pre.contents[0].name == "code"): logger.warning("Not supported <pre> element") code = pre.contents[0].text self.new_paragraph(code, style=P_STYLE.PL) def insert_soup_a(self, a: Tag) -> None: def insert_soup_a(self, a: Tag) -> None: self.ensure_cursor_paragraph() self.ensure_cursor_paragraph() if href := a.get("href"): if href := a.get("href"): Loading