improve support for styles in run sequences, and enable html or md generation... (8bde9cf4) · Commits · SAREF / saref-pypeline

src/saref_pypeline/docgen/ts_extractor.py

+260 −103

Original line number	Diff line number	Diff line
		@@ -4,7 +4,7 @@ import os
		import re
		from pathlib import Path
		from functools import cache, cached_property, lru_cache
		from typing import Callable, Dict, Generator, List, Tuple
		from typing import Any, Callable, Dict, Generator, List, Tuple
		from copy import deepcopy
		from datetime import datetime
		from functools import cache
		@@ -18,6 +18,7 @@ import webbrowser
		import logging
		import sys
		import html
		from enum import Enum, auto

		# Third-party libraries
		from bs4 import BeautifulSoup, Tag, NavigableString, PageElement
		@@ -73,6 +74,33 @@ if TYPE_CHECKING:
		logger = logging.getLogger(__name__)


		class ExtractFormat(Enum):
		HTML = auto()
		MD = auto()


		class Markup(Enum):
		STRONG = "bold", ("", ""), ("<b>", "</b>"),
		EM = "italic", ("_", "_"), ("<em>", "</em>")
		SUP = "superscript", ("<sup>", "</sup>"), ("<sup>", "</sup>")
		CODE = "name", ("`", "`"), ("<code>", "</code>"), lambda x: x == "Courier New" or x == "Consolas"

		def __init__(self, style: str, md: Tuple[str, str], html: Tuple[str, str], func: Callable = lambda x: bool(x)):
		self.style = style
		self.check = func
		self.prefix = {ExtractFormat.MD: md[0], ExtractFormat.HTML: html[0]}
		self.suffix = {ExtractFormat.MD: md[1], ExtractFormat.HTML: html[1]}

		class RunContext:
		def __init__(self, format:ExtractFormat):
		self.format = format
		self.content:List[str] = []
		self.buffer_blank:List[str] = []
		self.buffer_opening:List[Markup] = []
		self.buffer_closing:List[Markup] = []
		for markup in Markup.__members__.values():
		setattr(self, markup.style, None)

		def pprint_xml(entity):
		"""
		Pretty-print the XML of a python-docx entity (_element).
		@@ -237,51 +265,77 @@ def cell_is_header(cell):
		return False


		def extract_hyperlink(hyperlink: Hyperlink, ctx: dict):
		def extract_hyperlink(hyperlink: Hyperlink, ctx: RunContext):
		if hyperlink.url:
		href = hyperlink.url
		else:
		href = f"#{hyperlink.fragment}"
		text = "".join([extract_run(run, ctx) for run in hyperlink.runs])
		return f"[{text}]({href})"

		if ctx.format == ExtractFormat.MD:
		ctx.content.append("[")
		else:
		ctx.content.append(f"""<a href="{href}">""")

		for run in hyperlink.runs:
		extract_run(run, ctx)

		if ctx.format == ExtractFormat.MD:
		ctx.content.append(f"]({href})")
		else:
		ctx.content.append("</a>")


		def manage_ctx(
		ctx: dict,
		ctx: RunContext,
		this: Run \| Font,
		attr=None,
		before: str = "",
		after: str = "",
		prefix: str = "",
		suffix: str = "",
		):
		prev = ctx.setdefault(attr, False)
		now = getattr(this, attr)
		if not prev and now:
		before = before + prefix
		if prev and not now:
		before = suffix + before
		ctx[attr] = now
		return before, after


		def close_ctx(ctx: dict, attr=None, after: str = "", suffix: str = ""):
		prev = ctx.setdefault(attr, False)
		if prev:
		after = suffix + after
		return after


		def extract_run(run: Run, ctx: dict):
		markup: Markup,
		):
		prev = getattr(ctx, markup.style)
		now = markup.check(getattr(this, markup.style))
		setattr(ctx, markup.style, now)
		if not prev and now: # opening markup
		ctx.buffer_opening.append(markup)
		if prev and not now: # closing markup
		if ctx.buffer_opening and ctx.buffer_opening[-1] == markup:
		# markup opened and immediately closed
		ctx.buffer_opening.pop()
		else:
		ctx.buffer_closing.append(markup)


		def extract_run(run: Run, ctx: RunContext):
		# invariant: last character of last item in ctx.content is a non-blank character

		# manage opening/closing tags
		manage_ctx(ctx, run, Markup.STRONG)
		manage_ctx(ctx, run, Markup.EM)
		manage_ctx(ctx, run.font, Markup.SUP)
		manage_ctx(ctx, run.font, Markup.CODE)

		# replace blank characters
		text = run.text.replace(" ", " ")

		# extract white space before and after
		before, text, after = re.match(r"^(\s)(.?)(\s*)$", run.text).groups()
		text = text.replace(" ", " ")
		(before, after) = manage_ctx(ctx, run, "bold", before, after, "", "")
		(before, after) = manage_ctx(ctx, run, "italic", before, after, "_", "_")
		(before, after) = manage_ctx(
		ctx, run.font, "superscript", before, after, "<sup>", "</sup>"
		)
		return before + text + after
		before, text, after = re.match(r"^(\s)(.?)(\s*)$", text).groups()
		if before:
		ctx.buffer_blank.append(before)

		if ctx.buffer_closing:
		ctx.content.extend([m.suffix[ctx.format] for m in reversed(ctx.buffer_closing)])
		ctx.buffer_closing.clear()

		if text:
		ctx.content.extend(ctx.buffer_blank)
		ctx.buffer_blank.clear()

		ctx.content.extend([m.prefix[ctx.format] for m in ctx.buffer_opening])
		ctx.buffer_opening.clear()

		ctx.content.append(text)

		if after:
		ctx.buffer_blank.append(after)



		class TSExtractor:
		@@ -308,8 +362,14 @@ class TSExtractor:
		url, filename = self.get_docx_url()

		self.doc_folder = os.path.join(self.pipeline.directory, "ts")
		if not os.path.isdir(self.doc_folder):
		self.doc_folder = os.path.join(self.pipeline.sources_dir, "ts")

		self.out_folder = os.path.join(
		self.pipeline.target_dir, "ts", os.path.splitext(filename)[0]
		self.pipeline.target_dir,
		"ts",
		self.project.name,
		os.path.splitext(filename)[0],
		)

		self.file_path = os.path.join(self.doc_folder, filename)
		@@ -489,13 +549,17 @@ class TSExtractor:
		started = True
		if not started or block_item.style.name not in [P_STYLE.NO, P_STYLE.EX]:
		continue
		md_output.append(self.extract_block_item(block_item))
		ref = self.extract_block_item(block_item)
		ref = re.sub(r"(\[(i\.)?\d+\])", r"""<a id="\1">\1</a>""", ref, count=1)
		md_output.append(ref)

		# ---------------------------------------
		# For paragraphs
		# ---------------------------------------

		def extract_paragraph(self, paragraph: Paragraph):
		def extract_paragraph(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		try:
		style = P_STYLE(paragraph.style.name)
		except:
		@@ -504,24 +568,35 @@ class TSExtractor:
		fname = f"extract_{style.name}"
		if hasattr(self, fname) and callable(getattr(self, fname)):
		method = getattr(self, fname)
		return method(paragraph)
		return method(paragraph, extract_format)
		else:
		logger.warning(f"TSExtractor function {fname} not implemented - skipping")

		def extract_inner_content(self, paragraph: Paragraph):
		md_content = []
		ctx = dict()
		def extract_inner_content(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		ctx = RunContext(extract_format)
		# keep track of spaces, as md markup needs to be right before/after non-space characters
		# invariant: last character of last item in ctx.content is a non-blank character

		for child in paragraph.iter_inner_content():
		if isinstance(child, Hyperlink):
		md_content.append(extract_hyperlink(child, ctx))
		extract_hyperlink(child, ctx)
		elif isinstance(child, Run):
		md_content.append(extract_run(child, ctx))
		after = close_ctx(ctx, "bold", "", "**")
		after = close_ctx(ctx, "italic", after, "_")
		after = close_ctx(ctx, "superscript", after, "</sup>")
		md_content.append(after)
		extract_run(child, ctx)

		for markup in [Markup.STRONG, Markup.EM, Markup.SUP, Markup.CODE]:
		if getattr(ctx, markup.style):
		ctx.buffer_closing.append(markup)

		if ctx.buffer_closing:
		ctx.content.extend([m.suffix[ctx.format] for m in reversed(ctx.buffer_closing)])
		ctx.buffer_closing.clear()

		return "".join(md_content)
		ctx.content.extend(ctx.buffer_blank)
		ctx.buffer_blank.clear()

		return "".join(ctx.content)

		def extract_heading(self, paragraph: Paragraph, prefix: str):
		md = self.extract_inner_content(paragraph)
		@@ -531,31 +606,49 @@ class TSExtractor:
		md = match.group(1)
		return prefix + md + "\n"

		def extract_Heading_1(self, paragraph: Paragraph):
		def extract_Heading_1(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		return self.extract_heading(paragraph, prefix=1 * "#" + " ")

		def extract_Heading_2(self, paragraph: Paragraph):
		def extract_Heading_2(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		return self.extract_heading(paragraph, prefix=2 * "#" + " ")

		def extract_Heading_3(self, paragraph: Paragraph):
		def extract_Heading_3(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		return self.extract_heading(paragraph, prefix=3 * "#" + " ")

		def extract_Heading_4(self, paragraph: Paragraph):
		def extract_Heading_4(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		return self.extract_heading(paragraph, prefix=4 * "#" + " ")

		def extract_Heading_5(self, paragraph: Paragraph):
		def extract_Heading_5(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		return self.extract_heading(paragraph, prefix=5 * "#" + " ")

		def extract_H6(self, paragraph: Paragraph):
		def extract_H6(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		return self.extract_heading(paragraph, prefix=6 * "#" + " ")

		def extract_Heading_8(self, paragraph: Paragraph):
		def extract_Heading_8(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		return self.extract_heading(paragraph, prefix=1 * "#" + " ")

		def extract_Heading_9(self, paragraph: Paragraph):
		def extract_Heading_9(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		return self.extract_heading(paragraph, prefix=1 * "#" + " ")

		def extract_EX(self, paragraph: Paragraph):
		def extract_EX(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		"""Reference, Example => use "tab" between "item/number" and "text"."""
		md = self.extract_inner_content(paragraph)
		if md.startswith("EXAMPLE"):
		@@ -567,13 +660,17 @@ class TSExtractor:
		# Reference
		return f"* {md}"

		def extract_EW(self, paragraph: Paragraph):
		def extract_EW(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		"""Symbol, Abbreviation, Example continuation in text => use "tab" between "item/number" and "text"."""
		md = self.extract_inner_content(paragraph)
		left, right = md.split("\t", 1)
		return f"* {left}: {right}"

		def extract_NO(self, paragraph: Paragraph):
		def extract_NO(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		"""Note integrated in the text => use "tab" between "item/number" and "text"."""
		md = self.extract_inner_content(paragraph)
		if md.startswith("NOTE"):
		@@ -585,84 +682,124 @@ class TSExtractor:

		# List styles (indents)

		def extract_B(self, paragraph: Paragraph, level: int):
		def extract_B(
		self,
		paragraph: Paragraph,
		level: int,
		extract_format: ExtractFormat = ExtractFormat.MD,
		):
		md = self.extract_inner_content(paragraph)
		return print_admonition(f"indent-{level}", "", md)

		def extract_B1(self, paragraph: Paragraph):
		def extract_B1(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		"""Indent 1"""
		return self.extract_B(paragraph, 1)

		def extract_B2(self, paragraph: Paragraph):
		def extract_B2(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		"""Indent 2"""
		return self.extract_B(paragraph, 2)

		def extract_B3(self, paragraph: Paragraph):
		def extract_B3(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		"""Indent 3"""
		return self.extract_B(paragraph, 3)

		def extract_B4(self, paragraph: Paragraph):
		def extract_B4(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		"""Indent 4"""
		return self.extract_B(paragraph, 4)

		def extract_B5(self, paragraph: Paragraph):
		def extract_B5(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		"""Indent 5"""
		return self.extract_B(paragraph, 5)

		def extract_B_plus(self, paragraph: Paragraph, level: int):
		def extract_B_plus(
		self,
		paragraph: Paragraph,
		level: int,
		extract_format: ExtractFormat = ExtractFormat.MD,
		):
		"""Bulleted indent 1 (round bullets)"""
		md = self.extract_inner_content(paragraph)
		return level * " " + f"* {md}"

		def extract_B1_plus(self, paragraph: Paragraph):
		def extract_B1_plus(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		"""Bulleted indent 1 (round bullets)"""
		return self.extract_B_plus(paragraph, 0)

		def extract_B2_plus(self, paragraph: Paragraph):
		def extract_B2_plus(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		"""Bulleted indent 2 (dashes)"""
		return self.extract_B_plus(paragraph, 4)

		def extract_B3_plus(self, paragraph: Paragraph):
		def extract_B3_plus(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		"""Bulleted indent 3 (square bullets)"""
		return self.extract_B_plus(paragraph, 8)

		def extract_BN(self, paragraph: Paragraph):
		def extract_BN(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		"""Bulleted (numbers) indent 1"""
		md = self.extract_inner_content(paragraph)
		md = self.extract_inner_content(paragraph, extract_format)
		return f"1. {md}"

		def extract_BL(self, paragraph: Paragraph):
		def extract_BL(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		"""Bulleted (letters) indent 1"""
		md = self.extract_inner_content(paragraph)
		md = self.extract_inner_content(paragraph, extract_format)
		return f"a. {md}"

		# General styles For different items

		def extract_Normal(self, paragraph: Paragraph):
		def extract_Normal(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		"""Standard paragraph, Definition"""
		md = self.extract_inner_content(paragraph)
		md = self.extract_inner_content(paragraph, extract_format)
		return f"\n{md}\n"

		def extract_TT(self, paragraph: Paragraph):
		def extract_TT(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		"""Contents list title"""
		md = self.extract_inner_content(paragraph)
		md = self.extract_inner_content(paragraph, extract_format)
		return print_admonition(f"TT", "some contents list title", md)

		def extract_PL(self, paragraph: Paragraph):
		def extract_PL(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		"""Programming language"""
		md = self.extract_inner_content(paragraph)
		md = self.extract_inner_content(paragraph, extract_format)
		return f"```\n{md}\n```"

		def extract_EQ(self, paragraph: Paragraph):
		def extract_EQ(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		"""Equation"""
		md = self.extract_inner_content(paragraph)
		md = self.extract_inner_content(paragraph, extract_format)
		return print_admonition(f"EQ", "some Equation", md)

		# Figure styles For formatting figures

		def extract_TF(self, paragraph: Paragraph):
		def extract_TF(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		"""Figure title"""
		md = self.extract_inner_content(paragraph)
		md = self.extract_inner_content(paragraph, extract_format)

		match = re.match(r"(\w+[\s\xa0]+[A-Z0-9\.]+)[: \xa0](.)", md)
		if not match:
		@@ -678,48 +815,68 @@ class TSExtractor:
		else:
		return "{md}"

		def extract_FL(self, paragraph: Paragraph):
		def extract_FL(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		"""Figure layout, do nothing"""
		return ""

		def extract_NF(self, paragraph: Paragraph):
		def extract_NF(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		"""Note in figure => use "tab" between "item/number" and "text"."""
		return self.extract_NO(paragraph)

		# Table styles For formatting tables

		def extract_TH(self, paragraph: Paragraph):
		def extract_TH(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		"""Table title"""
		return ""

		def extract_TAH(self, paragraph: Paragraph):
		def extract_TAH(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.HTML
		):
		"""Heading within table or column heading"""
		return self.extract_Normal(paragraph)
		return self.extract_Normal(paragraph, extract_format)

		def extract_TAC(self, paragraph: Paragraph):
		def extract_TAC(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.HTML
		):
		"""Centred texts"""
		return self.extract_Normal(paragraph)
		return self.extract_Normal(paragraph, extract_format)

		def extract_TAL(self, paragraph: Paragraph):
		def extract_TAL(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.HTML
		):
		"""Left aligned text"""
		return self.extract_Normal(paragraph)
		return self.extract_Normal(paragraph, extract_format)

		def extract_TAR(self, paragraph: Paragraph):
		def extract_TAR(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.HTML
		):
		"""Right aligned text"""
		return self.extract_Normal(paragraph)
		return self.extract_Normal(paragraph, extract_format)

		def extract_TB1(self, paragraph: Paragraph):
		def extract_TB1(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.HTML
		):
		"""List in tables Level 1"""
		md = self.extract_inner_content(paragraph)
		return f"\n<li>{md}</li>"
		html = self.extract_inner_content(paragraph, extract_format)
		return f"\n<li>{html}</li>"

		def extract_TB2(self, paragraph: Paragraph):
		def extract_TB2(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.HTML
		):
		"""List in tables Level 2"""
		return self.extract_TB1(paragraph)
		return self.extract_TB1(paragraph, extract_format)

		def extract_TAN(self, paragraph: Paragraph):
		def extract_TAN(
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.HTML
		):
		"""Note in table => use "tab" between "item/number" and "text"."""
		return self.extract_NO(paragraph)
		return self.extract_NO(paragraph, extract_format)

		# ---------------------------------------
		# For tables
		@@ -730,7 +887,7 @@ class TSExtractor:
		If the paragraph just above `table` has style 'TH' and matches 'Table <n>: <title>',
		return (caption_id, caption_text). Otherwise, return ("Table_id_unknown", "label_unknown").
		"""
		if i > 10:
		if i > 3:
		return "Table_id_unknown", "label_unknown"
		prev = get_prev_block(self.document, table)
		if isinstance(prev, Paragraph) and getattr(prev.style, "name", None) == "TH":
		@@ -750,8 +907,8 @@ class TSExtractor:
		"""
		parts = []
		for p in cell.paragraphs:
		parts.append(self.extract_paragraph(p))
		return "".join(parts)
		parts.append(self.extract_paragraph(p, ExtractFormat.HTML))
		return "<br>".join(parts)

		def get_paragraph_image_info(self, paragraph: Paragraph, i: int = 0):
		"""
		@@ -792,7 +949,7 @@ class TSExtractor:
		image_paragraph = get_prev_block(self.document, paragraph)
		return self.get_paragraph_image_info(image_paragraph, i + 1)

		def extract_table(self, table):
		def extract_table(self, table, extract_format: ExtractFormat = ExtractFormat.MD):
		"""
		Generate HTML for a python-docx Table.
		- If the paragraph just above has style TH and matches 'Table N: ...',