Unverified Commit 8bde9cf4 authored by Maxime Lefrançois's avatar Maxime Lefrançois
Browse files

improve support for styles in run sequences, and enable html or md generation...

improve support for styles in run sequences, and enable html or md generation depending on the context
parent 349cc1b2
Loading
Loading
Loading
Loading
+260 −103
Original line number Diff line number Diff line
@@ -4,7 +4,7 @@ import os
import re
from pathlib import Path
from functools import cache, cached_property, lru_cache
from typing import Callable, Dict, Generator, List, Tuple
from typing import Any, Callable, Dict, Generator, List, Tuple
from copy import deepcopy
from datetime import datetime
from functools import cache
@@ -18,6 +18,7 @@ import webbrowser
import logging
import sys
import html
from enum import Enum, auto

# Third-party libraries
from bs4 import BeautifulSoup, Tag, NavigableString, PageElement
@@ -73,6 +74,33 @@ if TYPE_CHECKING:
logger = logging.getLogger(__name__)


class ExtractFormat(Enum):
    HTML = auto()
    MD = auto()


class Markup(Enum):
    STRONG = "bold", ("**", "**"), ("<b>", "</b>"), 
    EM = "italic", ("_", "_"), ("<em>", "</em>")
    SUP = "superscript", ("<sup>", "</sup>"), ("<sup>", "</sup>")
    CODE = "name", ("`", "`"), ("<code>", "</code>"), lambda x: x == "Courier New" or x == "Consolas"

    def __init__(self, style: str, md: Tuple[str, str], html: Tuple[str, str], func: Callable = lambda x: bool(x)):
        self.style = style
        self.check = func
        self.prefix = {ExtractFormat.MD: md[0], ExtractFormat.HTML: html[0]}
        self.suffix = {ExtractFormat.MD: md[1], ExtractFormat.HTML: html[1]}

class RunContext:
    def __init__(self, format:ExtractFormat):
        self.format = format
        self.content:List[str] = []
        self.buffer_blank:List[str] = []
        self.buffer_opening:List[Markup] = []
        self.buffer_closing:List[Markup] = []
        for markup in Markup.__members__.values():
            setattr(self, markup.style, None)

def pprint_xml(entity):
    """
    Pretty-print the XML of a python-docx entity (_element).
@@ -237,51 +265,77 @@ def cell_is_header(cell):
    return False


def extract_hyperlink(hyperlink: Hyperlink, ctx: dict):
def extract_hyperlink(hyperlink: Hyperlink, ctx: RunContext):
    if hyperlink.url:
        href = hyperlink.url
    else:
        href = f"#{hyperlink.fragment}"
    text = "".join([extract_run(run, ctx) for run in hyperlink.runs])
    return f"[{text}]({href})"

    if ctx.format == ExtractFormat.MD:
        ctx.content.append("[")
    else:
        ctx.content.append(f"""<a href="{href}">""")

    for run in hyperlink.runs:
        extract_run(run, ctx)

    if ctx.format == ExtractFormat.MD:
        ctx.content.append(f"]({href})")
    else:
        ctx.content.append("</a>")


def manage_ctx(
    ctx: dict,
    ctx: RunContext,
    this: Run | Font,
    attr=None,
    before: str = "",
    after: str = "",
    prefix: str = "",
    suffix: str = "",
):
    prev = ctx.setdefault(attr, False)
    now = getattr(this, attr)
    if not prev and now:
        before = before + prefix
    if prev and not now:
        before = suffix + before
    ctx[attr] = now
    return before, after


def close_ctx(ctx: dict, attr=None, after: str = "", suffix: str = ""):
    prev = ctx.setdefault(attr, False)
    if prev:
        after = suffix + after
    return after


def extract_run(run: Run, ctx: dict):
    markup: Markup,
):
    prev = getattr(ctx, markup.style)
    now = markup.check(getattr(this, markup.style))
    setattr(ctx, markup.style, now)
    if not prev and now: # opening markup
        ctx.buffer_opening.append(markup)
    if prev and not now: # closing markup
        if ctx.buffer_opening and ctx.buffer_opening[-1] == markup:
            # markup opened and immediately closed
            ctx.buffer_opening.pop()
        else:
            ctx.buffer_closing.append(markup)


def extract_run(run: Run, ctx: RunContext):
    # invariant: last character of last item in ctx.content is a non-blank character

    # manage opening/closing tags
    manage_ctx(ctx, run, Markup.STRONG)
    manage_ctx(ctx, run, Markup.EM)
    manage_ctx(ctx, run.font, Markup.SUP)
    manage_ctx(ctx, run.font, Markup.CODE)

    # replace blank characters
    text = run.text.replace(" ", " ")

    # extract white space before and after
    before, text, after = re.match(r"^(\s*)(.*?)(\s*)$", run.text).groups()
    text = text.replace(" ", " ")
    (before, after) = manage_ctx(ctx, run, "bold", before, after, "**", "**")
    (before, after) = manage_ctx(ctx, run, "italic", before, after, "_", "_")
    (before, after) = manage_ctx(
        ctx, run.font, "superscript", before, after, "<sup>", "</sup>"
    )
    return before + text + after
    before, text, after = re.match(r"^(\s*)(.*?)(\s*)$", text).groups()
    if before:
        ctx.buffer_blank.append(before)

    if ctx.buffer_closing:
        ctx.content.extend([m.suffix[ctx.format] for m in reversed(ctx.buffer_closing)])
        ctx.buffer_closing.clear()

    if text:
        ctx.content.extend(ctx.buffer_blank)
        ctx.buffer_blank.clear()

        ctx.content.extend([m.prefix[ctx.format] for m in ctx.buffer_opening])
        ctx.buffer_opening.clear()

        ctx.content.append(text)

        if after:
            ctx.buffer_blank.append(after)
        


class TSExtractor:
@@ -308,8 +362,14 @@ class TSExtractor:
        url, filename = self.get_docx_url()

        self.doc_folder = os.path.join(self.pipeline.directory, "ts")
        if not os.path.isdir(self.doc_folder):
            self.doc_folder = os.path.join(self.pipeline.sources_dir, "ts")

        self.out_folder = os.path.join(
            self.pipeline.target_dir, "ts", os.path.splitext(filename)[0]
            self.pipeline.target_dir,
            "ts",
            self.project.name,
            os.path.splitext(filename)[0],
        )

        self.file_path = os.path.join(self.doc_folder, filename)
@@ -489,13 +549,17 @@ class TSExtractor:
                started = True
            if not started or block_item.style.name not in [P_STYLE.NO, P_STYLE.EX]:
                continue
            md_output.append(self.extract_block_item(block_item))
            ref = self.extract_block_item(block_item)
            ref = re.sub(r"(\[(i\.)?\d+\])", r"""<a id="\1">\1</a>""", ref, count=1)
            md_output.append(ref)

    # ---------------------------------------
    # For paragraphs
    # ---------------------------------------

    def extract_paragraph(self, paragraph: Paragraph):
    def extract_paragraph(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        try:
            style = P_STYLE(paragraph.style.name)
        except:
@@ -504,24 +568,35 @@ class TSExtractor:
        fname = f"extract_{style.name}"
        if hasattr(self, fname) and callable(getattr(self, fname)):
            method = getattr(self, fname)
            return method(paragraph)
            return method(paragraph, extract_format)
        else:
            logger.warning(f"TSExtractor function {fname} not implemented - skipping")

    def extract_inner_content(self, paragraph: Paragraph):
        md_content = []
        ctx = dict()
    def extract_inner_content(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        ctx = RunContext(extract_format)
        # keep track of spaces, as md markup needs to be right before/after non-space characters
        # invariant: last character of last item in ctx.content is a non-blank character

        for child in paragraph.iter_inner_content():
            if isinstance(child, Hyperlink):
                md_content.append(extract_hyperlink(child, ctx))
                extract_hyperlink(child, ctx)
            elif isinstance(child, Run):
                md_content.append(extract_run(child, ctx))
        after = close_ctx(ctx, "bold", "", "**")
        after = close_ctx(ctx, "italic", after, "_")
        after = close_ctx(ctx, "superscript", after, "</sup>")
        md_content.append(after)
                extract_run(child, ctx)

        for markup in [Markup.STRONG, Markup.EM, Markup.SUP, Markup.CODE]:
            if getattr(ctx, markup.style):
                ctx.buffer_closing.append(markup)

        if ctx.buffer_closing:            
            ctx.content.extend([m.suffix[ctx.format] for m in reversed(ctx.buffer_closing)])
            ctx.buffer_closing.clear()

        return "".join(md_content)
        ctx.content.extend(ctx.buffer_blank)
        ctx.buffer_blank.clear()

        return "".join(ctx.content)

    def extract_heading(self, paragraph: Paragraph, prefix: str):
        md = self.extract_inner_content(paragraph)
@@ -531,31 +606,49 @@ class TSExtractor:
            md = match.group(1)
        return prefix + md + "\n"

    def extract_Heading_1(self, paragraph: Paragraph):
    def extract_Heading_1(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        return self.extract_heading(paragraph, prefix=1 * "#" + " ")

    def extract_Heading_2(self, paragraph: Paragraph):
    def extract_Heading_2(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        return self.extract_heading(paragraph, prefix=2 * "#" + " ")

    def extract_Heading_3(self, paragraph: Paragraph):
    def extract_Heading_3(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        return self.extract_heading(paragraph, prefix=3 * "#" + " ")

    def extract_Heading_4(self, paragraph: Paragraph):
    def extract_Heading_4(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        return self.extract_heading(paragraph, prefix=4 * "#" + " ")

    def extract_Heading_5(self, paragraph: Paragraph):
    def extract_Heading_5(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        return self.extract_heading(paragraph, prefix=5 * "#" + " ")

    def extract_H6(self, paragraph: Paragraph):
    def extract_H6(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        return self.extract_heading(paragraph, prefix=6 * "#" + " ")

    def extract_Heading_8(self, paragraph: Paragraph):
    def extract_Heading_8(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        return self.extract_heading(paragraph, prefix=1 * "#" + " ")

    def extract_Heading_9(self, paragraph: Paragraph):
    def extract_Heading_9(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        return self.extract_heading(paragraph, prefix=1 * "#" + " ")

    def extract_EX(self, paragraph: Paragraph):
    def extract_EX(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        """Reference, Example => use "tab" between "item/number" and "text"."""
        md = self.extract_inner_content(paragraph)
        if md.startswith("EXAMPLE"):
@@ -567,13 +660,17 @@ class TSExtractor:
            # Reference
            return f"* {md}"

    def extract_EW(self, paragraph: Paragraph):
    def extract_EW(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        """Symbol, Abbreviation, Example continuation in text => use "tab" between "item/number" and "text"."""
        md = self.extract_inner_content(paragraph)
        left, right = md.split("\t", 1)
        return f"* {left}: {right}"

    def extract_NO(self, paragraph: Paragraph):
    def extract_NO(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        """Note integrated in the text => use "tab" between "item/number" and "text"."""
        md = self.extract_inner_content(paragraph)
        if md.startswith("NOTE"):
@@ -585,84 +682,124 @@ class TSExtractor:

    # List styles (indents)

    def extract_B(self, paragraph: Paragraph, level: int):
    def extract_B(
        self,
        paragraph: Paragraph,
        level: int,
        extract_format: ExtractFormat = ExtractFormat.MD,
    ):
        md = self.extract_inner_content(paragraph)
        return print_admonition(f"indent-{level}", "", md)

    def extract_B1(self, paragraph: Paragraph):
    def extract_B1(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        """Indent 1"""
        return self.extract_B(paragraph, 1)

    def extract_B2(self, paragraph: Paragraph):
    def extract_B2(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        """Indent 2"""
        return self.extract_B(paragraph, 2)

    def extract_B3(self, paragraph: Paragraph):
    def extract_B3(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        """Indent 3"""
        return self.extract_B(paragraph, 3)

    def extract_B4(self, paragraph: Paragraph):
    def extract_B4(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        """Indent 4"""
        return self.extract_B(paragraph, 4)

    def extract_B5(self, paragraph: Paragraph):
    def extract_B5(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        """Indent 5"""
        return self.extract_B(paragraph, 5)

    def extract_B_plus(self, paragraph: Paragraph, level: int):
    def extract_B_plus(
        self,
        paragraph: Paragraph,
        level: int,
        extract_format: ExtractFormat = ExtractFormat.MD,
    ):
        """Bulleted indent 1 (round bullets)"""
        md = self.extract_inner_content(paragraph)
        return level * " " + f"* {md}"

    def extract_B1_plus(self, paragraph: Paragraph):
    def extract_B1_plus(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        """Bulleted indent 1 (round bullets)"""
        return self.extract_B_plus(paragraph, 0)

    def extract_B2_plus(self, paragraph: Paragraph):
    def extract_B2_plus(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        """Bulleted indent 2 (dashes)"""
        return self.extract_B_plus(paragraph, 4)

    def extract_B3_plus(self, paragraph: Paragraph):
    def extract_B3_plus(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        """Bulleted indent 3 (square bullets)"""
        return self.extract_B_plus(paragraph, 8)

    def extract_BN(self, paragraph: Paragraph):
    def extract_BN(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        """Bulleted (numbers) indent 1"""
        md = self.extract_inner_content(paragraph)
        md = self.extract_inner_content(paragraph, extract_format)
        return f"1. {md}"

    def extract_BL(self, paragraph: Paragraph):
    def extract_BL(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        """Bulleted (letters) indent 1"""
        md = self.extract_inner_content(paragraph)
        md = self.extract_inner_content(paragraph, extract_format)
        return f"a. {md}"

    # General styles                For different items

    def extract_Normal(self, paragraph: Paragraph):
    def extract_Normal(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        """Standard paragraph, Definition"""
        md = self.extract_inner_content(paragraph)
        md = self.extract_inner_content(paragraph, extract_format)
        return f"\n{md}\n"

    def extract_TT(self, paragraph: Paragraph):
    def extract_TT(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        """Contents list title"""
        md = self.extract_inner_content(paragraph)
        md = self.extract_inner_content(paragraph, extract_format)
        return print_admonition(f"TT", "some contents list title", md)

    def extract_PL(self, paragraph: Paragraph):
    def extract_PL(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        """Programming language"""
        md = self.extract_inner_content(paragraph)
        md = self.extract_inner_content(paragraph, extract_format)
        return f"```\n{md}\n```"

    def extract_EQ(self, paragraph: Paragraph):
    def extract_EQ(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        """Equation"""
        md = self.extract_inner_content(paragraph)
        md = self.extract_inner_content(paragraph, extract_format)
        return print_admonition(f"EQ", "some Equation", md)

    # Figure styles             For formatting figures

    def extract_TF(self, paragraph: Paragraph):
    def extract_TF(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        """Figure title"""
        md = self.extract_inner_content(paragraph)
        md = self.extract_inner_content(paragraph, extract_format)

        match = re.match(r"(\w+[\s\xa0]+[A-Z0-9\.]+)[: \xa0]*(.*)", md)
        if not match:
@@ -678,48 +815,68 @@ class TSExtractor:
        else:
            return "**{md}**"

    def extract_FL(self, paragraph: Paragraph):
    def extract_FL(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        """Figure layout, do nothing"""
        return ""

    def extract_NF(self, paragraph: Paragraph):
    def extract_NF(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        """Note in figure => use "tab" between "item/number" and "text"."""
        return self.extract_NO(paragraph)

    # Table styles              For formatting tables

    def extract_TH(self, paragraph: Paragraph):
    def extract_TH(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        """Table title"""
        return ""

    def extract_TAH(self, paragraph: Paragraph):
    def extract_TAH(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.HTML
    ):
        """Heading within table or column heading"""
        return self.extract_Normal(paragraph)
        return self.extract_Normal(paragraph, extract_format)

    def extract_TAC(self, paragraph: Paragraph):
    def extract_TAC(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.HTML
    ):
        """Centred texts"""
        return self.extract_Normal(paragraph)
        return self.extract_Normal(paragraph, extract_format)

    def extract_TAL(self, paragraph: Paragraph):
    def extract_TAL(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.HTML
    ):
        """Left aligned text"""
        return self.extract_Normal(paragraph)
        return self.extract_Normal(paragraph, extract_format)

    def extract_TAR(self, paragraph: Paragraph):
    def extract_TAR(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.HTML
    ):
        """Right aligned text"""
        return self.extract_Normal(paragraph)
        return self.extract_Normal(paragraph, extract_format)

    def extract_TB1(self, paragraph: Paragraph):
    def extract_TB1(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.HTML
    ):
        """List in tables Level 1"""
        md = self.extract_inner_content(paragraph)
        return f"\n<li>{md}</li>"
        html = self.extract_inner_content(paragraph, extract_format)
        return f"\n<li>{html}</li>"

    def extract_TB2(self, paragraph: Paragraph):
    def extract_TB2(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.HTML
    ):
        """List in tables Level 2"""
        return self.extract_TB1(paragraph)
        return self.extract_TB1(paragraph, extract_format)

    def extract_TAN(self, paragraph: Paragraph):
    def extract_TAN(
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.HTML
    ):
        """Note in table => use "tab" between "item/number" and "text"."""
        return self.extract_NO(paragraph)
        return self.extract_NO(paragraph, extract_format)

    # ---------------------------------------
    # For tables
@@ -730,7 +887,7 @@ class TSExtractor:
        If the paragraph just above `table` has style 'TH' and matches 'Table <n>: <title>',
        return (caption_id, caption_text). Otherwise, return ("Table_id_unknown", "label_unknown").
        """
        if i > 10:
        if i > 3:
            return "Table_id_unknown", "label_unknown"
        prev = get_prev_block(self.document, table)
        if isinstance(prev, Paragraph) and getattr(prev.style, "name", None) == "TH":
@@ -750,8 +907,8 @@ class TSExtractor:
        """
        parts = []
        for p in cell.paragraphs:
            parts.append(self.extract_paragraph(p))
        return "".join(parts)
            parts.append(self.extract_paragraph(p, ExtractFormat.HTML))
        return "<br>".join(parts)

    def get_paragraph_image_info(self, paragraph: Paragraph, i: int = 0):
        """
@@ -792,7 +949,7 @@ class TSExtractor:
        image_paragraph = get_prev_block(self.document, paragraph)
        return self.get_paragraph_image_info(image_paragraph, i + 1)

    def extract_table(self, table):
    def extract_table(self, table, extract_format: ExtractFormat = ExtractFormat.MD):
        """
        Generate HTML for a python-docx Table.
        - If the paragraph just above has style TH and matches 'Table N: ...',