Unverified Commit b8446a3e authored by Maxime Lefrançois's avatar Maxime Lefrançois
Browse files

better management of Figure identifiers in SAREF4LIFT

parent a61ac989
Loading
Loading
Loading
Loading
+9 −2
Original line number Diff line number Diff line
@@ -853,7 +853,7 @@ class TS2MDExtractor:
        """Figure title"""
        md = self.extract_inner_content(paragraph, extract_format)

        match = re.match(r"(\w+[\s\xa0]+[A-Z0-9\.]+)[: \xa0]*(.*)", md)
        match = re.match(r"(\w+[\s\xa0]+[A-Z0-9\.-]+)[: \xa0]*(.*)", md)
        if not match:
            return ""
        label, caption = match.group(1), match.group(2)
@@ -875,6 +875,13 @@ class TS2MDExtractor:
        self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
    ):
        """Figure layout, do nothing"""
        if paragraph.text.startswith("Figure"):
            logger.warning(f"Wrong style for {paragraph.text}")
            return self.extract_TF(paragraph, extract_format)
        elif paragraph.text.strip():
            logger.warning(f"Styl FL should only be for Figure layout. Got {paragraph.text}")
            return self.extract_Normal(paragraph, extract_format)
        else:
            return ""

    def extract_NF(
+7 −7
Original line number Diff line number Diff line
@@ -935,8 +935,6 @@ class TSGenerator:
            )
            return

        for p in soup.find_all("p"):
            p.attrs["class"] = "Normal"
        for li in soup.find_all("li"):
            li.string = re.sub(r" *: *", "\t", li.string, 1)
            li["data-docx-pstyle"] = P_STYLE.EW
@@ -957,8 +955,6 @@ class TSGenerator:
            )
            return

        for p in soup.find_all("p"):
            p.attrs["class"] = "Normal"
        for li in soup.find_all("li"):
            for desc in li.descendants:
                if isinstance(desc, NavigableString):
@@ -966,7 +962,11 @@ class TSGenerator:
                    if n:  # replaced once
                        desc.replace_with(new_text)
                        break

            if li.find_next_sibling("li") is not None:
                li["data-docx-pstyle"] = P_STYLE.EW
            else:
                li["data-docx-pstyle"] = P_STYLE.EX
        self.insert_soup(soup)

    def edit_history(self):
@@ -1039,7 +1039,7 @@ class TSGenerator:
    # Methods for inserting soup
    # ---------------------------------------------------------------------

    @lru_cache
    # @lru_cache
    def get_soup(self, file_name: str) -> BeautifulSoup | None:
        """
        Load a file (HTML or Markdown) as a BeautifulSoup object.
@@ -1421,7 +1421,7 @@ class TSGenerator:
        self.new_paragraph(style=P_STYLE.TF)
        if el.a:
            el.a.replace_with(el.a.text)
        match = re.match(r"^Figure (\d+)", el.get_text())
        match = re.match(r"^Figure ([\d\.-]+)", el.get_text())
        id = f"Figure_{match.group(1)}" if match else None
        with Bookmark(self, id):
            self.insert_soup_children(el)