better management of Figure identifiers in SAREF4LIFT (b8446a3e) · Commits · SAREF / saref-pypeline

src/saref_pypeline/docgen/ts2md_extractor.py

+9 −2

Original line number	Diff line number	Diff line
		@@ -853,7 +853,7 @@ class TS2MDExtractor:
		"""Figure title"""
		md = self.extract_inner_content(paragraph, extract_format)

		match = re.match(r"(\w+[\s\xa0]+[A-Z0-9\.]+)[: \xa0](.)", md)
		match = re.match(r"(\w+[\s\xa0]+[A-Z0-9\.-]+)[: \xa0](.)", md)
		if not match:
		return ""
		label, caption = match.group(1), match.group(2)
		@@ -875,6 +875,13 @@ class TS2MDExtractor:
		self, paragraph: Paragraph, extract_format: ExtractFormat = ExtractFormat.MD
		):
		"""Figure layout, do nothing"""
		if paragraph.text.startswith("Figure"):
		logger.warning(f"Wrong style for {paragraph.text}")
		return self.extract_TF(paragraph, extract_format)
		elif paragraph.text.strip():
		logger.warning(f"Styl FL should only be for Figure layout. Got {paragraph.text}")
		return self.extract_Normal(paragraph, extract_format)
		else:
		return ""

		def extract_NF(

+7 −7

Original line number	Diff line number	Diff line
		@@ -935,8 +935,6 @@ class TSGenerator:
		)
		return

		for p in soup.find_all("p"):
		p.attrs["class"] = "Normal"
		for li in soup.find_all("li"):
		li.string = re.sub(r" : ", "\t", li.string, 1)
		li["data-docx-pstyle"] = P_STYLE.EW
		@@ -957,8 +955,6 @@ class TSGenerator:
		)
		return

		for p in soup.find_all("p"):
		p.attrs["class"] = "Normal"
		for li in soup.find_all("li"):
		for desc in li.descendants:
		if isinstance(desc, NavigableString):
		@@ -966,7 +962,11 @@ class TSGenerator:
		if n: # replaced once
		desc.replace_with(new_text)
		break

		if li.find_next_sibling("li") is not None:
		li["data-docx-pstyle"] = P_STYLE.EW
		else:
		li["data-docx-pstyle"] = P_STYLE.EX
		self.insert_soup(soup)

		def edit_history(self):
		@@ -1039,7 +1039,7 @@ class TSGenerator:
		# Methods for inserting soup
		# ---------------------------------------------------------------------

		@lru_cache
		# @lru_cache
		def get_soup(self, file_name: str) -> BeautifulSoup \| None:
		"""
		Load a file (HTML or Markdown) as a BeautifulSoup object.
		@@ -1421,7 +1421,7 @@ class TSGenerator:
		self.new_paragraph(style=P_STYLE.TF)
		if el.a:
		el.a.replace_with(el.a.text)
		match = re.match(r"^Figure (\d+)", el.get_text())
		match = re.match(r"^Figure ([\d\.-]+)", el.get_text())
		id = f"Figure_{match.group(1)}" if match else None
		with Bookmark(self, id):
		self.insert_soup_children(el)