ongoing work on docx generation (292e0c55) · Commits · SAREF / saref-pypeline

saref_pypeline/docgen/docx_generator.py

+918 −191

File changed.

Preview size limit exceeded, changes collapsed.

saref_pypeline/etsi.py

0 → 100644

+293 −0

Original line number	Diff line number	Diff line
		from enum import Enum, EnumMeta, StrEnum
		import inspect

		class WK_FIELD(StrEnum):
		"""Enumeration of fields of interest for a work item.

		The value of a field can be found in the ETSI skeleton for Technical Specifications, and need to be replaced by the corresponding value in a work item.
		"""
		# fields not present in the ETSI skeleton
		WK_ID = "Work item ID"
		"""Work Item ID on the ETSI portal."""
		Short_Title = "<short title>"
		"""Short Title of the Work Item"""

		# fields present in the ETSI skeleton
		ONE_DD = "1DD"
		"""First three digits of document number (ex. for TS 103 410-1: "103")"""
		DDD = "DDD"
		"""Last three digits of document number, optionally dash part (ex. for TS 103 410-1: "410-1")"""
		PART = "PART"
		"""Part of the technical specification (ex. for TS 103 410-1: "1")"""
		mte = "m.t.e"
		"""Version number (major, technical, editorial) (ex. for TS 103 410-1 V2.1.1: "2.1.1")"""
		yyyy = "yyyy"
		"""Publication year"""
		mm = "mm"
		"""Publication month"""
		Title = "Title;"
		"""Title 1"""
		Title_Part = "Part #: Part element of title;"
		"""Title 2"""
		Title_Sub_Part = "Sub-part #: Sub-part element of title"
		"""Title 3"""
		Title_Release = "Release #"
		"""Release number"""
		Workitem = "<Workitem>"
		"""Workitem number, ex. RTS/SmartM2M-103410-8v211"""
		Keywords = "<keywords>"
		"""Keywords, ex. AGEING, HEALTH, IoT, oneM2M, ontology, SAREF, Semantic"""
		TB_type = "ETSI Technical Committee\|ETSI Project\|<other>"
		"""Type of technical body, example "ETSI Technical Committee"."""
		long_techbody = "<long techbody>"
		"""Long name of technical body, example "Data Solutions"."""
		short_techbody = "<short techbody>"
		"""Short name of technical body, example "DATA"."""

		class TBType(Enum):
		"""Enumeration of types of ETSI Technical Bodies"""
		ETSI_Technical_Committee = "ETSI Technical Committee"
		ETSI_Project = "ETSI Project"

		class TB(Enum):
		"""Enumeration of ETSI Technical Bodies"""
		SmartM2M = TBType.ETSI_Technical_Committee, "Smart Machine-to-Machine communications",
		DATA = TBType.ETSI_Technical_Committee, "Data Solutions"

		def __init__(self, type_:TBType, long_name):
		self.type = type_.value
		self.long_name = long_name

		class _MetaEnum(EnumMeta):
		"""Source: https://stackoverflow.com/a/78943193"""
		def __new__(metacls, clsname, bases, classdict):
		cls = super().__new__(metacls, clsname, bases, classdict)

		# Extract source code and split docstrings
		source = inspect.getsource(cls)
		docstrings = source.split('"""')[-2::-2]

		# Assign the docstrings to enum members
		for member_name, doc_str in zip(reversed(cls._member_names_), docstrings):
		enum_member = getattr(cls, member_name)
		enum_member.__doc__ = doc_str.strip()

		return cls

		class P_STYLE(StrEnum, metaclass=_MetaEnum):
		"""ETSI Styles, listed at https://portal.etsi.org/Services/editHelp/Standards-development/Drafting/Styles-listing/Styles-listing

		Do not alter existing styles or formats pre-set in the ETSI styles, do not add new styles to the ETSI template and do not delete ETSI styles.

		The ETSI Secretariat provides a Microsoft® Word template which contains a set of pre-defined styles simplifying the formatting of documents according to the ETSI drafting rules:

		- applying the ETSI template from the very beginning of work avoids delay throughout the drafting stage;
		- it can be applied to a new or existing ETSI deliverable;
		- it is recommended to attach it to the change request (CR) template;
		- it must only be used for the purpose of the standardization work within ETSI.

		For Word for Windows® 2007 and higher use the following file:

		Download ETSIW_2013.dotm (17 kb): https://portal.etsi.org/Portals/0/TBpages/edithelp/Docs/ETSIW_2013.dotm

		ETSI Technical Specification (TS)

		TS (ETSI Technical Specification) is the preferred deliverable when the document contains normative provisions and short time to "market", validation and maintenance are essential. A TS may later be converted to an ES or an EN, or be used to publish the contents of a draft ES being sent for vote or a draft EN being sent for Public Enquiry or vote.

		https://portal.etsi.org/Portals/0/TBpages/edithelp/Docs/ETSI_Skeletons/ETSI_TS_skeleton.docx
		"""

		# Heading styles For different headings

		Heading_1 = "Heading 1"
		"""Clause"""
		Heading_2 = "Heading 2"
		"""Subdivision level 2"""
		Heading_3 = "Heading 3"
		"""Subdivision level 3"""
		Heading_4 = "Heading 4"
		"""Subdivision level 4"""
		Heading_5 = "Heading 5"
		"""Subdivision level 5"""
		H6 = "H6"
		"""Subdivision level 6 (not reflected in the table of contents)"""
		Heading_8 = "Heading 8"
		"""Annex title (for ENs, HSs, TSs, ESs and GSs only)"""
		Heading_9 = "Heading 9"
		"""Annex title (for TRs, EGs, GRs and SRs only)"""

		# Example styles For examples and abbreviations/symbols lists

		EX = "EX"
		"""Reference, Example => use "tab" between "item/number" and "text"."""
		EW = "EW"
		"""Symbol, Abbreviation, Example continuation in text => use "tab" between "item/number" and "text"."""
		NO = "NO"
		"""Note integrated in the text => use "tab" between "item/number" and "text"."""

		# Figure styles For formatting figures
		TF = "TF"
		"""Figure title"""
		FL = "FL"
		"""Figure layout"""
		NF = "NF"
		"""Note in figure => use "tab" between "item/number" and "text"."""


		# Table styles For formatting tables

		TH = "TH"
		"""Table title"""
		TAH = "TAH"
		"""Heading within table or column heading """
		TAC = "TAC"
		"""Centred texts"""
		TAL = "TAL"
		"""Left aligned text """
		TAR = "TAR"
		"""Right aligned text """
		TB1 = "TB1"
		"""List in tables Level 1 """
		TB2 = "TB2"
		"""List in tables Level 2 """
		TAN = "TAN"
		"""Note in table => use "tab" between "item/number" and "text"."""

		# List styles (indents)

		B1 = "B1"
		"""Indent 1"""
		B2 = "B2"
		"""Indent 2"""
		B3 = "B3"
		"""Indent 3"""
		B4 = "B4"
		"""Indent 4"""
		B5 = "B5"
		"""Indent 5"""
		B1_plus = "B1+"
		"""Bulleted indent 1 (round bullets)"""
		B2_plus = "B2+"
		"""Bulleted indent 2 (dashes) """
		B3_plus = "B3+"
		"""Bulleted indent 3 (square bullets) """
		BN = "BN"
		"""Bulleted (numbers) indent 1 """
		BL = "BL"
		"""Bulleted (letters) indent 1 """

		# General styles For different items
		Normal = "Normal"
		"""Standard paragraph, Definition """
		TT = "TT"
		"""Contents list title """
		PL = "PL"
		"""Programming language """
		EQ = "EQ"
		"""Equation """
		Header = "Header"
		"""Header (portrait and landscape pages)"""

		# Style which can be user-defined For formatting defined by the user that will not be altered by the ETSI processing macros
		FP = "FP"
		"""Free Paragraph.
		Style which can be user-defined
		For formatting defined by the user that will not be altered by the ETSI processing macros """

		class C_STYLE(StrEnum, metaclass=_MetaEnum):
		"""ETSI Character Styles, obtained from skeleton"""

		Normal = "Default Paragraph Font"
		"Times New Roman, 10"

		Hyperlink = "Hyperlink"
		"""Blue, Underlined"""

		Guidance = "Guidance"
		"""Green, Italics"""

		Strong = "Strong"
		"""Bold"""

		HTML_Definition = "HTML Definition"
		"""Times, 10, Italics"""

		HTML_Keyboard = "HTML Keyboard"
		"""Courier New, 10"""

		HTML_Sample = "HTML Sample"
		"""Courier New, 10"""



		HEADING_STYLES = [P_STYLE.Heading_1,
		P_STYLE.Heading_2,
		P_STYLE.Heading_3,
		P_STYLE.Heading_4,
		P_STYLE.Heading_5,
		P_STYLE.H6,
		P_STYLE.Heading_8,
		P_STYLE.Heading_9]

		HEADING_TAG_2_STYLE = {
		"h1": P_STYLE.Heading_1,
		"h2": P_STYLE.Heading_2,
		"h3": P_STYLE.Heading_3,
		"h4": P_STYLE.Heading_4,
		"h5": P_STYLE.Heading_5,
		"h6": P_STYLE.H6,
		"h7": P_STYLE.Heading_8,
		"h8": P_STYLE.Heading_9}

		SECTION_ENDING_STYLES = {
		P_STYLE.Heading_1: [P_STYLE.Heading_1,
		P_STYLE.Heading_8,
		P_STYLE.Heading_9],
		P_STYLE.Heading_2: [P_STYLE.Heading_1,
		P_STYLE.Heading_2,
		P_STYLE.Heading_8,
		P_STYLE.Heading_9],
		P_STYLE.Heading_3: [P_STYLE.Heading_1,
		P_STYLE.Heading_2,
		P_STYLE.Heading_3,
		P_STYLE.Heading_8,
		P_STYLE.Heading_9],
		P_STYLE.Heading_4: [P_STYLE.Heading_1,
		P_STYLE.Heading_2,
		P_STYLE.Heading_3,
		P_STYLE.Heading_4,
		P_STYLE.Heading_8,
		P_STYLE.Heading_9],
		P_STYLE.Heading_5: [P_STYLE.Heading_1,
		P_STYLE.Heading_2,
		P_STYLE.Heading_3,
		P_STYLE.Heading_4,
		P_STYLE.Heading_5,
		P_STYLE.Heading_8,
		P_STYLE.Heading_9],
		P_STYLE.H6: [P_STYLE.Heading_1,
		P_STYLE.Heading_2,
		P_STYLE.Heading_3,
		P_STYLE.Heading_4,
		P_STYLE.Heading_5,
		P_STYLE.H6,
		P_STYLE.Heading_8,
		P_STYLE.Heading_9],
		P_STYLE.Heading_8: [P_STYLE.Heading_1,
		P_STYLE.Heading_8,
		P_STYLE.Heading_9],
		P_STYLE.Heading_9: [P_STYLE.Heading_1,
		P_STYLE.Heading_8,
		P_STYLE.Heading_9]
		}
		"""list of sections that announce the end of the given section"""

		SUB_UL_STYLE = {
		P_STYLE.B1: P_STYLE.B2_plus,
		P_STYLE.B1_plus: P_STYLE.B2_plus,
		P_STYLE.B2_plus: P_STYLE.B3_plus,
		P_STYLE.BN: P_STYLE.B2_plus,
		P_STYLE.BL: P_STYLE.B2_plus,
		}

		SUB_LI_STYLE = dict()
		No newline at end of file

saref_pypeline/metadata_fetcher.py

+52 −29

Original line number	Diff line number	Diff line
		@@ -3,12 +3,16 @@ from bs4 import BeautifulSoup
		import re
		import logging
		from saref_pypeline._logging import TRACE_LEVEL
		from saref_pypeline.etsi import WK_FIELD, TBType, TB

		logger = logging.getLogger(__name__)


		def _fetch_keywords(ref, wk_id) -> str:
		logger.log(TRACE_LEVEL, f"fetch for {ref}")
		url = f"https://portal.etsi.org/webapp/WorkProgram/Report_WorkItem.asp?WKI_ID={wk_id}"
		url = (
		f"https://portal.etsi.org/webapp/WorkProgram/Report_WorkItem.asp?WKI_ID={wk_id}"
		)
		resp = requests.get(url)
		try:
		resp.raise_for_status()
		@@ -26,6 +30,7 @@ def _fetch_keywords(ref, wk_id) -> str:
		except Exception as e:
		logger.warning(f"Exception while fetching keywords for {ref} {wk_id}: {e}")


		def fetch_metadata() -> list:
		result = list()
		for search in ["103264", "103548", "103410"]:
		@@ -46,17 +51,26 @@ def fetch_metadata() -> list:
		if not "Drafting" in status and not "Published" in status:
		continue

		pub_date_match = re.search(r"Publication\s+\(([0-9]{4}-[0-9]{2}-[0-9]{2})\)", status)
		pub_date = pub_date_match.group(1) if pub_date_match else None
		pub_date_match = re.search(
		r"Publication\s+\(([0-9]{4})-([0-9]{2})-[0-9]{2}\)", status
		)
		yyyy, mm = pub_date_match.group(1, 2) if pub_date_match else (None, None)

		# Extract version, ref, Work Item ID
		try:
		col_text = cells[1].get_text(separator=" ", strip=True)
		doc_nb = re.search(r"Doc\.\ Nb\.\s*(TS\ [0-9]{3}\ [0-9]{3}(-[0-9]+)?)", col_text).group(1)
		version_match = re.search(r"Ver\.\s*(\d+\.\d+\.\d+)", col_text)
		version = version_match.group(1) if version_match else None
		one_dd, ddd, part = re.search(
		r"Doc\.\ Nb\.\s*TS\ ([0-9]{3})\ ([0-9]{3}(-([0-9]+))?)", col_text
		).group(1, 2, 4)
		mte_match = re.search(r"Ver\.\s*(\d+\.\d+\.\d+)", col_text)
		mte = mte_match.group(1) if mte_match else None
		tb = re.search(r"Technical Body:\s*([\w-]+)", col_text).group(1)
		ref = re.search(r"Ref\.\s*([A-Z]+/[\w-]+)", col_text).group(1)
		wk_id = int(re.search(r"WKI_ID=(\d+)", cells[1].find("a", href=True)["href"]).group(1))
		wk_id = int(
		re.search(
		r"WKI_ID=(\d+)", cells[1].find("a", href=True)["href"]
		).group(1)
		)
		except:
		continue

		@@ -66,31 +80,40 @@ def fetch_metadata() -> list:
		title1 = title_lines[0] if len(title_lines) >= 2 else None
		title2 = title_lines[1] if len(title_lines) >= 3 else None
		title3 = title_lines[2] if len(title_lines) >= 4 else None
		try:
		part = int(re.search(r"Part\s+(\d+)", title3).group(1))
		except:
		part = None
		# try:
		# part = int(re.search(r"Part\s+(\d+)", title3).group(1))
		# except:
		# part = None

		# fetch keywords
		keywords = _fetch_keywords(ref, wk_id)

		result.append({
		"doc_nb": doc_nb,
		"ref": ref,
		"version": version,
		"pub_date": pub_date,
		"wk_id": wk_id,
		"short_title": short_title,
		"title1": title1,
		"title2": title2,
		"title3": title3,
		"part": part,
		"keywords": keywords,
		})
		result.append(
		{
		WK_FIELD.WK_ID: wk_id,
		WK_FIELD.Short_Title: short_title,
		WK_FIELD.ONE_DD: one_dd,
		WK_FIELD.DDD: ddd,
		WK_FIELD.PART: part,
		WK_FIELD.mte: mte,
		WK_FIELD.yyyy: yyyy,
		WK_FIELD.mm: mm,
		WK_FIELD.Title: title1,
		WK_FIELD.Title_Part: title2,
		WK_FIELD.Title_Sub_Part: title3,
		WK_FIELD.Title_Release: None,
		WK_FIELD.Workitem: ref,
		WK_FIELD.Keywords: keywords,
		WK_FIELD.TB_type: TB[tb].type,
		WK_FIELD.short_techbody: tb,
		WK_FIELD.long_techbody: TB[tb].long_name,
		}
		)
		return result


		if __name__ == "__main__":
		details = fetch_metadata()
		import pprint
		pprint.pprint(details)

		pprint.pprint(details)

saref_pypeline/pipeline.py

+19 −0

Original line number	Diff line number	Diff line
		@@ -5,6 +5,7 @@ import re
		from git import Repo, GitCommandError
		from saref_pypeline._logging import TRACE_LEVEL
		from saref_pypeline.utils import skip_if_filtered
		from saref_pypeline.etsi import WK_FIELD
		from saref_pypeline.docgen import SiteManager
		from saref_pypeline.dataset import ManagedDataset
		from saref_pypeline.constants import *
		@@ -509,10 +510,28 @@ class SAREFPipeline:
		def fetch_projects_metadata(self):
		metadata_file = os.path.join(self.target_dir, "projects_metadata.yaml")
		try:
		def dict_fields_constructor(loader, node):
		# Map string keys back to FIELDS if possible
		mapping = loader.construct_mapping(node)
		return {
		WK_FIELD[k] if k in WK_FIELD.__members__ else k: v
		for k, v in mapping.items()
		}
		yaml.SafeLoader.add_constructor(
		"tag:yaml.org,2002:map",
		dict_fields_constructor)
		with open(metadata_file, "r") as f:
		self.projects_metadata = yaml.safe_load(f)
		if not self.projects_metadata:
		raise ValueError()
		except:
		self.projects_metadata = fetch_metadata()
		def represent_string(dumper, data):
		return dumper.represent_str(data.name)

		yaml.SafeDumper.add_representer(
		WK_FIELD,
		represent_string)
		with open(metadata_file, "w") as f:
		yaml.safe_dump(self.projects_metadata, f)

saref_pypeline/resources/docgen/ETSI_SAREF_TS_skeleton.docx

0 → 100644

+127 KiB

File added.

No diff preview for this file type.

View file