Unverified Commit e18c7d0d authored by Maxime Lefrançois's avatar Maxime Lefrançois
Browse files

start Task 3, extract data from ETSI Portal

parent 5d7a6142
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@ dominate = "*"
markdown = "*"
pygments = "*"
beautifulsoup4 = "*"
python-docx = "*"

[dev-packages]
pytest = "*"
+110 −1
Original line number Diff line number Diff line
{
    "_meta": {
        "hash": {
            "sha256": "3ea1c1b66331595be357927e997c45458818799398be5d0a76d74775b8dec1c4"
            "sha256": "61451871c2689e6dcf0ecb3dd9f595052940889c2d2d1a911cd67b12960f3d64"
        },
        "pipfile-spec": 6,
        "requires": {},
@@ -186,6 +186,106 @@
            "markers": "python_version >= '3.7'",
            "version": "==3.1.6"
        },
        "lxml": {
            "hashes": [
                "sha256:013090383863b72c62a702d07678b658fa2567aa58d373d963cca245b017e065",
                "sha256:032e65120339d44cdc3efc326c9f660f5f7205f3a535c1fdbf898b29ea01fb72",
                "sha256:048a930eb4572829604982e39a0c7289ab5dc8abc7fc9f5aabd6fbc08c154e93",
                "sha256:04d67ceee6db4bcb92987ccb16e53bef6b42ced872509f333c04fb58a3315256",
                "sha256:059c4cbf3973a621b62ea3132934ae737da2c132a788e6cfb9b08d63a0ef73f9",
                "sha256:0e32698462aacc5c1cf6bdfebc9c781821b7e74c79f13e5ffc8bfe27c42b1abf",
                "sha256:1676b56d48048a62ef77a250428d1f31f610763636e0784ba67a9740823988ca",
                "sha256:17f090a9bc0ce8da51a5632092f98a7e7f84bca26f33d161a98b57f7fb0004ca",
                "sha256:185efc2fed89cdd97552585c624d3c908f0464090f4b91f7d92f8ed2f3b18f54",
                "sha256:1fa377b827ca2023244a06554c6e7dc6828a10aaf74ca41965c5d8a4925aebb4",
                "sha256:2181e4b1d07dde53986023482673c0f1fba5178ef800f9ab95ad791e8bdded6a",
                "sha256:219e0431ea8006e15005767f0351e3f7f9143e793e58519dc97fe9e07fae5563",
                "sha256:21db1ec5525780fd07251636eb5f7acb84003e9382c72c18c542a87c416ade03",
                "sha256:246b40f8a4aec341cbbf52617cad8ab7c888d944bfe12a6abd2b1f6cfb6f6082",
                "sha256:2793a627e95d119e9f1e19720730472f5543a6d84c50ea33313ce328d870f2dd",
                "sha256:2930aa001a3776c3e2601cb8e0a15d21b8270528d89cc308be4843ade546b9ab",
                "sha256:2ae06fbab4f1bb7db4f7c8ca9897dc8db4447d1a2b9bee78474ad403437bcc29",
                "sha256:2b4790b558bee331a933e08883c423f65bbcd07e278f91b2272489e31ab1e2b4",
                "sha256:2cfcf84f1defed7e5798ef4f88aa25fcc52d279be731ce904789aa7ccfb7e8d2",
                "sha256:2dd1cc3ea7e60bfb31ff32cafe07e24839df573a5e7c2d33304082a5019bcd58",
                "sha256:2f34687222b78fff795feeb799a7d44eca2477c3d9d3a46ce17d51a4f383e32e",
                "sha256:310b719b695b3dd442cdfbbe64936b2f2e231bb91d998e99e6f0daf991a3eba3",
                "sha256:34190a1ec4f1e84af256495436b2d196529c3f2094f0af80202947567fdbf2e7",
                "sha256:35bc626eec405f745199200ccb5c6b36f202675d204aa29bb52e27ba2b71dea8",
                "sha256:36531f81c8214e293097cd2b7873f178997dae33d3667caaae8bdfb9666b76c0",
                "sha256:390240baeb9f415a82eefc2e13285016f9c8b5ad71ec80574ae8fa9605093cd7",
                "sha256:40442e2a4456e9910875ac12951476d36c0870dcb38a68719f8c4686609897c4",
                "sha256:4337e4aec93b7c011f7ee2e357b0d30562edd1955620fdd4aeab6aacd90d43c5",
                "sha256:43cfbb7db02b30ad3926e8fceaef260ba2fb7df787e38fa2df890c1ca7966c3b",
                "sha256:43fe5af2d590bf4691531b1d9a2495d7aab2090547eaacd224a3afec95706d76",
                "sha256:46b9ed911f36bfeb6338e0b482e7fe7c27d362c52fde29f221fddbc9ee2227e7",
                "sha256:4d23854ecf381ab1facc8f353dcd9adeddef3652268ee75297c1164c987c11dc",
                "sha256:4d6036c3a296707357efb375cfc24bb64cd955b9ec731abf11ebb1e40063949f",
                "sha256:4eb114a0754fd00075c12648d991ec7a4357f9cb873042cc9a77bf3a7e30c9db",
                "sha256:4ee56288d0df919e4aac43b539dd0e34bb55d6a12a6562038e8d6f3ed07f9e36",
                "sha256:51a5e4c61a4541bd1cd3ba74766d0c9b6c12d6a1a4964ef60026832aac8e79b3",
                "sha256:522fe7abb41309e9543b0d9b8b434f2b630c5fdaf6482bee642b34c8c70079c8",
                "sha256:54c4855eabd9fc29707d30141be99e5cd1102e7d2258d2892314cf4c110726c3",
                "sha256:5592401cdf3dc682194727c1ddaa8aa0f3ddc57ca64fd03226a430b955eab6f6",
                "sha256:58ffd35bd5425c3c3b9692d078bf7ab851441434531a7e517c4984d5634cd65b",
                "sha256:5967fe415b1920a3877a4195e9a2b779249630ee49ece22021c690320ff07452",
                "sha256:5fcd7d3b1d8ecb91445bd71b9c88bdbeae528fefee4f379895becfc72298d181",
                "sha256:63b634facdfbad421d4b61c90735688465d4ab3a8853ac22c76ccac2baf98d97",
                "sha256:690b20e3388a7ec98e899fd54c924e50ba6693874aa65ef9cb53de7f7de9d64a",
                "sha256:6da7cd4f405fd7db56e51e96bff0865b9853ae70df0e6720624049da76bde2da",
                "sha256:7488a43033c958637b1a08cddc9188eb06d3ad36582cebc7d4815980b47e27ef",
                "sha256:74e748012f8c19b47f7d6321ac929a9a94ee92ef12bc4298c47e8b7219b26541",
                "sha256:78718d8454a6e928470d511bf8ac93f469283a45c354995f7d19e77292f26108",
                "sha256:7bf61bc4345c1895221357af8f3e89f8c103d93156ef326532d35c707e2fb19d",
                "sha256:7da298e1659e45d151b4028ad5c7974917e108afb48731f4ed785d02b6818994",
                "sha256:84ef591495ffd3f9dcabffd6391db7bb70d7230b5c35ef5148354a134f56f2be",
                "sha256:85b14a4689d5cff426c12eefe750738648706ea2753b20c2f973b2a000d3d261",
                "sha256:8a2e76efbf8772add72d002d67a4c3d0958638696f541734304c7f28217a9cab",
                "sha256:8a78d6c9168f5bcb20971bf3329c2b83078611fbe1f807baadc64afc70523b3a",
                "sha256:8cb26f51c82d77483cdcd2b4a53cda55bbee29b3c2f3ddeb47182a2a9064e4eb",
                "sha256:8db5dc617cb937ae17ff3403c3a70a7de9df4852a046f93e71edaec678f721d0",
                "sha256:9ab542c91f5a47aaa58abdd8ea84b498e8e49fe4b883d67800017757a3eb78e8",
                "sha256:9da022c14baeec36edfcc8daf0e281e2f55b950249a455776f0d1adeeada4734",
                "sha256:9f4b481b6cc3a897adb4279216695150bbe7a44c03daba3c894f49d2037e0a24",
                "sha256:a52a4704811e2623b0324a18d41ad4b9fabf43ce5ff99b14e40a520e2190c851",
                "sha256:a55da151d0b0c6ab176b4e761670ac0e2667817a1e0dadd04a01d0561a219349",
                "sha256:a674c0948789e9136d69065cc28009c1b1874c6ea340253db58be7622ce6398f",
                "sha256:ae74f7c762270196d2dda56f8dd7309411f08a4084ff2dfcc0b095a218df2e06",
                "sha256:afd27d8629ae94c5d863e32ab0e1d5590371d296b87dae0a751fb22bf3685741",
                "sha256:b2d71cdefda9424adff9a3607ba5bbfc60ee972d73c21c7e3c19e71037574816",
                "sha256:b34339898bb556a2351a1830f88f751679f343eabf9cf05841c95b165152c9e7",
                "sha256:b372d10d17a701b0945f67be58fae4664fd056b85e0ff0fbc1e6c951cdbc0512",
                "sha256:b3c98d5b24c6095e89e03d65d5c574705be3d49c0d8ca10c17a8a4b5201b72f5",
                "sha256:b8dd6dd0e9c1992613ccda2bcb74fc9d49159dbe0f0ca4753f37527749885c25",
                "sha256:bd5913b4972681ffc9718bc2d4c53cde39ef81415e1671ff93e9aa30b46595e7",
                "sha256:c0b5fa5eda84057a4f1bbb4bb77a8c28ff20ae7ce211588d698ae453e13c6281",
                "sha256:c16304bba98f48a28ae10e32a8e75c349dd742c45156f297e16eeb1ba9287a1f",
                "sha256:c24b8efd9c0f62bad0439283c2c795ef916c5a6b75f03c17799775c7ae3c0c9e",
                "sha256:c2a5e8d207311a0170aca0eb6b160af91adc29ec121832e4ac151a57743a1e1e",
                "sha256:c352fc8f36f7e9727db17adbf93f82499457b3d7e5511368569b4c5bd155a922",
                "sha256:c86df1c9af35d903d2b52d22ea3e66db8058d21dc0f59842ca5deb0595921141",
                "sha256:c907516d49f77f6cd8ead1322198bdfd902003c3c330c77a1c5f3cc32a0e4d16",
                "sha256:ca50bd612438258a91b5b3788c6621c1f05c8c478e7951899f492be42defc0da",
                "sha256:d18a25b19ca7307045581b18b3ec9ead2b1db5ccd8719c291f0cd0a5cec6cb81",
                "sha256:d4f0c66df4386b75d2ab1e20a489f30dc7fd9a06a896d64980541506086be1f1",
                "sha256:d6e200909a119626744dd81bae409fc44134389e03fbf1d68ed2a55a2fb10991",
                "sha256:d7ae472f74afcc47320238b5dbfd363aba111a525943c8a34a1b657c6be934c3",
                "sha256:db0efd6bae1c4730b9c863fc4f5f3c0fa3e8f05cae2c44ae141cb9dfc7d091dc",
                "sha256:dbdd7679a6f4f08152818043dbb39491d1af3332128b3752c3ec5cebc0011a72",
                "sha256:e0b1520ef900e9ef62e392dd3d7ae4f5fa224d1dd62897a792cf353eb20b6cae",
                "sha256:e2030956cf4886b10be9a0285c6802e078ec2391e1dd7ff3eb509c2c95a69b76",
                "sha256:e35e8aaaf3981489f42884b59726693de32dabfc438ac10ef4eb3409961fd402",
                "sha256:e380e85b93f148ad28ac15f8117e2fd8e5437aa7732d65e260134f83ce67911b",
                "sha256:edf6e4c8fe14dfe316939711e3ece3f9a20760aabf686051b537a7562f4da91a",
                "sha256:f3389924581d9a770c6caa4df4e74b606180869043b9073e2cec324bad6e306e",
                "sha256:f64ccf593916e93b8d36ed55401bb7fe9c7d5de3180ce2e10b08f82a8f397316",
                "sha256:f720a14aa102a38907c6d5030e3d66b3b680c3e6f6bc95473931ea3c00c59967",
                "sha256:f8d19565ae3eb956d84da3ef367aa7def14a2735d05bd275cd54c0301f0d0d6c",
                "sha256:f97487996a39cb18278ca33f7be98198f278d0bc3c5d0fd4d7b3d63646ca3c8a"
            ],
            "markers": "python_version >= '3.8'",
            "version": "==6.0.0"
        },
        "markdown": {
            "hashes": [
                "sha256:247b9a70dd12e27f67431ce62523e675b866d254f900c4fe75ce3dda62237c45",
@@ -331,6 +431,15 @@
            "markers": "python_full_version >= '3.8.1' and python_full_version < '4.0.0'",
            "version": "==0.28.1"
        },
        "python-docx": {
            "hashes": [
                "sha256:3fd478f3250fbbbfd3b94fe1e985955737c145627498896a8a6bf81f4baf66c7",
                "sha256:7bc9d7b7d8a69c9c02ca09216118c86552704edc23bac179283f2e38f86220ce"
            ],
            "index": "pypi",
            "markers": "python_version >= '3.9'",
            "version": "==1.2.0"
        },
        "python-dotenv": {
            "hashes": [
                "sha256:31f23644fe2602f88ff55e1f5c79ba497e01224ee7737937930c448e4d0e24dc",
+4 −1
Original line number Diff line number Diff line
from saref_pypeline.docgen.site_manager import SiteManager, HTMLDocumentationGenerator
from saref_pypeline.docgen.utils import *
from saref_pypeline.docgen.html_generator import HTMLDocumentationGenerator
from saref_pypeline.docgen.docx_generator import DOCXDocumentationGenerator
from saref_pypeline.docgen.site_manager import SiteManager
 No newline at end of file
+243 −0
Original line number Diff line number Diff line
import re
import docx.text
import docx.text.paragraph
import docx.enum.style
import docx.enum.text
import docx.styles
import docx.styles.style
import docx.enum
from docx import Document
from docx.shared import Inches , Pt, Cm
import os
import requests
from bs4 import BeautifulSoup
import re

from git import TYPE_CHECKING
from rdflib import OWL, URIRef

from saref_pypeline.docgen.utils import OWL_GRAPH, EntityDescription
from saref_pypeline.entities import SAREFProjectVersion

if TYPE_CHECKING:
    from saref_pypeline.docgen import SiteManager

def get_saref_work_item(saref_doc: str, version: str):
    url = f"https://portal.etsi.org/webapp/WorkProgram/Frame_WorkItemList.asp?qTITLE={saref_doc}"
    response = requests.get(url)
    response.raise_for_status()
    
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Find all work item rows (they are <tr> containing 'Ref.' and 'Ver.')
    for row in soup.find_all("tr"):
        cells = row.find_all("td")
        if len(cells) < 3:
            continue

        # Try to get the version and ref from the 2nd column
        col_text = cells[1].get_text(separator=" ", strip=True)
        version_match = re.search(r"Ver\.\s*(\d+\.\d+\.\d+)", col_text)
        ref_match = re.search(r"Ref\.\s*([A-Z]+/SmartM2M-\d+-\w+)", col_text)

        if not version_match or not ref_match:
            continue

        found_version = version_match.group(1)
        if found_version != version.lstrip("V"):
            continue

        ref = ref_match.group(1)

        # Extract Work Item ID from link href
        wk_id_link = cells[1].find("a", href=True)
        wk_id_match = re.search(r"WKI_ID=(\d+)", wk_id_link["href"]) if wk_id_link else None
        wk_id = int(wk_id_match.group(1)) if wk_id_match else None

        # Extract title and part from 3rd column
        title_lines = [line.strip() for line in cells[2].stripped_strings]
        title = " ".join(line for line in title_lines if not line.startswith("D2.") and not line.startswith("SAREF4"))
        part_match = re.search(r"Part\s+(\d+)", title)
        part = int(part_match.group(1)) if part_match else None

        return {
            "ref": ref,
            "title": title,
            "part": part,
            "wk_id": wk_id
        }

    raise ValueError(f"No matching work item found for document '{saref_doc}' and version '{version}'")

# {'ref': 'DTS/SmartM2M-103410-7', 'title': 'SmartM2M; Extension to SAREF; Part 7: Automotive Domain', 'part': 7, 'wk_id': 51402}

def fetch_work_item_details(wk_id: int) -> dict:
    url = f"https://portal.etsi.org/webapp/WorkProgram/Report_WorkItem.asp?WKI_ID={wk_id}"
    resp = requests.get(url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.content, "html.parser")

    details = {}

    # 1. ETSI Doc. Number — it's in a table row with header "ETSI Doc.  Number"
    header_cells = soup.find_all("td", class_="RowHead")
    for hdr in header_cells:
        txt = hdr.get_text(strip=True)
        if "ETSI Doc." in txt:
            val_td = list(hdr.parent.next_sibling.next_sibling.children)[5] # YUCK !
            if val_td:
                details["ETSI Doc. Number"] = val_td.get_text(strip=True)

        if "Cover Date" in txt:
            val_td = list(hdr.parent.next_sibling.next_sibling.children)[7]
            if val_td:
                details["Date"] = val_td.get_text(strip=True)

    # 2. Keywords — find the <td> under the Keywords header row
    # locate the row whose second <td> has header "Keywords"
    table = soup.find("table", class_="Table")
    if table:
        rows = table.find_all("tr")
        for row in rows:
            tds = row.find_all("td")
            if len(tds) >= 2 and "Keywords" in tds[1].get_text(strip=True):
                # next row holds keywords in same column position
                next_row = row.find_next_sibling("tr")
                if next_row:
                    kw_td = next_row.find_all("td")[1]
                    # keywords separated by <br>
                    keywords = [kw.strip() for kw in kw_td.stripped_strings]
                    details["Keywords"] = "; ".join(keywords)
                break

    if "ETSI Doc. Number" not in details:
        raise ValueError("Unable to extract ETSI Doc. Number")
    if "Keywords" not in details:
        details["Keywords"] = ""

    return details

# {'ETSI Doc. Number': 'TS 103 410-7', 'Keywords': 'IoT; oneM2M; ontology; SAREF; Semantic; TRANSPORT'}


class DOCXDocumentationGenerator:

    def __init__(self, site_manager: "SiteManager", project_version:SAREFProjectVersion):
        self.pipeline = site_manager.pipeline
        self.site_dir = site_manager.site_dir
        self.dataset = site_manager.dataset
        self.nm = self.dataset.namespace_manager
        self.project_version = project_version
        self.project = project_version.project
        self.version = project_version.version
        
        self._entity_cache = {}
        # precompute descriptions for OWL entities
        self._description(OWL.Thing, OWL_GRAPH)
        self._description(OWL.Nothing, OWL_GRAPH)
        self._description(OWL.bottomDataProperty, OWL_GRAPH)
        self._description(OWL.bottomObjectProperty, OWL_GRAPH)
        self._description(OWL.bottomDataProperty, OWL_GRAPH)
        self._description(OWL.topDataProperty, OWL_GRAPH)

    def render_document(self) -> Document:
        self.document = Document(os.path.join(os.path.dirname(__file__), '../resources/docgen/stub.docx'))

        self.code_title = self.document.styles.add_style("Consolas_title", docx.enum.style.WD_STYLE_TYPE.CHARACTER) #type: docx.styles.style.CharacterStyle
        self.code_title.font.name = "Consolas"

        code = self.document.styles.add_style("Consolas", docx.enum.style.WD_STYLE_TYPE.CHARACTER) #type: docx.styles.style.CharacterStyle
        code.font.name = "Consolas"
        code.font.size = Pt(9)

        # current heading number
        self.n = [4]

        self.add_heading("Hello docx")

        p = self.document.add_paragraph()
        p.add_run("hi code", style=code).font.bold = True
        p.add_run(" hi text ").font.bold = False

        # todo:
        # find official work item on the etsi portal
        # example https://portal.etsi.org/webapp/WorkProgram/SimpleSearch/QueryForm.asp
        # https://portal.etsi.org/webapp/WorkProgram/Frame_WorkItemList.asp?qTITLE=saref4envi
        # https://portal.etsi.org/webapp/WorkProgram/Report_WorkItem.asp?WKI_ID=63058
        # 
        # parse page and extract 

        
        return self.document

    def _description(self, uri:URIRef, graph) -> EntityDescription:
        return self._entity_cache.setdefault(uri, EntityDescription(uri, graph))

    def add_runs(self, s, p:docx.text.paragraph.Paragraph=None, style=None, header = None):
        _p = p or document.add_paragraph()
        if header:
            _p.add_run(header).font.bold = True
        if isinstance(s, list):
            s = ", ".join(s)
        if not isinstance(s, str):
            s = str(s)
        s = re.sub(r"((xsd|owl|geo|time|foaf|vcard|org|saref|s4[a-z]{4}):[a-zA-Z0-9_-]+)",r"`\1`", s)
        s = re.sub(r"``",r"`", s)
        for s1, _, s2 in re.findall(r"([^`]*)(`([^`]+)`)?", s):
            if s1:
                _p.add_run(s1)
            if s2:
                _p.add_run(s2, style=style)


    # def add_figures(self, title, figures):
    #     for figure in figures:
    #         if os.path.isfile(f"documentation/diagrams/{figure}"):
    #             p = self.document.add_paragraph()
    #             p.paragraph_format.alignment = docx.enum.text.WD_PARAGRAPH_ALIGNMENT.CENTER
    #             p.add_run().add_picture(f'documentation/diagrams/{figure}', width=Cm(17))
    #             # add_runs(title, self.document.add_paragraph(style="TF"))

    def add_heading(self, title):
        self.n[-1]+=1
        p = self.document.add_heading(f'{".".join([str(x) for x in self.n])}\t', len(self.n))
        self.add_runs(title, p, self.code_title)

    # def start_subsection_if_needed(self, query=None):
    #     has_subsections = not query or g.query(query).askAnswer
    #     if has_subsections:
    #         n.append(-1)           
    #         self.add_heading("Definition")
    #     return has_subsections

    # def add_note_for_query(self, header, query):
    #     ts = [self.nm.qname(t) if isinstance(t, URIRef) else str(t) for t, in g.query(query)]
    #     if ts:
    #         self.add_runs(ts, header=header)
            
    # def add_note_for_query_2(self, header, query):
    #     ts = []
    #     for s,o in g.query(query):
    #         a = self.nm.qname(s)
    #         a += " to "
    #         if isinstance(o, URIRef):
    #             a += self.nm.qname(o)
    #         else:
    #             a += str(o)
    #         ts.append(a)
    #     if ts:
    #         self.add_runs(",".join(ts), header=header)
            
    # def add_comment(self, subject):
    #     for comment, in g.query(f"""SELECT ?comment WHERE {{ {subject.n3()} rdfs:comment ?comment FILTER( lang(?comment)="{LANG}") }}"""):
    #         self.add_runs(str(comment))

    # def add_note_punning(self, subject, fks, fois):
    #     if g.query(f"""ASK{{ {subject.n3()} a owl:Class . }}""").askAnswer:
    #         p = self.document.add_paragraph()
    #         p.add_run(self.nm.qname(subject), style=code).font.bold = True
    #         p.add_run(" belongs to the eponym class ").font.bold = True
    #         p.add_run(self.nm.qname(subject), style=code).font.bold = True
    #         p.add_run(". This class groups ").font.bold = True 
    #         p.add_run(self.nm.qname(subject), style=code).font.bold = True
    #         p.add_run(f", narrower {fks}, and {fois} of this kind.").font.bold = True
+952 −0

File added.

Preview size limit exceeded, changes collapsed.

Loading