Loading saref_pypeline/pipeline.py +159 −60 Original line number Diff line number Diff line Loading @@ -2,12 +2,37 @@ import os import shutil import logging import re from git import Repo, GitCommandError import datetime import logging from itertools import chain import yaml from typing import Set, Dict, List, Tuple from git import Git, Repo, GitCommandError from rdflib import ( BNode, Dataset, Graph, Literal, URIRef, RDF, RDFS, OWL, XSD, DCTERMS, DCMITYPE, ) from gitlab import Gitlab from gitlab.v4.objects import Group from saref_pypeline._logging import TRACE_LEVEL from saref_pypeline.checkers.TS103673_Checker import Checker as TS103673_Checker from saref_pypeline.etsi import WK_FIELD from saref_pypeline.docgen import SiteManager from saref_pypeline.dataset import LevelDBDataset, ManagedDataset from saref_pypeline.constants import * from saref_pypeline.vocabs import EX, SCHEMA from saref_pypeline.metadata_fetcher import fetch_metadata from saref_pypeline.entities import ( DocumentImportCantFindVersionError, DocumentImportDoesntMatchPatternError, Loading @@ -21,18 +46,6 @@ from saref_pypeline.entities import ( SAREFGraphDocument, SAREFGraphDocumentType, ) from saref_pypeline.checkers.TS103673_Checker import Checker as TS103673_Checker from rdflib import RDF, Dataset, Graph, OWL, DCMITYPE from gitlab import Gitlab from gitlab.v4.objects import Group from typing import Set, Dict, List, Tuple import logging from saref_pypeline.constants import METADATA, PATTERN_SAREF_GRAPHS, BASE from saref_pypeline.vocabs import SCHEMA from saref_pypeline.metadata_fetcher import fetch_metadata from rdflib import URIRef, DCTERMS, RDFS, XSD from itertools import chain import yaml class SimpleConditionalFormatter(logging.Formatter): Loading Loading @@ -144,7 +157,9 @@ class SAREFPipeline: if not self.include_versions: return True project_version_str = str(project_version) return any(re.search(regex, project_version_str) for regex in self.include_versions) return any( re.search(regex, project_version_str) for regex in self.include_versions ) def filter_clause(self, clause: str): if not self.include_clauses: Loading Loading @@ -191,6 +206,12 @@ class SAREFPipeline: encoding="utf-8", ) self.dataset_on_disk.graph(LOADING_METADATA).serialize( destination=os.path.join(self.target_dir, "loading-metadata.ttl"), format="turtle", encoding="utf-8", ) def run_on_patterns(self): raise NotImplementedError() SAREFPatterns.directory = self.directory Loading Loading @@ -271,7 +292,6 @@ class SAREFPipeline: if self.mode == PipelineMode.WEBSITE: self.site_manager.generate_htaccess() def run_on_project(self, basename): try: project = SAREFProject(basename) Loading Loading @@ -405,7 +425,7 @@ class SAREFPipeline: versions = [ v for v in versions if v.branch_type in {BranchType.WORKING_DIRECTORY, BranchType.RELEASE} if v.branch_type in [BranchType.WORKING_DIRECTORY, BranchType.RELEASE] ] logger.debug( Loading Loading @@ -619,10 +639,74 @@ class SAREFPipeline: ) -> None: # prerequisite: the project is already fetched, and existing versions have been found if not document.version_iri in self.graph_documents: logger.info(f"Loading {document}") logger.debug(f"Loading {document}") project_version = document.project_version # Retrieve last modification date stored in metadata loading_metadata = self.dataset_on_disk.graph( LOADING_METADATA ) # stores metadata about modification dates of previously loaded graphs last_mod = None try: last_mod_literal = next( loading_metadata.objects(document.version_iri, DCTERMS.modified), None, ) last_mod = datetime.datetime.fromisoformat(last_mod_literal) except: pass # Get current file modification date if project_version.branch_type == BranchType.WORKING_DIRECTORY: path = os.path.join( project_version.project.directory, document.file_path_ttl ) current_mod = datetime.datetime.fromtimestamp(os.path.getmtime(path)) else: # Get commit date of the file in the given branch commit = project_version.project.repo.git.log( "-1", "--format=%ci", f"origin/{project_version.branch_name}", "--", document.file_path_ttl, ) current_mod = datetime.datetime.fromisoformat(commit) # Decide whether to reload or reuse if last_mod and last_mod >= current_mod: # Reuse what's already cached in dataset_on_disk logger.log(TRACE_LEVEL, f"... from cache ({last_mod})") graph = self.dataset_on_disk.graph(document.version_iri) document.bases.extend( { str(o) for o in loading_metadata.objects( document.version_iri, EX.declares_base ) } ) if loading_error := next( loading_metadata.objects(document.version_iri, EX.loading_error), None, ): document.loading_error = str(loading_error) logger.log( TRACE_LEVEL, f"Error while loading {document}: {loading_error}" ) for ns in loading_metadata.objects( document.version_iri, EX.declares_namespace ): prefix = next(loading_metadata.objects(ns, EX.prefix), None) namespace = next(loading_metadata.objects(ns, EX.namespace), None) if not prefix or not namespace: continue document.namespaces.append((prefix, namespace)) else: self.dataset_on_disk.remove_graph(document.version_iri) graph = Graph(identifier=document.version_iri, bind_namespaces="none") try: if project_version.branch_type == BranchType.WORKING_DIRECTORY: Loading @@ -641,22 +725,42 @@ class SAREFPipeline: data=file_content.replace("\r", ""), format="turtle" ) # store declared bases document.bases = list(re.findall(PATTERN_BASE, file_content)) # store graph for later use on_disk_dataset_graph = self.dataset.graph(document.version_iri) on_disk_dataset_graph += graph # store declared bases for later use for base in re.findall(PATTERN_BASE, file_content): document.bases.append(base) loading_metadata.add( (document.version_iri, EX.declares_base, Literal(base)) ) except Exception as e: # store error for later use document.loading_error = e loading_metadata.add( (document.version_iri, EX.loading_error, Literal(str(e))) ) logger.error(f"Error while loading {document}: {e}") document.bases = [] # store original namespaces of the document for later use document.namespaces.extend(graph.namespaces()) for prefix, namespace in graph.namespaces(): document.namespaces.append((prefix, namespace)) ns = BNode() loading_metadata.add( (document.version_iri, EX.declares_namespace, ns) ) loading_metadata.add((ns, EX.prefix, Literal(prefix))) loading_metadata.add((ns, EX.namespace, Literal(namespace))) # return dataset graph (possibly empty) # store current_mod time loading_metadata.add( (document.version_iri, DCTERMS.modified, Literal(current_mod)) ) # use in-memory dataset for better performances dataset_graph = self.dataset.graph(document.version_iri) dataset_graph.remove( (None, None, None) ) # clear the graph before adding new data dataset_graph += graph # store loaded graph documents at the pipeline and project_version levels Loading @@ -670,17 +774,12 @@ class SAREFPipeline: self.resolve_imports(document) self.generate_metadata(document) def load_version(self, project_version: SAREFProjectVersion): # read ontology document ontology = SAREFGraphDocument(project_version, SAREFGraphDocumentType.ONTOLOGY) if self.filter_document(ontology): self.load_graph_document(ontology) metadata = self.dataset.graph(METADATA) # load examples and vocabularies Loading Loading
saref_pypeline/pipeline.py +159 −60 Original line number Diff line number Diff line Loading @@ -2,12 +2,37 @@ import os import shutil import logging import re from git import Repo, GitCommandError import datetime import logging from itertools import chain import yaml from typing import Set, Dict, List, Tuple from git import Git, Repo, GitCommandError from rdflib import ( BNode, Dataset, Graph, Literal, URIRef, RDF, RDFS, OWL, XSD, DCTERMS, DCMITYPE, ) from gitlab import Gitlab from gitlab.v4.objects import Group from saref_pypeline._logging import TRACE_LEVEL from saref_pypeline.checkers.TS103673_Checker import Checker as TS103673_Checker from saref_pypeline.etsi import WK_FIELD from saref_pypeline.docgen import SiteManager from saref_pypeline.dataset import LevelDBDataset, ManagedDataset from saref_pypeline.constants import * from saref_pypeline.vocabs import EX, SCHEMA from saref_pypeline.metadata_fetcher import fetch_metadata from saref_pypeline.entities import ( DocumentImportCantFindVersionError, DocumentImportDoesntMatchPatternError, Loading @@ -21,18 +46,6 @@ from saref_pypeline.entities import ( SAREFGraphDocument, SAREFGraphDocumentType, ) from saref_pypeline.checkers.TS103673_Checker import Checker as TS103673_Checker from rdflib import RDF, Dataset, Graph, OWL, DCMITYPE from gitlab import Gitlab from gitlab.v4.objects import Group from typing import Set, Dict, List, Tuple import logging from saref_pypeline.constants import METADATA, PATTERN_SAREF_GRAPHS, BASE from saref_pypeline.vocabs import SCHEMA from saref_pypeline.metadata_fetcher import fetch_metadata from rdflib import URIRef, DCTERMS, RDFS, XSD from itertools import chain import yaml class SimpleConditionalFormatter(logging.Formatter): Loading Loading @@ -144,7 +157,9 @@ class SAREFPipeline: if not self.include_versions: return True project_version_str = str(project_version) return any(re.search(regex, project_version_str) for regex in self.include_versions) return any( re.search(regex, project_version_str) for regex in self.include_versions ) def filter_clause(self, clause: str): if not self.include_clauses: Loading Loading @@ -191,6 +206,12 @@ class SAREFPipeline: encoding="utf-8", ) self.dataset_on_disk.graph(LOADING_METADATA).serialize( destination=os.path.join(self.target_dir, "loading-metadata.ttl"), format="turtle", encoding="utf-8", ) def run_on_patterns(self): raise NotImplementedError() SAREFPatterns.directory = self.directory Loading Loading @@ -271,7 +292,6 @@ class SAREFPipeline: if self.mode == PipelineMode.WEBSITE: self.site_manager.generate_htaccess() def run_on_project(self, basename): try: project = SAREFProject(basename) Loading Loading @@ -405,7 +425,7 @@ class SAREFPipeline: versions = [ v for v in versions if v.branch_type in {BranchType.WORKING_DIRECTORY, BranchType.RELEASE} if v.branch_type in [BranchType.WORKING_DIRECTORY, BranchType.RELEASE] ] logger.debug( Loading Loading @@ -619,10 +639,74 @@ class SAREFPipeline: ) -> None: # prerequisite: the project is already fetched, and existing versions have been found if not document.version_iri in self.graph_documents: logger.info(f"Loading {document}") logger.debug(f"Loading {document}") project_version = document.project_version # Retrieve last modification date stored in metadata loading_metadata = self.dataset_on_disk.graph( LOADING_METADATA ) # stores metadata about modification dates of previously loaded graphs last_mod = None try: last_mod_literal = next( loading_metadata.objects(document.version_iri, DCTERMS.modified), None, ) last_mod = datetime.datetime.fromisoformat(last_mod_literal) except: pass # Get current file modification date if project_version.branch_type == BranchType.WORKING_DIRECTORY: path = os.path.join( project_version.project.directory, document.file_path_ttl ) current_mod = datetime.datetime.fromtimestamp(os.path.getmtime(path)) else: # Get commit date of the file in the given branch commit = project_version.project.repo.git.log( "-1", "--format=%ci", f"origin/{project_version.branch_name}", "--", document.file_path_ttl, ) current_mod = datetime.datetime.fromisoformat(commit) # Decide whether to reload or reuse if last_mod and last_mod >= current_mod: # Reuse what's already cached in dataset_on_disk logger.log(TRACE_LEVEL, f"... from cache ({last_mod})") graph = self.dataset_on_disk.graph(document.version_iri) document.bases.extend( { str(o) for o in loading_metadata.objects( document.version_iri, EX.declares_base ) } ) if loading_error := next( loading_metadata.objects(document.version_iri, EX.loading_error), None, ): document.loading_error = str(loading_error) logger.log( TRACE_LEVEL, f"Error while loading {document}: {loading_error}" ) for ns in loading_metadata.objects( document.version_iri, EX.declares_namespace ): prefix = next(loading_metadata.objects(ns, EX.prefix), None) namespace = next(loading_metadata.objects(ns, EX.namespace), None) if not prefix or not namespace: continue document.namespaces.append((prefix, namespace)) else: self.dataset_on_disk.remove_graph(document.version_iri) graph = Graph(identifier=document.version_iri, bind_namespaces="none") try: if project_version.branch_type == BranchType.WORKING_DIRECTORY: Loading @@ -641,22 +725,42 @@ class SAREFPipeline: data=file_content.replace("\r", ""), format="turtle" ) # store declared bases document.bases = list(re.findall(PATTERN_BASE, file_content)) # store graph for later use on_disk_dataset_graph = self.dataset.graph(document.version_iri) on_disk_dataset_graph += graph # store declared bases for later use for base in re.findall(PATTERN_BASE, file_content): document.bases.append(base) loading_metadata.add( (document.version_iri, EX.declares_base, Literal(base)) ) except Exception as e: # store error for later use document.loading_error = e loading_metadata.add( (document.version_iri, EX.loading_error, Literal(str(e))) ) logger.error(f"Error while loading {document}: {e}") document.bases = [] # store original namespaces of the document for later use document.namespaces.extend(graph.namespaces()) for prefix, namespace in graph.namespaces(): document.namespaces.append((prefix, namespace)) ns = BNode() loading_metadata.add( (document.version_iri, EX.declares_namespace, ns) ) loading_metadata.add((ns, EX.prefix, Literal(prefix))) loading_metadata.add((ns, EX.namespace, Literal(namespace))) # return dataset graph (possibly empty) # store current_mod time loading_metadata.add( (document.version_iri, DCTERMS.modified, Literal(current_mod)) ) # use in-memory dataset for better performances dataset_graph = self.dataset.graph(document.version_iri) dataset_graph.remove( (None, None, None) ) # clear the graph before adding new data dataset_graph += graph # store loaded graph documents at the pipeline and project_version levels Loading @@ -670,17 +774,12 @@ class SAREFPipeline: self.resolve_imports(document) self.generate_metadata(document) def load_version(self, project_version: SAREFProjectVersion): # read ontology document ontology = SAREFGraphDocument(project_version, SAREFGraphDocumentType.ONTOLOGY) if self.filter_document(ontology): self.load_graph_document(ontology) metadata = self.dataset.graph(METADATA) # load examples and vocabularies Loading