Unverified Commit 14c6f76e authored by Maxime Lefrançois's avatar Maxime Lefrançois
Browse files

speed up execution with cache

parent 66bf06f7
Loading
Loading
Loading
Loading
+159 −60
Original line number Diff line number Diff line
@@ -2,12 +2,37 @@ import os
import shutil
import logging
import re
from git import Repo, GitCommandError
import datetime
import logging
from itertools import chain
import yaml
from typing import Set, Dict, List, Tuple

from git import Git, Repo, GitCommandError
from rdflib import (
    BNode,
    Dataset,
    Graph,
    Literal,
    URIRef,
    RDF,
    RDFS,
    OWL,
    XSD,
    DCTERMS,
    DCMITYPE,
)
from gitlab import Gitlab
from gitlab.v4.objects import Group

from saref_pypeline._logging import TRACE_LEVEL
from saref_pypeline.checkers.TS103673_Checker import Checker as TS103673_Checker
from saref_pypeline.etsi import WK_FIELD
from saref_pypeline.docgen import SiteManager
from saref_pypeline.dataset import LevelDBDataset, ManagedDataset
from saref_pypeline.constants import *
from saref_pypeline.vocabs import EX, SCHEMA
from saref_pypeline.metadata_fetcher import fetch_metadata
from saref_pypeline.entities import (
    DocumentImportCantFindVersionError,
    DocumentImportDoesntMatchPatternError,
@@ -21,18 +46,6 @@ from saref_pypeline.entities import (
    SAREFGraphDocument,
    SAREFGraphDocumentType,
)
from saref_pypeline.checkers.TS103673_Checker import Checker as TS103673_Checker
from rdflib import RDF, Dataset, Graph, OWL, DCMITYPE
from gitlab import Gitlab
from gitlab.v4.objects import Group
from typing import Set, Dict, List, Tuple
import logging
from saref_pypeline.constants import METADATA, PATTERN_SAREF_GRAPHS, BASE
from saref_pypeline.vocabs import SCHEMA
from saref_pypeline.metadata_fetcher import fetch_metadata
from rdflib import URIRef, DCTERMS, RDFS, XSD
from itertools import chain
import yaml


class SimpleConditionalFormatter(logging.Formatter):
@@ -144,7 +157,9 @@ class SAREFPipeline:
        if not self.include_versions:
            return True
        project_version_str = str(project_version)
        return any(re.search(regex, project_version_str) for regex in self.include_versions)
        return any(
            re.search(regex, project_version_str) for regex in self.include_versions
        )

    def filter_clause(self, clause: str):
        if not self.include_clauses:
@@ -191,6 +206,12 @@ class SAREFPipeline:
                encoding="utf-8",
            )

            self.dataset_on_disk.graph(LOADING_METADATA).serialize(
                destination=os.path.join(self.target_dir, "loading-metadata.ttl"),
                format="turtle",
                encoding="utf-8",
            )

    def run_on_patterns(self):
        raise NotImplementedError()
        SAREFPatterns.directory = self.directory
@@ -271,7 +292,6 @@ class SAREFPipeline:
        if self.mode == PipelineMode.WEBSITE:
            self.site_manager.generate_htaccess()


    def run_on_project(self, basename):
        try:
            project = SAREFProject(basename)
@@ -405,7 +425,7 @@ class SAREFPipeline:
            versions = [
                v
                for v in versions
                if v.branch_type in {BranchType.WORKING_DIRECTORY, BranchType.RELEASE}
                if v.branch_type in [BranchType.WORKING_DIRECTORY, BranchType.RELEASE]
            ]

        logger.debug(
@@ -619,10 +639,74 @@ class SAREFPipeline:
    ) -> None:
        # prerequisite: the project is already fetched, and existing versions have been found
        if not document.version_iri in self.graph_documents:
            logger.info(f"Loading {document}")
            logger.debug(f"Loading {document}")

            project_version = document.project_version

            # Retrieve last modification date stored in metadata
            loading_metadata = self.dataset_on_disk.graph(
                LOADING_METADATA
            )  # stores metadata about modification dates of previously loaded graphs
            last_mod = None
            try:
                last_mod_literal = next(
                    loading_metadata.objects(document.version_iri, DCTERMS.modified),
                    None,
                )
                last_mod = datetime.datetime.fromisoformat(last_mod_literal)
            except:
                pass

            # Get current file modification date
            if project_version.branch_type == BranchType.WORKING_DIRECTORY:
                path = os.path.join(
                    project_version.project.directory, document.file_path_ttl
                )
                current_mod = datetime.datetime.fromtimestamp(os.path.getmtime(path))
            else:
                # Get commit date of the file in the given branch
                commit = project_version.project.repo.git.log(
                    "-1",
                    "--format=%ci",
                    f"origin/{project_version.branch_name}",
                    "--",
                    document.file_path_ttl,
                )
                current_mod = datetime.datetime.fromisoformat(commit)

            # Decide whether to reload or reuse
            if last_mod and last_mod >= current_mod:
                # Reuse what's already cached in dataset_on_disk
                logger.log(TRACE_LEVEL, f"... from cache ({last_mod})")
                graph = self.dataset_on_disk.graph(document.version_iri)
                document.bases.extend(
                    {
                        str(o)
                        for o in loading_metadata.objects(
                            document.version_iri, EX.declares_base
                        )
                    }
                )
                if loading_error := next(
                    loading_metadata.objects(document.version_iri, EX.loading_error),
                    None,
                ):
                    document.loading_error = str(loading_error)
                    logger.log(
                        TRACE_LEVEL, f"Error while loading {document}: {loading_error}"
                    )

                for ns in loading_metadata.objects(
                    document.version_iri, EX.declares_namespace
                ):
                    prefix = next(loading_metadata.objects(ns, EX.prefix), None)
                    namespace = next(loading_metadata.objects(ns, EX.namespace), None)
                    if not prefix or not namespace:
                        continue
                    document.namespaces.append((prefix, namespace))

            else:
                self.dataset_on_disk.remove_graph(document.version_iri)
                graph = Graph(identifier=document.version_iri, bind_namespaces="none")
                try:
                    if project_version.branch_type == BranchType.WORKING_DIRECTORY:
@@ -641,22 +725,42 @@ class SAREFPipeline:
                            data=file_content.replace("\r", ""), format="turtle"
                        )

                # store declared bases
                document.bases = list(re.findall(PATTERN_BASE, file_content))
                    # store graph for later use
                    on_disk_dataset_graph = self.dataset.graph(document.version_iri)
                    on_disk_dataset_graph += graph

                    # store declared bases for later use
                    for base in re.findall(PATTERN_BASE, file_content):
                        document.bases.append(base)
                        loading_metadata.add(
                            (document.version_iri, EX.declares_base, Literal(base))
                        )

                except Exception as e:
                    # store error for later use
                    document.loading_error = e
                    loading_metadata.add(
                        (document.version_iri, EX.loading_error, Literal(str(e)))
                    )
                    logger.error(f"Error while loading {document}: {e}")
                document.bases = []

                # store original namespaces of the document for later use
            document.namespaces.extend(graph.namespaces())
                for prefix, namespace in graph.namespaces():
                    document.namespaces.append((prefix, namespace))
                    ns = BNode()
                    loading_metadata.add(
                        (document.version_iri, EX.declares_namespace, ns)
                    )
                    loading_metadata.add((ns, EX.prefix, Literal(prefix)))
                    loading_metadata.add((ns, EX.namespace, Literal(namespace)))

            # return dataset graph (possibly empty)
                # store current_mod time
                loading_metadata.add(
                    (document.version_iri, DCTERMS.modified, Literal(current_mod))
                )

            # use in-memory dataset for better performances
            dataset_graph = self.dataset.graph(document.version_iri)
            dataset_graph.remove(
                (None, None, None)
            )  # clear the graph before adding new data
            dataset_graph += graph

            # store loaded graph documents at the pipeline and project_version levels
@@ -670,17 +774,12 @@ class SAREFPipeline:
            self.resolve_imports(document)
            self.generate_metadata(document)





    def load_version(self, project_version: SAREFProjectVersion):
        # read ontology document
        ontology = SAREFGraphDocument(project_version, SAREFGraphDocumentType.ONTOLOGY)
        if self.filter_document(ontology):
            self.load_graph_document(ontology)


        metadata = self.dataset.graph(METADATA)

        # load examples and vocabularies