doc2tosca.py 11.3 KB
Newer Older
carignani's avatar
carignani committed
#!/bin/python3
'''
Generate tosca definitions from Docx specfication
'''

import sys
carignani's avatar
carignani committed
import os
import re
import logging
from io import StringIO
from docx.table import Table
from docx.text.paragraph import Paragraph
from example import generate_examples_between

BASE_FILENAME = "etsi_nfv_sol001_{}_types.yaml"
carignani's avatar
carignani committed
TOSCA_VERSION = "tosca_simple_yaml_1_3"
DEFAULT_TOSCA_VERSION = "tosca_simple_yaml_1_3"
SPEC_VERSION = "v2.6.1"
allowed_versions = ["v2.6.1", "v2.6.3", "v2.7.1", "v2.8.1", "v3.3.1"]
MODEL_NAMES = ['vnfd', 'nsd', 'pnfd', 'common']
HDR = '''tosca_definitions_version: {tosca_version}
description: ETSI NFV SOL 001 {model} types definitions version {spec_version}
metadata:
  template_name: etsi_nfv_sol001_{model}_types
  template_author: ETSI_NFV
  template_version: {spec_version}
  {imports}
carignani's avatar
carignani committed
sections_to_models = {
    6: 'vnfd',
    7: 'nsd',
    8: 'pnfd',
    9: 'common'
}


carignani's avatar
carignani committed
SUBSECTIONS = [
    "Artifact Types",
    "Data Types",
    "Capability Types",
    "Interface Types",
    "Requirements Types",
    "Relationship Types",
    "Interface Types",
    "Node Types",
    "Group Types",
    "Policy Types"
]

MODELS = {}
EXAMPLES = {}

carignani's avatar
carignani committed

class Section():
    '''
    Defines a section of the base document
    '''

    def __init__(self, from_id, to_id, title):
        self.from_id = from_id
        self.to_id = to_id
        self.is_annex = title.strip().startswith("Annex")
carignani's avatar
carignani committed

        if not self.is_annex:
            cleaned_title = title.strip().split("\t")
            self.title = cleaned_title[1]
            self.number = int(cleaned_title[0])
        else:
            cleaned_title = title.strip().split(" ")
            self.title = " ".join(cleaned_title[3:])
            self.letter = cleaned_title[1]

    def __repr__(self):
        if self.is_annex:
carignani's avatar
carignani committed
            return "({}, Annex {}, {}-{})".format(
                self.title, self.letter, self.from_id, self.to_id
            )
        return "({}, {}, {}-{})".format(
            self.title, self.number, self.from_id, self.to_id
        )

def match_definition_incipit(txt):
    '''
    Returns tru if txt matches the incipit of a definition,
carignani's avatar
carignani committed
    identified by the word 'tosca'
carignani's avatar
carignani committed
    return bool(
        re.match(r'^tosca\.[a-zA-Z\.:0-9\s]*$', txt.split("\n")[0].strip())
    )

def is_tosca_def(table):
    '''
    Returns true when a table contains TOSCA definitions, i.e.
    the table contains just one cell and text starts with an
    empty space ' '
    '''
    txt = table.rows[0].cells[0].text
        len(table.rows) == 1 and \
        len(table.columns) == 1 and \
carignani's avatar
carignani committed
        match_definition_incipit(txt)

def tosca_model_info(name, version, imports):
    Returns a dictionary to hold information on the model
carignani's avatar
carignani committed
        'name': name,
carignani's avatar
carignani committed
        'fd': None,
        'imports': imports,
        'buf':  StringIO()
carignani's avatar
carignani committed

    '''
    Returns a list of all paragraphs and tables in the Document
    '''
    parag_count = 0
    table_count = 0
    for element in body._element:
        if isinstance(element, docx.oxml.text.paragraph.CT_P):
            ret.append(Paragraph(element, body))
            parag_count = parag_count + 1
        elif isinstance(element, docx.oxml.table.CT_Tbl):
            ret.append(Table(element, body))
            table_count = table_count + 1
carignani's avatar
carignani committed
        # else:
        #    logging.info("Non paragraph or table " +  str(type(element)))
    logging.info("Paragraphs: " + str(parag_count))
    logging.info("Tables: " + str(table_count))
carignani's avatar
carignani committed

def find_sect(sect_to_find, start_idx, doc_content):
    '''
    Returns the index in the doc_content list to the first paragraph
    or heading of the section with title sect_to_find,
    starting the research from start_idx
    '''
    while start_idx < len(doc_content):
        my_elem = doc_content[start_idx]
carignani's avatar
carignani committed
        if isinstance(my_elem, Paragraph) and \
           my_elem.text.strip() == sect_to_find:
    logging.info("FOUND " + sect_to_find + " at " + str(start_idx))
carignani's avatar
carignani committed

carignani's avatar
carignani committed
def is_lvl2_section_hdn(txt):
    ''' Returns true if txt is level 2 heading'''
    clean_txt = txt.strip()
carignani's avatar
carignani committed

carignani's avatar
carignani committed
    if not bool(re.match(r'^[0-9]\.[0-9]+\t[a-zA-Z\s]*$', clean_txt)):
        return False

    subtitle = clean_txt.split('\t')[1]
carignani's avatar
carignani committed

carignani's avatar
carignani committed
    return subtitle in SUBSECTIONS

carignani's avatar
carignani committed

def is_lvl1_section_hdn(txt):
    ''' Returns true if txt is level 1 heading'''
    clean_txt = txt.strip()
    return bool(re.match(r'^[0-9]+\t[a-zA-Z\s]*$', clean_txt)) or \
carignani's avatar
carignani committed
        bool(re.match(r'^Annex[\s]*[A-Z]+[\s\t]+[a-zA-Z\s\(\)]*', clean_txt))


def find_all_sections(doc_content):
    '''
    Scans the body of the document to find level 1 sections
    Returns a list of Section
    '''
    sections = []

    start_indx = 0
    end_indx = 1

    while end_indx < len(doc_content):
        my_elem = doc_content[end_indx]
carignani's avatar
carignani committed
        if isinstance(my_elem, Paragraph) and \
           is_lvl1_section_hdn(my_elem.text):
            if start_indx != 0:
carignani's avatar
carignani committed
                sections.append(
                    Section(
                        start_indx,
                        end_indx-1,
                        doc_content[start_indx].text)
                )

            start_indx = end_indx
carignani's avatar
carignani committed

        end_indx = end_indx + 1
carignani's avatar
carignani committed

    sections.append(
        Section(start_indx, end_indx-1, doc_content[start_indx].text)
    )
carignani's avatar
carignani committed

carignani's avatar
carignani committed
def write_subsection_to_file(txt, buf):
    '''
    Writes a subsection header in utf-8 encoding to file buf
    '''
    buf.write(slugify(txt)+":")
    if not txt.endswith('\n'):
        buf.write('\n')
    buf.write('\n')

carignani's avatar
carignani committed

def write_table_to_file(tab, buf):
    '''
    Writes content of table t in utf-8 encoding to file F
    '''
    txt = tab.rows[0].cells[0].text
    if not txt.endswith('\n'):
        buf.write('\n')
    buf.write('\n')
carignani's avatar
carignani committed

range_err_mess = "ERR: Out of range (A: {}, B: {}, IDX: {}, LEN(CONTENT): {})"


def gen_tables_btwn(a_id, b_id, content, buf):
    '''
    Loops over content and writes all tosca definitions to the
    fdesc file. Returns the number of written definitions
    '''
    definitions_count = 0
carignani's avatar
carignani committed

    for idx in range(a_id, b_id):
carignani's avatar
carignani committed
        if idx >= len(content):
            logging.info(
carignani's avatar
carignani committed
                range_err_mess.format(a_id, b_id, idx, len(content)))
carignani's avatar
carignani committed
            return definitions_count

        tmp_elem = content[idx]
carignani's avatar
carignani committed
        if isinstance(tmp_elem, Paragraph) and \
           is_lvl2_section_hdn(tmp_elem.text):
            logging.info(tmp_elem.text)
        if isinstance(tmp_elem, Table) and is_tosca_def(tmp_elem):
            if subsection is not None:
                write_subsection_to_file(subsection, buf)
                definitions_count = definitions_count + 1
                subsection = None
            write_table_to_file(tmp_elem, buf)
            definitions_count = definitions_count + 1
        elif isinstance(tmp_elem, Table):
            txt = tmp_elem.rows[0].cells[0].text
carignani's avatar
carignani committed
            if txt.strip().startswith("Name") or \
               txt.strip().startswith("Shorthand") or \
               txt.strip().startswith("tosca_def"):
                continue
    return definitions_count

carignani's avatar
carignani committed

carignani's avatar
carignani committed
        model_name,
        buf,
        spec_version=SPEC_VERSION,
        imports=None,
        tosca_version=DEFAULT_TOSCA_VERSION):
    '''
    Writes the header to the file for a specific model
    '''
    buf.write(HDR.format(
        tosca_version=tosca_version,
        model=model_name,
        spec_version=spec_version,
        imports=imports))

carignani's avatar
carignani committed

def init_models(yaml_root, spec_ver, tosc_ver):
    for model in MODEL_NAMES:
        import_stmt = 'etsi_nfv_sol001_common_types.yaml'

        if yaml_root != 'local':
            import_stmt = \
                'https://forge.etsi.org/rep/nfv/SOL001/raw/{}/{}'.format(
                    spec_ver, import_stmt
                )

        MODELS[model] = tosca_model_info(
            model,
            spec_ver,
            '- ' + import_stmt
        )
carignani's avatar
carignani committed

    for mod in MODELS:
        generate_header(
            MODELS[mod]['name'],
            MODELS[mod]['buf'],
            spec_ver,
            MODELS[mod]['imports'],
            tosc_ver
        )


carignani's avatar
carignani committed
        filename,
        spec_ver=SPEC_VERSION,
        yaml_root='uri',
        tosc_ver=DEFAULT_TOSCA_VERSION):
carignani's avatar
carignani committed
    '''
carignani's avatar
carignani committed
    Takes a filename or file object and loads the definition into
    the MODELS dictionary
carignani's avatar
carignani committed
    '''
    if isinstance(filename, str):
        logging.info("Opening " + filename)
carignani's avatar
carignani committed
    init_models(yaml_root, spec_ver, tosc_ver)
    try:
        sol_001 = docx.Document(filename)
    except:
        logging.info("Error opening the submitted Docx file")
        raise ValueError("Cannot open the submitted Docx file")
    content = get_content(sol_001)
    sections = find_all_sections(content)
    for sect in sections:
        if not sect.is_annex:
            if sect.number in sections_to_models.keys():
                model = sections_to_models[sect.number]
carignani's avatar
carignani committed
                count = gen_tables_btwn(
                    sect.from_id, sect.to_id, content, MODELS[model]['buf']
                )
                logging.info("Printed " + str(count) + " types to " + model)
carignani's avatar
carignani committed
            if sect.letter == "A" or sect.letter == "E":
                count = generate_examples_between(
                    sect.from_id, sect.to_id, content, EXAMPLES
                )
                logging.info("Printed {} types to Annex {}".format(
carignani's avatar
carignani committed
                    str(count), sect.letter)
                )
carignani's avatar
carignani committed

carignani's avatar
carignani committed

def print_to_files(prefix=None):
carignani's avatar
carignani committed
    '''
carignani's avatar
carignani committed
    Prefix is a path to a folder to work into
    '''
    for key in MODELS:
        mod = MODELS[key]
        if prefix is not None:
            mod['fn'] = os.path.join(prefix, mod['fn'])
        logging.info("Writing to " + mod['fn'])
        mod['fd'] = open(mod['fn'], 'w')
        mod['buf'].seek(0)
        mod['fd'].write(mod['buf'].read())
        mod['fd'].write('\n')
        mod['fd'].close()
    for k in EXAMPLES:
        if prefix is not None:
            fnm = os.path.join(prefix, "example_"+EXAMPLES[k].filename)
            fnm = EXAMPLES[k].filename
        logging.info("Writing example file: " + fnm)
        with open(fnm, 'w') as newf:
            newf.write(EXAMPLES[k].text)
            newf.write("\n")
            newf.close()

carignani's avatar
carignani committed

def parse_version_from_filename(filename):
    '''
    Parses the version from the filename
    '''
    base_filename = os.path.basename(filename)

carignani's avatar
carignani committed
    if base_filename.startswith("gs_NFV-SOL001v"):
        return "v" + base_filename.strip("gs_NFV-SOL001v") \
carignani's avatar
carignani committed
                .replace("0", ".").strip(".").strip("p.docx")
    if base_filename.startswith("gs_nfv-sol001v"):
        return "v" + base_filename.strip("gs_nfv-sol001v") \
carignani's avatar
carignani committed
                .replace("0", ".").strip(".").strip("p.docx")
carignani's avatar
carignani committed

carignani's avatar
carignani committed
def slugify(t):
    return t.replace(" ", "_").lower()

carignani's avatar
carignani committed

carignani's avatar
carignani committed
if __name__ == "__main__":

    try:
        SOL001_FN = sys.argv[1]
    except:
        logging.info('Error: Filename missing or filename not a docx document')
        logging.info('Usage: doc2tosca <docx-with-tosca-definitions>')
carignani's avatar
carignani committed
        sys.exit(1)

    ver = parse_version_from_filename(SOL001_FN)
    generate_templates(SOL001_FN, spec_ver=ver)
carignani's avatar
carignani committed

    print_to_files()