doc2tosca.py

#!/bin/python3
'''
Generate tosca definitions from Docx specfication
'''

import sys
import os
import re
from io import StringIO

import docx
from docx.table import Table
from docx.text.paragraph import Paragraph

from example import generate_examples_between

BASE_FILENAME = "etsi_nfv_sol001_{}_{}_types.yaml"
TOSCA_VERSION = "tosca_simple_yaml_1_2"
DEFAULT_TOSCA_VERSION = "tosca_simple_yaml_1_2"
SPEC_VERSION = "v2.6.1"

allowed_versions = ["v2.6.1", "v2.6.3", "v2.7.1", "v2.8.1", "v3.3.1"]

MODEL_NAMES = ['vnfd', 'nsd', 'pnfd', 'common']

HDR = '''tosca_definitions_version: {tosca_version}
description: ETSI NFV SOL 001 {model} types definitions version {spec_version}
metadata:
  - template_name: etsi_nfv_sol001_{model}_types
  - template_name: ETSI_NFV
  - template_version: {spec_version}

imports:
  {imports}

data_types:
'''

MODELS = {}
EXAMPLES = {}

class Section():
    '''
    Defines a section of the base document
    '''

    def __init__(self, from_id, to_id, title):
        self.from_id = from_id
        self.to_id = to_id
        self.is_annex = title.strip().startswith("Annex")
        
        if not self.is_annex:
            cleaned_title = title.strip().split("\t")
            self.title = cleaned_title[1]
            self.number = int(cleaned_title[0])
        else:
            cleaned_title = title.strip().split(" ")
            self.title = " ".join(cleaned_title[3:])
            self.letter = cleaned_title[1]

    def __repr__(self):
        if self.is_annex:
            return "({}, Annex {}, {}-{})".format(self.title,self.letter, self.from_id, self.to_id)
        return "({}, {}, {}-{})".format(self.title, self.number, self.from_id, self.to_id)

def match_definition_incipit(txt):
    '''
    Returns tru if txt matches the incipit of a definition,
    identified by the word 'tosca' 
    '''
    return bool(re.match(r'^tosca\.[a-zA-Z\.:0-9\s]*$',txt.split("\n")[0].strip()))

def is_tosca_def(table):
    '''
    Returns true when a table contains TOSCA definitions, i.e.
    the table contains just one cell and text starts with an
    empty space ' '
    '''
    txt = table.rows[0].cells[0].text
    return \
        len(table.rows) == 1 and \
        len(table.columns) == 1 and \
        match_definition_incipit(txt)    

def tosca_model_info(name, version, imports):
    '''
    Returns a dictionary to hold information on the model
    '''
    return {
        'name' : name,
        'fn' : BASE_FILENAME.format(version.replace(".","-"), name),
        'fd' : None,
        'imports' : imports,
        'buf' :  StringIO()
    }

def get_content(doc):
    '''
    Returns a list of all paragraphs and tables in the Document
    '''
    ret = []
    body = doc._body
    parag_count = 0
    table_count = 0
    for element in body._element:
        if isinstance(element, docx.oxml.text.paragraph.CT_P):
            ret.append(Paragraph(element, body))
            parag_count = parag_count + 1
        elif isinstance(element, docx.oxml.table.CT_Tbl):
            ret.append(Table(element, body))
            table_count = table_count + 1
        else:
            print("Non paragraph or table " +  str(type(element)))
    print("Paragraphs: " + str(parag_count))
    print("Tables: " + str(table_count))
    return ret

def find_sect(sect_to_find, start_idx, doc_content):
    '''
    Returns the index in the doc_content list to the first paragraph
    or heading of the section with title sect_to_find,
    starting the research from start_idx
    '''
    while start_idx < len(doc_content):
        my_elem = doc_content[start_idx]
        if isinstance(my_elem, Paragraph) and my_elem.text.strip() == sect_to_find:
            break
        start_idx = start_idx + 1

    print("FOUND " + sect_to_find + " at " + str(start_idx))
    return start_idx

def is_lvl1_section_hdn(txt):
    ''' Returns true if txt is level 1 heading'''
    clean_txt = txt.strip()
    return bool(re.match(r'^[0-9]+\t[a-zA-Z\s]*$', clean_txt)) or \
            bool(re.match(r'^Annex[\s]*[A-Z]+[\s\t]+[a-zA-Z\s\(\)]*', clean_txt))

def find_all_sections(doc_content):
    '''
    Scans the body of the document to find level 1 sections
    Returns a list of Section
    '''
    sections = []

    start_indx = 0
    end_indx = 1

    while end_indx < len(doc_content):
        my_elem = doc_content[end_indx]
        if isinstance(my_elem, Paragraph) and is_lvl1_section_hdn(my_elem.text):
            if start_indx != 0:
                sections.append(Section(start_indx, end_indx-1, doc_content[start_indx].text))
            
            start_indx = end_indx
            
        end_indx = end_indx + 1
    
    sections.append(Section(start_indx, end_indx-1, doc_content[start_indx].text))
    return sections

def write_table_to_file(tab, buf):
    '''
    Writes content of table t in utf-8 encoding to file F
    '''
    txt = tab.rows[0].cells[0].text
    buf.write(txt)
    if not txt.endswith('\n'):
        buf.write('\n')
    buf.write('\n')

def gen_tables_btwn(a_id, b_id, content, buf):
    '''
    Loops over content and writes all tosca definitions to the
    fdesc file. Returns the number of written definitions
    '''
    definitions_count = 0

    for idx in range(a_id, b_id):
        if idx >= len(content):
            print("A: " + str(a_id))
            print("B: " + str(b_id))
            print("IDX: " + str(idx))
            print("LEN(CONTENT): " + str(len(content)))
            return definitions_count 
        tmp_elem = content[idx]
        if isinstance(tmp_elem, Table) and is_tosca_def(tmp_elem):
            write_table_to_file(tmp_elem, buf)
            definitions_count = definitions_count + 1
        elif isinstance(tmp_elem, Table):
            txt = tmp_elem.rows[0].cells[0].text
            if txt.strip().startswith("Name") or txt.strip().startswith("Shorthand") or \
                txt.strip().startswith("tosca_def"):
                continue
            # print("----- Filtered out: " + txt.split("\n")[0])
            #if not len(tmp_elem.rows) == 1:
                #print("       Rows count != 1 ")
            #if not len(tmp_elem.columns) == 1:
            #    print("       Columns count != 1 ")
            #if not match_definition_incipit(txt):
            #    print("       Regex != 1 ")
    return definitions_count

def generate_header(
        model_name, 
        buf, 
        spec_version=SPEC_VERSION, 
        imports=None, 
        tosca_version=DEFAULT_TOSCA_VERSION):
    '''
    Writes the header to the file for a specific model
    '''
    buf.write(HDR.format(
        tosca_version=tosca_version,
        model=model_name,
        spec_version=spec_version,
        imports=imports))

def generate_templates(
        filename, 
        spec_ver=SPEC_VERSION, 
        yaml_root='uri', 
        tosc_ver=DEFAULT_TOSCA_VERSION):
    '''
    Takes a filename or file object and loads the definition into the MODELS dictionary
    '''
    if isinstance(filename, str):
        print("Opening " + filename)

    for mod in MODEL_NAMES:
        import_stmt = 'etsi_nfv_sol001_common_types.yaml'
        if yaml_root != 'local':
            import_stmt = \
                'https://forge.etsi.org/rep/nfv/SOL001/raw/{}/'.format(spec_ver) + import_stmt
        MODELS[mod] = tosca_model_info(
            mod,
            spec_ver, 
            '- ' + import_stmt
        )

    try:
        sol_001 = docx.Document(filename)
    except:
        print("Error opening the submitted Docx file")
        raise ValueError("Cannot open the submitted Docx file")

    for mod in MODELS:
        generate_header(
            MODELS[mod]['name'], 
            MODELS[mod]['buf'],
            spec_ver, 
            MODELS[mod]['imports'],
            tosc_ver
        )

    content = get_content(sol_001)
    sections = find_all_sections(content)

    sections_to_models = {
        6 : 'vnfd',
        7 : 'nsd',
        8 : 'pnfd',
        9 : 'common'
    }

    for sect in sections:

        if not sect.is_annex:
            if sect.number in sections_to_models.keys():
                model = sections_to_models[sect.number]
                count = gen_tables_btwn(sect.from_id, sect.to_id, content, MODELS[model]['buf'])
                print("Printed " + str(count) + " types to " + model)
        else:
            if sect.letter == "A":
                count = generate_examples_between(sect.from_id, sect.to_id, content, EXAMPLES)
                print("Printed " + str(count) + " types to " + "Annex " + sect.letter)


def print_to_files(prefix=None):
    '''
    Prefix is a path to a folder to work into
    '''
    for key in MODELS:
        mod = MODELS[key]
        if prefix is not None:
            mod['fn'] = os.path.join(prefix, mod['fn'])

        print("Writing to " + mod['fn'])
        mod['fd'] = open(mod['fn'], 'w')
        mod['buf'].seek(0)
        mod['fd'].write(mod['buf'].read())
        mod['fd'].write('\n')
        mod['fd'].close()

    for k in EXAMPLES:
        if prefix is not None:
            fnm = os.path.join(prefix, "example_"+EXAMPLES[k].filename)
        else:
            fnm = EXAMPLES[k].filename
        print("Writing example file: " + fnm)
        with open(fnm, 'w') as newf:
            newf.write(EXAMPLES[k].text)
            newf.write("\n")
            newf.close()

def parse_version_from_filename(filename):
    '''
    Parses the version from the filename
    '''
    base_filename = os.path.basename(filename)

    if base_filename.startswith("gs_NFV-SOL001v"):         
        return "v" + base_filename.strip("gs_NFV-SOL001v") \
                .replace("0",".").strip(".").strip("p.docx")
    if base_filename.startswith("gs_nfv-sol001v"):         
        return "v" + base_filename.strip("gs_nfv-sol001v") \
                .replace("0",".").strip(".").strip("p.docx")
    return ""

if __name__ == "__main__":

    try:
        SOL001_FN = sys.argv[1]
    except:
        print('Error: Filename missing or filename not a docx document')
        print('Usage: doc2tosca <docx-with-tosca-definitions>')
        sys.exit(1)

    ver = parse_version_from_filename(SOL001_FN)
    generate_templates(SOL001_FN, spec_ver=ver)

    print_to_files()