doc2tosca.py

#!/bin/python3
'''
Generate tosca definitions from Docx specfication
'''

import sys
import os
import re
from io import StringIO

import docx
from docx.table import Table
from docx.text.paragraph import Paragraph

BASE_FILENAME = "generated_etsi_nfv_sol001_{}_types.yaml"
TOSCA_VERSION = "tosca_simple_yaml_1_2"
SPEC_VERSION = "v2.6.1"

allowed_versions = ["v2.6.1","v2.6.3", "v2.7.1"]

MODEL_NAMES = ['vnfd', 'nsd', 'pnfd', 'common']

HDR = '''tosca_definitions_version: {tosca_version}
description: ETSI NFV SOL 001 {model} types definitions version {spec_version}
metadata:
  - template_name: etsi_nfv_sol001_{model}_types
  - template_name: ETSI_NFV
  - template_version: {spec_version}

imports:
  {imports}

data_types:
'''

def match_definition_incipit(txt):
    return bool(re.match(r'^tosca\.[a-zA-Z\.:0-9\s]*$',txt.split("\n")[0].strip()))

def  is_tosca_def(table):
    '''
    Returns true when a table contains TOSCA definitions, i.e.
    the table contains just one cell and text starts with an
    empty space ' '
    '''
    txt = table.rows[0].cells[0].text
    return \
        len(table.rows) == 1 and \
        len(table.columns) == 1 and \
        match_definition_incipit(txt)

def tosca_model_info(name, imports):
    '''
    Returns a dictionary for the information on the model
    '''
    return {
        'name' : name,
        'fn' : BASE_FILENAME.format(name),
        'fd' : None,
        'imports' : imports,
        'buf' :  StringIO()
    }

def get_content(doc):
    '''
    Returns a list of all paragraphs and tables in the Document
    '''
    ret = []
    body = doc._body
    parag_count = 0
    table_count = 0
    for element in body._element:
        if isinstance(element, docx.oxml.text.paragraph.CT_P):
            ret.append(Paragraph(element, body))
            parag_count = parag_count + 1
        elif isinstance(element, docx.oxml.table.CT_Tbl):
            ret.append(Table(element, body))
            table_count = table_count + 1
        else:
            print("Non paragraph or table " +  str(type(element)))
    print("Paragraphs: " + str(parag_count))
    print("Tables: " + str(table_count))
    return ret

def find_sect(sect_to_find, start_idx, doc_content):
    '''
    Returns the index in the doc_content list to the first paragraph
    or heading of the section with title sect_to_find,
    starting the research from start_idx
    '''
    while start_idx < len(doc_content):
        my_elem = doc_content[start_idx]
        if isinstance(my_elem, Paragraph) and my_elem.text.strip() == sect_to_find:
            break
        start_idx = start_idx + 1

    print("FOUND " + sect_to_find + " at " + str(start_idx))
    return start_idx

def write_table_to_file(tab, buf):
    '''
    Writes content of table t in utf-8 encoding to file F
    '''
    def pad2 (txt):
        if txt.startswith("   "):
            return " " + txt
        if txt.startswith("  "):
            return "  " + txt
        if txt.startswith(" "):
            return " " + txt
        return "  " + txt

    txt = tab.rows[0].cells[0].text
    # print("#  Included in: " + tab.rows[0].cells[0].text.split("\n")[0])
    buf.write("\n".join([x for x in txt.split("\n")]))
    # buf.write('\n# -------------------- #\n')
    if not txt.endswith('\n'):
        buf.write('\n')
    buf.write('\n')

def generate_tables_between(a_id, b_id, content, buf):
    '''
    Loops over content and writes all tosca definitions to the
    fdesc file. Returns the number of written definitions
    '''
    definitions_count = 0

    for idx in range(a_id, b_id):
        if idx >= len(content):
            print("A: " + str(a_id))
            print("B: " + str(b_id))
            print("IDX: " + str(idx))
            print("LEN(CONTENT): " + str(len(content)))
            return 
        tmp_elem = content[idx]
        if isinstance(tmp_elem, Table) and is_tosca_def(tmp_elem):
            write_table_to_file(tmp_elem, buf)
            definitions_count = definitions_count + 1
        elif isinstance(tmp_elem, Table):
            txt = tmp_elem.rows[0].cells[0].text
            if txt.strip().startswith("Name") or txt.strip().startswith("Shorthand") or \
                txt.strip().startswith("tosca_def"):
                continue
            print("----- Filtered out: " + txt.split("\n")[0])
            if not len(tmp_elem.rows) == 1:
                print("       Rows count != 1 ")
            if not len(tmp_elem.columns) == 1:
                print("       Columns count != 1 ")
            if not match_definition_incipit(txt):
                print("       Regex != 1 ")
    return definitions_count

def dump_header(model_name, buf, spec_version=SPEC_VERSION, imports=None):
    '''
    Writes the header to the file for a specific model
    '''
    buf.write(HDR.format(
        tosca_version=TOSCA_VERSION,
        model=model_name,
        spec_version=spec_version,
        imports=imports))

MODELS = {}

def generate_templates(filename, spec_version=SPEC_VERSION, yaml_root_path='uri'):
    '''
    Takes a filename or file object and loads the definition into the MODELS dictionary
    '''
    if isinstance(filename, str):
        print("Opening " + filename)

    for mn in MODEL_NAMES:
        import_stmt = 'etsi_nfv_sol001_common_types.yaml'
        if yaml_root_path != 'local':
            import_stmt = 'https://forge.etsi.org/rep/nfv/SOL001/raw/{}/'.format(spec_version) + import_stmt
        MODELS[mn] = tosca_model_info(
            mn, 
            '- ' + import_stmt
        )

    try:
        sol_001 = docx.Document(filename)
    except:
        print("Error opening the submitted Docx file")
        raise ValueError("Cannot open the submitted Docx file")

    for m in MODELS:
        dump_header(
            MODELS[m]['name'], 
            MODELS[m]['buf'],
            spec_version, 
            MODELS[m]['imports'])

    p_id = 0

    cur_sect = "0"

    CONTENT = get_content(sol_001)
    tables=0

    while p_id < len(CONTENT):
        elem = CONTENT[p_id]
        if isinstance(elem, Paragraph) and elem.text == "Foreword":
            break
        p_id = p_id + 1

    if p_id >= len(CONTENT):
        print("FOREWORD NOT FOUND")

    sect_6_id = find_sect("6\tVNFD TOSCA model", p_id, CONTENT)

    sect_7_id = find_sect("7\tNSD TOSCA model", sect_6_id, CONTENT)

    sect_8_id = find_sect("8\tPNFD TOSCA model", sect_7_id, CONTENT)

    sect_9_id = find_sect("9\tCommon Definitions", sect_8_id, CONTENT)

    annex_a_id = find_sect("Annex A (informative):", sect_9_id, CONTENT)

    count = generate_tables_between(sect_6_id, sect_7_id, CONTENT, MODELS['vnfd']['buf'])
    print("Printed " + str(count) + " types to " + "VNFD\n\n\n")

    count = generate_tables_between(sect_7_id, sect_8_id, CONTENT, MODELS['nsd']['buf'])
    print("Printed " + str(count) + " types to " + "NSD\n\n\n")

    count = generate_tables_between(sect_8_id, sect_9_id, CONTENT, MODELS['pnfd']['buf'])
    print("Printed " + str(count) + " types to " + "PNFD\n\n\n")

    count = generate_tables_between(sect_9_id, annex_a_id, CONTENT, MODELS['common']['buf'])
    print("Printed " + str(count) + " types to " + "Common\n\n\n")


def print_to_files(prefix=None):
    '''
    Prefix is a path to a folder to work into
    '''
    for m in MODELS:
        if prefix != None:
            MODELS[m]['fn'] = os.path.join(prefix, MODELS[m]['fn'])

        print("Writing to " + MODELS[m]['fn'])
        MODELS[m]['fd'] = open(MODELS[m]['fn'], 'w')
        MODELS[m]['buf'].seek(0)
        MODELS[m]['fd'].write(MODELS[m]['buf'].read())
        MODELS[m]['fd'].write('\n')
        MODELS[m]['fd'].close()

if __name__ == "__main__":

    try:
        SOL001_FN = sys.argv[1]
    except:
        print('Error: Filename missing or filename not a docx document')
        print('Usage: doc2tosca <docx-with-tosca-definitions>')
        sys.exit(1)

    generate_templates(SOL001_FN)

    print_to_files()