diff --git a/src/doc2tosca.py b/src/doc2tosca.py index 53572d875fc12a6d6f28669af3915dee7a0a1633..b1c52bd9c8afbd52398804caeeeeaa643de327db 100644 --- a/src/doc2tosca.py +++ b/src/doc2tosca.py @@ -33,9 +33,21 @@ metadata: imports: {imports} -data_types: ''' +SUBSECTIONS = [ + "Artifact Types", + "Data Types", + "Capability Types", + "Interface Types", + "Requirements Types", + "Relationship Types", + "Interface Types", + "Node Types", + "Group Types", + "Policy Types" +] + MODELS = {} EXAMPLES = {} @@ -109,8 +121,8 @@ def get_content(doc): elif isinstance(element, docx.oxml.table.CT_Tbl): ret.append(Table(element, body)) table_count = table_count + 1 - else: - print("Non paragraph or table " + str(type(element))) + #else: + # print("Non paragraph or table " + str(type(element))) print("Paragraphs: " + str(parag_count)) print("Tables: " + str(table_count)) return ret @@ -130,6 +142,17 @@ def find_sect(sect_to_find, start_idx, doc_content): print("FOUND " + sect_to_find + " at " + str(start_idx)) return start_idx +def is_lvl2_section_hdn(txt): + ''' Returns true if txt is level 2 heading''' + clean_txt = txt.strip() + + if not bool(re.match(r'^[0-9]\.[0-9]+\t[a-zA-Z\s]*$', clean_txt)): + return False + + subtitle = clean_txt.split('\t')[1] + + return subtitle in SUBSECTIONS + def is_lvl1_section_hdn(txt): ''' Returns true if txt is level 1 heading''' clean_txt = txt.strip() @@ -159,6 +182,15 @@ def find_all_sections(doc_content): sections.append(Section(start_indx, end_indx-1, doc_content[start_indx].text)) return sections +def write_subsection_to_file(txt, buf): + ''' + Writes a subsection header in utf-8 encoding to file buf + ''' + buf.write(slugify(txt)+":") + if not txt.endswith('\n'): + buf.write('\n') + buf.write('\n') + def write_table_to_file(tab, buf): ''' Writes content of table t in utf-8 encoding to file F @@ -177,28 +209,28 @@ def gen_tables_btwn(a_id, b_id, content, buf): definitions_count = 0 for idx in range(a_id, b_id): + if idx >= len(content): - print("A: " + str(a_id)) - print("B: " + str(b_id)) - print("IDX: " + str(idx)) - print("LEN(CONTENT): " + str(len(content))) - return definitions_count + print("ERROR: Paragraph out of range (A: {}, B: {}, IDX: {}, LEN(CONTENT): {})".format( + a_id, b_id,idx, len(content))) + return definitions_count + tmp_elem = content[idx] + + if isinstance(tmp_elem, Paragraph) and is_lvl2_section_hdn(tmp_elem.text): + print(tmp_elem.text) + write_subsection_to_file(tmp_elem.text.split("\t")[1], buf) + definitions_count = definitions_count + 1 + if isinstance(tmp_elem, Table) and is_tosca_def(tmp_elem): write_table_to_file(tmp_elem, buf) definitions_count = definitions_count + 1 + elif isinstance(tmp_elem, Table): txt = tmp_elem.rows[0].cells[0].text if txt.strip().startswith("Name") or txt.strip().startswith("Shorthand") or \ txt.strip().startswith("tosca_def"): continue - # print("----- Filtered out: " + txt.split("\n")[0]) - #if not len(tmp_elem.rows) == 1: - #print(" Rows count != 1 ") - #if not len(tmp_elem.columns) == 1: - # print(" Columns count != 1 ") - #if not match_definition_incipit(txt): - # print(" Regex != 1 ") return definitions_count def generate_header( @@ -277,7 +309,7 @@ def generate_templates( def print_to_files(prefix=None): - ''' + ''' Prefix is a path to a folder to work into ''' for key in MODELS: @@ -317,6 +349,9 @@ def parse_version_from_filename(filename): .replace("0",".").strip(".").strip("p.docx") return "" +def slugify(t): + return t.replace(" ", "_").lower() + if __name__ == "__main__": try: diff --git a/src/test_doc2tosca.py b/src/test_doc2tosca.py index 6353a3c84fb857ffa16dfc769d6f3bfc3c7c0adc..74cd5cb740c358017d2923501d8de97bd668ba02 100644 --- a/src/test_doc2tosca.py +++ b/src/test_doc2tosca.py @@ -20,6 +20,12 @@ def test_is_lvl1_section_hdn(): assert d2t.is_lvl1_section_hdn("Annex A (informative)") assert d2t.is_lvl1_section_hdn("Annex C (normative):\tConformance\t284") +def test_is_lvl2_section_hdn(): + + assert d2t.is_lvl2_section_hdn("6.3\tData Types") + assert not d2t.is_lvl2_section_hdn("6.4.2\tSomething") + assert not d2t.is_lvl2_section_hdn("6.4\tSomething") + def test_section_init(): ssss = d2t.Section(0, 10, "6\tVNFD TOSCA model")