Loading src/doc2tosca.py +51 −16 Original line number Diff line number Diff line Loading @@ -33,9 +33,21 @@ metadata: imports: {imports} data_types: ''' SUBSECTIONS = [ "Artifact Types", "Data Types", "Capability Types", "Interface Types", "Requirements Types", "Relationship Types", "Interface Types", "Node Types", "Group Types", "Policy Types" ] MODELS = {} EXAMPLES = {} Loading Loading @@ -109,8 +121,8 @@ def get_content(doc): elif isinstance(element, docx.oxml.table.CT_Tbl): ret.append(Table(element, body)) table_count = table_count + 1 else: print("Non paragraph or table " + str(type(element))) #else: # print("Non paragraph or table " + str(type(element))) print("Paragraphs: " + str(parag_count)) print("Tables: " + str(table_count)) return ret Loading @@ -130,6 +142,17 @@ def find_sect(sect_to_find, start_idx, doc_content): print("FOUND " + sect_to_find + " at " + str(start_idx)) return start_idx def is_lvl2_section_hdn(txt): ''' Returns true if txt is level 2 heading''' clean_txt = txt.strip() if not bool(re.match(r'^[0-9]\.[0-9]+\t[a-zA-Z\s]*$', clean_txt)): return False subtitle = clean_txt.split('\t')[1] return subtitle in SUBSECTIONS def is_lvl1_section_hdn(txt): ''' Returns true if txt is level 1 heading''' clean_txt = txt.strip() Loading Loading @@ -159,6 +182,15 @@ def find_all_sections(doc_content): sections.append(Section(start_indx, end_indx-1, doc_content[start_indx].text)) return sections def write_subsection_to_file(txt, buf): ''' Writes a subsection header in utf-8 encoding to file buf ''' buf.write(slugify(txt)+":") if not txt.endswith('\n'): buf.write('\n') buf.write('\n') def write_table_to_file(tab, buf): ''' Writes content of table t in utf-8 encoding to file F Loading @@ -177,28 +209,28 @@ def gen_tables_btwn(a_id, b_id, content, buf): definitions_count = 0 for idx in range(a_id, b_id): if idx >= len(content): print("A: " + str(a_id)) print("B: " + str(b_id)) print("IDX: " + str(idx)) print("LEN(CONTENT): " + str(len(content))) print("ERROR: Paragraph out of range (A: {}, B: {}, IDX: {}, LEN(CONTENT): {})".format( a_id, b_id,idx, len(content))) return definitions_count tmp_elem = content[idx] if isinstance(tmp_elem, Paragraph) and is_lvl2_section_hdn(tmp_elem.text): print(tmp_elem.text) write_subsection_to_file(tmp_elem.text.split("\t")[1], buf) definitions_count = definitions_count + 1 if isinstance(tmp_elem, Table) and is_tosca_def(tmp_elem): write_table_to_file(tmp_elem, buf) definitions_count = definitions_count + 1 elif isinstance(tmp_elem, Table): txt = tmp_elem.rows[0].cells[0].text if txt.strip().startswith("Name") or txt.strip().startswith("Shorthand") or \ txt.strip().startswith("tosca_def"): continue # print("----- Filtered out: " + txt.split("\n")[0]) #if not len(tmp_elem.rows) == 1: #print(" Rows count != 1 ") #if not len(tmp_elem.columns) == 1: # print(" Columns count != 1 ") #if not match_definition_incipit(txt): # print(" Regex != 1 ") return definitions_count def generate_header( Loading Loading @@ -317,6 +349,9 @@ def parse_version_from_filename(filename): .replace("0",".").strip(".").strip("p.docx") return "" def slugify(t): return t.replace(" ", "_").lower() if __name__ == "__main__": try: Loading src/test_doc2tosca.py +6 −0 Original line number Diff line number Diff line Loading @@ -20,6 +20,12 @@ def test_is_lvl1_section_hdn(): assert d2t.is_lvl1_section_hdn("Annex A (informative)") assert d2t.is_lvl1_section_hdn("Annex C (normative):\tConformance\t284") def test_is_lvl2_section_hdn(): assert d2t.is_lvl2_section_hdn("6.3\tData Types") assert not d2t.is_lvl2_section_hdn("6.4.2\tSomething") assert not d2t.is_lvl2_section_hdn("6.4\tSomething") def test_section_init(): ssss = d2t.Section(0, 10, "6\tVNFD TOSCA model") Loading Loading
src/doc2tosca.py +51 −16 Original line number Diff line number Diff line Loading @@ -33,9 +33,21 @@ metadata: imports: {imports} data_types: ''' SUBSECTIONS = [ "Artifact Types", "Data Types", "Capability Types", "Interface Types", "Requirements Types", "Relationship Types", "Interface Types", "Node Types", "Group Types", "Policy Types" ] MODELS = {} EXAMPLES = {} Loading Loading @@ -109,8 +121,8 @@ def get_content(doc): elif isinstance(element, docx.oxml.table.CT_Tbl): ret.append(Table(element, body)) table_count = table_count + 1 else: print("Non paragraph or table " + str(type(element))) #else: # print("Non paragraph or table " + str(type(element))) print("Paragraphs: " + str(parag_count)) print("Tables: " + str(table_count)) return ret Loading @@ -130,6 +142,17 @@ def find_sect(sect_to_find, start_idx, doc_content): print("FOUND " + sect_to_find + " at " + str(start_idx)) return start_idx def is_lvl2_section_hdn(txt): ''' Returns true if txt is level 2 heading''' clean_txt = txt.strip() if not bool(re.match(r'^[0-9]\.[0-9]+\t[a-zA-Z\s]*$', clean_txt)): return False subtitle = clean_txt.split('\t')[1] return subtitle in SUBSECTIONS def is_lvl1_section_hdn(txt): ''' Returns true if txt is level 1 heading''' clean_txt = txt.strip() Loading Loading @@ -159,6 +182,15 @@ def find_all_sections(doc_content): sections.append(Section(start_indx, end_indx-1, doc_content[start_indx].text)) return sections def write_subsection_to_file(txt, buf): ''' Writes a subsection header in utf-8 encoding to file buf ''' buf.write(slugify(txt)+":") if not txt.endswith('\n'): buf.write('\n') buf.write('\n') def write_table_to_file(tab, buf): ''' Writes content of table t in utf-8 encoding to file F Loading @@ -177,28 +209,28 @@ def gen_tables_btwn(a_id, b_id, content, buf): definitions_count = 0 for idx in range(a_id, b_id): if idx >= len(content): print("A: " + str(a_id)) print("B: " + str(b_id)) print("IDX: " + str(idx)) print("LEN(CONTENT): " + str(len(content))) print("ERROR: Paragraph out of range (A: {}, B: {}, IDX: {}, LEN(CONTENT): {})".format( a_id, b_id,idx, len(content))) return definitions_count tmp_elem = content[idx] if isinstance(tmp_elem, Paragraph) and is_lvl2_section_hdn(tmp_elem.text): print(tmp_elem.text) write_subsection_to_file(tmp_elem.text.split("\t")[1], buf) definitions_count = definitions_count + 1 if isinstance(tmp_elem, Table) and is_tosca_def(tmp_elem): write_table_to_file(tmp_elem, buf) definitions_count = definitions_count + 1 elif isinstance(tmp_elem, Table): txt = tmp_elem.rows[0].cells[0].text if txt.strip().startswith("Name") or txt.strip().startswith("Shorthand") or \ txt.strip().startswith("tosca_def"): continue # print("----- Filtered out: " + txt.split("\n")[0]) #if not len(tmp_elem.rows) == 1: #print(" Rows count != 1 ") #if not len(tmp_elem.columns) == 1: # print(" Columns count != 1 ") #if not match_definition_incipit(txt): # print(" Regex != 1 ") return definitions_count def generate_header( Loading Loading @@ -317,6 +349,9 @@ def parse_version_from_filename(filename): .replace("0",".").strip(".").strip("p.docx") return "" def slugify(t): return t.replace(" ", "_").lower() if __name__ == "__main__": try: Loading
src/test_doc2tosca.py +6 −0 Original line number Diff line number Diff line Loading @@ -20,6 +20,12 @@ def test_is_lvl1_section_hdn(): assert d2t.is_lvl1_section_hdn("Annex A (informative)") assert d2t.is_lvl1_section_hdn("Annex C (normative):\tConformance\t284") def test_is_lvl2_section_hdn(): assert d2t.is_lvl2_section_hdn("6.3\tData Types") assert not d2t.is_lvl2_section_hdn("6.4.2\tSomething") assert not d2t.is_lvl2_section_hdn("6.4\tSomething") def test_section_init(): ssss = d2t.Section(0, 10, "6\tVNFD TOSCA model") Loading