From 0c70b06178a492f7117bad10dbd994b937dfd331 Mon Sep 17 00:00:00 2001 From: jetm Date: Sun, 29 Aug 2021 21:26:55 -0500 Subject: [PATCH 1/2] markdown to xml feature --- src/MarkdownFile.py | 46 ++++++++++++ .../expected_outputs/empty-file.xml | 1 + .../expected_outputs/false-headers.xml | 14 ++++ .../expected_outputs/multi-level-headers.xml | 22 ++++++ .../expected_outputs/no-section.xml | 2 + .../expected_outputs/single-level-headers.xml | 14 ++++ .../markdown_files/empty-file.md | 0 .../markdown_files/false-headers.md | 13 ++++ .../markdown_files/multi-level-headers.md | 21 ++++++ .../markdown_files/no-section.md | 1 + .../markdown_files/single-level-headers.md | 13 ++++ test/test_Script.py | 74 +++++++++++++++++-- test/xml_helpers.py | 20 +++++ 13 files changed, 236 insertions(+), 5 deletions(-) create mode 100644 test/testVault/xml_markdown_tests/expected_outputs/empty-file.xml create mode 100644 test/testVault/xml_markdown_tests/expected_outputs/false-headers.xml create mode 100644 test/testVault/xml_markdown_tests/expected_outputs/multi-level-headers.xml create mode 100644 test/testVault/xml_markdown_tests/expected_outputs/no-section.xml create mode 100644 test/testVault/xml_markdown_tests/expected_outputs/single-level-headers.xml create mode 100644 test/testVault/xml_markdown_tests/markdown_files/empty-file.md create mode 100644 test/testVault/xml_markdown_tests/markdown_files/false-headers.md create mode 100644 test/testVault/xml_markdown_tests/markdown_files/multi-level-headers.md create mode 100644 test/testVault/xml_markdown_tests/markdown_files/no-section.md create mode 100644 test/testVault/xml_markdown_tests/markdown_files/single-level-headers.md create mode 100644 test/xml_helpers.py diff --git a/src/MarkdownFile.py b/src/MarkdownFile.py index fd11249..27a2aa6 100644 --- a/src/MarkdownFile.py +++ b/src/MarkdownFile.py @@ -1,9 +1,11 @@ import re from src.YamlParser import YamlParser, YAML_METHOD +from xml.etree.ElementTree import Element, ElementTree, SubElement class MarkdownFile: def __init__(self, fileName, filePath): self._regexFindLinks = r'(?<=\[\[).*?(?=(?:\]\]|#|\|))' # Thanks to https://github.com/archelpeg + self._regexFindHeaders = r'^\s*(#+)(\s+.*)' self.fileName = fileName self.path = filePath self.tags = self._findTags() @@ -77,3 +79,47 @@ def _findLinksInCurrentFile(self) -> set: def _openFile(self): """Open markdown file""" return open(self.path, "r", encoding="utf-8") + + def toXML(self): + file = self._openFile() + lines = file.readlines() + file.close() + + # Get the line numbers and header levels of each header + section_lines = [] + section_levels = [] + section_titles = [] + for i, line in enumerate(lines): + match = re.match(self._regexFindHeaders, line) + if match is not None: + section_lines.append(i) + section_levels.append(len(match.group(1))) + section_titles.append(match.group(2).strip()) + + root = Element('root', {'level': '0'}) + parent_map = {} + + current_node = root + section_ends = section_lines + [len(lines)] + current_node.text = "".join(lines[:section_ends[0]]) + section_ends = section_ends[1:] + + for section_start, level, title, section_end in zip(section_lines, section_levels, section_titles, section_ends): + while int(current_node.get('level')) >= level: + current_node = parent_map[current_node] + section_node = SubElement(current_node, 'section', {'level': str(level), 'title': title}) + parent_map[section_node] = current_node + section_node.text = "".join(lines[section_start:section_end]) + + current_node = section_node + + return ElementTree(root) + +def xmlToMarkdownText(tree: ElementTree): + sections = [] + for node in tree.iter(): + if node.get('level') == '0': + continue + sections.append(node.text) + + return ''.join(sections) \ No newline at end of file diff --git a/test/testVault/xml_markdown_tests/expected_outputs/empty-file.xml b/test/testVault/xml_markdown_tests/expected_outputs/empty-file.xml new file mode 100644 index 0000000..e239f73 --- /dev/null +++ b/test/testVault/xml_markdown_tests/expected_outputs/empty-file.xml @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/test/testVault/xml_markdown_tests/expected_outputs/false-headers.xml b/test/testVault/xml_markdown_tests/expected_outputs/false-headers.xml new file mode 100644 index 0000000..c683cc0 --- /dev/null +++ b/test/testVault/xml_markdown_tests/expected_outputs/false-headers.xml @@ -0,0 +1,14 @@ +Root level text. + +
# A good header + +This header has good text. + +#bad header + +asdf # another bad header + +
## This is okay + +asdfasdf +
diff --git a/test/testVault/xml_markdown_tests/expected_outputs/multi-level-headers.xml b/test/testVault/xml_markdown_tests/expected_outputs/multi-level-headers.xml new file mode 100644 index 0000000..71392ff --- /dev/null +++ b/test/testVault/xml_markdown_tests/expected_outputs/multi-level-headers.xml @@ -0,0 +1,22 @@ +Root level text. + +
# Header 1 + +Level 1 text. + +
## Header 2 + +Level 2 text. + +
### Header 3 + +Level 3 text. + +
###### Header 6 + +Level 6 text. + +
# Section 2 + +Another section. +
diff --git a/test/testVault/xml_markdown_tests/expected_outputs/no-section.xml b/test/testVault/xml_markdown_tests/expected_outputs/no-section.xml new file mode 100644 index 0000000..630aeaa --- /dev/null +++ b/test/testVault/xml_markdown_tests/expected_outputs/no-section.xml @@ -0,0 +1,2 @@ +This file has no sections. + diff --git a/test/testVault/xml_markdown_tests/expected_outputs/single-level-headers.xml b/test/testVault/xml_markdown_tests/expected_outputs/single-level-headers.xml new file mode 100644 index 0000000..ebcea05 --- /dev/null +++ b/test/testVault/xml_markdown_tests/expected_outputs/single-level-headers.xml @@ -0,0 +1,14 @@ +root section + +
# header 1 + +section 1 + +
# header 2 + +section 2 + +
# header 3 + +section 3 +
diff --git a/test/testVault/xml_markdown_tests/markdown_files/empty-file.md b/test/testVault/xml_markdown_tests/markdown_files/empty-file.md new file mode 100644 index 0000000..e69de29 diff --git a/test/testVault/xml_markdown_tests/markdown_files/false-headers.md b/test/testVault/xml_markdown_tests/markdown_files/false-headers.md new file mode 100644 index 0000000..71519df --- /dev/null +++ b/test/testVault/xml_markdown_tests/markdown_files/false-headers.md @@ -0,0 +1,13 @@ +Root level text. + +# A good header + +This header has good text. + +#bad header + +asdf # another bad header + + ## This is okay + +asdfasdf diff --git a/test/testVault/xml_markdown_tests/markdown_files/multi-level-headers.md b/test/testVault/xml_markdown_tests/markdown_files/multi-level-headers.md new file mode 100644 index 0000000..199ee06 --- /dev/null +++ b/test/testVault/xml_markdown_tests/markdown_files/multi-level-headers.md @@ -0,0 +1,21 @@ +Root level text. + +# Header 1 + +Level 1 text. + +## Header 2 + +Level 2 text. + +### Header 3 + +Level 3 text. + +###### Header 6 + +Level 6 text. + +# Section 2 + +Another section. diff --git a/test/testVault/xml_markdown_tests/markdown_files/no-section.md b/test/testVault/xml_markdown_tests/markdown_files/no-section.md new file mode 100644 index 0000000..14efe95 --- /dev/null +++ b/test/testVault/xml_markdown_tests/markdown_files/no-section.md @@ -0,0 +1 @@ +This file has no sections. diff --git a/test/testVault/xml_markdown_tests/markdown_files/single-level-headers.md b/test/testVault/xml_markdown_tests/markdown_files/single-level-headers.md new file mode 100644 index 0000000..51708c5 --- /dev/null +++ b/test/testVault/xml_markdown_tests/markdown_files/single-level-headers.md @@ -0,0 +1,13 @@ +root section + +# header 1 + +section 1 + +# header 2 + +section 2 + +# header 3 + +section 3 diff --git a/test/test_Script.py b/test/test_Script.py index 1cda461..97285fe 100644 --- a/test/test_Script.py +++ b/test/test_Script.py @@ -1,12 +1,10 @@ -from src.Extractor import Extractor -from src.MarkdownFile import MarkdownFile +from src.MarkdownFile import xmlToMarkdownText from src.Parser import Parser -from src.YamlParser import YamlParser -import os +from test.xml_helpers import loadXMLSingleLevelHeaders, loadXMLMultiLevelHeaders, loadXMLFalseHeaders, loadXMLEmptyFile, loadXMLNoSection, treesEqual def testMarkdownsRetrieval(): parser = Parser('./test/testVault') - assert len(parser.mdFiles) == 6 + assert len(parser.mdFiles) == 11 def testMarkdownTags(): def nbFilesWithTag(parser, tag): @@ -22,3 +20,69 @@ def testSubfilesForFile(): file = set([file for file in parser.mdFiles if file.fileName == 'file1.md']) subFiles = parser.findSubFilesForFiles(file) assert len(subFiles) == 3 + +def testXMLBuilderSingleLevelHeaders(): + parser = Parser('./test/testVault') + file = [file for file in parser.mdFiles if file.fileName == 'single-level-headers.md'].pop() + assert treesEqual(file.toXML(), loadXMLSingleLevelHeaders()) + +def testXMLBuilderMultiLevelHeaders(): + parser = Parser('./test/testVault') + file = [file for file in parser.mdFiles if file.fileName == 'multi-level-headers.md'].pop() + assert treesEqual(file.toXML(), loadXMLMultiLevelHeaders()) + +def testXMLBuilderFalseHeaders(): + parser = Parser('./test/testVault') + file = [file for file in parser.mdFiles if file.fileName == 'false-headers.md'].pop() + assert treesEqual(file.toXML(), loadXMLFalseHeaders()) + +def testXMLBuilderEmptyFile(): + parser = Parser('./test/testVault') + file = [file for file in parser.mdFiles if file.fileName == 'empty-file.md'].pop() + assert treesEqual(file.toXML(), loadXMLEmptyFile()) + +def testXMLBuilderNoSectionFile(): + parser = Parser('./test/testVault') + file = [file for file in parser.mdFiles if file.fileName == 'no-section.md'].pop() + assert treesEqual(file.toXML(), loadXMLNoSection()) + +def testXMLWriterSingleLevelHeaders(): + parser = Parser('./test/testVault') + file = [file for file in parser.mdFiles if file.fileName == 'single-level-headers.md'].pop() + with open(file.path, 'r') as mdFile: + text = mdFile.read() + + xmlToMarkdownText(loadXMLSingleLevelHeaders()) == text + +def testXMLWriterMultiLevelHeaders(): + parser = Parser('./test/testVault') + file = [file for file in parser.mdFiles if file.fileName == 'multi-level-headers.md'].pop() + with open(file.path, 'r') as mdFile: + text = mdFile.read() + + xmlToMarkdownText(loadXMLMultiLevelHeaders()) == text + +def testXMLWriterFalseHeaders(): + parser = Parser('./test/testVault') + file = [file for file in parser.mdFiles if file.fileName == 'false-headers.md'].pop() + with open(file.path, 'r') as mdFile: + text = mdFile.read() + + xmlToMarkdownText(loadXMLFalseHeaders()) == text + +def testXMLWriterEmptyFile(): + parser = Parser('./test/testVault') + file = [file for file in parser.mdFiles if file.fileName == 'empty-file.md'].pop() + with open(file.path, 'r') as mdFile: + text = mdFile.read() + + xmlToMarkdownText(loadXMLEmptyFile()) == text + +def testXMLWriterNoSectionFile(): + parser = Parser('./test/testVault') + file = [file for file in parser.mdFiles if file.fileName == 'no-section.md'].pop() + with open(file.path, 'r') as mdFile: + text = mdFile.read() + + xmlToMarkdownText(loadXMLNoSection()) == text + diff --git a/test/xml_helpers.py b/test/xml_helpers.py new file mode 100644 index 0000000..30c4d14 --- /dev/null +++ b/test/xml_helpers.py @@ -0,0 +1,20 @@ +import xml.etree.ElementTree as ET +from xml.etree.ElementTree import Element, ElementTree + +def treesEqual(t1: ElementTree, t2: ElementTree) -> bool: + return ET.canonicalize(ET.tostring(t1.getroot())) == ET.canonicalize(ET.tostring(t2.getroot())) + +def loadXMLSingleLevelHeaders(): + return ET.parse('./test/testVault/xml_markdown_tests/expected_outputs/single-level-headers.xml') + +def loadXMLMultiLevelHeaders(): + return ET.parse('./test/testVault/xml_markdown_tests/expected_outputs/multi-level-headers.xml') + +def loadXMLFalseHeaders(): + return ET.parse('./test/testVault/xml_markdown_tests/expected_outputs/false-headers.xml') + +def loadXMLEmptyFile(): + return ET.parse('./test/testVault/xml_markdown_tests/expected_outputs/empty-file.xml') + +def loadXMLNoSection(): + return ET.parse('./test/testVault/xml_markdown_tests/expected_outputs/no-section.xml') From 0b0e1182229b12d1aa787ca08be9569c566e0fc1 Mon Sep 17 00:00:00 2001 From: jetm Date: Sun, 29 Aug 2021 21:36:02 -0500 Subject: [PATCH 2/2] add newline at end of files --- src/MarkdownFile.py | 2 +- .../xml_markdown_tests/expected_outputs/empty-file.xml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/MarkdownFile.py b/src/MarkdownFile.py index 27a2aa6..f6cb37c 100644 --- a/src/MarkdownFile.py +++ b/src/MarkdownFile.py @@ -122,4 +122,4 @@ def xmlToMarkdownText(tree: ElementTree): continue sections.append(node.text) - return ''.join(sections) \ No newline at end of file + return ''.join(sections) diff --git a/test/testVault/xml_markdown_tests/expected_outputs/empty-file.xml b/test/testVault/xml_markdown_tests/expected_outputs/empty-file.xml index e239f73..ecc8385 100644 --- a/test/testVault/xml_markdown_tests/expected_outputs/empty-file.xml +++ b/test/testVault/xml_markdown_tests/expected_outputs/empty-file.xml @@ -1 +1 @@ - \ No newline at end of file +