diff --git a/src/MarkdownFile.py b/src/MarkdownFile.py
index fd11249..f6cb37c 100644
--- a/src/MarkdownFile.py
+++ b/src/MarkdownFile.py
@@ -1,9 +1,11 @@
import re
from src.YamlParser import YamlParser, YAML_METHOD
+from xml.etree.ElementTree import Element, ElementTree, SubElement
class MarkdownFile:
def __init__(self, fileName, filePath):
self._regexFindLinks = r'(?<=\[\[).*?(?=(?:\]\]|#|\|))' # Thanks to https://github.com/archelpeg
+ self._regexFindHeaders = r'^\s*(#+)(\s+.*)'
self.fileName = fileName
self.path = filePath
self.tags = self._findTags()
@@ -77,3 +79,47 @@ def _findLinksInCurrentFile(self) -> set:
def _openFile(self):
"""Open markdown file"""
return open(self.path, "r", encoding="utf-8")
+
+ def toXML(self):
+ file = self._openFile()
+ lines = file.readlines()
+ file.close()
+
+ # Get the line numbers and header levels of each header
+ section_lines = []
+ section_levels = []
+ section_titles = []
+ for i, line in enumerate(lines):
+ match = re.match(self._regexFindHeaders, line)
+ if match is not None:
+ section_lines.append(i)
+ section_levels.append(len(match.group(1)))
+ section_titles.append(match.group(2).strip())
+
+ root = Element('root', {'level': '0'})
+ parent_map = {}
+
+ current_node = root
+ section_ends = section_lines + [len(lines)]
+ current_node.text = "".join(lines[:section_ends[0]])
+ section_ends = section_ends[1:]
+
+ for section_start, level, title, section_end in zip(section_lines, section_levels, section_titles, section_ends):
+ while int(current_node.get('level')) >= level:
+ current_node = parent_map[current_node]
+ section_node = SubElement(current_node, 'section', {'level': str(level), 'title': title})
+ parent_map[section_node] = current_node
+ section_node.text = "".join(lines[section_start:section_end])
+
+ current_node = section_node
+
+ return ElementTree(root)
+
+def xmlToMarkdownText(tree: ElementTree):
+ sections = []
+ for node in tree.iter():
+ if node.get('level') == '0':
+ continue
+ sections.append(node.text)
+
+ return ''.join(sections)
diff --git a/test/testVault/xml_markdown_tests/expected_outputs/empty-file.xml b/test/testVault/xml_markdown_tests/expected_outputs/empty-file.xml
new file mode 100644
index 0000000..ecc8385
--- /dev/null
+++ b/test/testVault/xml_markdown_tests/expected_outputs/empty-file.xml
@@ -0,0 +1 @@
+
diff --git a/test/testVault/xml_markdown_tests/expected_outputs/false-headers.xml b/test/testVault/xml_markdown_tests/expected_outputs/false-headers.xml
new file mode 100644
index 0000000..c683cc0
--- /dev/null
+++ b/test/testVault/xml_markdown_tests/expected_outputs/false-headers.xml
@@ -0,0 +1,14 @@
+Root level text.
+
+# A good header
+
+This header has good text.
+
+#bad header
+
+asdf # another bad header
+
+ ## This is okay
+
+asdfasdf
+
diff --git a/test/testVault/xml_markdown_tests/expected_outputs/multi-level-headers.xml b/test/testVault/xml_markdown_tests/expected_outputs/multi-level-headers.xml
new file mode 100644
index 0000000..71392ff
--- /dev/null
+++ b/test/testVault/xml_markdown_tests/expected_outputs/multi-level-headers.xml
@@ -0,0 +1,22 @@
+Root level text.
+
+# Header 1
+
+Level 1 text.
+
+## Header 2
+
+Level 2 text.
+
+### Header 3
+
+Level 3 text.
+
+###### Header 6
+
+Level 6 text.
+
+# Section 2
+
+Another section.
+
diff --git a/test/testVault/xml_markdown_tests/expected_outputs/no-section.xml b/test/testVault/xml_markdown_tests/expected_outputs/no-section.xml
new file mode 100644
index 0000000..630aeaa
--- /dev/null
+++ b/test/testVault/xml_markdown_tests/expected_outputs/no-section.xml
@@ -0,0 +1,2 @@
+This file has no sections.
+
diff --git a/test/testVault/xml_markdown_tests/expected_outputs/single-level-headers.xml b/test/testVault/xml_markdown_tests/expected_outputs/single-level-headers.xml
new file mode 100644
index 0000000..ebcea05
--- /dev/null
+++ b/test/testVault/xml_markdown_tests/expected_outputs/single-level-headers.xml
@@ -0,0 +1,14 @@
+root section
+
+# header 1
+
+section 1
+
+# header 2
+
+section 2
+
+# header 3
+
+section 3
+
diff --git a/test/testVault/xml_markdown_tests/markdown_files/empty-file.md b/test/testVault/xml_markdown_tests/markdown_files/empty-file.md
new file mode 100644
index 0000000..e69de29
diff --git a/test/testVault/xml_markdown_tests/markdown_files/false-headers.md b/test/testVault/xml_markdown_tests/markdown_files/false-headers.md
new file mode 100644
index 0000000..71519df
--- /dev/null
+++ b/test/testVault/xml_markdown_tests/markdown_files/false-headers.md
@@ -0,0 +1,13 @@
+Root level text.
+
+# A good header
+
+This header has good text.
+
+#bad header
+
+asdf # another bad header
+
+ ## This is okay
+
+asdfasdf
diff --git a/test/testVault/xml_markdown_tests/markdown_files/multi-level-headers.md b/test/testVault/xml_markdown_tests/markdown_files/multi-level-headers.md
new file mode 100644
index 0000000..199ee06
--- /dev/null
+++ b/test/testVault/xml_markdown_tests/markdown_files/multi-level-headers.md
@@ -0,0 +1,21 @@
+Root level text.
+
+# Header 1
+
+Level 1 text.
+
+## Header 2
+
+Level 2 text.
+
+### Header 3
+
+Level 3 text.
+
+###### Header 6
+
+Level 6 text.
+
+# Section 2
+
+Another section.
diff --git a/test/testVault/xml_markdown_tests/markdown_files/no-section.md b/test/testVault/xml_markdown_tests/markdown_files/no-section.md
new file mode 100644
index 0000000..14efe95
--- /dev/null
+++ b/test/testVault/xml_markdown_tests/markdown_files/no-section.md
@@ -0,0 +1 @@
+This file has no sections.
diff --git a/test/testVault/xml_markdown_tests/markdown_files/single-level-headers.md b/test/testVault/xml_markdown_tests/markdown_files/single-level-headers.md
new file mode 100644
index 0000000..51708c5
--- /dev/null
+++ b/test/testVault/xml_markdown_tests/markdown_files/single-level-headers.md
@@ -0,0 +1,13 @@
+root section
+
+# header 1
+
+section 1
+
+# header 2
+
+section 2
+
+# header 3
+
+section 3
diff --git a/test/test_Script.py b/test/test_Script.py
index 1cda461..97285fe 100644
--- a/test/test_Script.py
+++ b/test/test_Script.py
@@ -1,12 +1,10 @@
-from src.Extractor import Extractor
-from src.MarkdownFile import MarkdownFile
+from src.MarkdownFile import xmlToMarkdownText
from src.Parser import Parser
-from src.YamlParser import YamlParser
-import os
+from test.xml_helpers import loadXMLSingleLevelHeaders, loadXMLMultiLevelHeaders, loadXMLFalseHeaders, loadXMLEmptyFile, loadXMLNoSection, treesEqual
def testMarkdownsRetrieval():
parser = Parser('./test/testVault')
- assert len(parser.mdFiles) == 6
+ assert len(parser.mdFiles) == 11
def testMarkdownTags():
def nbFilesWithTag(parser, tag):
@@ -22,3 +20,69 @@ def testSubfilesForFile():
file = set([file for file in parser.mdFiles if file.fileName == 'file1.md'])
subFiles = parser.findSubFilesForFiles(file)
assert len(subFiles) == 3
+
+def testXMLBuilderSingleLevelHeaders():
+ parser = Parser('./test/testVault')
+ file = [file for file in parser.mdFiles if file.fileName == 'single-level-headers.md'].pop()
+ assert treesEqual(file.toXML(), loadXMLSingleLevelHeaders())
+
+def testXMLBuilderMultiLevelHeaders():
+ parser = Parser('./test/testVault')
+ file = [file for file in parser.mdFiles if file.fileName == 'multi-level-headers.md'].pop()
+ assert treesEqual(file.toXML(), loadXMLMultiLevelHeaders())
+
+def testXMLBuilderFalseHeaders():
+ parser = Parser('./test/testVault')
+ file = [file for file in parser.mdFiles if file.fileName == 'false-headers.md'].pop()
+ assert treesEqual(file.toXML(), loadXMLFalseHeaders())
+
+def testXMLBuilderEmptyFile():
+ parser = Parser('./test/testVault')
+ file = [file for file in parser.mdFiles if file.fileName == 'empty-file.md'].pop()
+ assert treesEqual(file.toXML(), loadXMLEmptyFile())
+
+def testXMLBuilderNoSectionFile():
+ parser = Parser('./test/testVault')
+ file = [file for file in parser.mdFiles if file.fileName == 'no-section.md'].pop()
+ assert treesEqual(file.toXML(), loadXMLNoSection())
+
+def testXMLWriterSingleLevelHeaders():
+ parser = Parser('./test/testVault')
+ file = [file for file in parser.mdFiles if file.fileName == 'single-level-headers.md'].pop()
+ with open(file.path, 'r') as mdFile:
+ text = mdFile.read()
+
+ xmlToMarkdownText(loadXMLSingleLevelHeaders()) == text
+
+def testXMLWriterMultiLevelHeaders():
+ parser = Parser('./test/testVault')
+ file = [file for file in parser.mdFiles if file.fileName == 'multi-level-headers.md'].pop()
+ with open(file.path, 'r') as mdFile:
+ text = mdFile.read()
+
+ xmlToMarkdownText(loadXMLMultiLevelHeaders()) == text
+
+def testXMLWriterFalseHeaders():
+ parser = Parser('./test/testVault')
+ file = [file for file in parser.mdFiles if file.fileName == 'false-headers.md'].pop()
+ with open(file.path, 'r') as mdFile:
+ text = mdFile.read()
+
+ xmlToMarkdownText(loadXMLFalseHeaders()) == text
+
+def testXMLWriterEmptyFile():
+ parser = Parser('./test/testVault')
+ file = [file for file in parser.mdFiles if file.fileName == 'empty-file.md'].pop()
+ with open(file.path, 'r') as mdFile:
+ text = mdFile.read()
+
+ xmlToMarkdownText(loadXMLEmptyFile()) == text
+
+def testXMLWriterNoSectionFile():
+ parser = Parser('./test/testVault')
+ file = [file for file in parser.mdFiles if file.fileName == 'no-section.md'].pop()
+ with open(file.path, 'r') as mdFile:
+ text = mdFile.read()
+
+ xmlToMarkdownText(loadXMLNoSection()) == text
+
diff --git a/test/xml_helpers.py b/test/xml_helpers.py
new file mode 100644
index 0000000..30c4d14
--- /dev/null
+++ b/test/xml_helpers.py
@@ -0,0 +1,20 @@
+import xml.etree.ElementTree as ET
+from xml.etree.ElementTree import Element, ElementTree
+
+def treesEqual(t1: ElementTree, t2: ElementTree) -> bool:
+ return ET.canonicalize(ET.tostring(t1.getroot())) == ET.canonicalize(ET.tostring(t2.getroot()))
+
+def loadXMLSingleLevelHeaders():
+ return ET.parse('./test/testVault/xml_markdown_tests/expected_outputs/single-level-headers.xml')
+
+def loadXMLMultiLevelHeaders():
+ return ET.parse('./test/testVault/xml_markdown_tests/expected_outputs/multi-level-headers.xml')
+
+def loadXMLFalseHeaders():
+ return ET.parse('./test/testVault/xml_markdown_tests/expected_outputs/false-headers.xml')
+
+def loadXMLEmptyFile():
+ return ET.parse('./test/testVault/xml_markdown_tests/expected_outputs/empty-file.xml')
+
+def loadXMLNoSection():
+ return ET.parse('./test/testVault/xml_markdown_tests/expected_outputs/no-section.xml')