Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions src/MarkdownFile.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import re
from src.YamlParser import YamlParser, YAML_METHOD
from xml.etree.ElementTree import Element, ElementTree, SubElement

class MarkdownFile:
def __init__(self, fileName, filePath):
self._regexFindLinks = r'(?<=\[\[).*?(?=(?:\]\]|#|\|))' # Thanks to https://github.com/archelpeg
self._regexFindHeaders = r'^\s*(#+)(\s+.*)'
self.fileName = fileName
self.path = filePath
self.tags = self._findTags()
Expand Down Expand Up @@ -77,3 +79,47 @@ def _findLinksInCurrentFile(self) -> set:
def _openFile(self):
"""Open markdown file"""
return open(self.path, "r", encoding="utf-8")

def toXML(self):
file = self._openFile()
lines = file.readlines()
file.close()

# Get the line numbers and header levels of each header
section_lines = []
section_levels = []
section_titles = []
for i, line in enumerate(lines):
match = re.match(self._regexFindHeaders, line)
if match is not None:
section_lines.append(i)
section_levels.append(len(match.group(1)))
section_titles.append(match.group(2).strip())

root = Element('root', {'level': '0'})
parent_map = {}

current_node = root
section_ends = section_lines + [len(lines)]
current_node.text = "".join(lines[:section_ends[0]])
section_ends = section_ends[1:]

for section_start, level, title, section_end in zip(section_lines, section_levels, section_titles, section_ends):
while int(current_node.get('level')) >= level:
current_node = parent_map[current_node]
section_node = SubElement(current_node, 'section', {'level': str(level), 'title': title})
parent_map[section_node] = current_node
section_node.text = "".join(lines[section_start:section_end])

current_node = section_node

return ElementTree(root)

def xmlToMarkdownText(tree: ElementTree):
sections = []
for node in tree.iter():
if node.get('level') == '0':
continue
sections.append(node.text)

return ''.join(sections)
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<root level="0" />
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<root level="0">Root level text.

<section level="1" title="A good header"># A good header

This header has good text.

#bad header

asdf # another bad header

<section level="2" title="This is okay"> ## This is okay

asdfasdf
</section></section></root>
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<root level="0">Root level text.

<section level="1" title="Header 1"># Header 1

Level 1 text.

<section level="2" title="Header 2">## Header 2

Level 2 text.

<section level="3" title="Header 3">### Header 3

Level 3 text.

<section level="6" title="Header 6">###### Header 6

Level 6 text.

</section></section></section></section><section level="1" title="Section 2"># Section 2

Another section.
</section></root>
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
<root level="0">This file has no sections.
</root>
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<root level="0">root section

<section level="1" title="header 1"># header 1

section 1

</section><section level="1" title="header 2"># header 2

section 2

</section><section level="1" title="header 3"># header 3

section 3
</section></root>
Empty file.
13 changes: 13 additions & 0 deletions test/testVault/xml_markdown_tests/markdown_files/false-headers.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Root level text.

# A good header

This header has good text.

#bad header

asdf # another bad header

## This is okay

asdfasdf
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
Root level text.

# Header 1

Level 1 text.

## Header 2

Level 2 text.

### Header 3

Level 3 text.

###### Header 6

Level 6 text.

# Section 2

Another section.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This file has no sections.
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
root section

# header 1

section 1

# header 2

section 2

# header 3

section 3
74 changes: 69 additions & 5 deletions test/test_Script.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
from src.Extractor import Extractor
from src.MarkdownFile import MarkdownFile
from src.MarkdownFile import xmlToMarkdownText
from src.Parser import Parser
from src.YamlParser import YamlParser
import os
from test.xml_helpers import loadXMLSingleLevelHeaders, loadXMLMultiLevelHeaders, loadXMLFalseHeaders, loadXMLEmptyFile, loadXMLNoSection, treesEqual

def testMarkdownsRetrieval():
parser = Parser('./test/testVault')
assert len(parser.mdFiles) == 6
assert len(parser.mdFiles) == 11

def testMarkdownTags():
def nbFilesWithTag(parser, tag):
Expand All @@ -22,3 +20,69 @@ def testSubfilesForFile():
file = set([file for file in parser.mdFiles if file.fileName == 'file1.md'])
subFiles = parser.findSubFilesForFiles(file)
assert len(subFiles) == 3

def testXMLBuilderSingleLevelHeaders():
parser = Parser('./test/testVault')
file = [file for file in parser.mdFiles if file.fileName == 'single-level-headers.md'].pop()
assert treesEqual(file.toXML(), loadXMLSingleLevelHeaders())

def testXMLBuilderMultiLevelHeaders():
parser = Parser('./test/testVault')
file = [file for file in parser.mdFiles if file.fileName == 'multi-level-headers.md'].pop()
assert treesEqual(file.toXML(), loadXMLMultiLevelHeaders())

def testXMLBuilderFalseHeaders():
parser = Parser('./test/testVault')
file = [file for file in parser.mdFiles if file.fileName == 'false-headers.md'].pop()
assert treesEqual(file.toXML(), loadXMLFalseHeaders())

def testXMLBuilderEmptyFile():
parser = Parser('./test/testVault')
file = [file for file in parser.mdFiles if file.fileName == 'empty-file.md'].pop()
assert treesEqual(file.toXML(), loadXMLEmptyFile())

def testXMLBuilderNoSectionFile():
parser = Parser('./test/testVault')
file = [file for file in parser.mdFiles if file.fileName == 'no-section.md'].pop()
assert treesEqual(file.toXML(), loadXMLNoSection())

def testXMLWriterSingleLevelHeaders():
parser = Parser('./test/testVault')
file = [file for file in parser.mdFiles if file.fileName == 'single-level-headers.md'].pop()
with open(file.path, 'r') as mdFile:
text = mdFile.read()

xmlToMarkdownText(loadXMLSingleLevelHeaders()) == text

def testXMLWriterMultiLevelHeaders():
parser = Parser('./test/testVault')
file = [file for file in parser.mdFiles if file.fileName == 'multi-level-headers.md'].pop()
with open(file.path, 'r') as mdFile:
text = mdFile.read()

xmlToMarkdownText(loadXMLMultiLevelHeaders()) == text

def testXMLWriterFalseHeaders():
parser = Parser('./test/testVault')
file = [file for file in parser.mdFiles if file.fileName == 'false-headers.md'].pop()
with open(file.path, 'r') as mdFile:
text = mdFile.read()

xmlToMarkdownText(loadXMLFalseHeaders()) == text

def testXMLWriterEmptyFile():
parser = Parser('./test/testVault')
file = [file for file in parser.mdFiles if file.fileName == 'empty-file.md'].pop()
with open(file.path, 'r') as mdFile:
text = mdFile.read()

xmlToMarkdownText(loadXMLEmptyFile()) == text

def testXMLWriterNoSectionFile():
parser = Parser('./test/testVault')
file = [file for file in parser.mdFiles if file.fileName == 'no-section.md'].pop()
with open(file.path, 'r') as mdFile:
text = mdFile.read()

xmlToMarkdownText(loadXMLNoSection()) == text

20 changes: 20 additions & 0 deletions test/xml_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import Element, ElementTree

def treesEqual(t1: ElementTree, t2: ElementTree) -> bool:
return ET.canonicalize(ET.tostring(t1.getroot())) == ET.canonicalize(ET.tostring(t2.getroot()))

def loadXMLSingleLevelHeaders():
return ET.parse('./test/testVault/xml_markdown_tests/expected_outputs/single-level-headers.xml')

def loadXMLMultiLevelHeaders():
return ET.parse('./test/testVault/xml_markdown_tests/expected_outputs/multi-level-headers.xml')

def loadXMLFalseHeaders():
return ET.parse('./test/testVault/xml_markdown_tests/expected_outputs/false-headers.xml')

def loadXMLEmptyFile():
return ET.parse('./test/testVault/xml_markdown_tests/expected_outputs/empty-file.xml')

def loadXMLNoSection():
return ET.parse('./test/testVault/xml_markdown_tests/expected_outputs/no-section.xml')