diff --git a/config.yaml b/config.yaml index aa03275..9fc0b35 100644 --- a/config.yaml +++ b/config.yaml @@ -5,7 +5,7 @@ wp_exports: wordpress-xml build_dir: build # Output format: primary choices are html or markdown. -target_format: markdown +target_format: html # The date format of the wikipedia export file. # I'm not sure if this ever differs depending on wordpress localization. diff --git a/exitwp.py b/exitwp.py index e7851af..a9c95a4 100755 --- a/exitwp.py +++ b/exitwp.py @@ -12,6 +12,7 @@ from urlparse import urlparse, urljoin from urllib import urlretrieve from html2text import html2text_file +from linebreaks_wp import linebreaks_wp ''' exitwp - Wordpress xml exports to Jekykll blog format conversion @@ -51,7 +52,7 @@ def html2fmt(html, target_format): # html = html.replace('
', '', ']]>
')
if target_format == 'html':
- return html
+ return linebreaks_wp(html);
else:
return html2text_file(html, None)
@@ -183,7 +184,8 @@ def get_item_uid(item, date_prefix=False, namespace=''):
dt = datetime.strptime(item['date'], date_fmt)
uid.append(dt.strftime('%Y-%m-%d'))
uid.append('-')
- s_title = item['slug']
+ #s_title = item['slug']
+ s_title = item['wp_id']
if s_title is None or s_title == '':
s_title = item['title']
if s_title is None or s_title == '':
diff --git a/linebreaks_wp.py b/linebreaks_wp.py
new file mode 100644
index 0000000..84a8f4b
--- /dev/null
+++ b/linebreaks_wp.py
@@ -0,0 +1,75 @@
+import re
+from django import template
+from django.utils.functional import allow_lazy
+from django.template.defaultfilters import stringfilter
+from django.utils.safestring import mark_safe, SafeData
+from django.utils.encoding import force_unicode
+from django.utils.html import escape
+from django.utils.text import normalize_newlines
+register = template.Library()
+
+def linebreaks_wp(pee, autoescape=False):
+ """Straight up port of http://codex.wordpress.org/Function_Reference/wpautop"""
+ if (pee.strip() == ""):
+ return ""
+ pee = normalize_newlines(pee)
+ pee = pee + "\n"
+ pee = re.sub(r'
\s*
', "\n\n", pee)
+ allblocks = r'(?:table|thead|tfoot|caption|col|colgroup|tbody|tr|td|th|div|dl|dd|dt|ul|ol|li|pre|select|option|form|map|area|blockquote|address|math|style|input|p|h[1-6]|hr|fieldset|legend|section|article|aside|hgroup|header|footer|nav|figure|figcaption|details|menu|summary)'
+ pee = re.sub(r'(<' + allblocks + '[^>]*>)', lambda m: "\n"+m.group(1) if m.group(1) else "\n", pee)
+ pee = re.sub(r'(' + allblocks + '>)', lambda m: m.group(1)+"\n\n" if m.group(1) else "\n\n", pee)
+ #pee = pee.replace("\r\n", "\n")
+ #pee = pee.replace("\r", "\n") #these taken care of by normalize_newlines
+ if (pee.find(""
+ else:
+ text = m.group(0)
+ text = text.replace('', "\n") + text = text.replace('
', '') + + return text + pee = re.sub('(?is)(]*>)(.*?)', clean_pre, pee) + pee = re.sub( r"\n$", '', pee) + return pee +linebreaks_wp = allow_lazy(linebreaks_wp, unicode) + +@register.filter("linebreaks_wp") +@stringfilter +def linebreaks_wp_filter(value, autoescape=None): + """Straight up port of http://codex.wordpress.org/Function_Reference/wpautop""" + autoescape = autoescape and not isinstance(value, SafeData) + return mark_safe(linebreaks_wp(value, autoescape)) +linebreaks_wp_filter.is_safe = True +linebreaks_wp_filter.needs_autoescape = True +linebreaks_wp = stringfilter(linebreaks_wp)