diff --git a/config.yaml b/config.yaml index aa03275..9fc0b35 100644 --- a/config.yaml +++ b/config.yaml @@ -5,7 +5,7 @@ wp_exports: wordpress-xml build_dir: build # Output format: primary choices are html or markdown. -target_format: markdown +target_format: html # The date format of the wikipedia export file. # I'm not sure if this ever differs depending on wordpress localization. diff --git a/exitwp.py b/exitwp.py index e7851af..a9c95a4 100755 --- a/exitwp.py +++ b/exitwp.py @@ -12,6 +12,7 @@ from urlparse import urlparse, urljoin from urllib import urlretrieve from html2text import html2text_file +from linebreaks_wp import linebreaks_wp ''' exitwp - Wordpress xml exports to Jekykll blog format conversion @@ -51,7 +52,7 @@ def html2fmt(html, target_format): # html = html.replace('
', '
', ']]>
') if target_format == 'html': - return html + return linebreaks_wp(html); else: return html2text_file(html, None) @@ -183,7 +184,8 @@ def get_item_uid(item, date_prefix=False, namespace=''): dt = datetime.strptime(item['date'], date_fmt) uid.append(dt.strftime('%Y-%m-%d')) uid.append('-') - s_title = item['slug'] + #s_title = item['slug'] + s_title = item['wp_id'] if s_title is None or s_title == '': s_title = item['title'] if s_title is None or s_title == '': diff --git a/linebreaks_wp.py b/linebreaks_wp.py new file mode 100644 index 0000000..84a8f4b --- /dev/null +++ b/linebreaks_wp.py @@ -0,0 +1,75 @@ +import re +from django import template +from django.utils.functional import allow_lazy +from django.template.defaultfilters import stringfilter +from django.utils.safestring import mark_safe, SafeData +from django.utils.encoding import force_unicode +from django.utils.html import escape +from django.utils.text import normalize_newlines +register = template.Library() + +def linebreaks_wp(pee, autoescape=False): + """Straight up port of http://codex.wordpress.org/Function_Reference/wpautop""" + if (pee.strip() == ""): + return "" + pee = normalize_newlines(pee) + pee = pee + "\n" + pee = re.sub(r'
\s*
', "\n\n", pee) + allblocks = r'(?:table|thead|tfoot|caption|col|colgroup|tbody|tr|td|th|div|dl|dd|dt|ul|ol|li|pre|select|option|form|map|area|blockquote|address|math|style|input|p|h[1-6]|hr|fieldset|legend|section|article|aside|hgroup|header|footer|nav|figure|figcaption|details|menu|summary)' + pee = re.sub(r'(<' + allblocks + '[^>]*>)', lambda m: "\n"+m.group(1) if m.group(1) else "\n", pee) + pee = re.sub(r'()', lambda m: m.group(1)+"\n\n" if m.group(1) else "\n\n", pee) + #pee = pee.replace("\r\n", "\n") + #pee = pee.replace("\r", "\n") #these taken care of by normalize_newlines + if (pee.find("]*)>\s*', lambda m: "" % (m.group(1) if m.group(1) else "", ), pee) # no pee inside object/embed + pee = re.sub(r'\s*\s*', '', pee) + pee = re.sub(r"\n\n+", "\n\n", pee) # take care of duplicates + pees = re.split(r'\n\s*\n', pee) # since PHP has a PREG_SPLIT_NO_EMPTY, may need to go through and drop any empty strings + #pees = [p for p in pees if p] + pee = "".join(["

%s

\n" % tinkle.strip('\n') for tinkle in pees]) + pee = re.sub(r'

\s*

', '', pee) #under certain strange conditions it could create a P of entirely whitespace + pee = re.sub(r'

([^<]+)', lambda m: "

%s

" % ((lambda x: x.group(1) if x.group(1) else "")(m), (lambda x: x.group(2) if x.group(2) else "")(m), ), pee) + pee = re.sub(r'

\s*(]*>)\s*

', lambda m: m.group(1) if m.group(1) else "", pee) # don't pee all over a tag + pee = re.sub(r"

(", lambda m: m.group(1) if m.group(1) else "", pee) # problem with nested lists + pee = re.sub(r'

]*)>', lambda m: "

" % (m.group(1) if m.group(1) else "",), pee, flags=re.IGNORECASE) + pee = pee.replace('

', '

') + pee = re.sub(r'

\s*(]*>)', lambda m: m.group(1) if m.group(1) else "", pee) + pee = re.sub(r'(]*>)\s*

', lambda m: m.group(1) if m.group(1) else "", pee) + + def _autop_newline_preservation_helper(matches): + return matches.group(0).replace("\n", "") + pee = re.sub(r'<(script|style).*?', _autop_newline_preservation_helper, pee, flags=re.DOTALL) + pee = re.sub(r'(?)\s*\n', "
\n", pee) # make line breaks + pee = pee.replace('', "\n") + + pee = re.sub(r'(]*>)\s*
', lambda m: m.group(1) if m.group(1) else "", pee) + pee = re.sub(r'
(\s*]*>)', lambda m: m.group(1) if m.group(1) else "", pee) + if (pee.find('', '') + text = text.replace('

', "\n") + text = text.replace('

', '') + text = m.group(1)+escape(text)+"
" + else: + text = m.group(0) + text = text.replace('
', '') + text = text.replace('

', "\n") + text = text.replace('

', '') + + return text + pee = re.sub('(?is)(]*>)(.*?)', clean_pre, pee) + pee = re.sub( r"\n

$", '

', pee) + return pee +linebreaks_wp = allow_lazy(linebreaks_wp, unicode) + +@register.filter("linebreaks_wp") +@stringfilter +def linebreaks_wp_filter(value, autoescape=None): + """Straight up port of http://codex.wordpress.org/Function_Reference/wpautop""" + autoescape = autoescape and not isinstance(value, SafeData) + return mark_safe(linebreaks_wp(value, autoescape)) +linebreaks_wp_filter.is_safe = True +linebreaks_wp_filter.needs_autoescape = True +linebreaks_wp = stringfilter(linebreaks_wp)