From d6109e684252475050db7c2c24f68aade317211f Mon Sep 17 00:00:00 2001
From: Jakub Kukul <jakub.kukul@gmail.com>
Date: Tue, 12 Feb 2019 16:08:43 +0700
Subject: [PATCH 1/8] Refactor: remove unnecessary comments.

---
 exitwp.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/exitwp.py b/exitwp.py
index 0a59010..57b51a0 100755
--- a/exitwp.py
+++ b/exitwp.py
@@ -146,7 +146,6 @@ def gi(q, unicode_wrap=True, empty=False):
                         img_srcs.append(img['src'])
                 except:
                     print 'could not parse html: ' + body
-            # print img_srcs
 
             excerpt = gi('excerpt:encoded', empty=True)
 
@@ -270,8 +269,6 @@ def get_attachment_path(src, dir, dir_prefix='images'):
         if (not os.path.exists(target_dir)):
             os.makedirs(target_dir)
 
-        # if src not in attachments[dir]:
-        #     print target_name
         return target_file
 
     for i in data['items']:

From 0869cde4322e661a76bafd6f2a3871755f0a1204 Mon Sep 17 00:00:00 2001
From: Jakub Kukul <jakub.kukul@gmail.com>
Date: Tue, 12 Feb 2019 16:26:05 +0700
Subject: [PATCH 2/8] Refactor: don't parse html without download_images
 option.

When download_images option is disabled there's no other need to parse html, so I'm suggesting to optimise and only parse html when it's actually needed.
---
 exitwp.py | 36 +++++++++++++++---------------------
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/exitwp.py b/exitwp.py
index 57b51a0..7de6945 100755
--- a/exitwp.py
+++ b/exitwp.py
@@ -137,16 +137,6 @@ def gi(q, unicode_wrap=True, empty=False):
                 # body = body.replace(key, body_replace[key])
                 body = re.sub(key, body_replace[key], body)
 
-            img_srcs = []
-            if body is not None:
-                try:
-                    soup = BeautifulSoup(body)
-                    img_tags = soup.find_all('img')
-                    for img in img_tags:
-                        img_srcs.append(img['src'])
-                except:
-                    print 'could not parse html: ' + body
-
             excerpt = gi('excerpt:encoded', empty=True)
 
             export_item = {
@@ -162,8 +152,7 @@ def gi(q, unicode_wrap=True, empty=False):
                 'comments': gi('wp:comment_status') == u'open',
                 'taxanomies': export_taxanomies,
                 'body': body,
-                'excerpt': excerpt,
-                'img_srcs': img_srcs
+                'excerpt': excerpt
             }
 
             export_items.append(export_item)
@@ -325,15 +314,20 @@ def get_attachment_path(src, dir, dir_prefix='images'):
         else:
             print 'Unknown item type :: ' + i['type']
 
-        if download_images:
-            for img in i['img_srcs']:
-                try:
-                    urlretrieve(urljoin(data['header']['link'],
-                                        img.encode('utf-8')),
-                                get_attachment_path(img, i['uid']))
-                except:
-                    print '\n unable to download ' + urljoin(
-                        data['header']['link'], img.encode('utf-8'))
+        if download_images and i['body'] is not None:
+            try:
+                soup = BeautifulSoup(i['body'])
+                img_tags = soup.find_all('img')
+                for img in img_tags:
+                    try:
+                        urlretrieve(urljoin(data['header']['link'],
+                                            img['src'].encode('utf-8')),
+                                    get_attachment_path(img['src'], i['uid']))
+                    except:
+                        print '\n unable to download ' + urljoin(
+                            data['header']['link'], img['src'].encode('utf-8'))
+            except:
+                print 'could not parse html: ' + i['body']
 
         if out is not None:
             def toyaml(data):

From a8fb6ef4e958f06ab922b1434c098cd01313f8f0 Mon Sep 17 00:00:00 2001
From: Jakub Kukul <jakub.kukul@gmail.com>
Date: Tue, 12 Feb 2019 17:12:22 +0700
Subject: [PATCH 3/8] Refactor: remove unused variables.

---
 exitwp.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/exitwp.py b/exitwp.py
index 7de6945..5e12537 100755
--- a/exitwp.py
+++ b/exitwp.py
@@ -116,7 +116,6 @@ def parse_items():
 
             def gi(q, unicode_wrap=True, empty=False):
                 namespace = ''
-                tag = ''
                 if q.find(':') > 0:
                     namespace, tag = q.split(':', 1)
                 else:
@@ -190,7 +189,6 @@ def open_file(file):
         return f
 
     def get_item_uid(item, date_prefix=False, namespace=''):
-        result = None
         if namespace not in item_uids:
             item_uids[namespace] = {}
 

From a25b3c0b66552d57c6c6ed199df58008b63360a1 Mon Sep 17 00:00:00 2001
From: Jakub Kukul <jakub.kukul@gmail.com>
Date: Tue, 12 Feb 2019 17:35:55 +0700
Subject: [PATCH 4/8] Refactor: remove redundant patenthesis.

---
 exitwp.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/exitwp.py b/exitwp.py
index 5e12537..482b5d0 100755
--- a/exitwp.py
+++ b/exitwp.py
@@ -69,9 +69,6 @@ def _start_ns(self, prefix, ns):
 
 
 def html2fmt(html, target_format):
-    #   html = html.replace("\n\n", '<br/><br/>')
-    #   html = html.replace('<pre lang="xml">', '<pre lang="xml"><![CDATA[')
-    #   html = html.replace('</pre>', ']]></pre>')
     if target_format == 'html':
         return html
     else:
@@ -222,7 +219,7 @@ def get_item_path(item, dir=''):
         filename_parts = [full_dir, '/']
         filename_parts.append(item['uid'])
         if item['type'] == 'page':
-            if (not os.path.exists(''.join(filename_parts))):
+            if not os.path.exists(''.join(filename_parts)):
                 os.makedirs(''.join(filename_parts))
             filename_parts.append('/index')
         filename_parts.append('.')
@@ -253,7 +250,7 @@ def get_attachment_path(src, dir, dir_prefix='images'):
         target_dir = os.path.normpath(blog_dir + '/' + dir_prefix + '/' + dir)
         target_file = os.path.normpath(target_dir + '/' + filename)
 
-        if (not os.path.exists(target_dir)):
+        if not os.path.exists(target_dir):
             os.makedirs(target_dir)
 
         return target_file
@@ -262,11 +259,11 @@ def get_attachment_path(src, dir, dir_prefix='images'):
         skip_item = False
 
         for field, value in item_field_filter.iteritems():
-            if(i[field] == value):
+            if i[field] == value:
                 skip_item = True
                 break
 
-        if(skip_item):
+        if skip_item:
             continue
 
         sys.stdout.write('.')

From a11765edf346b8121290e094bee238720df1be7d Mon Sep 17 00:00:00 2001
From: Jakub Kukul <jakub.kukul@gmail.com>
Date: Tue, 12 Feb 2019 20:02:49 +0700
Subject: [PATCH 5/8] Refactor: do not shadow outer scope name.

---
 exitwp.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/exitwp.py b/exitwp.py
index 482b5d0..3bf1ed6 100755
--- a/exitwp.py
+++ b/exitwp.py
@@ -226,11 +226,11 @@ def get_item_path(item, dir=''):
         filename_parts.append(target_format)
         return ''.join(filename_parts)
 
-    def get_attachment_path(src, dir, dir_prefix='images'):
+    def get_attachment_path(src, uid, dir_prefix='images'):
         try:
-            files = attachments[dir]
+            files = attachments[uid]
         except KeyError:
-            attachments[dir] = files = {}
+            attachments[uid] = files = {}
 
         try:
             filename = files[src]
@@ -247,7 +247,7 @@ def get_attachment_path(src, dir, dir_prefix='images'):
                 file_infix = file_infix + 1
             files[src] = filename = maybe_filename
 
-        target_dir = os.path.normpath(blog_dir + '/' + dir_prefix + '/' + dir)
+        target_dir = os.path.normpath(blog_dir + '/' + dir_prefix + '/' + uid)
         target_file = os.path.normpath(target_dir + '/' + filename)
 
         if not os.path.exists(target_dir):

From 91391cb43c599686e3b543e51a74b6295a4adce1 Mon Sep 17 00:00:00 2001
From: Jakub Kukul <jakub.kukul@gmail.com>
Date: Wed, 13 Feb 2019 14:06:09 +0700
Subject: [PATCH 6/8] Skip remaining logic for ignored post types.

---
 exitwp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/exitwp.py b/exitwp.py
index 3bf1ed6..7370733 100755
--- a/exitwp.py
+++ b/exitwp.py
@@ -305,7 +305,7 @@ def get_attachment_path(src, uid, dir_prefix='images'):
             out = open_file(fn)
             yaml_header['layout'] = 'page'
         elif i['type'] in item_type_filter:
-            pass
+            continue
         else:
             print 'Unknown item type :: ' + i['type']
 

From 1ffc7dcb740ac011250b151aba94a74eb132e795 Mon Sep 17 00:00:00 2001
From: Jakub Kukul <jakub.kukul@gmail.com>
Date: Wed, 13 Feb 2019 15:13:27 +0700
Subject: [PATCH 7/8] Refactor: do target_dir related logic only once.

Instead of doing it repeatedly for every loop iteration.
---
 exitwp.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/exitwp.py b/exitwp.py
index 7370733..a4751bb 100755
--- a/exitwp.py
+++ b/exitwp.py
@@ -226,7 +226,7 @@ def get_item_path(item, dir=''):
         filename_parts.append(target_format)
         return ''.join(filename_parts)
 
-    def get_attachment_path(src, uid, dir_prefix='images'):
+    def get_attachment_path(src, uid, target_dir):
         try:
             files = attachments[uid]
         except KeyError:
@@ -247,12 +247,8 @@ def get_attachment_path(src, uid, dir_prefix='images'):
                 file_infix = file_infix + 1
             files[src] = filename = maybe_filename
 
-        target_dir = os.path.normpath(blog_dir + '/' + dir_prefix + '/' + uid)
         target_file = os.path.normpath(target_dir + '/' + filename)
 
-        if not os.path.exists(target_dir):
-            os.makedirs(target_dir)
-
         return target_file
 
     for i in data['items']:
@@ -313,11 +309,19 @@ def get_attachment_path(src, uid, dir_prefix='images'):
             try:
                 soup = BeautifulSoup(i['body'])
                 img_tags = soup.find_all('img')
+
+                image_dir = os.path.join('images', i['uid'])
+                target_dir = os.path.normpath(os.path.join(blog_dir, image_dir))
+
+                if img_tags and not os.path.exists(target_dir):
+                    os.makedirs(target_dir)
+
                 for img in img_tags:
                     try:
                         urlretrieve(urljoin(data['header']['link'],
                                             img['src'].encode('utf-8')),
-                                    get_attachment_path(img['src'], i['uid']))
+                                    get_attachment_path(img['src'], i['uid'],
+                                                        target_dir))
                     except:
                         print '\n unable to download ' + urljoin(
                             data['header']['link'], img['src'].encode('utf-8'))

From 642bfa83c852296077331bd6f7b2281d9c1511da Mon Sep 17 00:00:00 2001
From: Jakub Kukul <jakub.kukul@gmail.com>
Date: Wed, 13 Feb 2019 18:07:25 +0700
Subject: [PATCH 8/8] Rewrite of image/attachment links if they are downloaded.

---
 README.rst |  1 -
 exitwp.py  | 22 ++++++++++++++++------
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/README.rst b/README.rst
index cdaeb1a..12a1306 100644
--- a/README.rst
+++ b/README.rst
@@ -66,7 +66,6 @@ Some things like custom handling of non standard post types is not fully configu
 Known issues
 ============
  * Target file names are some times less than optimal.
- * Rewriting of image/attachment links if they are downloaded would be a good feature
  * There will probably be issues when migrating non utf-8 encoded wordpress dump files (if they exist).
 
 Other Tools
diff --git a/exitwp.py b/exitwp.py
index a4751bb..020f633 100755
--- a/exitwp.py
+++ b/exitwp.py
@@ -226,7 +226,7 @@ def get_item_path(item, dir=''):
         filename_parts.append(target_format)
         return ''.join(filename_parts)
 
-    def get_attachment_path(src, uid, target_dir):
+    def get_attachment_file_name(src, uid):
         try:
             files = attachments[uid]
         except KeyError:
@@ -247,9 +247,7 @@ def get_attachment_path(src, uid, target_dir):
                 file_infix = file_infix + 1
             files[src] = filename = maybe_filename
 
-        target_file = os.path.normpath(target_dir + '/' + filename)
-
-        return target_file
+        return filename
 
     for i in data['items']:
         skip_item = False
@@ -318,13 +316,25 @@ def get_attachment_path(src, uid, target_dir):
 
                 for img in img_tags:
                     try:
+
+                        attachment_file_name = \
+                            get_attachment_file_name(img['src'], i['uid'])
+                        attachment_file_path = os.path.join(target_dir, attachment_file_name)
+                        attachment_url = "/" + os.path.join(image_dir, attachment_file_name)
+
                         urlretrieve(urljoin(data['header']['link'],
                                             img['src'].encode('utf-8')),
-                                    get_attachment_path(img['src'], i['uid'],
-                                                        target_dir))
+                                    attachment_file_path)
+
+                        # Substitute image link with a path of a downloaded image
+                        img['src'] = attachment_url
+
                     except:
                         print '\n unable to download ' + urljoin(
                             data['header']['link'], img['src'].encode('utf-8'))
+
+                if img_tags:
+                    i['body'] = soup.prettify()
             except:
                 print 'could not parse html: ' + i['body']