From d6109e684252475050db7c2c24f68aade317211f Mon Sep 17 00:00:00 2001 From: Jakub Kukul Date: Tue, 12 Feb 2019 16:08:43 +0700 Subject: [PATCH 1/8] Refactor: remove unnecessary comments. --- exitwp.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/exitwp.py b/exitwp.py index 0a59010..57b51a0 100755 --- a/exitwp.py +++ b/exitwp.py @@ -146,7 +146,6 @@ def gi(q, unicode_wrap=True, empty=False): img_srcs.append(img['src']) except: print 'could not parse html: ' + body - # print img_srcs excerpt = gi('excerpt:encoded', empty=True) @@ -270,8 +269,6 @@ def get_attachment_path(src, dir, dir_prefix='images'): if (not os.path.exists(target_dir)): os.makedirs(target_dir) - # if src not in attachments[dir]: - # print target_name return target_file for i in data['items']: From 0869cde4322e661a76bafd6f2a3871755f0a1204 Mon Sep 17 00:00:00 2001 From: Jakub Kukul Date: Tue, 12 Feb 2019 16:26:05 +0700 Subject: [PATCH 2/8] Refactor: don't parse html without download_images option. When download_images option is disabled there's no other need to parse html, so I'm suggesting to optimise and only parse html when it's actually needed. --- exitwp.py | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/exitwp.py b/exitwp.py index 57b51a0..7de6945 100755 --- a/exitwp.py +++ b/exitwp.py @@ -137,16 +137,6 @@ def gi(q, unicode_wrap=True, empty=False): # body = body.replace(key, body_replace[key]) body = re.sub(key, body_replace[key], body) - img_srcs = [] - if body is not None: - try: - soup = BeautifulSoup(body) - img_tags = soup.find_all('img') - for img in img_tags: - img_srcs.append(img['src']) - except: - print 'could not parse html: ' + body - excerpt = gi('excerpt:encoded', empty=True) export_item = { @@ -162,8 +152,7 @@ def gi(q, unicode_wrap=True, empty=False): 'comments': gi('wp:comment_status') == u'open', 'taxanomies': export_taxanomies, 'body': body, - 'excerpt': excerpt, - 'img_srcs': img_srcs + 'excerpt': excerpt } export_items.append(export_item) @@ -325,15 +314,20 @@ def get_attachment_path(src, dir, dir_prefix='images'): else: print 'Unknown item type :: ' + i['type'] - if download_images: - for img in i['img_srcs']: - try: - urlretrieve(urljoin(data['header']['link'], - img.encode('utf-8')), - get_attachment_path(img, i['uid'])) - except: - print '\n unable to download ' + urljoin( - data['header']['link'], img.encode('utf-8')) + if download_images and i['body'] is not None: + try: + soup = BeautifulSoup(i['body']) + img_tags = soup.find_all('img') + for img in img_tags: + try: + urlretrieve(urljoin(data['header']['link'], + img['src'].encode('utf-8')), + get_attachment_path(img['src'], i['uid'])) + except: + print '\n unable to download ' + urljoin( + data['header']['link'], img['src'].encode('utf-8')) + except: + print 'could not parse html: ' + i['body'] if out is not None: def toyaml(data): From a8fb6ef4e958f06ab922b1434c098cd01313f8f0 Mon Sep 17 00:00:00 2001 From: Jakub Kukul Date: Tue, 12 Feb 2019 17:12:22 +0700 Subject: [PATCH 3/8] Refactor: remove unused variables. --- exitwp.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/exitwp.py b/exitwp.py index 7de6945..5e12537 100755 --- a/exitwp.py +++ b/exitwp.py @@ -116,7 +116,6 @@ def parse_items(): def gi(q, unicode_wrap=True, empty=False): namespace = '' - tag = '' if q.find(':') > 0: namespace, tag = q.split(':', 1) else: @@ -190,7 +189,6 @@ def open_file(file): return f def get_item_uid(item, date_prefix=False, namespace=''): - result = None if namespace not in item_uids: item_uids[namespace] = {} From a25b3c0b66552d57c6c6ed199df58008b63360a1 Mon Sep 17 00:00:00 2001 From: Jakub Kukul Date: Tue, 12 Feb 2019 17:35:55 +0700 Subject: [PATCH 4/8] Refactor: remove redundant patenthesis. --- exitwp.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/exitwp.py b/exitwp.py index 5e12537..482b5d0 100755 --- a/exitwp.py +++ b/exitwp.py @@ -69,9 +69,6 @@ def _start_ns(self, prefix, ns): def html2fmt(html, target_format): - # html = html.replace("\n\n", '

') - # html = html.replace('
', '
', ']]>
') if target_format == 'html': return html else: @@ -222,7 +219,7 @@ def get_item_path(item, dir=''): filename_parts = [full_dir, '/'] filename_parts.append(item['uid']) if item['type'] == 'page': - if (not os.path.exists(''.join(filename_parts))): + if not os.path.exists(''.join(filename_parts)): os.makedirs(''.join(filename_parts)) filename_parts.append('/index') filename_parts.append('.') @@ -253,7 +250,7 @@ def get_attachment_path(src, dir, dir_prefix='images'): target_dir = os.path.normpath(blog_dir + '/' + dir_prefix + '/' + dir) target_file = os.path.normpath(target_dir + '/' + filename) - if (not os.path.exists(target_dir)): + if not os.path.exists(target_dir): os.makedirs(target_dir) return target_file @@ -262,11 +259,11 @@ def get_attachment_path(src, dir, dir_prefix='images'): skip_item = False for field, value in item_field_filter.iteritems(): - if(i[field] == value): + if i[field] == value: skip_item = True break - if(skip_item): + if skip_item: continue sys.stdout.write('.') From a11765edf346b8121290e094bee238720df1be7d Mon Sep 17 00:00:00 2001 From: Jakub Kukul Date: Tue, 12 Feb 2019 20:02:49 +0700 Subject: [PATCH 5/8] Refactor: do not shadow outer scope name. --- exitwp.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/exitwp.py b/exitwp.py index 482b5d0..3bf1ed6 100755 --- a/exitwp.py +++ b/exitwp.py @@ -226,11 +226,11 @@ def get_item_path(item, dir=''): filename_parts.append(target_format) return ''.join(filename_parts) - def get_attachment_path(src, dir, dir_prefix='images'): + def get_attachment_path(src, uid, dir_prefix='images'): try: - files = attachments[dir] + files = attachments[uid] except KeyError: - attachments[dir] = files = {} + attachments[uid] = files = {} try: filename = files[src] @@ -247,7 +247,7 @@ def get_attachment_path(src, dir, dir_prefix='images'): file_infix = file_infix + 1 files[src] = filename = maybe_filename - target_dir = os.path.normpath(blog_dir + '/' + dir_prefix + '/' + dir) + target_dir = os.path.normpath(blog_dir + '/' + dir_prefix + '/' + uid) target_file = os.path.normpath(target_dir + '/' + filename) if not os.path.exists(target_dir): From 91391cb43c599686e3b543e51a74b6295a4adce1 Mon Sep 17 00:00:00 2001 From: Jakub Kukul Date: Wed, 13 Feb 2019 14:06:09 +0700 Subject: [PATCH 6/8] Skip remaining logic for ignored post types. --- exitwp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/exitwp.py b/exitwp.py index 3bf1ed6..7370733 100755 --- a/exitwp.py +++ b/exitwp.py @@ -305,7 +305,7 @@ def get_attachment_path(src, uid, dir_prefix='images'): out = open_file(fn) yaml_header['layout'] = 'page' elif i['type'] in item_type_filter: - pass + continue else: print 'Unknown item type :: ' + i['type'] From 1ffc7dcb740ac011250b151aba94a74eb132e795 Mon Sep 17 00:00:00 2001 From: Jakub Kukul Date: Wed, 13 Feb 2019 15:13:27 +0700 Subject: [PATCH 7/8] Refactor: do target_dir related logic only once. Instead of doing it repeatedly for every loop iteration. --- exitwp.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/exitwp.py b/exitwp.py index 7370733..a4751bb 100755 --- a/exitwp.py +++ b/exitwp.py @@ -226,7 +226,7 @@ def get_item_path(item, dir=''): filename_parts.append(target_format) return ''.join(filename_parts) - def get_attachment_path(src, uid, dir_prefix='images'): + def get_attachment_path(src, uid, target_dir): try: files = attachments[uid] except KeyError: @@ -247,12 +247,8 @@ def get_attachment_path(src, uid, dir_prefix='images'): file_infix = file_infix + 1 files[src] = filename = maybe_filename - target_dir = os.path.normpath(blog_dir + '/' + dir_prefix + '/' + uid) target_file = os.path.normpath(target_dir + '/' + filename) - if not os.path.exists(target_dir): - os.makedirs(target_dir) - return target_file for i in data['items']: @@ -313,11 +309,19 @@ def get_attachment_path(src, uid, dir_prefix='images'): try: soup = BeautifulSoup(i['body']) img_tags = soup.find_all('img') + + image_dir = os.path.join('images', i['uid']) + target_dir = os.path.normpath(os.path.join(blog_dir, image_dir)) + + if img_tags and not os.path.exists(target_dir): + os.makedirs(target_dir) + for img in img_tags: try: urlretrieve(urljoin(data['header']['link'], img['src'].encode('utf-8')), - get_attachment_path(img['src'], i['uid'])) + get_attachment_path(img['src'], i['uid'], + target_dir)) except: print '\n unable to download ' + urljoin( data['header']['link'], img['src'].encode('utf-8')) From 642bfa83c852296077331bd6f7b2281d9c1511da Mon Sep 17 00:00:00 2001 From: Jakub Kukul Date: Wed, 13 Feb 2019 18:07:25 +0700 Subject: [PATCH 8/8] Rewrite of image/attachment links if they are downloaded. --- README.rst | 1 - exitwp.py | 22 ++++++++++++++++------ 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/README.rst b/README.rst index cdaeb1a..12a1306 100644 --- a/README.rst +++ b/README.rst @@ -66,7 +66,6 @@ Some things like custom handling of non standard post types is not fully configu Known issues ============ * Target file names are some times less than optimal. - * Rewriting of image/attachment links if they are downloaded would be a good feature * There will probably be issues when migrating non utf-8 encoded wordpress dump files (if they exist). Other Tools diff --git a/exitwp.py b/exitwp.py index a4751bb..020f633 100755 --- a/exitwp.py +++ b/exitwp.py @@ -226,7 +226,7 @@ def get_item_path(item, dir=''): filename_parts.append(target_format) return ''.join(filename_parts) - def get_attachment_path(src, uid, target_dir): + def get_attachment_file_name(src, uid): try: files = attachments[uid] except KeyError: @@ -247,9 +247,7 @@ def get_attachment_path(src, uid, target_dir): file_infix = file_infix + 1 files[src] = filename = maybe_filename - target_file = os.path.normpath(target_dir + '/' + filename) - - return target_file + return filename for i in data['items']: skip_item = False @@ -318,13 +316,25 @@ def get_attachment_path(src, uid, target_dir): for img in img_tags: try: + + attachment_file_name = \ + get_attachment_file_name(img['src'], i['uid']) + attachment_file_path = os.path.join(target_dir, attachment_file_name) + attachment_url = "/" + os.path.join(image_dir, attachment_file_name) + urlretrieve(urljoin(data['header']['link'], img['src'].encode('utf-8')), - get_attachment_path(img['src'], i['uid'], - target_dir)) + attachment_file_path) + + # Substitute image link with a path of a downloaded image + img['src'] = attachment_url + except: print '\n unable to download ' + urljoin( data['header']['link'], img['src'].encode('utf-8')) + + if img_tags: + i['body'] = soup.prettify() except: print 'could not parse html: ' + i['body']