Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions extraction/techniques.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class HeadTags(Technique):
def extract(self, html):
"Extract data from meta, link and title tags within the head tag."
extracted = {}
soup = BeautifulSoup(html)
soup = BeautifulSoup(html, "lxml")
# extract data from title tag
title_tag = soup.find('title')
if title_tag:
Expand Down Expand Up @@ -168,7 +168,7 @@ def extract(self, html):
titles = []
descriptions = []
videos = []
soup = BeautifulSoup(html)
soup = BeautifulSoup(html, "lxml")
for article in soup.find_all('article') or []:
title = article.find('h1')
if title:
Expand Down Expand Up @@ -207,7 +207,7 @@ class SemanticTags(Technique):
def extract(self, html):
"Extract data from usual semantic tags."
extracted = {}
soup = BeautifulSoup(html)
soup = BeautifulSoup(html, "lxml")

for tag, dest, max_to_store in self.extract_string:
for found in soup.find_all(tag)[:max_to_store] or []:
Expand Down