From 47e29133eecbbebe248348a0cf4875e009fc2aaf Mon Sep 17 00:00:00 2001 From: nibir-paul Date: Sat, 18 Jul 2020 00:24:26 +0530 Subject: [PATCH 1/6] Added the content of get_title() method --- predicteasy/core/classifier/link_genuinity.py | 100 +++++++++++++++--- 1 file changed, 85 insertions(+), 15 deletions(-) diff --git a/predicteasy/core/classifier/link_genuinity.py b/predicteasy/core/classifier/link_genuinity.py index 09e27b4..080906f 100644 --- a/predicteasy/core/classifier/link_genuinity.py +++ b/predicteasy/core/classifier/link_genuinity.py @@ -1,5 +1,10 @@ -import os +# pylint: disable=R0904 +""" +anomaly detection in given urls +""" +#import os import requests +from bs4 import BeautifulSoup class LinkGenuinityClassifier: @@ -11,7 +16,15 @@ class LinkGenuinityClassifier: ====== >>> clf = LinkGenuinityClassifier(url="https://economictime.com/articles.apx") - >>> clf = LinkGenClassifier(url="[https://www.thehindu.com/sport/cricket/sri-lankan-cricketer-kusal-mendis-arrested-for-causing-fatal-motor-accident/article31993605.ece](https://www.thehindu.com/sport/cricket/sri-lankan-cricketer-kusal-mendis-arrested-for-causing-fatal-motor-accident/article31993605.ece)") + >>> clf = LinkGenuinityClassifier(url="https://www.ndtv.com/india-news/ + up-gangster-vikas-dubey-wanted-in-killing-of-8-cops-arrested + -in-ujjain-madhya-pradesh-2259611") + >>> clf = LinkGenClassifier(url="[https://www.thehindu.com/sport/ + cricket/sri-lankan-cricketer-kusal-mendis-arrested-for- + causing-fatal-motor-accident/article31993605.ece] + (https://www.thehindu.com/sport/cricket/sri-lankan-cricketer- + kusal-mendis-arrested-for-causing-fatal-motor-accident/ + article31993605.ece)") >>> clf.score() @@ -19,7 +32,7 @@ class LinkGenuinityClassifier: ======= >>> { url: 'given_url', - freshness: 0.8, + freshness: 0.8, geniuness: 0.8, spam_proximity: 0.4, is_date_relevant: True, @@ -48,117 +61,174 @@ class LinkGenuinityClassifier: def __init__(self, url=""): self.url = url self.html_string = requests.get(url).text + self.soup = BeautifulSoup(self.html_string, 'lxml') def get_ip_address(self): + """ pass + """ def get_title(self): - pass + """ + Fetching the title of the url given. + + Usage: + ====== + + >>> clf = LinkGenuinityClassifier(url="https://www.ndtv.com/india-news/ + up-gangster-vikas-dubey-wanted-in-killing-of-8-cops-arrested + -in-ujjain-madhya-pradesh-2259611") + >>> clf.get_title() + + Output: + ======= + + >>> After 5-Day Run, UP Gangster Vikas Dubey Arrested At Madhya Pradesh Temple + """ + return self.soup.title.text def get_content(self): + """ pass + """ def check_relevence(self, title, content): + """ pass + """ def parse_actual_date(self): """ self.html_string parse and fetch the date of publications """ - pass def count_redirected_urls(self): + """ pass + """ def fetch_relevent_urls(self): + """ pass + """ def fetch_irrelevent_urls(self): + """ pass + """ def compare_content(self, content1, content2): + """ pass + """ def count_standard_url_shortners(self): + """ pass + """ def count_unconventional_url_shortners(self): + """ pass + """ - def calculate_age_of_content(title, content): + def calculate_age_of_content(self, title, content): + """ pass + """ def calculate_promises(self): + """ pass + """ def calculate_misleading_score(self): + """ pass + """ def calculate_block_sentiment(self): + """ pass + """ def fetch_topic_intensity(self): + """ pass + """ def is_nudity(self): + """ pass + """ def is_violence(self): + """ pass + """ def is_adult_content(self): + """ pass + """ def score_nudity(self): + """ pass + """ def score_violence(self): + """ pass + """ + def score_adult_content(self): + """ pass + """ def calculate_content_freshness(self): + """ pass + """ - def get_pagerank_score(url): + def get_pagerank_score(self, url): + """ pass + """ def learn(self): + """ pass + """ def score(self): + """ pass - - - - - - - - + """ From 003c1a240e3be816d9f0f5353cf07e1c962b3ff6 Mon Sep 17 00:00:00 2001 From: nibir-paul Date: Tue, 21 Jul 2020 00:21:25 +0530 Subject: [PATCH 2/6] Added: check_relevence method to the class --- predicteasy/core/classifier/link_genuinity.py | 31 +++++++++++++++++-- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/predicteasy/core/classifier/link_genuinity.py b/predicteasy/core/classifier/link_genuinity.py index 080906f..3152404 100644 --- a/predicteasy/core/classifier/link_genuinity.py +++ b/predicteasy/core/classifier/link_genuinity.py @@ -5,6 +5,8 @@ #import os import requests from bs4 import BeautifulSoup +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.metrics.pairwise import cosine_similarity class LinkGenuinityClassifier: @@ -72,7 +74,7 @@ def get_ip_address(self): def get_title(self): """ - Fetching the title of the url given. + Fetching the title of the url given. #18-07-2020 Usage: ====== @@ -96,10 +98,33 @@ def get_content(self): """ - def check_relevence(self, title, content): + @classmethod + def check_relevence(cls, title, content): """ - pass + Checking if the content and the title of the article are related to each other or not. + #21-07-2020 + + Usage: + ====== + + >>> clf = LinkGenuinityClassifier(url="https://www.ndtv.com/india-news/ + up-gangster-vikas-dubey-wanted-in-killing-of-8-cops-arrested + -in-ujjain-madhya-pradesh-2259611") + >>> title = clf.get_title() + >>> content = clf.get_content() + >>> clf.check_relevance(title, content) + + Output: + ======= + + >>> True """ + documents = [title, content] + count_vectorizer = CountVectorizer(stop_words='english') + count_vectorizer = CountVectorizer() + sparse_matrix = count_vectorizer.fit_transform(documents) + value = cosine_similarity(sparse_matrix, sparse_matrix)[0, 1] + return bool(True) if value >= 0.5 else bool(False) def parse_actual_date(self): From 15b2a95f7634f3cc6ace4ee7bd75e8619a150463 Mon Sep 17 00:00:00 2001 From: nibir-paul Date: Wed, 22 Jul 2020 01:21:15 +0530 Subject: [PATCH 3/6] Added: count_redirected_urls() method has been added --- predicteasy/core/classifier/link_genuinity.py | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/predicteasy/core/classifier/link_genuinity.py b/predicteasy/core/classifier/link_genuinity.py index 3152404..d7cf213 100644 --- a/predicteasy/core/classifier/link_genuinity.py +++ b/predicteasy/core/classifier/link_genuinity.py @@ -74,7 +74,7 @@ def get_ip_address(self): def get_title(self): """ - Fetching the title of the url given. #18-07-2020 + Fetching the title of the url given. #17-07-2020 Usage: ====== @@ -102,7 +102,7 @@ def get_content(self): def check_relevence(cls, title, content): """ Checking if the content and the title of the article are related to each other or not. - #21-07-2020 + #20-07-2020 Usage: ====== @@ -135,8 +135,22 @@ def parse_actual_date(self): def count_redirected_urls(self): """ - pass + Returns the number of redirected urls present in the website. + #21-07-2020 + + Usage: + ====== + + >>> clf = LinkGenuinityClassifier(url="http://coreyms.com") + >>> clf.count_redirected_urls() + + Output: + ======= + + >>> 131 """ + links = [link['href'] for link in self.soup.find_all('a')] + return len(links) def fetch_relevent_urls(self): From 75147058f53ee090f2dd544b602dcb9f8c64f7b9 Mon Sep 17 00:00:00 2001 From: nibir-paul Date: Thu, 23 Jul 2020 01:31:49 +0530 Subject: [PATCH 4/6] Created: get_domain() method, Modified: count_redirected_urls() method to only redirected_urls(), Added: count_standard_url_shorteners() --- predicteasy/core/classifier/link_genuinity.py | 106 +++++++++++++++--- 1 file changed, 93 insertions(+), 13 deletions(-) diff --git a/predicteasy/core/classifier/link_genuinity.py b/predicteasy/core/classifier/link_genuinity.py index d7cf213..ad69653 100644 --- a/predicteasy/core/classifier/link_genuinity.py +++ b/predicteasy/core/classifier/link_genuinity.py @@ -3,6 +3,7 @@ anomaly detection in given urls """ #import os +import urllib import requests from bs4 import BeautifulSoup from sklearn.feature_extraction.text import CountVectorizer @@ -72,6 +73,30 @@ def get_ip_address(self): """ + def get_domain_name(self, remove_http=True): + """ + Fetching the domain name of the url given. + #22-07-2020 + + Usage: + ====== + + >>> clf = LinkGenuinityClassifier(url= 'http://bit.ly/bcFOko') + >>> clf.get_domain_name() + + Output: + ======= + + >>> bit.ly + """ + uri = urllib.parse.urlparse(self.url) + if remove_http: + domain_name = f"{uri.netloc}" + else: + domain_name = f"{uri.netloc}://{uri.netloc}" + return domain_name + + def get_title(self): """ Fetching the title of the url given. #17-07-2020 @@ -98,8 +123,8 @@ def get_content(self): """ - @classmethod - def check_relevence(cls, title, content): + @staticmethod + def check_relevence(title, content): """ Checking if the content and the title of the article are related to each other or not. #20-07-2020 @@ -133,24 +158,52 @@ def parse_actual_date(self): """ - def count_redirected_urls(self): + def redirected_urls(self): """ - Returns the number of redirected urls present in the website. + Returns all the redirected urls present in the website. #21-07-2020 Usage: ====== - >>> clf = LinkGenuinityClassifier(url="http://coreyms.com") + >>> clf = LinkGenuinityClassifier(url = "http://coreyms.com") >>> clf.count_redirected_urls() Output: ======= - >>> 131 - """ - links = [link['href'] for link in self.soup.find_all('a')] - return len(links) + >>> {'https://blog.codepen.io/radio/', 'http://carasantamaria.com/podcast/', + 'https://twitter.com/CoreyMSchafer', 'https://coreyms.com/tag/standard-library', } + """ + def is_valid(url): + parsed = urllib.parse.urlparse(url) + return bool(parsed.netloc) and bool(parsed.scheme) + + links = set() + internal_urls = set() + external_urls = set() + urls = set() + domain_name = urllib.parse.urlparse(self.url).netloc + for a_tag in self.soup.find_all('a'): + href = a_tag.attrs.get('href') + if href == "" or href is None: + continue + href = urllib.parse.urljoin(self.url, href) + parsed_href = urllib.parse.urlparse(href) + href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path + if not is_valid(href): + continue + if href in internal_urls: + continue + if domain_name not in href: + if href not in external_urls: + external_urls.add(href) + links.add(href) + continue + urls.add(href) + links.add(href) + internal_urls.add(href) + return links def fetch_relevent_urls(self): @@ -171,13 +224,40 @@ def compare_content(self, content1, content2): """ - def count_standard_url_shortners(self): - """ - pass + def count_standard_url_shorteners(self): """ + Returns the number of standard url shorteners used. + #22-07-2020 + + Usage: + ====== + + >>> clf = LinkGenuinityClassifier(url = "http://bit.ly/bcFOko") + >>> clf.count_standard_url_shorteners() + Output: + ======= - def count_unconventional_url_shortners(self): + >>> 1 + """ + shorteners = ["bit.ly", "goo.gl", "Owl.ly", "Deck.ly", "Su.pr" + "lnk.co", "fur.ly", "moourl.com"] + count = 0 + links = self.redirected_urls() + links.add(self.url) + for link in links: + resp = urllib.request.urlopen(link) + clf_1 = LinkGenuinityClassifier(url=link) + clf_2 = LinkGenuinityClassifier(url=resp.url) + given_url_domain = clf_1.get_domain_name(link) + original_url_domain = clf_2.get_domain_name(resp.url) + if given_url_domain != original_url_domain: + if given_url_domain in shorteners: + count = count + 1 + return count + + + def count_unconventional_url_shorteners(self): """ pass """ From adfb4979c7815106b6c8ede711ece17ae64f162f Mon Sep 17 00:00:00 2001 From: nibir-paul Date: Fri, 24 Jul 2020 01:34:03 +0530 Subject: [PATCH 5/6] Added: fetch_relevant_urls and fetch_irrelevant_urls methods --- predicteasy/core/classifier/link_genuinity.py | 58 ++++++++++++++----- 1 file changed, 44 insertions(+), 14 deletions(-) diff --git a/predicteasy/core/classifier/link_genuinity.py b/predicteasy/core/classifier/link_genuinity.py index ad69653..942c217 100644 --- a/predicteasy/core/classifier/link_genuinity.py +++ b/predicteasy/core/classifier/link_genuinity.py @@ -1,4 +1,5 @@ # pylint: disable=R0904 +# pylint: disable=E1111 """ anomaly detection in given urls """ @@ -124,9 +125,9 @@ def get_content(self): @staticmethod - def check_relevence(title, content): + def check_relevance(content1, content2): """ - Checking if the content and the title of the article are related to each other or not. + Checking if two given contents are related to each other or not. #20-07-2020 Usage: @@ -144,7 +145,7 @@ def check_relevence(title, content): >>> True """ - documents = [title, content] + documents = [content1, content2] count_vectorizer = CountVectorizer(stop_words='english') count_vectorizer = CountVectorizer() sparse_matrix = count_vectorizer.fit_transform(documents) @@ -173,7 +174,7 @@ def redirected_urls(self): ======= >>> {'https://blog.codepen.io/radio/', 'http://carasantamaria.com/podcast/', - 'https://twitter.com/CoreyMSchafer', 'https://coreyms.com/tag/standard-library', } + 'https://twitter.com/CoreyMSchafer', 'https://coreyms.com/tag/standard-library',} """ def is_valid(url): parsed = urllib.parse.urlparse(url) @@ -206,22 +207,51 @@ def is_valid(url): return links - def fetch_relevent_urls(self): - """ - pass + def fetch_relevant_urls(self): """ + Returns all the relevant urls from the set of redirected urls. + #23-07-2020 + Usage: + ====== - def fetch_irrelevent_urls(self): - """ - pass + >>> clf = LinkGenuinityClassifier(url = "http://coreyms.com") + >>> clf.fetc_relevant_urls() + + Output: + ======= + + >>> {'https://twitter.com/CoreyMSchafer', 'https://coreyms.com/tag/standard-library',} """ + content = self.get_content() + relevant_links = set() + links = self.redirected_urls() + for link in links: + clf_1 = LinkGenuinityClassifier(link) + if self.check_relevance(content, clf_1.get_content()) == bool(True): + relevant_links.add(link) + return relevant_links - def compare_content(self, content1, content2): + def fetch_irrelevant_urls(self): """ - pass + Returns all the irrelevant urls from the set of redirected urls. + #23-07-2020 + + Usage: + ====== + + >>> clf = LinkGenuinityClassifier(url = "http://coreyms.com") + >>> clf.fetc_irrelevant_urls() + + Output: + ======= + + >>> {'https://blog.codepen.io/radio/', 'http://carasantamaria.com/podcast/',} """ + links = self.redirected_urls() + relevant_links = self.fetch_relevant_urls() + return links - relevant_links def count_standard_url_shorteners(self): @@ -249,8 +279,8 @@ def count_standard_url_shorteners(self): resp = urllib.request.urlopen(link) clf_1 = LinkGenuinityClassifier(url=link) clf_2 = LinkGenuinityClassifier(url=resp.url) - given_url_domain = clf_1.get_domain_name(link) - original_url_domain = clf_2.get_domain_name(resp.url) + given_url_domain = clf_1.get_domain_name() + original_url_domain = clf_2.get_domain_name() if given_url_domain != original_url_domain: if given_url_domain in shorteners: count = count + 1 From 0a372290f55421823584ba5620d1e90ba4cafbe4 Mon Sep 17 00:00:00 2001 From: nibir-paul Date: Sat, 25 Jul 2020 01:05:34 +0530 Subject: [PATCH 6/6] Added: count_unconventional_url_shorteners method --- predicteasy/core/classifier/link_genuinity.py | 29 ++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/predicteasy/core/classifier/link_genuinity.py b/predicteasy/core/classifier/link_genuinity.py index 942c217..0315ae0 100644 --- a/predicteasy/core/classifier/link_genuinity.py +++ b/predicteasy/core/classifier/link_genuinity.py @@ -289,8 +289,35 @@ def count_standard_url_shorteners(self): def count_unconventional_url_shorteners(self): """ - pass + Returns the number of unconventional url shorteners used. + #23-07-2020 + + Usage: + ====== + + >>> clf = LinkGenuinityClassifier(url = "http://bit.ly/bcFOko") + >>> clf.count_unconventional_url_shorteners() + + Output: + ======= + + >>> 0 """ + shorteners = ["bit.ly", "goo.gl", "Owl.ly", "Deck.ly", "Su.pr" + "lnk.co", "fur.ly", "moourl.com"] + count = 0 + links = self.redirected_urls() + links.add(self.url) + for link in links: + resp = urllib.request.urlopen(link) + clf_1 = LinkGenuinityClassifier(url=link) + clf_2 = LinkGenuinityClassifier(url=resp.url) + given_url_domain = clf_1.get_domain_name() + original_url_domain = clf_2.get_domain_name() + if given_url_domain != original_url_domain: + if given_url_domain not in shorteners: + count = count + 1 + return count def calculate_age_of_content(self, title, content):