From 47e29133eecbbebe248348a0cf4875e009fc2aaf Mon Sep 17 00:00:00 2001
From: nibir-paul <nibirpaul491@gmail.com>
Date: Sat, 18 Jul 2020 00:24:26 +0530
Subject: [PATCH 1/6] Added the content of get_title() method

---
 predicteasy/core/classifier/link_genuinity.py | 100 +++++++++++++++---
 1 file changed, 85 insertions(+), 15 deletions(-)

diff --git a/predicteasy/core/classifier/link_genuinity.py b/predicteasy/core/classifier/link_genuinity.py
index 09e27b4..080906f 100644
--- a/predicteasy/core/classifier/link_genuinity.py
+++ b/predicteasy/core/classifier/link_genuinity.py
@@ -1,5 +1,10 @@
-import os
+# pylint: disable=R0904
+"""
+anomaly detection in given urls
+"""
+#import os
 import requests
+from bs4 import BeautifulSoup
 
 
 class LinkGenuinityClassifier:
@@ -11,7 +16,15 @@ class LinkGenuinityClassifier:
     ======
 
         >>> clf = LinkGenuinityClassifier(url="https://economictime.com/articles.apx")
-        >>> clf = LinkGenClassifier(url="[https://www.thehindu.com/sport/cricket/sri-lankan-cricketer-kusal-mendis-arrested-for-causing-fatal-motor-accident/article31993605.ece](https://www.thehindu.com/sport/cricket/sri-lankan-cricketer-kusal-mendis-arrested-for-causing-fatal-motor-accident/article31993605.ece)")
+        >>> clf = LinkGenuinityClassifier(url="https://www.ndtv.com/india-news/
+                  up-gangster-vikas-dubey-wanted-in-killing-of-8-cops-arrested
+                  -in-ujjain-madhya-pradesh-2259611")
+        >>> clf = LinkGenClassifier(url="[https://www.thehindu.com/sport/
+                  cricket/sri-lankan-cricketer-kusal-mendis-arrested-for-
+                  causing-fatal-motor-accident/article31993605.ece]
+                  (https://www.thehindu.com/sport/cricket/sri-lankan-cricketer-
+                  kusal-mendis-arrested-for-causing-fatal-motor-accident/
+                  article31993605.ece)")
         >>> clf.score()
 
 
@@ -19,7 +32,7 @@ class LinkGenuinityClassifier:
     =======
         >>> {
                 url: 'given_url',
-                freshness: 0.8, 
+                freshness: 0.8,
                 geniuness: 0.8,
                 spam_proximity: 0.4,
                 is_date_relevant: True,
@@ -48,117 +61,174 @@ class LinkGenuinityClassifier:
     def __init__(self, url=""):
         self.url = url
         self.html_string = requests.get(url).text
+        self.soup = BeautifulSoup(self.html_string, 'lxml')
 
 
     def get_ip_address(self):
+        """
         pass
+        """
 
 
     def get_title(self):
-        pass
+        """
+        Fetching the title of the url given.
+
+        Usage:
+        ======
+
+            >>> clf = LinkGenuinityClassifier(url="https://www.ndtv.com/india-news/
+                      up-gangster-vikas-dubey-wanted-in-killing-of-8-cops-arrested
+                      -in-ujjain-madhya-pradesh-2259611")
+            >>> clf.get_title()
+
+        Output:
+        =======
+
+            >>> After 5-Day Run, UP Gangster Vikas Dubey Arrested At Madhya Pradesh Temple
+        """
+        return self.soup.title.text
 
 
     def get_content(self):
+        """
         pass
+        """
 
 
     def check_relevence(self, title, content):
+        """
         pass
+        """
 
 
     def parse_actual_date(self):
         """
         self.html_string parse and fetch the date of publications
         """
-        pass
 
 
     def count_redirected_urls(self):
+        """
         pass
+        """
 
 
     def fetch_relevent_urls(self):
+        """
         pass
+        """
 
 
     def fetch_irrelevent_urls(self):
+        """
         pass
+        """
 
 
     def compare_content(self, content1, content2):
+        """
         pass
+        """
 
 
     def count_standard_url_shortners(self):
+        """
         pass
+        """
 
 
     def count_unconventional_url_shortners(self):
+        """
         pass
+        """
 
 
-    def calculate_age_of_content(title,  content):
+    def calculate_age_of_content(self, title, content):
+        """
         pass
+        """
 
 
     def calculate_promises(self):
+        """
         pass
+        """
 
 
     def calculate_misleading_score(self):
+        """
         pass
+        """
 
 
     def calculate_block_sentiment(self):
+        """
         pass
+        """
 
 
     def fetch_topic_intensity(self):
+        """
         pass
+        """
 
 
     def is_nudity(self):
+        """
         pass
+        """
 
 
     def is_violence(self):
+        """
         pass
+        """
 
 
     def is_adult_content(self):
+        """
         pass
+        """
 
 
     def score_nudity(self):
+        """
         pass
+        """
 
 
     def score_violence(self):
+        """
         pass
+        """
+
 
     def score_adult_content(self):
+        """
         pass
+        """
 
 
     def calculate_content_freshness(self):
+        """
         pass
+        """
 
 
-    def get_pagerank_score(url):
+    def get_pagerank_score(self, url):
+        """
         pass
+        """
 
 
     def learn(self):
+        """
         pass
+        """
 
 
     def score(self):
+        """
         pass
-
-
-
-
-
-
-
-
+        """

From 003c1a240e3be816d9f0f5353cf07e1c962b3ff6 Mon Sep 17 00:00:00 2001
From: nibir-paul <nibirpaul491@gmail.com>
Date: Tue, 21 Jul 2020 00:21:25 +0530
Subject: [PATCH 2/6] Added: check_relevence method to the class

---
 predicteasy/core/classifier/link_genuinity.py | 31 +++++++++++++++++--
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/predicteasy/core/classifier/link_genuinity.py b/predicteasy/core/classifier/link_genuinity.py
index 080906f..3152404 100644
--- a/predicteasy/core/classifier/link_genuinity.py
+++ b/predicteasy/core/classifier/link_genuinity.py
@@ -5,6 +5,8 @@
 #import os
 import requests
 from bs4 import BeautifulSoup
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
 
 
 class LinkGenuinityClassifier:
@@ -72,7 +74,7 @@ def get_ip_address(self):
 
     def get_title(self):
         """
-        Fetching the title of the url given.
+        Fetching the title of the url given. #18-07-2020
 
         Usage:
         ======
@@ -96,10 +98,33 @@ def get_content(self):
         """
 
 
-    def check_relevence(self, title, content):
+    @classmethod
+    def check_relevence(cls, title, content):
         """
-        pass
+        Checking if the content and the title of the article are related to each other or not.
+        #21-07-2020
+
+        Usage:
+        ======
+
+            >>> clf = LinkGenuinityClassifier(url="https://www.ndtv.com/india-news/
+                      up-gangster-vikas-dubey-wanted-in-killing-of-8-cops-arrested
+                      -in-ujjain-madhya-pradesh-2259611")
+            >>> title = clf.get_title()
+            >>> content = clf.get_content()
+            >>> clf.check_relevance(title, content)
+
+        Output:
+        =======
+
+            >>> True
         """
+        documents = [title, content]
+        count_vectorizer = CountVectorizer(stop_words='english')
+        count_vectorizer = CountVectorizer()
+        sparse_matrix = count_vectorizer.fit_transform(documents)
+        value = cosine_similarity(sparse_matrix, sparse_matrix)[0, 1]
+        return bool(True) if value >= 0.5 else bool(False)
 
 
     def parse_actual_date(self):

From 15b2a95f7634f3cc6ace4ee7bd75e8619a150463 Mon Sep 17 00:00:00 2001
From: nibir-paul <nibirpaul491@gmail.com>
Date: Wed, 22 Jul 2020 01:21:15 +0530
Subject: [PATCH 3/6] Added: count_redirected_urls() method has been added

---
 predicteasy/core/classifier/link_genuinity.py | 20 ++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/predicteasy/core/classifier/link_genuinity.py b/predicteasy/core/classifier/link_genuinity.py
index 3152404..d7cf213 100644
--- a/predicteasy/core/classifier/link_genuinity.py
+++ b/predicteasy/core/classifier/link_genuinity.py
@@ -74,7 +74,7 @@ def get_ip_address(self):
 
     def get_title(self):
         """
-        Fetching the title of the url given. #18-07-2020
+        Fetching the title of the url given. #17-07-2020
 
         Usage:
         ======
@@ -102,7 +102,7 @@ def get_content(self):
     def check_relevence(cls, title, content):
         """
         Checking if the content and the title of the article are related to each other or not.
-        #21-07-2020
+        #20-07-2020
 
         Usage:
         ======
@@ -135,8 +135,22 @@ def parse_actual_date(self):
 
     def count_redirected_urls(self):
         """
-        pass
+        Returns the number of redirected urls present in the website.
+        #21-07-2020
+
+        Usage:
+        ======
+
+            >>> clf = LinkGenuinityClassifier(url="http://coreyms.com")
+            >>> clf.count_redirected_urls()
+
+        Output:
+        =======
+
+            >>> 131
         """
+        links = [link['href'] for link in self.soup.find_all('a')]
+        return len(links)
 
 
     def fetch_relevent_urls(self):

From 75147058f53ee090f2dd544b602dcb9f8c64f7b9 Mon Sep 17 00:00:00 2001
From: nibir-paul <nibirpaul491@gmail.com>
Date: Thu, 23 Jul 2020 01:31:49 +0530
Subject: [PATCH 4/6] Created: get_domain() method, Modified:
 count_redirected_urls() method to only redirected_urls(), Added:
 count_standard_url_shorteners()

---
 predicteasy/core/classifier/link_genuinity.py | 106 +++++++++++++++---
 1 file changed, 93 insertions(+), 13 deletions(-)

diff --git a/predicteasy/core/classifier/link_genuinity.py b/predicteasy/core/classifier/link_genuinity.py
index d7cf213..ad69653 100644
--- a/predicteasy/core/classifier/link_genuinity.py
+++ b/predicteasy/core/classifier/link_genuinity.py
@@ -3,6 +3,7 @@
 anomaly detection in given urls
 """
 #import os
+import urllib
 import requests
 from bs4 import BeautifulSoup
 from sklearn.feature_extraction.text import CountVectorizer
@@ -72,6 +73,30 @@ def get_ip_address(self):
         """
 
 
+    def get_domain_name(self, remove_http=True):
+        """
+        Fetching the domain name of the url given.
+        #22-07-2020
+
+        Usage:
+        ======
+
+            >>> clf = LinkGenuinityClassifier(url= 'http://bit.ly/bcFOko')
+            >>> clf.get_domain_name()
+
+        Output:
+        =======
+
+            >>> bit.ly
+        """
+        uri = urllib.parse.urlparse(self.url)
+        if remove_http:
+            domain_name = f"{uri.netloc}"
+        else:
+            domain_name = f"{uri.netloc}://{uri.netloc}"
+        return domain_name
+
+
     def get_title(self):
         """
         Fetching the title of the url given. #17-07-2020
@@ -98,8 +123,8 @@ def get_content(self):
         """
 
 
-    @classmethod
-    def check_relevence(cls, title, content):
+    @staticmethod
+    def check_relevence(title, content):
         """
         Checking if the content and the title of the article are related to each other or not.
         #20-07-2020
@@ -133,24 +158,52 @@ def parse_actual_date(self):
         """
 
 
-    def count_redirected_urls(self):
+    def redirected_urls(self):
         """
-        Returns the number of redirected urls present in the website.
+        Returns all the redirected urls present in the website.
         #21-07-2020
 
         Usage:
         ======
 
-            >>> clf = LinkGenuinityClassifier(url="http://coreyms.com")
+            >>> clf = LinkGenuinityClassifier(url = "http://coreyms.com")
             >>> clf.count_redirected_urls()
 
         Output:
         =======
 
-            >>> 131
-        """
-        links = [link['href'] for link in self.soup.find_all('a')]
-        return len(links)
+            >>> {'https://blog.codepen.io/radio/', 'http://carasantamaria.com/podcast/',
+                 'https://twitter.com/CoreyMSchafer', 'https://coreyms.com/tag/standard-library', }
+        """
+        def is_valid(url):
+            parsed = urllib.parse.urlparse(url)
+            return bool(parsed.netloc) and bool(parsed.scheme)
+
+        links = set()
+        internal_urls = set()
+        external_urls = set()
+        urls = set()
+        domain_name = urllib.parse.urlparse(self.url).netloc
+        for a_tag in self.soup.find_all('a'):
+            href = a_tag.attrs.get('href')
+            if href == "" or href is None:
+                continue
+            href = urllib.parse.urljoin(self.url, href)
+            parsed_href = urllib.parse.urlparse(href)
+            href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
+            if not is_valid(href):
+                continue
+            if href in internal_urls:
+                continue
+            if domain_name not in href:
+                if href not in external_urls:
+                    external_urls.add(href)
+                    links.add(href)
+                continue
+            urls.add(href)
+            links.add(href)
+            internal_urls.add(href)
+        return links
 
 
     def fetch_relevent_urls(self):
@@ -171,13 +224,40 @@ def compare_content(self, content1, content2):
         """
 
 
-    def count_standard_url_shortners(self):
-        """
-        pass
+    def count_standard_url_shorteners(self):
         """
+        Returns the number of standard url shorteners used.
+        #22-07-2020
+
+        Usage:
+        ======
+
+            >>> clf = LinkGenuinityClassifier(url = "http://bit.ly/bcFOko")
+            >>> clf.count_standard_url_shorteners()
 
+        Output:
+        =======
 
-    def count_unconventional_url_shortners(self):
+            >>> 1
+        """
+        shorteners = ["bit.ly", "goo.gl", "Owl.ly", "Deck.ly", "Su.pr"
+                      "lnk.co", "fur.ly", "moourl.com"]
+        count = 0
+        links = self.redirected_urls()
+        links.add(self.url)
+        for link in links:
+            resp = urllib.request.urlopen(link)
+            clf_1 = LinkGenuinityClassifier(url=link)
+            clf_2 = LinkGenuinityClassifier(url=resp.url)
+            given_url_domain = clf_1.get_domain_name(link)
+            original_url_domain = clf_2.get_domain_name(resp.url)
+            if given_url_domain != original_url_domain:
+                if given_url_domain in shorteners:
+                    count = count + 1
+        return count
+
+
+    def count_unconventional_url_shorteners(self):
         """
         pass
         """

From adfb4979c7815106b6c8ede711ece17ae64f162f Mon Sep 17 00:00:00 2001
From: nibir-paul <nibirpaul491@gmail.com>
Date: Fri, 24 Jul 2020 01:34:03 +0530
Subject: [PATCH 5/6] Added: fetch_relevant_urls and fetch_irrelevant_urls
 methods

---
 predicteasy/core/classifier/link_genuinity.py | 58 ++++++++++++++-----
 1 file changed, 44 insertions(+), 14 deletions(-)

diff --git a/predicteasy/core/classifier/link_genuinity.py b/predicteasy/core/classifier/link_genuinity.py
index ad69653..942c217 100644
--- a/predicteasy/core/classifier/link_genuinity.py
+++ b/predicteasy/core/classifier/link_genuinity.py
@@ -1,4 +1,5 @@
 # pylint: disable=R0904
+# pylint: disable=E1111
 """
 anomaly detection in given urls
 """
@@ -124,9 +125,9 @@ def get_content(self):
 
 
     @staticmethod
-    def check_relevence(title, content):
+    def check_relevance(content1, content2):
         """
-        Checking if the content and the title of the article are related to each other or not.
+        Checking if two given contents are related to each other or not.
         #20-07-2020
 
         Usage:
@@ -144,7 +145,7 @@ def check_relevence(title, content):
 
             >>> True
         """
-        documents = [title, content]
+        documents = [content1, content2]
         count_vectorizer = CountVectorizer(stop_words='english')
         count_vectorizer = CountVectorizer()
         sparse_matrix = count_vectorizer.fit_transform(documents)
@@ -173,7 +174,7 @@ def redirected_urls(self):
         =======
 
             >>> {'https://blog.codepen.io/radio/', 'http://carasantamaria.com/podcast/',
-                 'https://twitter.com/CoreyMSchafer', 'https://coreyms.com/tag/standard-library', }
+                 'https://twitter.com/CoreyMSchafer', 'https://coreyms.com/tag/standard-library',}
         """
         def is_valid(url):
             parsed = urllib.parse.urlparse(url)
@@ -206,22 +207,51 @@ def is_valid(url):
         return links
 
 
-    def fetch_relevent_urls(self):
-        """
-        pass
+    def fetch_relevant_urls(self):
         """
+        Returns all the relevant urls from the set of redirected urls.
+        #23-07-2020
 
+        Usage:
+        ======
 
-    def fetch_irrelevent_urls(self):
-        """
-        pass
+            >>> clf = LinkGenuinityClassifier(url = "http://coreyms.com")
+            >>> clf.fetc_relevant_urls()
+
+        Output:
+        =======
+
+            >>> {'https://twitter.com/CoreyMSchafer', 'https://coreyms.com/tag/standard-library',}
         """
+        content = self.get_content()
+        relevant_links = set()
+        links = self.redirected_urls()
+        for link in links:
+            clf_1 = LinkGenuinityClassifier(link)
+            if self.check_relevance(content, clf_1.get_content()) == bool(True):
+                relevant_links.add(link)
+        return relevant_links
 
 
-    def compare_content(self, content1, content2):
+    def fetch_irrelevant_urls(self):
         """
-        pass
+        Returns all the irrelevant urls from the set of redirected urls.
+        #23-07-2020
+
+        Usage:
+        ======
+
+            >>> clf = LinkGenuinityClassifier(url = "http://coreyms.com")
+            >>> clf.fetc_irrelevant_urls()
+
+        Output:
+        =======
+
+            >>> {'https://blog.codepen.io/radio/', 'http://carasantamaria.com/podcast/',}
         """
+        links = self.redirected_urls()
+        relevant_links = self.fetch_relevant_urls()
+        return links - relevant_links
 
 
     def count_standard_url_shorteners(self):
@@ -249,8 +279,8 @@ def count_standard_url_shorteners(self):
             resp = urllib.request.urlopen(link)
             clf_1 = LinkGenuinityClassifier(url=link)
             clf_2 = LinkGenuinityClassifier(url=resp.url)
-            given_url_domain = clf_1.get_domain_name(link)
-            original_url_domain = clf_2.get_domain_name(resp.url)
+            given_url_domain = clf_1.get_domain_name()
+            original_url_domain = clf_2.get_domain_name()
             if given_url_domain != original_url_domain:
                 if given_url_domain in shorteners:
                     count = count + 1

From 0a372290f55421823584ba5620d1e90ba4cafbe4 Mon Sep 17 00:00:00 2001
From: nibir-paul <nibirpaul491@gmail.com>
Date: Sat, 25 Jul 2020 01:05:34 +0530
Subject: [PATCH 6/6] Added: count_unconventional_url_shorteners method

---
 predicteasy/core/classifier/link_genuinity.py | 29 ++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/predicteasy/core/classifier/link_genuinity.py b/predicteasy/core/classifier/link_genuinity.py
index 942c217..0315ae0 100644
--- a/predicteasy/core/classifier/link_genuinity.py
+++ b/predicteasy/core/classifier/link_genuinity.py
@@ -289,8 +289,35 @@ def count_standard_url_shorteners(self):
 
     def count_unconventional_url_shorteners(self):
         """
-        pass
+        Returns the number of unconventional url shorteners used.
+        #23-07-2020
+
+        Usage:
+        ======
+
+            >>> clf = LinkGenuinityClassifier(url = "http://bit.ly/bcFOko")
+            >>> clf.count_unconventional_url_shorteners()
+
+        Output:
+        =======
+
+            >>> 0
         """
+        shorteners = ["bit.ly", "goo.gl", "Owl.ly", "Deck.ly", "Su.pr"
+                      "lnk.co", "fur.ly", "moourl.com"]
+        count = 0
+        links = self.redirected_urls()
+        links.add(self.url)
+        for link in links:
+            resp = urllib.request.urlopen(link)
+            clf_1 = LinkGenuinityClassifier(url=link)
+            clf_2 = LinkGenuinityClassifier(url=resp.url)
+            given_url_domain = clf_1.get_domain_name()
+            original_url_domain = clf_2.get_domain_name()
+            if given_url_domain != original_url_domain:
+                if given_url_domain not in shorteners:
+                    count = count + 1
+        return count
 
 
     def calculate_age_of_content(self, title, content):