From aba8b0d50dd666e5dcdaab0a7581bcf9321861af Mon Sep 17 00:00:00 2001 From: Luca Baronti Date: Sat, 25 Feb 2017 15:43:12 +0000 Subject: [PATCH 1/4] Added --citations-only option. It prints all the articles that cite the queried one --- scholar.py | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/scholar.py b/scholar.py index 13ccd43..a56538d 100755 --- a/scholar.py +++ b/scholar.py @@ -1026,6 +1026,28 @@ def send_query(self, query): self.parse(html) + def get_citations(self,query): + """ + Given a query, it retrieve the list of articles that cite the first + article returned by the query. + It's done in two steps: first it retrieves the citations url of the + first article, then it retrieves the articles that cite it + """ + self.send_query(query) + + if self.articles[0]['url_citations'] is None: + return + citations_url=self.articles[0]['url_citations'] + self.clear_articles() + + html = self._get_http_response(url=citations_url, + log_msg='dump of query response HTML', + err_msg='results retrieval failed') + if html is None: + return + + self.parse(html) + def get_citation_data(self, article): """ Given an article, retrieves citation link. Note, this requires that @@ -1187,6 +1209,8 @@ def main(): help='Do not include patents in results') group.add_option('--no-citations', action='store_true', default=False, help='Do not include citations in results') + group.add_option('--citations-only', action='store_true', default=False, + help='Prints only the citations list in results') group.add_option('-C', '--cluster-id', metavar='CLUSTER_ID', default=None, help='Do not search, just use articles in given cluster ID') group.add_option('-c', '--count', type='int', default=None, @@ -1290,7 +1314,11 @@ def main(): options.count = min(options.count, ScholarConf.MAX_PAGE_RESULTS) query.set_num_page_results(options.count) - querier.send_query(query) + + if options.citations_only: + querier.get_citations(query) + else: + querier.send_query(query) if options.csv: csv(querier) From 2fec066dc8e8d0834d0f1cecdd71db49007f91a7 Mon Sep 17 00:00:00 2001 From: Luca Baronti Date: Thu, 2 Mar 2017 15:04:09 +0000 Subject: [PATCH 2/4] Fixed the return of just one result if asked for a specific citation format --- scholar.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scholar.py b/scholar.py index a56538d..e84549e 100755 --- a/scholar.py +++ b/scholar.py @@ -879,7 +879,7 @@ class ScholarSettings(object): def __init__(self): self.citform = 0 # Citation format, default none - self.per_page_results = None + self.per_page_results = 10 self._is_configured = False def set_citation_format(self, citform): @@ -1035,7 +1035,7 @@ def get_citations(self,query): """ self.send_query(query) - if self.articles[0]['url_citations'] is None: + if len(self.articles)==0 or self.articles[0]['url_citations'] is None: return citations_url=self.articles[0]['url_citations'] self.clear_articles() From 6ebd846882e64f9e7f5cdbed87a68dcaf2d97b4f Mon Sep 17 00:00:00 2001 From: Luca Baronti Date: Mon, 17 Apr 2017 14:18:36 +0100 Subject: [PATCH 3/4] Fixed a typo --- scholar.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scholar.py b/scholar.py index e84549e..c34f8db 100755 --- a/scholar.py +++ b/scholar.py @@ -893,8 +893,7 @@ def set_citation_format(self, citform): def set_per_page_results(self, per_page_results): self.per_page_results = ScholarUtils.ensure_int( per_page_results, 'page results must be integer') - self.per_page_results = min( - self.per_page_results, ScholarConf.MAX_PAGE_RESULTS) + self.per_page_results = min(self.per_page_results, ScholarConf.MAX_PAGE_RESULTS) self._is_configured = True def is_configured(self): From 44d797246e20c2948bd50291283eddc9ae723f90 Mon Sep 17 00:00:00 2001 From: Luca Baronti Date: Mon, 17 Apr 2017 20:36:04 +0100 Subject: [PATCH 4/4] Fixed a bug that prevents to download more than 10 citations --- scholar.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/scholar.py b/scholar.py index c34f8db..ce1b191 100755 --- a/scholar.py +++ b/scholar.py @@ -166,6 +166,7 @@ import re import sys import warnings +import time try: # Try importing for Python 3 @@ -1037,6 +1038,7 @@ def get_citations(self,query): if len(self.articles)==0 or self.articles[0]['url_citations'] is None: return citations_url=self.articles[0]['url_citations'] + citations_num=self.articles[0]['num_citations'] self.clear_articles() html = self._get_http_response(url=citations_url, @@ -1044,8 +1046,17 @@ def get_citations(self,query): err_msg='results retrieval failed') if html is None: return - self.parse(html) + while len(self.articles)