Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 32 additions & 5 deletions scholar.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
#! /usr/bin/env python
"""
This module provides classes for querying Google Scholar and parsing
returned results. It currently *only* processes the first results
page. It is not a recursive crawler.
returned results. It is not a recursive crawler.
"""
# ChangeLog
# ---------
#
# 2.12 Added a paging system to get results on more pages that
# the initial one. The number of results requested is forced back
# to the default of 10. (You still cannot request more than 20 results.)
#
# 2.11 The Scholar site seems to have become more picky about the
# number of results requested. The default of 20 in scholar.py
# could cause HTTP 503 responses. scholar.py now doesn't request
Expand Down Expand Up @@ -238,7 +241,7 @@ def make_soup(markup, parser=None):
class ScholarConf(object):
"""Helper class for global settings."""

VERSION = '2.10'
VERSION = '2.12'
LOG_LEVEL = 1
MAX_PAGE_RESULTS = 10 # Current default for per-page results
SCHOLAR_SITE = 'http://scholar.google.com'
Expand Down Expand Up @@ -632,7 +635,8 @@ def __init__(self):
# The number of results requested from Scholar -- not the
# total number of results it reports (the latter gets stored
# in attrs, see below).
self.num_results = None
self.num_results = 10 # force results to show onl10 at a time
self.start = 0

# Queries may have global result attributes, similar to
# per-article attributes in ScholarArticle. The exact set of
Expand All @@ -645,6 +649,11 @@ def set_num_page_results(self, num_page_results):
num_page_results,
'maximum number of results on page must be numeric')

def set_start(self, page_num):
self.start = ScholarUtils.ensure_int(
page_num * self.num_results,
'Page number')

def get_url(self):
"""
Returns a complete, submittable URL string for this particular
Expand Down Expand Up @@ -708,11 +717,13 @@ class ClusterScholarQuery(ScholarQuery):
"""
SCHOLAR_CLUSTER_URL = ScholarConf.SCHOLAR_SITE + '/scholar?' \
+ 'cluster=%(cluster)s' \
+ '%(num)s'
+ '%(num)s' \
+ '%(start)s'

def __init__(self, cluster=None):
ScholarQuery.__init__(self)
self._add_attribute_type('num_results', 'Results', 0)
self._add_attribute_type('page', 'Page', 0)
self.cluster = None
self.set_cluster(cluster)

Expand All @@ -737,6 +748,10 @@ def get_url(self):
urlargs['num'] = ('&num=%d' % self.num_results
if self.num_results is not None else '')

# paging
urlargs['start'] = ('&start=%d' % self.start
if self.start is not None else 0)

return self.SCHOLAR_CLUSTER_URL % urlargs


Expand All @@ -758,11 +773,13 @@ class SearchScholarQuery(ScholarQuery):
+ '&as_vis=%(citations)s' \
+ '&btnG=&hl=en' \
+ '%(num)s' \
+ '%(start)s' \
+ '&as_sdt=%(patents)s%%2C5'

def __init__(self):
ScholarQuery.__init__(self)
self._add_attribute_type('num_results', 'Results', 0)
self._add_attribute_type('page', 'Page', 0)
self.words = None # The default search behavior
self.words_some = None # At least one of those words
self.words_none = None # None of these words
Expand Down Expand Up @@ -862,6 +879,10 @@ def get_url(self):
urlargs['num'] = ('&num=%d' % self.num_results
if self.num_results is not None else '')

# paging
urlargs['start'] = ('&start=%d' % self.start
if self.start is not None else 0)

return self.SCHOLAR_QUERY_URL % urlargs


Expand Down Expand Up @@ -920,6 +941,7 @@ class ScholarQuerier(object):
+ '&as_sdt=1,5' \
+ '&as_sdtp=' \
+ '&num=%(num)s' \
+ '&start=%(start)s' \
+ '&scis=%(scis)s' \
+ '%(scisf)s' \
+ '&hl=en&lang=all&instq=&inst=569367360547434339&save='
Expand Down Expand Up @@ -1191,6 +1213,8 @@ def main():
help='Do not search, just use articles in given cluster ID')
group.add_option('-c', '--count', type='int', default=None,
help='Maximum number of results')
group.add_option('-S', '--start', type='int', default=None,
help='Navigate to page')
parser.add_option_group(group)

group = optparse.OptionGroup(parser, 'Output format',
Expand Down Expand Up @@ -1290,6 +1314,9 @@ def main():
options.count = min(options.count, ScholarConf.MAX_PAGE_RESULTS)
query.set_num_page_results(options.count)

if options.start is not None:
query.set_start(options.start)

querier.send_query(query)

if options.csv:
Expand Down