From b89d443c4fcc32c4368616a6c601e03764f7d53e Mon Sep 17 00:00:00 2001 From: Yakov Date: Fri, 10 Oct 2025 03:30:12 +0000 Subject: [PATCH 1/4] Add `--no-save-cookies` --- bot/job_options_parser.rb | 2 ++ bot/pipeline_options.rb | 5 +++++ doc/commands.rst | 3 +++ lib/job.rb | 4 ++++ pipeline/archivebot/seesaw/tasks.py | 1 + pipeline/archivebot/seesaw/wpull.py | 4 +++- spec/bot/job_options_parser_spec.rb | 4 ++++ 7 files changed, 22 insertions(+), 1 deletion(-) diff --git a/bot/job_options_parser.rb b/bot/job_options_parser.rb index b7c96954..e5fe06df 100644 --- a/bot/job_options_parser.rb +++ b/bot/job_options_parser.rb @@ -5,6 +5,7 @@ class JobOptionsParser def initialize @parser = Trollop::Parser.new do opt :no_offsite_links, 'Do not fetch offsite links' + opt :no_save_cookies, 'Do not save cookies' opt :youtube_dl, 'Use youtube-dl on grabbed pages' opt :ignore_sets, 'Ignore sets to apply', :type => :string opt :pipeline, 'Run job on this pipeline', :type => :string @@ -23,6 +24,7 @@ def parse(str) b[0] = (case b[0] when '--ignoresets','--ignore_sets','--ignoreset','--ignore-set','--ignore_set','--ig-set','--igset' then '--ignore-sets' when '--nooffsitelinks','--no-offsite','--nooffsite' then '--no-offsite-links' + when '--nosavecookies','--no-cookies','--nocookies' then '--no-save-cookies' when '--useragentalias','--user-agent','--useragent' then '--user-agent-alias' when '--concurrent' then '--concurrency' when '--reason' then '--explain' diff --git a/bot/pipeline_options.rb b/bot/pipeline_options.rb index 03b2db54..b1b3150d 100644 --- a/bot/pipeline_options.rb +++ b/bot/pipeline_options.rb @@ -17,6 +17,11 @@ def run_post_registration_hooks(m, job, params) messages << 'offsite links: no' end + if params[:no_save_cookies] + job.no_save_cookies! + messages << 'save cookies: no' + end + if !messages.empty? reply m, "Options: #{messages.join('; ')}" end diff --git a/doc/commands.rst b/doc/commands.rst index ba192dd9..c51c92e8 100644 --- a/doc/commands.rst +++ b/doc/commands.rst @@ -67,6 +67,9 @@ Accepted parameters Aliases: ``--nooffsitelinks``, ``--no-offsite``, ``--nooffsite`` +``--no-save-cookies`` + do not save cookies to a cookie jar + ``--user-agent-alias ALIAS`` specify a user-agent to use:: diff --git a/lib/job.rb b/lib/job.rb index 48ecc4db..9569bafb 100644 --- a/lib/job.rb +++ b/lib/job.rb @@ -380,6 +380,10 @@ def no_offsite_links! redis.hset(ident, 'no_offsite_links', true) end + def no_save_cookies! + redis.hset(ident, 'no_save_cookies', true) + end + def yahoo silently do set_delay(0, 0) diff --git a/pipeline/archivebot/seesaw/tasks.py b/pipeline/archivebot/seesaw/tasks.py index 022fa5df..22a064da 100644 --- a/pipeline/archivebot/seesaw/tasks.py +++ b/pipeline/archivebot/seesaw/tasks.py @@ -140,6 +140,7 @@ def process(self, item): item['url_file'] = job_data.get('url_file') item['user_agent'] = job_data.get('user_agent') item['no_offsite_links'] = job_data.get('no_offsite_links') + item['no_save_cookies'] = job_data.get('no_save_cookies') item['youtube_dl'] = job_data.get('youtube_dl') item.log_output('Received item %s.' % ident) diff --git a/pipeline/archivebot/seesaw/wpull.py b/pipeline/archivebot/seesaw/wpull.py index 48fa0659..a3c4bbf8 100644 --- a/pipeline/archivebot/seesaw/wpull.py +++ b/pipeline/archivebot/seesaw/wpull.py @@ -22,7 +22,6 @@ def make_args(item, default_user_agent, wpull_exe, youtube_dl_exe, finished_warc '-o', '%(item_dir)s/wpull.log' % item, '--database', '%(item_dir)s/wpull.db' % item, '--html-parser', 'libxml2-lxml', - '--save-cookies', '%(cookie_jar)s' % item, '--no-check-certificate', '--no-strong-crypto', '--delete-after', @@ -51,6 +50,9 @@ def make_args(item, default_user_agent, wpull_exe, youtube_dl_exe, finished_warc '--youtube-dl-exe', youtube_dl_exe ] + if not item.get('no_save_cookies'): + add_args(args, ['--save-cookies', '%(cookie_jar)s'], item) + if item['url'].startswith("http://www.reddit.com/") or \ item['url'].startswith("https://www.reddit.com/"): add_args(args, ['--header', 'Cookie: over18=1'], item) diff --git a/spec/bot/job_options_parser_spec.rb b/spec/bot/job_options_parser_spec.rb index ed03b877..b7a9e0f8 100644 --- a/spec/bot/job_options_parser_spec.rb +++ b/spec/bot/job_options_parser_spec.rb @@ -53,6 +53,10 @@ expect(parser.parse('--concurrency=4')[:concurrency]).to eq(4) end + it 'recognizes --no-save-cookies' do + expect(parser.parse('--no-save-cookies')[:no_save_cookies]).to eq(true) + end + describe 'when unknown options are present' do it 'raises UnknownOptionError' do expect(lambda { parser.parse('--foo=bar') }).to raise_error(JobOptionsParser::UnknownOptionError) From 66d0ab1f004f020bc55442495a7b51ea3b631d48 Mon Sep 17 00:00:00 2001 From: Yakov Date: Sun, 12 Oct 2025 00:34:06 +0000 Subject: [PATCH 2/4] Pass `--no-cookies` to wpull instead --- pipeline/archivebot/seesaw/wpull.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pipeline/archivebot/seesaw/wpull.py b/pipeline/archivebot/seesaw/wpull.py index a3c4bbf8..34732f56 100644 --- a/pipeline/archivebot/seesaw/wpull.py +++ b/pipeline/archivebot/seesaw/wpull.py @@ -50,7 +50,9 @@ def make_args(item, default_user_agent, wpull_exe, youtube_dl_exe, finished_warc '--youtube-dl-exe', youtube_dl_exe ] - if not item.get('no_save_cookies'): + if item.get('no_save_cookies'): + args.append('--no-cookies') + else: add_args(args, ['--save-cookies', '%(cookie_jar)s'], item) if item['url'].startswith("http://www.reddit.com/") or \ From 30a328e82fee3091f723f1637fb0fd9bb11a89e7 Mon Sep 17 00:00:00 2001 From: Yakov Date: Sun, 12 Oct 2025 18:34:02 +0000 Subject: [PATCH 3/4] Change `--no-save-cookies` flag to `--no-cookies` --- bot/job_options_parser.rb | 4 ++-- bot/pipeline_options.rb | 6 +++--- doc/commands.rst | 4 ++-- lib/job.rb | 4 ++-- pipeline/archivebot/seesaw/tasks.py | 2 +- pipeline/archivebot/seesaw/wpull.py | 2 +- spec/bot/job_options_parser_spec.rb | 4 ++-- 7 files changed, 13 insertions(+), 13 deletions(-) diff --git a/bot/job_options_parser.rb b/bot/job_options_parser.rb index e5fe06df..4bbc35d1 100644 --- a/bot/job_options_parser.rb +++ b/bot/job_options_parser.rb @@ -5,7 +5,7 @@ class JobOptionsParser def initialize @parser = Trollop::Parser.new do opt :no_offsite_links, 'Do not fetch offsite links' - opt :no_save_cookies, 'Do not save cookies' + opt :no_cookies, 'Do not use cookies' opt :youtube_dl, 'Use youtube-dl on grabbed pages' opt :ignore_sets, 'Ignore sets to apply', :type => :string opt :pipeline, 'Run job on this pipeline', :type => :string @@ -24,7 +24,7 @@ def parse(str) b[0] = (case b[0] when '--ignoresets','--ignore_sets','--ignoreset','--ignore-set','--ignore_set','--ig-set','--igset' then '--ignore-sets' when '--nooffsitelinks','--no-offsite','--nooffsite' then '--no-offsite-links' - when '--nosavecookies','--no-cookies','--nocookies' then '--no-save-cookies' + when '--nocookies', then '--no-cookies' when '--useragentalias','--user-agent','--useragent' then '--user-agent-alias' when '--concurrent' then '--concurrency' when '--reason' then '--explain' diff --git a/bot/pipeline_options.rb b/bot/pipeline_options.rb index b1b3150d..e1b6be6c 100644 --- a/bot/pipeline_options.rb +++ b/bot/pipeline_options.rb @@ -17,9 +17,9 @@ def run_post_registration_hooks(m, job, params) messages << 'offsite links: no' end - if params[:no_save_cookies] - job.no_save_cookies! - messages << 'save cookies: no' + if params[:no_cookies] + job.no_cookies! + messages << 'use cookies: no' end if !messages.empty? diff --git a/doc/commands.rst b/doc/commands.rst index c51c92e8..b1765811 100644 --- a/doc/commands.rst +++ b/doc/commands.rst @@ -67,8 +67,8 @@ Accepted parameters Aliases: ``--nooffsitelinks``, ``--no-offsite``, ``--nooffsite`` -``--no-save-cookies`` - do not save cookies to a cookie jar +``--no-cookies`` + do not use cookies for each request ``--user-agent-alias ALIAS`` specify a user-agent to use:: diff --git a/lib/job.rb b/lib/job.rb index 9569bafb..cb6ffcc3 100644 --- a/lib/job.rb +++ b/lib/job.rb @@ -380,8 +380,8 @@ def no_offsite_links! redis.hset(ident, 'no_offsite_links', true) end - def no_save_cookies! - redis.hset(ident, 'no_save_cookies', true) + def no_cookies! + redis.hset(ident, 'no_cookies', true) end def yahoo diff --git a/pipeline/archivebot/seesaw/tasks.py b/pipeline/archivebot/seesaw/tasks.py index 22a064da..9b31932e 100644 --- a/pipeline/archivebot/seesaw/tasks.py +++ b/pipeline/archivebot/seesaw/tasks.py @@ -140,7 +140,7 @@ def process(self, item): item['url_file'] = job_data.get('url_file') item['user_agent'] = job_data.get('user_agent') item['no_offsite_links'] = job_data.get('no_offsite_links') - item['no_save_cookies'] = job_data.get('no_save_cookies') + item['no_cookies'] = job_data.get('no_cookies') item['youtube_dl'] = job_data.get('youtube_dl') item.log_output('Received item %s.' % ident) diff --git a/pipeline/archivebot/seesaw/wpull.py b/pipeline/archivebot/seesaw/wpull.py index 34732f56..c2a8d7a6 100644 --- a/pipeline/archivebot/seesaw/wpull.py +++ b/pipeline/archivebot/seesaw/wpull.py @@ -50,7 +50,7 @@ def make_args(item, default_user_agent, wpull_exe, youtube_dl_exe, finished_warc '--youtube-dl-exe', youtube_dl_exe ] - if item.get('no_save_cookies'): + if item.get('no_cookies'): args.append('--no-cookies') else: add_args(args, ['--save-cookies', '%(cookie_jar)s'], item) diff --git a/spec/bot/job_options_parser_spec.rb b/spec/bot/job_options_parser_spec.rb index b7a9e0f8..9039af68 100644 --- a/spec/bot/job_options_parser_spec.rb +++ b/spec/bot/job_options_parser_spec.rb @@ -53,8 +53,8 @@ expect(parser.parse('--concurrency=4')[:concurrency]).to eq(4) end - it 'recognizes --no-save-cookies' do - expect(parser.parse('--no-save-cookies')[:no_save_cookies]).to eq(true) + it 'recognizes --no-cookies' do + expect(parser.parse('--no-cookies')[:no_cookies]).to eq(true) end describe 'when unknown options are present' do From 12870fff7c4de1aea41c52e263337a1f2e221003 Mon Sep 17 00:00:00 2001 From: Yakov Date: Mon, 20 Oct 2025 10:12:23 -0400 Subject: [PATCH 4/4] Remove extra comma in option handling --- bot/job_options_parser.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/job_options_parser.rb b/bot/job_options_parser.rb index 4bbc35d1..76b8fe61 100644 --- a/bot/job_options_parser.rb +++ b/bot/job_options_parser.rb @@ -24,7 +24,7 @@ def parse(str) b[0] = (case b[0] when '--ignoresets','--ignore_sets','--ignoreset','--ignore-set','--ignore_set','--ig-set','--igset' then '--ignore-sets' when '--nooffsitelinks','--no-offsite','--nooffsite' then '--no-offsite-links' - when '--nocookies', then '--no-cookies' + when '--nocookies' then '--no-cookies' when '--useragentalias','--user-agent','--useragent' then '--user-agent-alias' when '--concurrent' then '--concurrency' when '--reason' then '--explain'