From 11493ad426d8cb773d0a47b143f5b195be017b39 Mon Sep 17 00:00:00 2001 From: Luis Mendo Date: Sat, 24 Nov 2018 23:00:53 +0100 Subject: [PATCH 1/3] Added file. See comments at in source code This is motivated by the fact that `x = MATLAnswer.fetch();` gives an error, apparently because there are too many answers and some download limitation kicks in. Not sure if that can be fixed, maybe using some pause to comply with the limitation. So I've written this file, which downloads answers after a specified date and merges them with answers contained in a specified file (which was generated in the specified date), removing duplicated that may exist in the specified date. --- extend_file.m | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 extend_file.m diff --git a/extend_file.m b/extend_file.m new file mode 100644 index 0000000..e9866de --- /dev/null +++ b/extend_file.m @@ -0,0 +1,43 @@ +function x = extend_file(existing_file, date_available) + +% function x = extend_file(existing_file, date_available) +% +% Extends a file containing an MATLAnswer object named `x,` to include newer answers. The file name +% is specified as the char vector `date_available`. The second input, `data_available`, indicates the +% date, in format '2018-06-20', up to which that file contains answers. The file is assumed to contain +% all answers corresponding to dates before the specified date. On the other hand, newer answers may +% exist in the specified date that are not in the file; and those will be included. +% +% This is useful because often it is not possible to download all answers up to the current date +% (probably because they are too many and the Stack Exchange API imposes some limitation in the number +% of answers that can be downloaded). +% +% This script downloads new answers starting from the specified date (which must be the date in which +% the existing file was generated), and merges the new and the old. When merging, duplicates are removed. +% Duplicates may arise if the existing file contains some answers in the specified date. Those answers +% will be downloaded again, and should be removed. Two criteria are used for checking duplicates: +% URL and CreationDate. If the two criteria give different results an error is issued. +% +% Note that old answers in the site may have changed and will not be updated. Only answers newer than +% the specified date are downloaded. +% +% Example use: x = extend_file('12viii18.mat', '2018-08-12'); +% +% Luis Mendo + +load(existing_file, 'x'); +assert(logical(exist('x', 'var')), 'File does not contain a variable named x') +x_old = x; +x_new = MATLAnswer.fetch('fromdate', date_available); % 1 is most recent, end is earliest +x = [x_new; x_old]; +[~, ind_keep] = unique({x.URL}, 'stable'); +creationDate = cellfun(@char, {x.CreationDate}, 'UniformOutput', false); % convert from datetime to char +[~, ind_keep_2] = unique(creationDate, 'stable'); % we need 'stable' so that ind_keep and ind_keep_2 have +% the same order +assert(isequal(ind_keep, ind_keep_2), 'Criteria for duplicates give different results') +ind_remove = setdiff(1:numel(x), ind_keep); +for k = ind_remove + disp(['Removing duplicate answer, date ' char(x(k).CreationDate)]) +end +x = x(ind_keep); +% save(datestr(now,1), 'x') \ No newline at end of file From 542c8149bbae27d4113d54f8f085f0645cde6d20 Mon Sep 17 00:00:00 2001 From: Luis Mendo Date: Sun, 25 Nov 2018 00:48:00 +0100 Subject: [PATCH 2/3] In duplicates, the newest version now survives This allows specifying an early date and have existing answers update, in addition to including new answers --- extend_file.m | 46 +++++++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/extend_file.m b/extend_file.m index e9866de..9559f15 100644 --- a/extend_file.m +++ b/extend_file.m @@ -1,43 +1,47 @@ -function x = extend_file(existing_file, date_available) +function x = extend_file(existing_file, date_start) -% function x = extend_file(existing_file, date_available) +% function x = extend_file(existing_file, date_start ) % % Extends a file containing an MATLAnswer object named `x,` to include newer answers. The file name -% is specified as the char vector `date_available`. The second input, `data_available`, indicates the +% is specified as the char vector `existing_file`. The second input, `date_start`, indicates a % date, in format '2018-06-20', up to which that file contains answers. The file is assumed to contain % all answers corresponding to dates before the specified date. On the other hand, newer answers may -% exist in the specified date that are not in the file; and those will be included. +% exist in the specified date, or after the date, that are not in the file; and those will get included. % % This is useful because often it is not possible to download all answers up to the current date % (probably because they are too many and the Stack Exchange API imposes some limitation in the number % of answers that can be downloaded). % % This script downloads new answers starting from the specified date (which must be the date in which -% the existing file was generated), and merges the new and the old. When merging, duplicates are removed. -% Duplicates may arise if the existing file contains some answers in the specified date. Those answers -% will be downloaded again, and should be removed. Two criteria are used for checking duplicates: -% URL and CreationDate. If the two criteria give different results an error is issued. +% the existing file was generated), and merges the new and the old. When merging, older duplicates +% are removed. Duplicates arise if the existing file contains some answers in the specified date or +% after that. Those answers will be downloaded again, and the old version should be removed. % -% Note that old answers in the site may have changed and will not be updated. Only answers newer than -% the specified date are downloaded. +% Note that answers in the site before the specified may have changed and will not be updated. Only +% answers newer than the specified date are downloaded. If the existing file contains answers up to +% date D2, and an earlier date D1 is specified as `date_start`, answers from D1 to D2 will get updated. % -% Example use: x = extend_file('12viii18.mat', '2018-08-12'); +% So the idea is to specify `date_start` as old as possible, to update existing answers in +% addition to include the new answers. +% +% Example use: x = extend_file('12viii18.mat', '2018-01-15'); % % Luis Mendo load(existing_file, 'x'); assert(logical(exist('x', 'var')), 'File does not contain a variable named x') x_old = x; -x_new = MATLAnswer.fetch('fromdate', date_available); % 1 is most recent, end is earliest -x = [x_new; x_old]; -[~, ind_keep] = unique({x.URL}, 'stable'); -creationDate = cellfun(@char, {x.CreationDate}, 'UniformOutput', false); % convert from datetime to char -[~, ind_keep_2] = unique(creationDate, 'stable'); % we need 'stable' so that ind_keep and ind_keep_2 have -% the same order -assert(isequal(ind_keep, ind_keep_2), 'Criteria for duplicates give different results') +x_new = MATLAnswer.fetch('fromdate', date_start); % 1 is most recent, end is earliest +x = [x_new; x_old]; % from most recent to earliest +x = flip(x); % from earliest to recent +creationDate = cellfun(@datenum, {x.CreationDate}); % convert from datetime to number. Thanks to the +% previous flip, this will be an increasing vector +[~, ind_keep] = unique(creationDate, 'last'); % 'last', together with the fac that creationDate is +% increasing, ensures we keep the most recently obtained version of each duplicate ind_remove = setdiff(1:numel(x), ind_keep); -for k = ind_remove - disp(['Removing duplicate answer, date ' char(x(k).CreationDate)]) -end +disp(['Removing ' num2str(numel(ind_remove)) ' duplicate answers; dates from ' ... + datestr(x(ind_remove(1)).CreationDate, 1) ' to ' datestr(x(ind_remove(end)).CreationDate, 1)]) +disp(['The result contains ' num2str(numel(ind_keep)) ' answers']) x = x(ind_keep); +x = flip(x); % undo previous flip, so that most recent appears first, as returned by MATLAnswer % save(datestr(now,1), 'x') \ No newline at end of file From 6235dad829ec57e7e3fb65004d03cc9ec9342a8d Mon Sep 17 00:00:00 2001 From: Luis Mendo Date: Fri, 12 Jul 2019 19:42:36 +0200 Subject: [PATCH 3/3] Two small changes Modified the regexp so that potential leading spaces are matched Removed -1 in the for look that fetches counts --- MATLAnswer.m | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MATLAnswer.m b/MATLAnswer.m index 2936699..2260db7 100644 --- a/MATLAnswer.m +++ b/MATLAnswer.m @@ -930,7 +930,7 @@ function plots(varargin) % now that they are all answers answers = cat(1, answers{:}); - isMATL = ~cellfun(@isempty, regexp({answers.body}, '^MATL')); + isMATL = ~cellfun(@isempty, regexp({answers.body}, '^\s*MATL')); answers = answers(isMATL); % Now we want the actual content from the answers @@ -943,7 +943,7 @@ function plots(varargin) for k = 1:ceil(numel(answers) / chunksize) chunk = answers(((k - 1) * chunksize + 1) : ... - (min(k * chunksize - 1, numel(answers)))); + (min(k * chunksize, numel(answers)))); ids = sprintf('%d;', chunk.answer_id); url = strcat(MATLAnswer.API_URL, '/answers/', ids(1:end-1));