diff --git a/Gemfile b/Gemfile index a973bdc5..76e0672b 100644 --- a/Gemfile +++ b/Gemfile @@ -46,8 +46,8 @@ end # Padrino Stable Gem gem 'padrino', '0.12.5' -gem 'pry', :group => 'development' -gem 'pry-byebug', :group => 'development' +gem 'pry', :group => ['development', 'test'] +gem 'pry-byebug', :group => ['development', 'test'] gem 'newrelic_rpm' # Or Padrino Edge diff --git a/Gemfile.lock b/Gemfile.lock index 1215d608..9af9624c 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -242,6 +242,3 @@ DEPENDENCIES safe_yaml sass stretchy - -BUNDLED WITH - 1.10.6 diff --git a/app/controllers.rb b/app/controllers.rb index 1938ba6c..9b292ec1 100644 --- a/app/controllers.rb +++ b/app/controllers.rb @@ -47,27 +47,49 @@ data.to_json end - get :index, with: :endpoint, provides: [:json, :csv] do - options = get_search_args_from_params(params) - endpoint = options[:endpoint] - content_type options[:format].to_sym if options[:format] - DataMagic.logger.debug "-----> APP GET #{params.inspect}" - - unless DataMagic.config.api_endpoints.keys.include? endpoint - halt 404, { - error: 404, - message: "#{endpoint} not found. Available endpoints: #{DataMagic.config.api_endpoints.keys.join(',')}" - }.to_json - end + get :index, with: ':endpoint/:command', provides: [:json] do + process_params + end - data = DataMagic.search(params, options) - halt 400, data.to_json if data.key?(:errors) + get :index, with: ':endpoint', provides: [:json, :csv] do + process_params + end +end - if content_type == :csv - output_data_as_csv(data['results']) - else - data.to_json - end +def process_params + options = get_search_args_from_params(params) + DataMagic.logger.debug "-----> APP GET #{params.inspect} with options #{options.inspect}" + + check_endpoint!(options) + set_content_type(options) + search_and_respond(options) +end + +def search_and_respond(options) + data = DataMagic.search(params, options) + halt 400, data.to_json if data.key?(:errors) + + if content_type == :csv + output_data_as_csv(data['results']) + else + data.to_json + end +end + +def check_endpoint!(options) + unless DataMagic.config.api_endpoints.keys.include? options[:endpoint] + halt 404, { + error: 404, + message: "#{options[:endpoint]} not found. Available endpoints: #{DataMagic.config.api_endpoints.keys.join(',')}" + }.to_json + end +end + +def set_content_type(options) + if options[:command] == 'stats' + content_type :json + else + content_type(options[:format].nil? ? :json : options[:format].to_sym) end end @@ -76,7 +98,7 @@ # see comment in method body def get_search_args_from_params(params) options = {} - %w(sort fields zip distance page per_page debug).each do |opt| + %w(metrics sort fields zip distance page per_page debug).each do |opt| options[opt.to_sym] = params.delete("_#{opt}") # TODO: remove next line to end support for un-prefixed option parameters options[opt.to_sym] ||= params.delete(opt) @@ -84,6 +106,9 @@ def get_search_args_from_params(params) options[:endpoint] = params.delete("endpoint") # these two params are options[:format] = params.delete("format") # supplied by Padrino options[:fields] = (options[:fields] || "").split(',') + options[:command] = params.delete("command") + + options[:metrics] = options[:metrics].split(/\s*,\s*/) if options[:metrics] options end diff --git a/lib/data_magic.rb b/lib/data_magic.rb index b538671f..22f79d29 100644 --- a/lib/data_magic.rb +++ b/lib/data_magic.rb @@ -76,12 +76,20 @@ def self.search(terms, options = {}) body: query_body } + # Per https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-body.html: + # "the search_type and the query_cache must be passed as query-string parameters" + if options[:command] == 'stats' + full_query.merge! :search_type => 'count' + end + logger.info "FULL_QUERY: #{full_query.inspect}" time_start = Time.now.to_f result = client.search full_query + search_time = Time.now.to_f - time_start logger.info "ES query time (ms): #{result["took"]} ; Query fetch time (s): #{search_time} ; result: #{result.inspect[0..500]}" + hits = result["hits"] total = hits["total"] results = [] @@ -97,7 +105,7 @@ def self.search(terms, options = {}) found.keys.each { |key| found[key] = found[key][0] } # now it should look like this: - # {"city"=>"Springfield", "address"=>"742 Evergreen Terrace + # {"city"=>"Springfield", "address"=>"742 Evergreen Terrace} # re-insert null fields that didn't get returned by ES query_body[:fields].each do |field| @@ -118,10 +126,28 @@ def self.search(terms, options = {}) end # assemble a simpler json document to return + simple_result = { "metadata" => metadata, "results" => results } + + if options[:command] == 'stats' + # Remove metrics that weren't requested. + aggregations = result['aggregations'] + aggregations.each do |f_name, values| + if options[:metrics] && options[:metrics].size > 0 + aggregations[f_name] = values.reject { |k, v| !(options[:metrics].include? k) } + else + # Keep everything is no metric list is provided + aggregations[f_name] = values + end + end + + simple_result.merge!({"aggregations" => aggregations}) + end + + simple_result end private diff --git a/lib/data_magic/error_checker.rb b/lib/data_magic/error_checker.rb index 2bd31b75..d66ac928 100644 --- a/lib/data_magic/error_checker.rb +++ b/lib/data_magic/error_checker.rb @@ -2,7 +2,8 @@ module DataMagic module ErrorChecker class << self def check(params, options, config) - report_nonexistent_params(params, config) + + report_required_params_absent(options) + + report_nonexistent_params(params, config) + report_nonexistent_operators(params) + report_nonexistent_fields(options[:fields], config) + report_bad_range_argument(params) + @@ -11,6 +12,14 @@ def check(params, options, config) private + def report_required_params_absent(options) + if options[:command] == 'stats' && options[:fields].length == 0 + [build_error(error: 'invalid_or_incomplete_parameters', input: options[:command])] + else + [] + end + end + def report_nonexistent_params(params, config) return [] unless config.dictionary_only_search? params.keys.reject { |p| config.field_type(strip_op(p)) }. @@ -66,6 +75,8 @@ def report_wrong_field_type(params, config) def build_error(opts) opts[:message] = case opts[:error] + when 'invalid_or_incomplete_parameters' + "The command #{opts[:input]} requires a fields parameter." when 'parameter_not_found' "The input parameter '#{opts[:input]}' is not known in this dataset." when 'field_not_found' diff --git a/lib/data_magic/query_builder.rb b/lib/data_magic/query_builder.rb index 17330d5a..3e5b1358 100644 --- a/lib/data_magic/query_builder.rb +++ b/lib/data_magic/query_builder.rb @@ -8,9 +8,15 @@ def from_params(params, options, config) per_page = DataMagic::MAX_PAGE_SIZE if per_page > DataMagic::MAX_PAGE_SIZE query_hash = { from: page * per_page, - size: per_page + size: per_page, } + query_hash[:query] = generate_squery(params, options, config).to_search + + if options[:command] == 'stats' + query_hash.merge! add_aggregations(params, options, config) + end + if options[:fields] && !options[:fields].empty? query_hash[:fields] = get_restrict_fields(options) query_hash[:_source] = false @@ -31,6 +37,20 @@ def generate_squery(params, options, config) search_fields_and_ranges(squery, params, config) end + # Wrapper for Stretchy aggregation clause builder (which wraps ElasticSearch (ES) :aggs parameter) + # Extracts all extended_stats aggregations from ES, to be filtered later + # Is a no-op if no fields are specified, or none of them are numeric + def add_aggregations(params, options, config) + agg_hash = options[:fields].inject({}) do |memo, f| + if config.column_field_types[f.to_s] && ["integer", "float"].include?(config.column_field_types[f.to_s]) + memo[f.to_s] = { extended_stats: { "field" => f.to_s } } + end + memo + end + + agg_hash.empty? ? {} : { aggs: agg_hash } + end + def get_restrict_fields(options) options[:fields].map(&:to_s) end diff --git a/sample-data/data.yaml b/sample-data/data.yaml index 1eff28f5..57b38f59 100644 --- a/sample-data/data.yaml +++ b/sample-data/data.yaml @@ -48,7 +48,8 @@ dictionary: type: integer location.lat: INTPTLAT location.lon: INTPTLONG - area.land: + land_area: + source: ALAND_SQMI description: Land Area (square miles) source: ALAND_SQMI type: float diff --git a/spec/features/api_spec.rb b/spec/features/api_spec.rb index eafbae1e..641f64a6 100644 --- a/spec/features/api_spec.rb +++ b/spec/features/api_spec.rb @@ -344,6 +344,67 @@ end end end + + describe "With residents CSV data" do + before do + ENV['DATA_PATH'] = './spec/fixtures/numeric_data' + DataMagic.init(load_now: false) + num_rows, fields = DataMagic.import_csv(address_data) + end + + after do + DataMagic.destroy + end + + let(:all_aggregs) do + { "aggregations" => { + "age"=>{"count"=>2, "min"=>14.0, "max"=>70.0, "avg"=>42.0, "sum"=>84.0, "sum_of_squares"=>5096.0, "variance"=>784.0, "std_deviation"=>28.0, "std_deviation_bounds"=>{"upper"=>98.0, "lower"=>-14.0}}, + "height"=>{"count"=>2, "min"=>2.0, "max"=>142.0, "avg"=>72.0, "sum"=>144.0, "sum_of_squares"=>20168.0, "variance"=>4900.0, "std_deviation"=>70.0, "std_deviation_bounds"=>{"upper"=>212.0, "lower"=>-68.0}} + } + } + end + + let(:max_avg_aggregs) do + { "aggregations" => { + "age" => { "max" => 70.0, "avg" => 42.0}, + "height" => { "max" => 142.0, "avg" => 72.0} + } + } + end + + let(:stats_envelope) do + { "metadata" => { "total" => 2, + "page" => 0, + "per_page" => DataMagic::DEFAULT_PAGE_SIZE + }, + "results" => [] + } + end + + it "/stats returns the correct results for Springfield residents" do + get '/v1/cities/stats?city=Springfield&_fields=address,age,height&_metrics=max,avg' + expect(last_response).to be_ok + json_response["results"] = json_response["results"].sort_by { |k| k["age"] } + expect(json_response).to eq(stats_envelope.merge(max_avg_aggregs)) + end + + it "/stats returns all metrics when none are specified" do + get '/v1/cities/stats?city=Springfield&_fields=address,age,height' + expect(last_response).to be_ok + json_response["results"] = json_response["results"].sort_by { |k| k["age"] } + + age_expected = stats_envelope.merge(all_aggregs)["aggregations"]["age"] + expect(json_response["aggregations"]["age"]["std_deviation"]).to eq(age_expected["std_deviation"]) + + height_expected = stats_envelope.merge(all_aggregs)["aggregations"]["height"] + expect(json_response["aggregations"]["height"]["std_deviation"]).to eq(height_expected["std_deviation"]) + end + + it "/stats requires fields option" do + get '/v1/cities/stats?city=Springfield&_metrics=max,avg' + expect(last_response.status).to eq(400) + end + end describe "deprecated option syntax" do before do diff --git a/spec/fixtures/data.rb b/spec/fixtures/data.rb index f567416d..47004d1a 100644 --- a/spec/fixtures/data.rb +++ b/spec/fixtures/data.rb @@ -1,16 +1,16 @@ - - +# Ages adjusted for Springfield residents to average to 42 +# Heights randomly set to generate a max of 142 def address_data @address_data ||= StringIO.new <<-eos -name,address,city -Paul,15 Penny Lane,Liverpool -Michelle,600 Pennsylvania Avenue,Washington -Marilyn,1313 Mockingbird Lane,Springfield -Sherlock,221B Baker Street,London -Clark,66 Lois Lane,Smallville -Bart,742 Evergreen Terrace,Springfield -Paul,19 N Square,Boston -Peter,66 Parker Lane,New York +name,address,city,age,height +Paul,15 Penny Lane,Liverpool,10,142 +Michelle,600 Pennsylvania Avenue,Washington,12,1 +Marilyn,1313 Mockingbird Lane,Springfield,14,2 +Sherlock,221B Baker Street,London,16,123 +Clark,66 Lois Lane,Smallville,18,141 +Bart,742 Evergreen Terrace,Springfield,70,142 +Paul,19 N Square,Boston,70,55.2 +Peter,66 Parker Lane,New York,74,11.5123 eos @address_data.rewind @address_data diff --git a/spec/fixtures/numeric_data/data.yaml b/spec/fixtures/numeric_data/data.yaml new file mode 100644 index 00000000..80ddf3a5 --- /dev/null +++ b/spec/fixtures/numeric_data/data.yaml @@ -0,0 +1,21 @@ +# cities100.txt +# Test YAML file +index: numeric-data +api: cities + +dictionary: + name: + source: name + type: string + address: + source: address + type: string + city: + source: city + type: string + age: + source: age + type: integer + height: + source: height + type: float diff --git a/spec/lib/data_magic/config_spec.rb b/spec/lib/data_magic/config_spec.rb index fddbea40..0e02765c 100644 --- a/spec/lib/data_magic/config_spec.rb +++ b/spec/lib/data_magic/config_spec.rb @@ -80,7 +80,6 @@ "version" => "cities100-2010", "index" => "city-data", "api" => "cities", "files" => [{ "name" => "cities100.csv" }], - "data_path" => "./sample-data", "options" => {:search=>"dictionary_only"}, "unique" => ["name"], "data_path" => "./sample-data" @@ -89,7 +88,7 @@ dictionary = config.data.delete 'dictionary' expect(dictionary.keys.sort).to eq %w(id code name state population - location.lat location.lon area.land area.water).sort + location.lat location.lon land_area area.water).sort categories = config.data.delete 'categories' expect(categories.keys.sort).to eq %w(general general2 general3 general4 general5 geographic).sort expect(config.data).to eq(default_config) diff --git a/spec/lib/data_magic/search_spec.rb b/spec/lib/data_magic/search_spec.rb index 6afe45db..16915903 100644 --- a/spec/lib/data_magic/search_spec.rb +++ b/spec/lib/data_magic/search_spec.rb @@ -27,22 +27,27 @@ it "can find document with one attribute" do result = DataMagic.search({name: "Marilyn"}) - expected["results"] = [{"name" => "Marilyn", "address" => "1313 Mockingbird Lane", "city" => "Springfield"}] + expected["results"] = [{"name" => "Marilyn", "address" => "1313 Mockingbird Lane", "city" => "Springfield", + "age" => "14", "height" => "2"}] expect(result).to eq(expected) end it "can find document with multiple search terms" do result = DataMagic.search({name: "Paul", city:"Liverpool"}) - expected["results"] = [{"name" => "Paul", "address" => "15 Penny Lane", "city" => "Liverpool"}] + expected["results"] = [{"name" => "Paul", "address" => "15 Penny Lane", "city" => "Liverpool", + "age" => "10", "height" => "142"}] expect(result).to eq(expected) end it "can find a document with a set of values delimited by commas" do result = DataMagic.search({name: "Paul,Marilyn"}) expected['metadata']["total"] = 3 - expect(result["results"]).to include({"name" => "Marilyn", "address" => "1313 Mockingbird Lane", "city" => "Springfield"}) - expect(result["results"]).to include({"name" => "Paul", "address" => "15 Penny Lane", "city" => "Liverpool"}) - expect(result["results"]).to include({"name" => "Paul", "address" => "19 N Square", "city" => "Boston"}) + expect(result["results"]).to include({"name" => "Marilyn", "address" => "1313 Mockingbird Lane", "city" => "Springfield", + "age" => "14", "height" => "2"}) + expect(result["results"]).to include({"name" => "Paul", "address" => "15 Penny Lane", "city" => "Liverpool", + "age" => "10", "height" => "142"}) + expect(result["results"]).to include({"name" => "Paul", "address" => "19 N Square", "city" => "Boston", + "age" => "70", "height" => "55.2"}) end it "can return a single attribute" do @@ -68,7 +73,6 @@ expect(result).to eq(expected) end - describe "supports pagination" do it "can specify both page and page size" do result = DataMagic.search({ address: "Lane" }, page:1, per_page: 3) @@ -84,8 +88,8 @@ expect(result["results"].length).to eq(0) end end - end + describe "with mapping" do before (:all) do ENV['DATA_PATH']="./no-data" @@ -108,9 +112,52 @@ end + end + + describe "with numeric data" do + before (:all) do + ENV['DATA_PATH'] = './spec/fixtures/numeric_data' + DataMagic.init(load_now: false) + num_rows, fields = DataMagic.import_csv(address_data) + end + after(:all) do + DataMagic.destroy + end + + it "can correctly compute filtered statistics" do + expected["metadata"]["total"] = 2 + result = DataMagic.search({city: "Springfield"}, command: 'stats', fields: ["age", "height", "address"], + metrics: ['max', 'avg']) + result["results"] = result["results"].sort_by { |k| k["age"] } + + expected["results"] = [] + expected["aggregations"] = { + "age" => { "max" => 70.0, "avg" => 42.0}, + "height" => {"max"=>142.0, "avg"=>72.0} + } + expect(result).to eq(expected) + end + it "can correctly compute unfiltered statistics" do + expected["metadata"]["total"] = 2 + result = DataMagic.search({city: "Springfield"}, command: 'stats', fields: ["age", "height", "address"]) + result["results"] = result["results"].sort_by { |k| k["age"] } + + expected["results"] = [] + expected["aggregations"] = { + "age"=>{ + "count"=>2, "min"=>14.0, "max"=>70.0, "avg"=>42.0, "sum"=>84.0, "sum_of_squares"=>5096.0, "variance"=>784.0, "std_deviation"=>28.0, "std_deviation_bounds"=>{"upper"=>98.0, "lower"=>-14.0}}, + "height"=>{ + "count"=>2, "min"=>2.0, "max"=>142.0, "avg"=>72.0, "sum"=>144.0, "sum_of_squares"=>20168.0, "variance"=>4900.0, "std_deviation"=>70.0, "std_deviation_bounds"=>{"upper"=>212.0, "lower"=>-68.0} + } + } + + expect(result["age"]).to eq(expected["age"]) + expect(result["height"]).to eq(expected["height"]) + end end + describe "with geolocation" do before (:all) do ENV['DATA_PATH'] = './spec/fixtures/geo_no_files'