diff --git a/lib/mw_dictionary_api/parsers/definition_parser.rb b/lib/mw_dictionary_api/parsers/definition_parser.rb index bd59a3d..943aefa 100644 --- a/lib/mw_dictionary_api/parsers/definition_parser.rb +++ b/lib/mw_dictionary_api/parsers/definition_parser.rb @@ -26,6 +26,10 @@ class DefinitionParser end end + rule :sense_divider do |data, opts| + data[:sense_divider] && data[:sense_divider].content + end + rule :text do |data, opts| dt_without_vi = data[:dt].dup if dt_without_vi.respond_to? :css diff --git a/lib/mw_dictionary_api/parsers/entry_parser.rb b/lib/mw_dictionary_api/parsers/entry_parser.rb index 9faabb2..c5fc9d2 100644 --- a/lib/mw_dictionary_api/parsers/entry_parser.rb +++ b/lib/mw_dictionary_api/parsers/entry_parser.rb @@ -40,10 +40,11 @@ class EntryParser rule :definitions do |data, opts| nodes = data.xpath("def//sn | def//dt") + sd = nil # first step we will add dummy nodes if the list of nodes is not # strictly sn/dt pairs - nodes = add_dumy_nodes(nodes) if nodes.count % 2 != 0 + nodes = add_dummy_nodes(nodes) # data.xpath("def//sn | def//dt") nodes.each_slice(2).inject([]) do |definitions, nodes| @@ -57,6 +58,7 @@ class EntryParser end hash = Hash[names.zip(values)] hash[:prev_sn] = definitions[-1][:sense_number] if definitions[-1] + hash[:sense_divider] = sd if sd = previous_sense_divider(nodes[1]) definitions << DefinitionParser.new(parser_options(opts)).parse(hash) end @@ -87,6 +89,18 @@ class EntryParser inflections end + rule :undefined_run_ons do |data, opts| + data.xpath("uro").inject([]) do |uros, uro_node| + hash = {} + hash[:entry] = parse_entity(uro_node, "ure") + hash[:sound] = parse_entity(uro_node, "sound wav") + hash[:pronunciation] = parse_entity(uro_node, "pr") + hash[:part_of_speech] = parse_entity(uro_node, "fl") + + uros << hash + end + end + rule_helpers do def parser_options(opts) { api_type: opts[:api_type], response_format: opts[:response_format] } @@ -96,7 +110,15 @@ def parse_entity(data, tag) data.at_css(tag).content if data.at_css(tag) end - def add_dumy_nodes(nodes) + def previous_sense_divider(node) + if node.previous_element && node.previous_element.name == 'sd' + node.previous_element + else + nil + end + end + + def add_dummy_nodes(nodes) temp = [] previous_sense_number = nil nodes.each do |node| diff --git a/spec/fixtures/insouciance_collegiate.xml b/spec/fixtures/insouciance_collegiate.xml new file mode 100644 index 0000000..2606b5a --- /dev/null +++ b/spec/fixtures/insouciance_collegiate.xml @@ -0,0 +1,4 @@ + + + insouciancein*sou*ci*anceinsouc02.wavin-!sU-sE-un(t)sinsouc01.wava~-sUs-y@~sin-ˈsü-sē-ən(t)s, aⁿ-süs-ˈyäⁿsnounFrench, from in- + soucier to trouble, disturb, from Old French, from Latin sollicitare solicit1799
:lighthearted unconcern :nonchalance
in*sou*ci*antinsouc03.wavin-!sU-sE-unt in-ˈsü-sē-ənt, aⁿ-süs-yäⁿ adjectivein*sou*ci*ant*lyinsouc04.wavin-!sU-sE-unt-lE in-ˈsü-sē-ənt-lē adverb
+
\ No newline at end of file diff --git a/spec/fixtures/scant_collegiate.xml b/spec/fixtures/scant_collegiate.xml new file mode 100644 index 0000000..dfe1f5a --- /dev/null +++ b/spec/fixtures/scant_collegiate.xml @@ -0,0 +1,93 @@ + + + + scant + DI-1 + scant + + scant001.wav + !skant + + ˈskant + adjective + Middle English, from Old Norse + skamt, + neuter of + skammr + short + + 14th century + 1 + dialect + a +
:excessively frugal
+ b +
:not prodigal : + chary
+ 2 a +
:barely or scarcely sufficient
+ especially +
:not quite coming up to a stated measure + + a + scant + teaspoon +
+ b +
:lacking in amplitude or quantity + + scant + growth +
+ 3 +
:having a small or insufficient supply + + he's fat, and + scant + of breath + Shakespeare +
+ meager +
+ + scant*ly + adverb + + + scant*ness + noun + +
+ + scant + DI-1 + scant + adverb + + 15th century + dialect +
: + scarcely + hardly
+
+
+ + scant + scant + verb + + transitive verb + circa 1580 + 1 +
:to provide an incomplete supply of
+ 2 +
:to make small, narrow, or meager
+ 3 +
:to give scant attention to : + slight
+ 4 +
:to provide with a meager or inadequate portion or supply : + stint
+
+
+
\ No newline at end of file diff --git a/spec/lib/mw_dictionary_api/parsers/entry_parser_spec.rb b/spec/lib/mw_dictionary_api/parsers/entry_parser_spec.rb index e07d6a5..cee2c4d 100644 --- a/spec/lib/mw_dictionary_api/parsers/entry_parser_spec.rb +++ b/spec/lib/mw_dictionary_api/parsers/entry_parser_spec.rb @@ -18,6 +18,9 @@ module Parsers let(:one_collegiate_entry) { one_collegiate_xml_doc.at_css("entry") } let(:shrift_collegiate_entry) { Nokogiri::XML(File.open(fixture_path('shrift_collegiate.xml')).read).at_css("entry") } + let(:scant_collegiate_entry) { Nokogiri::XML(File.open(fixture_path('scant_collegiate.xml')).read).at_css("entry") } + + let(:insouciance_entry) { Nokogiri::XML(File.open(fixture_path('insouciance_collegiate.xml')).read).at_css("entry") } let(:parser) { EntryParser.new } @@ -96,11 +99,41 @@ def parse(data) end describe "definitions" do - it 'returns a list of definition pairs' do - definitions = parse(shrift_collegiate_entry)[:definitions] - expect(definitions.count).to eq 4 + context "when there's an odd number of sense/definition pairs" do + it 'returns a list of definition pairs' do + definitions = parse(shrift_collegiate_entry)[:definitions] + expect(definitions.count).to eq 4 + end + end + + context "when there's a mismatched set of sense/definition pairs" do + it 'returns a list of definition pairs' do + definitions = parse(scant_collegiate_entry)[:definitions] + expect(definitions.count).to eq 7 + end + end + + it 'identifies sense dividers in adjacent definitions' do + definitions = parse(scant_collegiate_entry)[:definitions] + expect(definitions[4][:sense_divider]).to eq 'especially' end end + + describe "undefined_run_ons" do + let(:undefined_run_ons) { parse(one_entry1)[:undefined_run_ons] } + let(:insouciance_uros) { parse(insouciance_entry)[:undefined_run_ons] } + + it "returns a list of run_ons if available" do + expect(undefined_run_ons).to be_empty + + expect(insouciance_uros).to eq([ + {:entry=>"in*sou*ci*ant", :sound=>"insouc03.wav", + :pronunciation=>"in-ˈsü-sē-ənt, aⁿ-süs-yäⁿ", :part_of_speech=>"adjective"}, + {:entry=>"in*sou*ci*ant*ly", :sound=>"insouc04.wav", + :pronunciation=>"in-ˈsü-sē-ənt-lē", :part_of_speech=>"adverb"}]) + end + + end end end end