diff --git a/README.md b/README.md index c17f155..5faf9c0 100644 --- a/README.md +++ b/README.md @@ -26,11 +26,19 @@ creek = Creek::Book.new 'spec/fixtures/sample.xlsx' sheet = creek.sheets[0] sheet.rows.each do |row| - puts row # => {"A1"=>"Content 1", "B1"=>nil, C1"=>nil, "D1"=>"Content 3"} + puts row # => ["Content 1", nil, nil, "Content 3"] +end + +sheet.rows(headers: true).each do |row| + puts row # => { 'header1' => "Content 1", 'header2' => nil, 'header3' => nil, 'header4' => "Content 3" } end sheet.rows_with_meta_data.each do |row| - puts row # => {"collapsed"=>"false", "customFormat"=>"false", "customHeight"=>"true", "hidden"=>"false", "ht"=>"12.1", "outlineLevel"=>"0", "r"=>"1", "cells"=>{"A1"=>"Content 1", "B1"=>nil, C1"=>nil, "D1"=>"Content 3"}} + puts row # => {"collapsed"=>"false", "customFormat"=>"false", "customHeight"=>"true", "hidden"=>"false", "ht"=>"12.1", "outlineLevel"=>"0", "r"=>"1", "header_row" => false, "cells"=>["Content 1", nil, nil, "Content 3"]} +end + +sheet.rows_with_meta_data(headers: true).each do |row| + puts row # => {"collapsed"=>"false", "customFormat"=>"false", "customHeight"=>"true", "hidden"=>"false", "ht"=>"12.1", "outlineLevel"=>"0", "r"=>"1", "header_row" => false, "cells"=>{ 'header1' => "Content 1", 'header2' => nil, 'header3' => nil, 'header4' => "Content 3" }} end sheet.state # => 'visible' @@ -38,6 +46,48 @@ sheet.name # => 'Sheet1' sheet.rid # => 'rId2' ``` +## Headers +`rows` and `rows_with_meta_data` both accept the kwargs `headers` and `header_row_number` to load +the rows as hash with the headers as keys. Also, a `header_row` boolean is added to the row metadata. +See examples above. + +Headers (as an array or nil if empty) are loaded once by parsing the file a first time until the `header_row_number` is reached. +Rows are then returned normally as an Enumerator as usual (new Enumerator instance starting from the beginning of the file, that will include header row as well). It's the caller's responsibility to filter the header row as needed. + +`extract_headers` can also be called manually from the sheet instance. +Once extracted, the headers can be accessed through the `headers` attr_reader. +As headers are matched to their respective value in the row by index, it's possible to modifies the array in `headers` to customize the headers (to fix typo, make them unique, etc.). + +Empty header row returns `nil` and the rows are then returned as an array (same as calling `rows(headers: false)`). + +```ruby +creek = Creek::Book.new 'spec/fixtures/sample.xlsx' +sheet = creek.sheets[0] + +# Parse the file up to row 3 (file starts at row 1) +sheet.extract_headers(3) +# => ['Header1', 'Other Header', 'More header'] + +sheet.headers +# => ['Header1', 'Other Header', 'More header'] + +# Headers can be modified before parsing the file to customize them +sheet.headers[0] = 'A better Header' +sheet.headers +# => ['A better Header', 'Other Header', 'More header'] + +# Parse the rows as hashes, including the (modified) headers +sheet.rows(headers: true).each do |row| + puts row # => { 'A better Header' => "Content 1", 'Other Header' => nil, 'More header' => nil } +end + +# Or both can be done directly when accessing rows +sheet2 = creek.sheets[1] +sheet2.rows(headers: true, header_row_number: 3).each do |row| + puts row # => { 'Header1' => "Content 2", 'Other Header' => nil, 'More header' => nil } +end +``` + ## Filename considerations By default, Creek will ensure that the file extension is either *.xlsx or *.xlsm, but this check can be circumvented as needed: @@ -82,13 +132,6 @@ puts sheet.images_at('C1') # => nil Creek will most likely return nil for a cell with images if there is no other text cell in that row - you can use *images_at* method for retrieving images in that cell. -## Remote files - -```ruby -remote_url = 'http://dev-builds.libreoffice.org/tmp/test.xlsx' -Creek::Book.new remote_url, remote: true -``` - ## Contributing Contributions are welcomed. You can fork a repository, add your code changes to the forked branch, ensure all existing unit tests pass, create new unit tests which cover your new changes and finally create a pull request. diff --git a/creek.gemspec b/creek.gemspec index 45cdc00..09dc319 100644 --- a/creek.gemspec +++ b/creek.gemspec @@ -28,5 +28,4 @@ Gem::Specification.new do |spec| spec.add_dependency 'nokogiri', '>= 1.7.0' spec.add_dependency 'rubyzip', '>= 1.0.0' - spec.add_dependency 'httparty', '~> 0.15.5' end diff --git a/lib/creek/book.rb b/lib/creek/book.rb index 8c51151..a6d8fda 100644 --- a/lib/creek/book.rb +++ b/lib/creek/book.rb @@ -1,7 +1,6 @@ require 'zip/filesystem' require 'nokogiri' require 'date' -require 'httparty' module Creek @@ -20,13 +19,7 @@ def initialize path, options = {} extension = File.extname(options[:original_filename] || path).downcase raise 'Not a valid file format.' unless (['.xlsx', '.xlsm'].include? extension) end - if options[:remote] - zipfile = Tempfile.new("file") - zipfile.binmode - zipfile.write(HTTParty.get(path).body) - zipfile.close - path = zipfile.path - end + @files = Zip::File.open(path) @shared_strings = SharedStrings.new(self) end diff --git a/lib/creek/shared_strings.rb b/lib/creek/shared_strings.rb index 677d37d..12b630a 100644 --- a/lib/creek/shared_strings.rb +++ b/lib/creek/shared_strings.rb @@ -24,7 +24,7 @@ def parse_shared_shared_strings when Nokogiri::XML::Reader::TYPE_ELEMENT case node.name when 'si' then - str = '' + str = ''.dup when 't' then buffer = true end @@ -46,4 +46,3 @@ def parse_shared_shared_strings end end end - diff --git a/lib/creek/sheet.rb b/lib/creek/sheet.rb index dc76f1e..5fdb503 100644 --- a/lib/creek/sheet.rb +++ b/lib/creek/sheet.rb @@ -11,10 +11,17 @@ class Creek::Sheet :state, :visible, :rid, - :index - + :index, + :headers + + # An XLS file has only 256 columns, however, an XLSX or XLSM file can contain up to 16384 columns. + # This function creates a hash with all valid XLSX column names and associated indices. + # Note: load and memoize on demand + def self.column_indexes + @column_indexes ||= ('A'..'XFD').each_with_index.to_h.freeze + end - def initialize book, name, sheetid, state, visible, rid, sheetfile + def initialize(book, name, sheetid, state, visible, rid, sheetfile) @book = book @name = name @sheetid = sheetid @@ -23,13 +30,10 @@ def initialize book, name, sheetid, state, visible, rid, sheetfile @state = state @sheetfile = sheetfile @images_present = false + end - # An XLS file has only 256 columns, however, an XLSX or XLSM file can contain up to 16384 columns. - # This function creates a hash with all valid XLSX column names and associated indices. - @excel_col_names = Hash.new - ('A'..'XFD').each_with_index do |col_name, index| - @excel_col_names[col_name] = index - end + def column_indexes + self.class.column_indexes end ## @@ -56,15 +60,33 @@ def images_at(cell) ## # Provides an Enumerator that returns a hash representing each row. # The key of the hash is the Cell id and the value is the value of the cell. - def rows - rows_generator + def rows(headers: false, header_row_number: 1, metadata: false) + extract_headers(header_row_number) if headers + + rows_generator(include_headers: headers, include_meta_data: metadata) end ## # Provides an Enumerator that returns a hash representing each row. # The hash contains meta data of the row and a 'cells' embended hash which contains the cell contents. - def rows_with_meta_data - rows_generator true + def rows_with_meta_data(headers: false, header_row_number: 1) + rows(headers: headers, header_row_number: header_row_number, metadata: true) + end + + # Parses the file until the header row is reached. + # Returns the headers as an array, or nil if the headers are empty. + def extract_headers(row_number = 1) + return @headers if defined?(@headers) + + # Extracted row numbers are String, convert it here to facilitate comparison + @header_row_number = row_number.to_s + + rows_with_meta_data.each do |row| + if @header_row_number == row['r'] + @headers = row['cells'] if row['cells'].any? + return @headers + end + end end private @@ -79,52 +101,65 @@ def rows_with_meta_data TEXT = 't'.freeze ## - # Returns a hash per row that includes the cell ids and values. - # Empty cells will be also included in the hash with a nil value. - def rows_generator include_meta_data=false - path = if @sheetfile.start_with? "/xl/" or @sheetfile.start_with? "xl/" then @sheetfile else "xl/#{@sheetfile}" end + # Returns an array or hash (with headers as key) per row that includes the cell ids and values. + # Empty cells will be also included with a nil value. + def rows_generator(include_meta_data: false, include_headers: false) + path = + if @sheetfile.start_with?("/xl/") || @sheetfile.start_with?("xl/") + @sheetfile + else + "xl/#{@sheetfile}" + end + if @book.files.file.exist?(path) # SAX parsing, Each element in the stream comes through as two events: # one to open the element and one to close it. opener = Nokogiri::XML::Reader::TYPE_ELEMENT closer = Nokogiri::XML::Reader::TYPE_END_ELEMENT + Enumerator.new do |y| - row, cells, cell = nil, {}, nil + row, cells, cell = nil, [], nil row_number = nil cell_type = nil cell_style_idx = nil + @book.files.file.open(path) do |xml| Nokogiri::XML::Reader.from_io(xml).each do |node| node_name = node.name - next unless node_name == CELL || node_name == ROW || node_name == VALUE || node_name == TEXT + next if node.node_type != opener && node_name != ROW + if node_name == ROW case node.node_type - when opener then + when opener row = node.attributes row_number = row[ROW_NUMBER] - if spans = row['spans'] + + if (spans = row['spans']) spans = spans.split(":").last.to_i - 1 else spans = 0 end + cells = Array.new(spans) - row['cells'] = cells - y << (include_meta_data ? row : cells) if node.self_closing? + + if node.self_closing? + y << to_formatted_row(row, cells, include_meta_data, include_headers) + end when closer - y << (include_meta_data ? row : cells) + y << to_formatted_row(row, cells, include_meta_data, include_headers) end - elsif (node_name == CELL) && node.node_type == opener + elsif node_name == CELL attributes = node.attributes cell_type = attributes[CELL_TYPE] cell_style_idx = attributes[STYLE_INDEX] cell = attributes[CELL_REF] - elsif node_name == VALUE && node.node_type == opener + elsif node_name == VALUE if cell - cells[@excel_col_names[cell.sub(row_number, '')]] = convert(node.inner_xml, cell_type, cell_style_idx) + cells[column_indexes[cell.sub(row_number, '')]] = convert(node.inner_xml, cell_type, cell_style_idx) end - elsif node_name == TEXT && node.node_type == opener + elsif node_name == TEXT if cell - cells[@excel_col_names[cell.sub(row_number, '')]] = convert(node.inner_xml, cell_type, cell_style_idx) + cells[column_indexes[cell.sub(row_number, '')]] = convert(node.inner_xml, cell_type, cell_style_idx) end end end @@ -133,6 +168,24 @@ def rows_generator include_meta_data=false end end + def to_formatted_row(row, cells, include_meta_data, include_headers) + if include_headers + row['header_row'] = row[ROW_NUMBER] == @header_row_number + cells = cells_with_headers(cells) if @headers + end + + if include_meta_data + row['cells'] = cells + row + else + cells + end + end + + def cells_with_headers(cells) + cells.empty? ? {} : @headers.zip(cells).to_h + end + def convert(value, type, style_idx) style = @book.style_types[style_idx.to_i] Creek::Styles::Converter.call(value, type, style, converter_options) diff --git a/lib/creek/styles/constants.rb b/lib/creek/styles/constants.rb index 1b4f156..4cde485 100644 --- a/lib/creek/styles/constants.rb +++ b/lib/creek/styles/constants.rb @@ -36,9 +36,6 @@ module Constants 48 => :bignum, # ##0.0E+0 49 => :unsupported # @ } - - DATE_SYSTEM_1900 = 25569 #Date.new(1899, 12, 30) - DATE_SYSTEM_1904 = Date.new(1904, 1, 1) end end end diff --git a/lib/creek/styles/converter.rb b/lib/creek/styles/converter.rb index 786052c..aaefba4 100644 --- a/lib/creek/styles/converter.rb +++ b/lib/creek/styles/converter.rb @@ -60,8 +60,10 @@ def self.call(value, type, style, options = {}) value.to_i when :float, :percentage value.to_f - when :date, :time, :date_time + when :date convert_date(value, options) + when :time, :date_time + convert_datetime(value, options) when :bignum convert_bignum(value) @@ -71,21 +73,25 @@ def self.call(value, type, style, options = {}) end end - # the trickiest. note that all these formats can vary on - # whether they actually contain a date, time, or datetime. def self.convert_date(value, options) - value = value.to_f + base_date(options) + value.to_i + end - Time.at(((value - DATE_SYSTEM_1900) * 86400).round) + def self.convert_datetime(value, options) + base_date(options).to_datetime + value.to_f.round(6) end def self.convert_bignum(value) if defined?(BigDecimal) - BigDecimal.new(value) + BigDecimal(value) else value.to_f end end + + def self.base_date(options) + options.fetch(:base_date, Date.new(1899, 12, 30)) + end end end end diff --git a/spec/fixtures/sample_dates.xlsx b/spec/fixtures/sample_dates.xlsx new file mode 100644 index 0000000..e4edea5 Binary files /dev/null and b/spec/fixtures/sample_dates.xlsx differ diff --git a/spec/fixtures/sheets/sample_dates.xlsx b/spec/fixtures/sheets/sample_dates.xlsx new file mode 100644 index 0000000..0a5d92f Binary files /dev/null and b/spec/fixtures/sheets/sample_dates.xlsx differ diff --git a/spec/fixtures/sheets/single_data_programme.xlsx b/spec/fixtures/sheets/single_data_programme.xlsx new file mode 100644 index 0000000..468537f Binary files /dev/null and b/spec/fixtures/sheets/single_data_programme.xlsx differ diff --git a/spec/styles/converter_spec.rb b/spec/styles/converter_spec.rb index 270c9ad..c9628bc 100644 --- a/spec/styles/converter_spec.rb +++ b/spec/styles/converter_spec.rb @@ -3,13 +3,20 @@ describe Creek::Styles::Converter do describe :call do + def convert(value, type, style) Creek::Styles::Converter.call(value, type, style) end + describe :date do + it "works" do + expect(convert('41275', 'n', :date)).to eq(Date.new(2013,01,01)) + end + end + describe :date_time do it "works" do - expect(convert('41275', 'n', :date_time)).to eq(Date.new(2013,01,01)) + expect(convert('41275', 'n', :date_time)).to eq(DateTime.new(2013,01,01)) end end end diff --git a/spec/test_spec.rb b/spec/test_spec.rb index 062e6b3..d1f648a 100644 --- a/spec/test_spec.rb +++ b/spec/test_spec.rb @@ -26,6 +26,34 @@ end end +describe 'Creek parsing dates on a sample XLSX file' do + before(:all) do + @creek = Creek::Book.new 'spec/fixtures/sample_dates.xlsx' + + @expected_datetime_rows = [ + {'A3' => 'Date', 'B3' => Date.parse('2018-01-01')}, + {'A4' => 'Datetime 00:00:00', 'B4' => Time.parse('2018-01-01 00:00:00')}, + {'A5' => 'Datetime', 'B5' => Time.parse('2018-01-01 23:59:59')}] + end + + after(:all) do + @creek.close + end + + it 'parses dates successfully' do + rows = Array.new + row_count = 0 + @creek.sheets[0].rows.each do |row| + rows << row + row_count += 1 + end + + (2..5).each do |number| + expect(rows[number]).to eq(@expected_datetime_rows[number-2]) + end + end +end + describe 'Creek parsing a sample XLSX file' do before(:all) do @creek = Creek::Book.new 'spec/fixtures/sample.xlsx' @@ -63,15 +91,9 @@ row_count += 1 end - expect(rows[0]).to eq(@expected_rows[0]) - expect(rows[1]).to eq(@expected_rows[1]) - expect(rows[2]).to eq(@expected_rows[2]) - expect(rows[3]).to eq(@expected_rows[3]) - expect(rows[4]).to eq(@expected_rows[4]) - expect(rows[5]).to eq(@expected_rows[5]) - expect(rows[6]).to eq(@expected_rows[6]) - expect(rows[7]).to eq(@expected_rows[7]) - expect(rows[8]).to eq(@expected_rows[8]) + (0..8).each do |number| + expect(rows[number]).to eq(@expected_rows[number]) + end expect(row_count).to eq(9) end