diff --git a/lib/nimble_csv.ex b/lib/nimble_csv.ex index 16e63c6..d62c545 100644 --- a/lib/nimble_csv.ex +++ b/lib/nimble_csv.ex @@ -91,6 +91,27 @@ defmodule NimbleCSV do defexception [:message] end + @doc """ + Returns the options used to define this parser/dumper module. + + This function allows you to retrieve the original options and use them + as a base for defining new modules with modified options. + + ## Examples + + # Create a new parser based on RFC4180 but with formula escaping + NimbleCSV.define( + MyApp.CSV, + NimbleCSV.RFC4180.options() + |> Keyword.merge( + escape_formula: %{~w(@ + - = \\t \\r) => "'"}, + moduledoc: "RFC4180 with formula escaping" + ) + ) + + """ + @callback options() :: keyword() + @doc """ Eagerly dumps an enumerable into iodata (a list of binaries and bytes and other lists). """ @@ -207,10 +228,10 @@ defmodule NimbleCSV do * `:dump_bom` - includes BOM (byte order marker) in the dumped document * `:reserved` - the list of characters to be escaped, defaults to the `:separator`, `:newlines`, and `:escape` characters above - * `:escape_formula` - the formula prefix(es) and formula escape sequence, - defaults to `nil`, which disabled formula escaping - `%{["@", "+", "-", "=", "\t", "\r"] => "'"}` would escape all fields starting - with `@`, `+`, `-`, `=`, tab or carriage return using the `'` character. + * `:escape_formula` - an optional map of formula prefixes to escape sequences. + When `nil` (the default), formula escaping is disabled. For example, + `%{~w(@ + - = \t \r) => "'"}` escapes fields starting with `@`, `+`, `-`, `=`, + tab, or carriage return by prefixing them with `'` Although parsing may support multiple newline delimiters, when dumping, only one of them must be picked, which is controlled by @@ -239,17 +260,27 @@ defmodule NimbleCSV do `@`, `+`, `-`, `=`, tab or carriage return). Use the following config to follow the [OWASP recommendations](https://owasp.org/www-community/attacks/CSV_Injection): - escape_formula: %{["@", "+", "-", "=", "\t", "\r"] => "'"} + escape_formula: %{~w(@ + - = \t \r) => "'"} Applications that want more control over this process, to allow formulas in specific cases, or possibly minimize false positives, should leave this option disabled and escape the value, as necessary, within their code. + + ## Extending existing CSV modules + + Each module defined with `define/2` includes an `c:options/0` function that + returns the original options used to create the CSV module. This allows you + to easily create new modules based on existing ones. For example, you can + extend an existing CSV module to add formula escaping or customize other + options as needed. """ def define(module, options) do defmodule module do @behaviour NimbleCSV @moduledoc Keyword.get(options, :moduledoc) + @original_options options + @escape Keyword.get(options, :escape, "\"") @escape_formula Enum.to_list(Keyword.get(options, :escape_formula, [])) @@ -342,6 +373,8 @@ defmodule NimbleCSV do @compile {:inline, maybe_dump_bom: 1, maybe_trim_bom: 1, maybe_to_utf8: 1, maybe_to_encoding: 1} + def options, do: @original_options + ## Parser def parse_stream(stream, opts \\ []) when is_list(opts) do diff --git a/test/nimble_csv_test.exs b/test/nimble_csv_test.exs index f6485ff..862bd62 100644 --- a/test/nimble_csv_test.exs +++ b/test/nimble_csv_test.exs @@ -9,6 +9,15 @@ defmodule NimbleCSVTest do line_separator: "\r\n" ) + NimbleCSV.define( + DerivedParser, + CSV.options() + |> Keyword.merge( + escape_formula: %{~w(@ + - = \t \r) => "'"}, + moduledoc: "Test parser based on RFC4180" + ) + ) + test "parse_string/2 without headers" do assert CSV.parse_string(""" name,last,year @@ -488,6 +497,39 @@ defmodule NimbleCSVTest do Spreadsheet.to_line_stream(stream) |> Spreadsheet.parse_stream() |> Enum.to_list() end + test "options/0 returns the original options" do + # Test that RFC4180 has the expected options + rfc4180_options = CSV.options() + assert Keyword.get(rfc4180_options, :separator) == "," + assert Keyword.get(rfc4180_options, :escape) == "\"" + assert Keyword.get(rfc4180_options, :line_separator) == "\r\n" + + # Test that Spreadsheet has the expected options + spreadsheet_options = Spreadsheet.options() + assert Keyword.get(spreadsheet_options, :separator) == "\t" + assert Keyword.get(spreadsheet_options, :encoding) == {:utf16, :little} + assert Keyword.get(spreadsheet_options, :trim_bom) == true + assert Keyword.get(spreadsheet_options, :dump_bom) == true + end + + test "creating a new parser based on existing options" do + # Verify the new parser has the combined options + test_options = DerivedParser.options() + assert Keyword.get(test_options, :separator) == "," + assert Keyword.get(test_options, :escape) == "\"" + assert Keyword.get(test_options, :escape_formula) == %{~w(@ + - = \t \r) => "'"} + assert Keyword.get(test_options, :moduledoc) == "Test parser based on RFC4180" + + # Test that the new parser works + assert DerivedParser.parse_string("name,value\njohn,123") == [~w(john 123)] + + # Test that formula escaping is applied + data = [~w(name formula), ["test", "@SUM(A1:A2)"]] + result = DerivedParser.dump_to_iodata(data) + dumped = IO.iodata_to_binary(result) + assert dumped == "name,formula\r\ntest,'@SUM(A1:A2)\r\n" + end + defp utf16le(binary), do: :unicode.characters_to_binary(binary, :utf8, {:utf16, :little}) defp utf16le_bom(), do: :unicode.encoding_to_bom({:utf16, :little}) end