-
Notifications
You must be signed in to change notification settings - Fork 5
Feature/csv destination #92
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,5 @@ | ||
| import csv | ||
| import dask.dataframe as dd | ||
| import jinja2 | ||
| import os | ||
| import pandas as pd | ||
|
|
@@ -18,8 +19,11 @@ class Destination(Node): | |
| mode: str = None # Documents which class was chosen. | ||
| allowed_configs: Tuple[str] = ('debug', 'expect', 'show_progress', 'repartition', 'source',) | ||
|
|
||
| def __new__(cls, *args, **kwargs): | ||
| return object.__new__(FileDestination) | ||
| def __new__(cls, name: str, config: 'YamlMapping', *, earthmover: 'Earthmover'): | ||
| if config.get('extension') == 'csv' or config.get('extension') == 'tsv': | ||
| return object.__new__(CsvDestination) | ||
| else: | ||
| return object.__new__(FileDestination) | ||
|
|
||
| def __init__(self, *args, **kwargs): | ||
| super().__init__(*args, **kwargs) | ||
|
|
@@ -132,3 +136,79 @@ def render_row(self, row: pd.Series): | |
| raise | ||
|
|
||
| return json_string | ||
|
|
||
|
|
||
| class CsvDestination(Destination): | ||
| """ | ||
|
|
||
| """ | ||
| mode: str = 'csv' # Documents which class was chosen. | ||
| allowed_configs: Tuple[str] = ( | ||
| 'debug', 'expect', 'show_progress', 'repartition', 'source', | ||
| 'extension', 'header', 'separator', 'limit', 'keep_columns' | ||
| ) | ||
|
|
||
|
|
||
| def __init__(self, *args, **kwargs): | ||
| super().__init__(*args, **kwargs) | ||
| self.header = self.error_handler.assert_get_key(self.config, 'header', dtype=bool, required=False, default=True) | ||
| self.separator = self.error_handler.assert_get_key(self.config, 'separator', dtype=str, required=False, default=",") | ||
| self.limit = self.error_handler.assert_get_key(self.config, 'limit', dtype=int, required=False, default=None) | ||
| self.extension = self.error_handler.assert_get_key(self.config, 'extension', dtype=str, required=False, default="csv") | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This field is technically required, since it has to be populated to initialize the CSV Destination.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I used a different approach here, adding another This does perhaps make Eventually we may add an (optional) (earthmover could parse the
Eventually we may add an optional One other relatively unrelated comment, if/when we support writing to databases, the order in which we process destinations will become important (if there are primary/foreign key references in the data). Currently there's no way in earthmover to control the order in which destinations are processed, we'd have to figure out how to handle that... maybe (like dbt does) an optional |
||
| self.keep_columns = self.error_handler.assert_get_key(self.config, 'keep_columns', required=False, default=None) | ||
|
|
||
| self.file = os.path.join( | ||
| self.earthmover.state_configs['output_dir'], | ||
| f"{self.name}.{self.extension}" | ||
| ) | ||
|
|
||
| def execute(self, **kwargs): | ||
| """ | ||
|
|
||
| :return: | ||
| """ | ||
| super().execute(**kwargs) | ||
|
|
||
| self.data = self.upstream_sources[self.source].data | ||
|
|
||
| # Apply limit to dataframe if specified. | ||
| if self.limit: | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure if raising an error is the right choice here. If the user specifies more rows than exist in the dataframe, we should just return all rows. |
||
| if self.limit > len(self.data): | ||
| self.error_handler.throw( | ||
| f"Limit value exceeds the number of rows in the data" | ||
| ) | ||
| raise | ||
|
|
||
| self.data = dd.from_pandas(self.data.head(n=self.limit), npartitions=1) | ||
|
|
||
| # Verify the output directory exists. | ||
| os.makedirs(os.path.dirname(self.file), exist_ok=True) | ||
| self.logger.info(f"Directory created: {os.path.dirname(self.file)}") | ||
|
|
||
| # Subset dataframe columns if specified | ||
| try: | ||
| if self.keep_columns: | ||
| self.data = self.data[self.keep_columns] | ||
|
|
||
| except KeyError as e: | ||
| self.error_handler.throw( | ||
| f"Error occurred while subsetting the data: {e.args[0]}" | ||
| ) | ||
| raise | ||
|
|
||
| # Change separator to tab if extension is tsv | ||
| if self.extension == 'tsv': | ||
| self.separator = '\t' | ||
|
|
||
| try: | ||
| self.data.to_csv( | ||
| filename=self.file, single_file=True, index=False, | ||
| sep=self.separator, header=self.header | ||
| ) | ||
| self.logger.info(f"Output `{self.file}` written") | ||
|
|
||
| except Exception as err: | ||
| self.error_handler.throw( | ||
| f"Error writing data to {self.extension} file: ({err})" | ||
| ) | ||
| raise | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I want to reiterate my opinion that
limitandkeep_columnsshould not be part of destination configs. These are data transformations which should be done separately. We already have akeep_columnstransformation operation, adding alimitoperation would be simple (and, for performance reasons, should be done as far upstream as possible, not at the final destination).