diff --git a/opentabulate/main/algorithm.py b/opentabulate/main/algorithm.py deleted file mode 100644 index 08a84c7..0000000 --- a/opentabulate/main/algorithm.py +++ /dev/null @@ -1,485 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Tabulation API. - -This module defines the core functionality of OpenTabulate, which contains the -Algorithm class and its children. The classes provide methods for parsing, -processing and tabulating input data into CSV format. - -Created and written by Maksym Neyra-Nesterenko, with support and funding from the -*Center for Special Business Projects* (CSBP) at *Statistics Canada*. -""" - -####################### -# MODULES AND IMPORTS # -####################### - -import csv -import os -import re -from xml.etree import ElementTree - -from opentabulate.main.config import SUPPORTED_ENCODINGS -from opentabulate.main.thread_exception import ThreadInterruptError - -##################################### -# DATA PROCESSING ALGORITHM CLASSES # -##################################### - -class Algorithm(): - """ - Parent algorithm class for data processing. - - Attributes: - source (Source): Dataset processing configuration and metadata. - interrupt (threading.Event): Event to halt multi-threaded processing. - label_map (dict): Column name mapping to output CSV. - FORCE_REGEXP (re.Pattern): Regular expression for 'force' values in source. - - OUTPUT_ENC_ERRORS (str): Flag for how to handle character encoding errors. - FILTER_FLAG (bool): Flag for data filtering. - PROVIDER_FLAG (bool): Flag for 'provider' column. - ADD_INDEX (bool): Flag for 'idx' column. - NO_WHITESPACE (bool): Flag for handling unnecessary whitespace (e.g. new lines, - tabs, separation of words by multiple spaces) - LOWERCASE (bool): Flag to whether or not the output is made lowercase. - """ - def __init__(self, source=None, interrupt=None): - """ - Initializes Algorithm object. - - Args: - source (Source): Dataset abstraction. - interrupt (threading.Event): Event to halt multi-threaded processing. - """ - self.source = source - self.interrupt = interrupt - self.label_map = None - - self.FORCE_REGEXP = re.compile('force:.*') - - # flags - self.OUTPUT_ENC_ERRORS = None - self.FILTER_FLAG = None - self.PROVIDER_FLAG = None - self.ADD_INDEX = None - self.NO_WHITESPACE = None - self.LOWERCASE = None - - if source is not None: - # flags from source file metadata - self.FILTER_FLAG = True if 'filter' in source.metadata else False - self.PROVIDER_FLAG = True if 'provider' in source.metadata else False - - self.OUTPUT_ENC_ERRORS = source.config.get('general', 'output_encoding_errors') - - if source.config is not None: - # configuration or command line flags - self.ADD_INDEX = True if source.config.getboolean('general', 'add_index') else False - self.NO_WHITESPACE = True if source.config.getboolean('general', 'clean_whitespace') else False - self.LOWERCASE = True if source.config.getboolean('general', 'lowercase_output') else False - - source.logger.debug("FILTER_FLAG set to %s" % self.FILTER_FLAG) - source.logger.debug("PROVIDER_FLAG set to %s" % self.PROVIDER_FLAG) - source.logger.debug("ADD_INDEX set to %s" % self.ADD_INDEX) - source.logger.debug("NO_WHITESPACE set to %s" % self.NO_WHITESPACE) - source.logger.debug("LOWERCASE set to %s" % self.LOWERCASE) - source.logger.debug("OUTPUT_ENC_ERRORS set to %s" % self.OUTPUT_ENC_ERRORS) - - def char_encode_check(self): - """ - Heuristic test to identify the character encoding of a source. Every - line in the file is attempted to be decoded over a set of supported - encodings in a fixed order. The first encoding that successfully - decodes the entire file is taken to be its encoding for the tabulation - step. Otherwise if all fail, then a RunTimeError is raised. - - Returns: - e (str): Python character encoding string. - - Raises: - ValueError: Invalid encoding from source. - RunTimeError: Character encoding test failed. - ThreadInterruptError: Interrupt event occurred in main thread. - """ - metadata = self.source.metadata - if 'encoding' in metadata: - data_enc = metadata['encoding'] - if data_enc in SUPPORTED_ENCODINGS: - return data_enc - else: - raise ValueError(data_enc + " is not a valid encoding.") - else: - for enc in SUPPORTED_ENCODINGS: - try: - with open(self.source.input_path, encoding=enc) as f: - for _ in f: - if self.interrupt is not None and self.interrupt.is_set(): - raise ThreadInterruptError("Interrupt event occurred.") - return enc - except UnicodeDecodeError: - pass - raise RuntimeError("Could not guess original character encoding.") - - - ############################################## - # Helper functions for the 'tabulate' method # - ############################################## - - def _generateFieldNames(self, keys): - """Generate column names for the target tabulated data.""" - return [k for k in keys] - - def _isRowEmpty(self, row): - """ - Check if a row (dict) has no non-empty entries. - - Raises: - AssertionError: Row value is not a string. - """ - for key in row: - if row[key] != "": - assert isinstance(row[key], str), 'Row value is not a string' - return False - return True - - def _quickCleanEntry(self, entry): - """Reformat a string using regex and return it.""" - if isinstance(entry, bytes): - entry = entry.decode() - - if self.NO_WHITESPACE: # remove redundant [:space:] char class characters - # since this includes removal of newlines, the next regexps are safe and - # do not require the "DOTALL" flag - entry = re.sub(r"\s+", " ", entry) - # remove spaces occuring at the beginning and end of an entry - entry = re.sub(r"^\s+([^\s].*)", r"\1", entry) - entry = re.sub(r"(.*[^\s])\s+$", r"\1", entry) - entry = re.sub(r"^\s+$", "", entry) - - if self.LOWERCASE: # make entries lowercase - entry = entry.lower() - - return entry - - def _isForceValue(self, value): - """Returns True if value contains the prefix 'force:'.""" - return bool(self.FORCE_REGEXP.match(value)) - - -class CSV_Algorithm(Algorithm): - """ - Algorithm child class designed to handle CSV formatted data. - """ - def construct_label_map(self): - """ - Constructs a dictionary from a column map that the 'tabulate' function uses to - to reformat input data. - """ - self.label_map = self.source.column_map - - def tabulate(self): - """ - Parses a dataset in CSV format to transform into a standardized CSV format. - - Exceptions raised must be handled external to this module. - - Raises: - ValueError: Label map for parsing data is missing. - csv.Error: Incorrect format of CSV data - ThreadInterruptError: Interrupt event occurred in main thread. - """ - if not hasattr(self, 'label_map'): - raise ValueError("Missing 'label_map' for parsing, 'construct_label_map' was not ran") - - tags = self.label_map - enc = self.char_encode_check() - - with open(self.source.input_path, 'r', encoding=enc) as csv_file_read, \ - open(self.source.output_path, 'w', - encoding=self.source.config.get('general', 'target_encoding'), - errors=self.OUTPUT_ENC_ERRORS - ) as csv_file_write: - # define column labels - fieldnames = self._generateFieldNames(tags) - - if self.PROVIDER_FLAG: - fieldnames.append('provider') - - if self.ADD_INDEX: - fieldnames.insert(0, 'idx') - - # define reader/writer - csvreader = csv.DictReader( - csv_file_read, - delimiter=self.source.metadata['format']['delimiter'], - quotechar=self.source.metadata['format']['quote'] - ) - csvwriter = csv.DictWriter( - csv_file_write, - fieldnames, - delimiter=',', - quotechar='"', - quoting=csv.QUOTE_MINIMAL - ) - - # remove (possibly existing) byte order mark (BOM) - csvreader.fieldnames[0] = re.sub(r"^\ufeff(.+)", r"\1", csvreader.fieldnames[0]) - no_columns = len(csvreader.fieldnames) - - csvwriter.writeheader() - - idx = 0 - - for entity in csvreader: - if self.interrupt is not None and self.interrupt.is_set(): - raise ThreadInterruptError("Interrupt event occurred") - - row = dict() - - no_row_entries = 0 - for x in entity: - if entity[x] is not None: - no_row_entries += 1 - - # if there are more or less row entries than number of columns, throw error - if no_row_entries != no_columns: - raise csv.Error("Incorrect number of entries on line %s" % csvreader.line_num) - - # filter entry - if not self._csv_keep_entry(entity): - continue - - for key in tags: - - # --%-- check if tags[key] is a JSON array --%-- - if isinstance(tags[key], list): - components = [] - for subentry in tags[key]: - # is 'i' a 'force' entry? - if self._isForceValue(subentry): - components.append(subentry.split(':')[1]) - else: - components.append(entity[subentry]) - - entry = ' '.join(components) - entry = self._quickCleanEntry(entry) - - row[key] = entry - continue - - # --%-- all other cases handled here --%-- - # is 'tags[key]' a 'force' entry? - if self._isForceValue(tags[key]): - entry = tags[key].split(':')[1] - else: - entry = entity[tags[key]] - - row[key] = self._quickCleanEntry(entry) - - if not self._isRowEmpty(row): - # add customized entries here (e.g. provider) - if self.PROVIDER_FLAG: - row['provider'] = self.source.metadata['provider'] - - if self.ADD_INDEX: - row['idx'] = idx - idx += 1 - - csvwriter.writerow(row) - - - def _csv_keep_entry(self, entity): - """ - Regular expression filtering implementation. - """ - if not self.FILTER_FLAG: - # keep entries if no filter flag is used - return True - else: - BOOL_MATCHES = [] - for attribute in self.source.metadata['filter']: - match = False - regexp = self.source.metadata['filter'][attribute] - if regexp.search(entity[attribute]): - match = True - BOOL_MATCHES.append(match) - - for var in BOOL_MATCHES: - # if one of the matches failed, discard entry - if not var: - return False - # otherwise, keep entry - return True - - -class XML_Algorithm(Algorithm): - """ - Algorithm child class with methods designed to handle XML formatted data. - """ - def construct_label_map(self): - """ - Constructs a dictionary from a column map that the 'tabulate' function uses to - to reformat input data. In this case (XML formatted data), the values in the - column map must be converted to XPath expressions. - """ - label_map = dict() - # append existing data using XPath expressions (for parsing) - for k in self.source.column_map: - if isinstance(self.source.column_map[k], list): - label_map[k] = list() - for t in self.source.column_map[k]: - label_map[k].append(t if self._isForceValue(t) else (".//" + t)) - else: - val = self.source.column_map[k] - label_map[k] = val if self._isForceValue(val) else (".//" + val) - - self.label_map = label_map - - - def tabulate(self): - """ - Parses a dataset in XML format to transform into a standardized CSV format. - - Exceptions raised must be handled external to this module. - - Raises: - ValueError: Label map for parsing data is missing. - ThreadInterruptError: Interrupt event occurred in main thread. - """ - if not hasattr(self, 'label_map'): - raise ValueError("Missing 'label_map' for parsing, 'construct_label_map' was not ran") - - tags = self.label_map - header = self.source.metadata['format']['header'] - enc = self.char_encode_check() - - xmlp = ElementTree.XMLParser(encoding=enc) - tree = ElementTree.parse(self.source.input_path, parser=xmlp) - root = tree.getroot() - - with open(self.source.output_path, 'w', - encoding=self.source.config.get('general', 'target_encoding'), - errors=self.OUTPUT_ENC_ERRORS - ) as csvfile: - # write the initial row which identifies each column - fieldnames = self._generateFieldNames(tags) - - if self.PROVIDER_FLAG: - fieldnames.append('provider') - - if self.ADD_INDEX: - fieldnames.insert(0, 'idx') - - csvwriter = csv.DictWriter( - csvfile, - fieldnames, - delimiter=',', - quotechar='"', - quoting=csv.QUOTE_MINIMAL - ) - - csvwriter.writeheader() - - idx = 0 - - for head_element in root.iter(header): - if self.interrupt is not None and self.interrupt.is_set(): - raise ThreadInterruptError("Interrupt event occurred") - - row = dict() - - # filter entry - if not self._xml_keep_entry(head_element): - continue - - for key in tags: - # --%-- check if tags[key] is a JSON array --%-- - if isinstance(tags[key], list): - components = [] - for val in tags[key]: - # is val a 'force' entry? - if self._isForceValue(val): - components.append(val.split(':')[1]) - else: - assert val[:3] == './/' - tag_name = val[3:] # removes './/' prefix - subelement = head_element.find(val) - subelement = self._xml_is_element_missing(subelement, tag_name, head_element) - components.append(subelement) - - entry = ' '.join(components) - row[key] = self._quickCleanEntry(entry) - continue - - # --%-- all other cases handled here --%-- - # is 'tags[key]' a 'force' entry? - if self._isForceValue(tags[key]): - entry = tags[key].split(':')[1] - else: - assert tags[key][:3] == './/' - tag_name = tags[key][3:] # removes './/' prefix - element = head_element.find(tags[key]) - element = self._xml_is_element_missing(element, tag_name, head_element) - entry = element - - row[key] = self._quickCleanEntry(entry) - - if not self._isRowEmpty(row): - # add customized entries here (e.g. provider) - if self.PROVIDER_FLAG: - row['provider'] = self.source.metadata['provider'] - - if self.ADD_INDEX: - row['idx'] = idx - idx += 1 - - csvwriter.writerow(row) - - - def _xml_keep_entry(self, head_element): - """ - Regular expression filtering implementation. - """ - if not self.FILTER_FLAG: - # keep entries if no filter flag is used - return True - else: - BOOL_MATCHES = [] - for attribute in self.source.metadata['filter']: - match = False - regexp = self.source.metadata['filter'][attribute] - element = head_element.find(".//" + attribute) - element = self._xml_is_element_missing(element, attribute, head_element) - if regexp.search(element): - match = True - BOOL_MATCHES.append(match) - - for var in BOOL_MATCHES: - # if one of the matches failed, discard entry - if not var: - return False - # otherwise, keep entry - return True - - def _xml_is_element_missing(self, element, tag_name, head_element): - """ - The xml.etree module returns 'None' if there is no text in a tag. Moreover, if - the element cannot be found, the element is None. - - Args: - element (ElementTree.Element): Target node in XML tree. - tag_name (str): Target tag name in tree parsing. - head_element (ElementTree.Element): Header node in XML tree. - - Returns: - str: Empty string if missing or empty tag, otherwise element.text. - """ - # Note: tag_name and head_element were originally intended for logging and - # might be used this way in the future. It's current use is for debugging! - if element is None: - return '' - elif element.text is not None: - return element.text - else: - return '' diff --git a/opentabulate/main/algorithms/__init__.py b/opentabulate/main/algorithms/__init__.py new file mode 100644 index 0000000..3bfaa1d --- /dev/null +++ b/opentabulate/main/algorithms/__init__.py @@ -0,0 +1,5 @@ +from .algorithm import Algorithm +from .csv_algorithm import CSV_Algorithm +from .geojson_algorithm import GeoJSON_Algorithm +from .json_algorithm import JSON_Algorithm +from .xml_algorithm import XML_Algorithm diff --git a/opentabulate/main/algorithms/algorithm.py b/opentabulate/main/algorithms/algorithm.py new file mode 100644 index 0000000..a89de83 --- /dev/null +++ b/opentabulate/main/algorithms/algorithm.py @@ -0,0 +1,205 @@ +# -*- coding: utf-8 -*- +""" +Tabulation API. + +This module defines the core functionality of OpenTabulate, which contains the +Algorithm class. The classes provide methods for parsing, +processing and tabulating input data into CSV format. + +Created and written by Maksym Neyra-Nesterenko, with support and funding from the +*Center for Special Business Projects* (CSBP) at *Statistics Canada*. +""" + +####################### +# MODULES AND IMPORTS # +####################### + +import platform +import re +from abc import ABC, abstractmethod + +from opentabulate.main.config import SUPPORTED_ENCODINGS +from opentabulate.main.thread_exception import ThreadInterruptError + +##################################### +# DATA PROCESSING ALGORITHM CLASSES # +##################################### + +class Algorithm(ABC): + """ + Parent algorithm class for data processing. + + Attributes: + source (Source): Dataset processing configuration and metadata. + interrupt (threading.Event): Event to halt multi-threaded processing. + label_map (dict): Column name mapping to output CSV. + FORCE_REGEXP (re.Pattern): Regular expression for 'force' values in source. + + OUTPUT_ENC_ERRORS (str): Flag for how to handle character encoding errors. + FILTER_FLAG (bool): Flag for data filtering. + PROVIDER_FLAG (bool): Flag for 'provider' column. + ADD_INDEX (bool): Flag for 'idx' column. + NO_WHITESPACE (bool): Flag for handling unnecessary whitespace (e.g. new lines, + tabs, separation of words by multiple spaces) + LOWERCASE (bool): Flag to whether or not the output is made lowercase. + """ + + @abstractmethod + def tabulate(self): + """Mandatory method for all subclasses""" + + def __init__(self, source=None, interrupt=None): + """ + Initializes Algorithm object. + + Args: + source (Source): Dataset abstraction. + interrupt (threading.Event): Event to halt multi-threaded processing. + """ + self.source = source + self.interrupt = interrupt + self.label_map = None + + self.platform = platform.system() + + self.FORCE_REGEXP = re.compile('force:.*') + + # flags + self.OUTPUT_ENC_ERRORS = None + self.FILTER_FLAG = None + self.PROVIDER_FLAG = None + self.ADD_INDEX = None + self.NO_WHITESPACE = None + self.LOWERCASE = None + self.TITLECASE = None + self.UPPERCASE = None + + if source is not None: + # flags from source file metadata + self.FILTER_FLAG = True if 'filter' in source.metadata else False + self.PROVIDER_FLAG = True if 'provider' in source.metadata else False + + self.OUTPUT_ENC_ERRORS = source.config.get('general', 'output_encoding_errors') + + if source.config is not None: + # configuration or command line flags + self.ADD_INDEX = True if source.config.getboolean('general', 'add_index') else False + self.NO_WHITESPACE = True if source.config.getboolean('general', 'clean_whitespace') else False + self.LOWERCASE = True if source.config.getboolean('general', 'lowercase_output') else False + # titlecase has lower priority than lowercase + self.TITLECASE = (not self.LOWERCASE) and source.config.getboolean('general', 'titlecase_output') + # uppercase has the lowest priority + self.UPPERCASE = (not (self.LOWERCASE or self.TITLECASE)) and source.config.getboolean('general', 'uppercase_output') + + source.logger.debug("FILTER_FLAG set to %s" % self.FILTER_FLAG) + source.logger.debug("PROVIDER_FLAG set to %s" % self.PROVIDER_FLAG) + source.logger.debug("ADD_INDEX set to %s" % self.ADD_INDEX) + source.logger.debug("NO_WHITESPACE set to %s" % self.NO_WHITESPACE) + source.logger.debug("LOWERCASE set to %s" % self.LOWERCASE) + source.logger.debug("TITLECASE set to %s" % self.TITLECASE) + source.logger.debug("UPPERCASE set to %s" % self.UPPERCASE) + source.logger.debug("OUTPUT_ENC_ERRORS set to %s" % self.OUTPUT_ENC_ERRORS) + + def char_encode_check(self): + """ + Heuristic test to identify the character encoding of a source. Every + line in the file is attempted to be decoded over a set of supported + encodings in a fixed order. The first encoding that successfully + decodes the entire file is taken to be its encoding for the tabulation + step. Otherwise if all fail, then a RunTimeError is raised. + + Returns: + e (str): Python character encoding string. + + Raises: + ValueError: Invalid encoding from source. + RunTimeError: Character encoding test failed. + ThreadInterruptError: Interrupt event occurred in main thread. + """ + metadata = self.source.metadata + if 'encoding' in metadata: + data_enc = metadata['encoding'] + if data_enc in SUPPORTED_ENCODINGS: + return data_enc + else: + raise ValueError(data_enc + " is not a valid encoding.") + else: + for enc in SUPPORTED_ENCODINGS: + try: + with open(self.source.input_path, encoding=enc) as f: + for _ in f: + if self.interrupt is not None and self.interrupt.is_set(): + raise ThreadInterruptError("Interrupt event occurred.") + return enc + except UnicodeDecodeError: + pass + raise RuntimeError("Could not guess original character encoding.") + + + ############################################## + # Helper functions for the 'tabulate' method # + ############################################## + + def _generateFieldNames(self, keys): + """Generate column names for the target tabulated data.""" + return [k for k in keys] + + def _isRowEmpty(self, row): + """ + Check if a row (dict) has no non-empty entries. + + Raises: + AssertionError: Row value is not a string. + """ + for key in row: + if row[key] != "": + assert isinstance(row[key], str), 'Row value is not a string' + return False + return True + + def _quickCleanEntry(self, entry): + """Reformat a string using regex and return it.""" + if isinstance(entry, bytes): + entry = entry.decode() + + if self.NO_WHITESPACE: # remove redundant [:space:] char class characters + # since this includes removal of newlines, the next regexps are safe and + # do not require the "DOTALL" flag + entry = re.sub(r"\s+", " ", entry) + # remove spaces occuring at the beginning and end of an entry + entry = re.sub(r"^\s+([^\s].*)", r"\1", entry) + entry = re.sub(r"(.*[^\s])\s+$", r"\1", entry) + entry = re.sub(r"^\s+$", "", entry) + + if self.LOWERCASE: # make entries lowercase + entry = entry.lower() + elif self.TITLECASE: # make entries titlecase + entry = entry.title() + elif self.UPPERCASE: # make entries uppercase + entry = entry.upper() + + + return entry + + def _isForceValue(self, value): + """Returns True if value contains the prefix 'force:'.""" + return bool(self.FORCE_REGEXP.match(value)) + + def _openInputFile(self): + """Open input file""" + return open(self.source.input_path, 'r', encoding=self.char_encode_check()) + + def _openOutputFile(self): + """Platform-specific file open, avoids empty CSV lines on Windows""" + + # on Windows, remove extra newline + # https://docs.python.org/3/library/csv.html#examples + if self.platform == "Windows": + newl = '' + else: + newl = '\n' + + return open(self.source.output_path, 'w', + encoding=self.source.config.get('general', 'target_encoding'), + errors=self.OUTPUT_ENC_ERRORS, + newline=newl) \ No newline at end of file diff --git a/opentabulate/main/algorithms/csv_algorithm.py b/opentabulate/main/algorithms/csv_algorithm.py new file mode 100644 index 0000000..18322e4 --- /dev/null +++ b/opentabulate/main/algorithms/csv_algorithm.py @@ -0,0 +1,165 @@ +# -*- coding: utf-8 -*- +""" +CSV_Algorithm. + +Algorithm class. The CSV_Algorithm class provides methods for parsing, +processing and tabulating CSV input data into CSV format. + +Created and written by Maksym Neyra-Nesterenko, with support and funding from the +*Center for Special Business Projects* (CSBP) at *Statistics Canada*. +""" + +####################### +# MODULES AND IMPORTS # +####################### + +import csv +import re + +from .algorithm import Algorithm +from opentabulate.main.thread_exception import ThreadInterruptError + +##################################### +# DATA PROCESSING ALGORITHM CLASSES # +##################################### + +class CSV_Algorithm(Algorithm): + """ + Algorithm child class designed to handle CSV formatted data. + """ + def construct_label_map(self): + """ + Constructs a dictionary from a column map that the 'tabulate' function uses to + to reformat input data. + """ + self.label_map = self.source.column_map + + def tabulate(self): + """ + Parses a dataset in CSV format to transform into a standardized CSV format. + + Exceptions raised must be handled external to this module. + + Raises: + ValueError: Label map for parsing data is missing. + csv.Error: Incorrect format of CSV data + ThreadInterruptError: Interrupt event occurred in main thread. + """ + if not hasattr(self, 'label_map'): + raise ValueError("Missing 'label_map' for parsing, 'construct_label_map' was not ran") + + tags = self.label_map + + with self._openInputFile() as csv_file_read, \ + self._openOutputFile() as csv_file_write: + # define column labels + fieldnames = self._generateFieldNames(tags) + + if self.PROVIDER_FLAG: + fieldnames.append('provider') + + if self.ADD_INDEX: + fieldnames.insert(0, 'idx') + + # define reader/writer + csvreader = csv.DictReader( + csv_file_read, + delimiter=self.source.metadata['format']['delimiter'], + quotechar=self.source.metadata['format']['quote'] + ) + csvwriter = csv.DictWriter( + csv_file_write, + fieldnames, + delimiter=',', + quotechar='"', + quoting=csv.QUOTE_MINIMAL + ) + + # remove (possibly existing) byte order mark (BOM) + csvreader.fieldnames[0] = re.sub(r"^\ufeff(.+)", r"\1", csvreader.fieldnames[0]) + no_columns = len(csvreader.fieldnames) + + csvwriter.writeheader() + + idx = 0 + + for entity in csvreader: + if self.interrupt is not None and self.interrupt.is_set(): + raise ThreadInterruptError("Interrupt event occurred") + + row = dict() + + no_row_entries = 0 + for x in entity: + if entity[x] is not None: + no_row_entries += 1 + + # if there are more or less row entries than number of columns, throw error + if no_row_entries != no_columns: + raise csv.Error("Incorrect number of entries on line %s" % csvreader.line_num) + + # filter entry + if not self._csv_keep_entry(entity): + continue + + for key in tags: + + # --%-- check if tags[key] is a JSON array --%-- + if isinstance(tags[key], list): + components = [] + for subentry in tags[key]: + # is 'i' a 'force' entry? + if self._isForceValue(subentry): + components.append(subentry.split(':')[1]) + else: + components.append(entity[subentry]) + + entry = ' '.join(components) + entry = self._quickCleanEntry(entry) + + row[key] = entry + continue + + # --%-- all other cases handled here --%-- + # is 'tags[key]' a 'force' entry? + if self._isForceValue(tags[key]): + entry = tags[key].split(':')[1] + else: + entry = entity[tags[key]] + + row[key] = self._quickCleanEntry(entry) + + if not self._isRowEmpty(row): + # add customized entries here (e.g. provider) + if self.PROVIDER_FLAG: + row['provider'] = self.source.metadata['provider'] + + if self.ADD_INDEX: + row['idx'] = idx + idx += 1 + + csvwriter.writerow(row) + + + def _csv_keep_entry(self, entity): + """ + Regular expression filtering implementation. + """ + if not self.FILTER_FLAG: + # keep entries if no filter flag is used + return True + else: + BOOL_MATCHES = [] + for attribute in self.source.metadata['filter']: + match = False + regexp = self.source.metadata['filter'][attribute] + if regexp.search(entity[attribute]): + match = True + BOOL_MATCHES.append(match) + + for var in BOOL_MATCHES: + # if one of the matches failed, discard entry + if not var: + return False + # otherwise, keep entry + return True diff --git a/opentabulate/main/algorithms/geojson_algorithm.py b/opentabulate/main/algorithms/geojson_algorithm.py new file mode 100644 index 0000000..df14f13 --- /dev/null +++ b/opentabulate/main/algorithms/geojson_algorithm.py @@ -0,0 +1,83 @@ +# -*- coding: utf-8 -*- +""" +GeoJSON_Algorithm. + +Algorithm class. The GeoJSON_Algorithm class provides methods for parsing, +processing and tabulating GeoJSON input data into CSV format. + +Created and written by Marcello Barisonzi, with support and funding from the +*Center for Special Business Projects* (CSBP) at *Statistics Canada*. +""" + +####################### +# MODULES AND IMPORTS # +####################### + +import geopandas as gpd + +from .algorithm import Algorithm +from opentabulate.main.thread_exception import ThreadInterruptError + +##################################### +# DATA PROCESSING ALGORITHM CLASSES # +##################################### + +class GeoJSON_Algorithm(Algorithm): + """ + Algorithm child class designed to handle GeoJSON formatted data. + """ + + def construct_label_map(self): + """ + Constructs a dictionary from a column map that the 'tabulate' function uses to + to reformat input data. + """ + self.label_map = self.source.column_map + + def tabulate(self): + """ + Parses a dataset in JSON format to transform into a standardized CSV format. + + Exceptions raised must be handled external to this module. + + Raises: + ValueError: Label map for parsing data is missing. + """ + + if not hasattr(self, 'label_map'): + raise ValueError("Missing 'label_map' for parsing, 'construct_label_map' was not ran") + + tags = dict([(v,k) for k,v in self.label_map.items()]) + + crs = self.source.metadata['format']['crs'] + enc = self.char_encode_check() + + # read input file into DataFrame + with open(self.source.input_path, 'r', encoding=enc) as f: + df = gpd.read_file(f) + df.crs = crs + + # if geocoordinates not set, get them from geometry + if "LONGITUDE" not in df.columns: + df["LONGITUDE"] = df.geometry.x + if "LATITUDE" not in df.columns: + df["LATITUDE"] = df.geometry.y + + df.drop(columns="geometry", inplace=True) + + df.rename(columns=tags, inplace=True) + + # drop columns not in tags: + drop_columns = [i for i in df.columns if i not in self.label_map.keys()] + df.drop(columns=drop_columns, inplace=True) + + if self.ADD_INDEX: + df.reset_index(inplace=True, names="idx") + + if self.PROVIDER_FLAG: + df['provider'] = self.source.metadata['provider'] + + with self._openOutputFile() as o_f: + df.to_csv(o_f, index=False) + + # TODO: UPPER/LOWER/TITLECASE, FORCE \ No newline at end of file diff --git a/opentabulate/main/algorithms/json_algorithm.py b/opentabulate/main/algorithms/json_algorithm.py new file mode 100644 index 0000000..a28094a --- /dev/null +++ b/opentabulate/main/algorithms/json_algorithm.py @@ -0,0 +1,73 @@ +# -*- coding: utf-8 -*- +""" +JSON_Algorithm. + +Algorithm class. The JSON_Algorithm class provides methods for parsing, +processing and tabulating JSON input data into CSV format. + +Created and written by Marcello Barisonzi, with support and funding from the +*Center for Special Business Projects* (CSBP) at *Statistics Canada*. +""" + +####################### +# MODULES AND IMPORTS # +####################### + +import pandas as pd + +from .algorithm import Algorithm + +##################################### +# DATA PROCESSING ALGORITHM CLASSES # +##################################### + +class JSON_Algorithm(Algorithm): + """ + Algorithm child class designed to handle JSON formatted data. + """ + + def construct_label_map(self): + """ + Constructs a dictionary from a column map that the 'tabulate' function uses to + to reformat input data. + """ + self.label_map = self.source.column_map + + def tabulate(self): + """ + Parses a dataset in JSON format to transform into a standardized CSV format. + + Exceptions raised must be handled external to this module. + + Raises: + ValueError: Label map for parsing data is missing. + """ + + if not hasattr(self, 'label_map'): + raise ValueError("Missing 'label_map' for parsing, 'construct_label_map' was not ran") + + tags = dict([(v,k) for k,v in self.label_map.items()]) + + orient = self.source.metadata['format']['orient'] + enc = self.char_encode_check() + + # read input file into DataFrame + with open(self.source.input_path, 'r', encoding=enc) as f: + df = pd.read_json(f, encoding=enc, orient=orient) + + df.rename(columns=tags, inplace=True) + + # drop columns not in tags: + drop_columns = [i for i in df.columns if i not in self.label_map.keys()] + df.drop(columns=drop_columns, inplace=True) + + if self.ADD_INDEX: + df.reset_index(inplace=True, names="idx") + + if self.PROVIDER_FLAG: + df['provider'] = self.source.metadata['provider'] + + with self._openOutputFile() as o_f: + df.to_csv(o_f, index=False) + + # TODO: UPPER/LOWER/TITLECASE, FORCE \ No newline at end of file diff --git a/opentabulate/main/algorithms/xml_algorithm.py b/opentabulate/main/algorithms/xml_algorithm.py new file mode 100644 index 0000000..b1308b3 --- /dev/null +++ b/opentabulate/main/algorithms/xml_algorithm.py @@ -0,0 +1,192 @@ +# -*- coding: utf-8 -*- +""" +XML_Algorithm. + +Algorithm class. The XML_Algorithm class provides methods for parsing, +processing and tabulating XML input data into CSV format. + +Created and written by Maksym Neyra-Nesterenko, with support and funding from the +*Center for Special Business Projects* (CSBP) at *Statistics Canada*. +""" + +####################### +# MODULES AND IMPORTS # +####################### + +import csv +from xml.etree import ElementTree + +from .algorithm import Algorithm +from opentabulate.main.thread_exception import ThreadInterruptError + +##################################### +# DATA PROCESSING ALGORITHM CLASSES # +##################################### + +class XML_Algorithm(Algorithm): + """ + Algorithm child class with methods designed to handle XML formatted data. + """ + def construct_label_map(self): + """ + Constructs a dictionary from a column map that the 'tabulate' function uses to + to reformat input data. In this case (XML formatted data), the values in the + column map must be converted to XPath expressions. + """ + label_map = dict() + # append existing data using XPath expressions (for parsing) + for k in self.source.column_map: + if isinstance(self.source.column_map[k], list): + label_map[k] = list() + for t in self.source.column_map[k]: + label_map[k].append(t if self._isForceValue(t) else (".//" + t)) + else: + val = self.source.column_map[k] + label_map[k] = val if self._isForceValue(val) else (".//" + val) + + self.label_map = label_map + + + def tabulate(self): + """ + Parses a dataset in XML format to transform into a standardized CSV format. + + Exceptions raised must be handled external to this module. + + Raises: + ValueError: Label map for parsing data is missing. + ThreadInterruptError: Interrupt event occurred in main thread. + """ + if not hasattr(self, 'label_map'): + raise ValueError("Missing 'label_map' for parsing, 'construct_label_map' was not ran") + + tags = self.label_map + header = self.source.metadata['format']['header'] + enc = self.char_encode_check() + + xmlp = ElementTree.XMLParser(encoding=enc) + tree = ElementTree.parse(self.source.input_path, parser=xmlp) + root = tree.getroot() + + with self._openOutputFile() as csvfile: + # write the initial row which identifies each column + fieldnames = self._generateFieldNames(tags) + + if self.PROVIDER_FLAG: + fieldnames.append('provider') + + if self.ADD_INDEX: + fieldnames.insert(0, 'idx') + + csvwriter = csv.DictWriter( + csvfile, + fieldnames, + delimiter=',', + quotechar='"', + quoting=csv.QUOTE_MINIMAL + ) + + csvwriter.writeheader() + + idx = 0 + + for head_element in root.iter(header): + if self.interrupt is not None and self.interrupt.is_set(): + raise ThreadInterruptError("Interrupt event occurred") + + row = dict() + + # filter entry + if not self._xml_keep_entry(head_element): + continue + + for key in tags: + # --%-- check if tags[key] is a JSON array --%-- + if isinstance(tags[key], list): + components = [] + for val in tags[key]: + # is val a 'force' entry? + if self._isForceValue(val): + components.append(val.split(':')[1]) + else: + assert val[:3] == './/' + tag_name = val[3:] # removes './/' prefix + subelement = head_element.find(val) + subelement = self._xml_is_element_missing(subelement, tag_name, head_element) + components.append(subelement) + + entry = ' '.join(components) + row[key] = self._quickCleanEntry(entry) + continue + + # --%-- all other cases handled here --%-- + # is 'tags[key]' a 'force' entry? + if self._isForceValue(tags[key]): + entry = tags[key].split(':')[1] + else: + assert tags[key][:3] == './/' + tag_name = tags[key][3:] # removes './/' prefix + element = head_element.find(tags[key]) + element = self._xml_is_element_missing(element, tag_name, head_element) + entry = element + + row[key] = self._quickCleanEntry(entry) + + if not self._isRowEmpty(row): + # add customized entries here (e.g. provider) + if self.PROVIDER_FLAG: + row['provider'] = self.source.metadata['provider'] + + if self.ADD_INDEX: + row['idx'] = idx + idx += 1 + + csvwriter.writerow(row) + + + def _xml_keep_entry(self, head_element): + """ + Regular expression filtering implementation. + """ + if not self.FILTER_FLAG: + # keep entries if no filter flag is used + return True + else: + BOOL_MATCHES = [] + for attribute in self.source.metadata['filter']: + match = False + regexp = self.source.metadata['filter'][attribute] + element = head_element.find(".//" + attribute) + element = self._xml_is_element_missing(element, attribute, head_element) + if regexp.search(element): + match = True + BOOL_MATCHES.append(match) + + for var in BOOL_MATCHES: + # if one of the matches failed, discard entry + if not var: + return False + # otherwise, keep entry + return True + + def _xml_is_element_missing(self, element, tag_name, head_element): + """ + The xml.etree module returns 'None' if there is no text in a tag. Moreover, if + the element cannot be found, the element is None. + + Args: + element (ElementTree.Element): Target node in XML tree. + tag_name (str): Target tag name in tree parsing. + head_element (ElementTree.Element): Header node in XML tree. + + Returns: + str: Empty string if missing or empty tag, otherwise element.text. + """ + # Note: tag_name and head_element were originally intended for logging and + # might be used this way in the future. It's current use is for debugging! + if element is None: + return '' + elif element.text is not None: + return element.text + else: + return '' diff --git a/opentabulate/main/args.py b/opentabulate/main/args.py index 9d799f3..1ff9dfb 100644 --- a/opentabulate/main/args.py +++ b/opentabulate/main/args.py @@ -115,8 +115,9 @@ def validate_args_and_config(p_args, config, cache_mgrs): except Exception as err: # other errors (such as loading file) print("Error: %s" % err, file=sys.stderr) sys.exit(1) - - root_dir = config.get('general', 'root_directory') + + # remove quotes from around root dir + root_dir = config.get('general', 'root_directory').replace('"', '').replace("'", "") # check that root directory is an absolute path try: diff --git a/opentabulate/main/config.py b/opentabulate/main/config.py index c318bbd..a0d98f4 100644 --- a/opentabulate/main/config.py +++ b/opentabulate/main/config.py @@ -79,7 +79,8 @@ def validate(self): base_sections = ('general', 'labels') general_section = ('root_directory', 'add_index', 'target_encoding', - 'output_encoding_errors', 'clean_whitespace', 'lowercase_output', + 'output_encoding_errors', 'clean_whitespace', + 'lowercase_output', 'titlecase_output', 'uppercase_output', 'log_level') reserved_cols = ('idx', 'provider') @@ -117,6 +118,8 @@ def validate(self): 'add_index' : 'false', 'clean_whitespace' : 'false', 'lowercase_output' : 'false', + 'titlecase_output' : 'false', + 'uppercase_output' : 'false', 'log_level' : '3'} for def_opt in defaults: @@ -124,7 +127,7 @@ def validate(self): self.set('general', def_opt, defaults[def_opt]) # validate boolean options - boolean_options = ('add_index', 'clean_whitespace', 'lowercase_output') + boolean_options = ('add_index', 'clean_whitespace', 'lowercase_output', 'titlecase_output', 'uppercase_output') for option in boolean_options: try: self.getboolean('general', option) diff --git a/opentabulate/main/source.py b/opentabulate/main/source.py index 65f2200..625fcb0 100644 --- a/opentabulate/main/source.py +++ b/opentabulate/main/source.py @@ -16,7 +16,7 @@ import logging import os import re -import sys + from ast import literal_eval class Source(object): @@ -177,6 +177,26 @@ def parse(self): raise LookupError("%s 'format.header' tag is missing for format 'xml'" % src_basename) elif not isinstance(self.metadata['format']['header'], str): raise TypeError("%s 'format.header' must be a string." % src_basename) + + elif (self.metadata['format']['type'] == 'json'): + # -- JSON -- + # json orientation for pandas + orients = ['split','records','index', 'columns', 'table'] + if 'orient' not in self.metadata['format']: + raise LookupError("%s 'format.orient' tag is missing for format 'json'" % src_basename) + elif not isinstance(self.metadata['format']['orient'], str): + raise TypeError("%s 'format.orient' must be a string." % src_basename) + elif self.metadata['format']['orient'] not in orients: + raise TypeError("%s 'format.orient' must be one of: %s." % (src_basename, ", ".join(orients))) + + elif (self.metadata['format']['type'] == 'geojson'): + # -- GeoJSON -- + # CRS + if 'crs' not in self.metadata['format']: + raise LookupError("%s 'format.crs' tag is missing for format 'geojson'" % src_basename) + elif not isinstance(self.metadata['format']['crs'], str): + raise TypeError("%s 'format.crs' must be a string." % src_basename) + else: # -- unsupported format -- raise ValueError("%s Unsupported data format '%s'" % (src_basename, self.metadata['format']['type'])) @@ -215,7 +235,7 @@ def parse(self): 'input' : './data/input', 'output' : './data/output' } - extensions = ('.csv', '.xml') + extensions = ('.csv', '.xml', '.json', '.geojson') basename = os.path.splitext(self.localfile) assert basename[1] in extensions, \ @@ -227,7 +247,7 @@ def parse(self): # check entire source to make sure correct keys are being used root_layer = ('localfile', 'format', 'schema_groups', 'encoding', 'schema', 'filter', 'provider', 'licence', 'source') - format_layer = ('type', 'header', 'quote', 'delimiter') + format_layer = ('type', 'header', 'quote', 'delimiter', 'orient', 'crs') for i in self.metadata: if i not in root_layer: diff --git a/opentabulate/main/tabulate.py b/opentabulate/main/tabulate.py index 6aadbd8..cc82de3 100644 --- a/opentabulate/main/tabulate.py +++ b/opentabulate/main/tabulate.py @@ -22,7 +22,7 @@ from xml.etree import ElementTree from opentabulate.main.source import Source -from opentabulate.main.algorithm import * +from opentabulate.main.algorithms import * class DataProcess(object): """ @@ -51,14 +51,24 @@ def prepareData(self, interrupt=None): Args: interrupt (threading.Event): Event to halt multi-threaded processing. + + Raises: + Data format unknown. """ - if self.source.metadata['format']['type'] == 'csv': + format = self.source.metadata['format']['type'] + if format == 'csv': fmt_algorithm = CSV_Algorithm(self.source, interrupt) if 'encoding' not in self.source.metadata: csv_encoding = fmt_algorithm.char_encode_check() self.source.metadata['encoding'] = csv_encoding # prevents redundant encoding checks - elif self.source.metadata['format']['type'] == 'xml': + elif format == 'xml': fmt_algorithm = XML_Algorithm(self.source, interrupt) + elif format == 'geojson': + fmt_algorithm = GeoJSON_Algorithm(self.source, interrupt) + elif format == 'json': + fmt_algorithm = JSON_Algorithm(self.source, interrupt) + else: + raise ValueError("Data format %s unknown." % format) # initialize self.algorithm for other methods self.algorithm = fmt_algorithm diff --git a/opentabulate/tests/data/.gitignore b/opentabulate/tests/data/.gitignore index 63d5225..aae2cbb 100644 --- a/opentabulate/tests/data/.gitignore +++ b/opentabulate/tests/data/.gitignore @@ -1,2 +1 @@ -/csv-test-output.csv -/xml-test-output.csv +/*-test-output.csv \ No newline at end of file diff --git a/opentabulate/tests/data/geojson-data.geojson b/opentabulate/tests/data/geojson-data.geojson new file mode 100644 index 0000000..9b0df74 --- /dev/null +++ b/opentabulate/tests/data/geojson-data.geojson @@ -0,0 +1,46 @@ +{ + "type" : "FeatureCollection", + "name" : "TestData", + "features" : [ + { + "type" : "Feature", + "geometry" : { + "type" : "Point", + "coordinates" : [ 0, -1 ] + }, + "properties" : { + "name" : "A,B" + } + }, + { + "type" : "Feature", + "geometry" : { + "type" : "Point", + "coordinates" : [ 1, 0 ] + }, + "properties" : { + "name" : "C D" + } + }, + { + "type" : "Feature", + "geometry" : { + "type" : "Point", + "coordinates" : [ 0, 0 ] + }, + "properties" : { + "name" : "A,D" + } + }, + { + "type" : "Feature", + "geometry" : { + "type" : "Point", + "coordinates" : [ -1, 1 ] + }, + "properties" : { + "name" : "C C" + } + } + ] +} \ No newline at end of file diff --git a/opentabulate/tests/data/geojson-source.json b/opentabulate/tests/data/geojson-source.json new file mode 100644 index 0000000..4c5dfd3 --- /dev/null +++ b/opentabulate/tests/data/geojson-source.json @@ -0,0 +1,16 @@ +{ + "localfile": "geojson-data.geojson", + "format": { + "type": "geojson", + "crs": "epsg:4326" + }, + "encoding": "utf-8", + "schema_groups": ["label", "coordinates"], + "schema": { + "i" : "name", + "coordinates" : { + "X" : "LONGITUDE", + "Y" : "LATITUDE" + } + } +} diff --git a/opentabulate/tests/data/geojson-target-output.csv b/opentabulate/tests/data/geojson-target-output.csv new file mode 100644 index 0000000..fe3ccda --- /dev/null +++ b/opentabulate/tests/data/geojson-target-output.csv @@ -0,0 +1,5 @@ +i,X,Y +"A,B",0.0,-1.0 +C D,1.0,0.0 +"A,D",0.0,0.0 +C C,-1.0,1.0 diff --git a/opentabulate/tests/data/json-columns-data.json b/opentabulate/tests/data/json-columns-data.json new file mode 100644 index 0000000..2adc500 --- /dev/null +++ b/opentabulate/tests/data/json-columns-data.json @@ -0,0 +1,20 @@ +{ + "name": { + "0": "A,B", + "1": "C D", + "2": "A,D", + "3": "C C" + }, + "x": { + "0": 0, + "1": 1, + "2": 0, + "3": -1 + }, + "y": { + "0": -1, + "1": 0, + "2": 0, + "3": 1 + } +} \ No newline at end of file diff --git a/opentabulate/tests/data/json-columns-source.json b/opentabulate/tests/data/json-columns-source.json new file mode 100644 index 0000000..ebc0819 --- /dev/null +++ b/opentabulate/tests/data/json-columns-source.json @@ -0,0 +1,16 @@ +{ + "localfile": "json-columns-data.json", + "format": { + "type": "json", + "orient": "columns" + }, + "encoding": "utf-8", + "schema_groups": ["label", "coordinates"], + "schema": { + "i" : "name", + "coordinates" : { + "X" : "x", + "Y" : "y" + } + } +} diff --git a/opentabulate/tests/data/json-index-data.json b/opentabulate/tests/data/json-index-data.json new file mode 100644 index 0000000..c748214 --- /dev/null +++ b/opentabulate/tests/data/json-index-data.json @@ -0,0 +1,22 @@ +{ + "0": { + "name": "A,B", + "x": 0, + "y": -1 + }, + "1": { + "name": "C D", + "x": 1, + "y": 0 + }, + "2": { + "name": "A,D", + "x": 0, + "y": 0 + }, + "3": { + "name": "C C", + "x": -1, + "y": 1 + } +} \ No newline at end of file diff --git a/opentabulate/tests/data/json-index-source.json b/opentabulate/tests/data/json-index-source.json new file mode 100644 index 0000000..e1cc646 --- /dev/null +++ b/opentabulate/tests/data/json-index-source.json @@ -0,0 +1,16 @@ +{ + "localfile": "json-index-data.json", + "format": { + "type": "json", + "orient": "index" + }, + "encoding": "utf-8", + "schema_groups": ["label", "coordinates"], + "schema": { + "i" : "name", + "coordinates" : { + "X" : "x", + "Y" : "y" + } + } +} diff --git a/opentabulate/tests/data/json-records-data.json b/opentabulate/tests/data/json-records-data.json new file mode 100644 index 0000000..11e5ea2 --- /dev/null +++ b/opentabulate/tests/data/json-records-data.json @@ -0,0 +1,22 @@ +[ + { + "name": "A,B", + "x": 0, + "y": -1 + }, + { + "name": "C D", + "x": 1, + "y": 0 + }, + { + "name": "A,D", + "x": 0, + "y": 0 + }, + { + "name": "C C", + "x": -1, + "y": 1 + } +] \ No newline at end of file diff --git a/opentabulate/tests/data/json-records-source.json b/opentabulate/tests/data/json-records-source.json new file mode 100644 index 0000000..b8a4c66 --- /dev/null +++ b/opentabulate/tests/data/json-records-source.json @@ -0,0 +1,16 @@ +{ + "localfile": "json-records-data.json", + "format": { + "type": "json", + "orient": "records" + }, + "encoding": "utf-8", + "schema_groups": ["label", "coordinates"], + "schema": { + "i" : "name", + "coordinates" : { + "X" : "x", + "Y" : "y" + } + } +} diff --git a/opentabulate/tests/data/json-split-data.json b/opentabulate/tests/data/json-split-data.json new file mode 100644 index 0000000..218dc4f --- /dev/null +++ b/opentabulate/tests/data/json-split-data.json @@ -0,0 +1,35 @@ +{ + "columns": [ + "name", + "x", + "y" + ], + "index": [ + 0, + 1, + 2, + 3 + ], + "data": [ + [ + "A,B", + 0, + -1 + ], + [ + "C D", + 1, + 0 + ], + [ + "A,D", + 0, + 0 + ], + [ + "C C", + -1, + 1 + ] + ] +} \ No newline at end of file diff --git a/opentabulate/tests/data/json-split-source.json b/opentabulate/tests/data/json-split-source.json new file mode 100644 index 0000000..313a0cb --- /dev/null +++ b/opentabulate/tests/data/json-split-source.json @@ -0,0 +1,16 @@ +{ + "localfile": "json-split-data.json", + "format": { + "type": "json", + "orient": "split" + }, + "encoding": "utf-8", + "schema_groups": ["label", "coordinates"], + "schema": { + "i" : "name", + "coordinates" : { + "X" : "x", + "Y" : "y" + } + } +} diff --git a/opentabulate/tests/data/json-table-data.json b/opentabulate/tests/data/json-table-data.json new file mode 100644 index 0000000..9ae9a59 --- /dev/null +++ b/opentabulate/tests/data/json-table-data.json @@ -0,0 +1,52 @@ +{ + "schema": { + "fields": [ + { + "name": "index", + "type": "integer" + }, + { + "name": "name", + "type": "string" + }, + { + "name": "x", + "type": "integer" + }, + { + "name": "y", + "type": "integer" + } + ], + "primaryKey": [ + "index" + ], + "pandas_version": "1.4.0" + }, + "data": [ + { + "index": 0, + "name": "A,B", + "x": 0, + "y": -1 + }, + { + "index": 1, + "name": "C D", + "x": 1, + "y": 0 + }, + { + "index": 2, + "name": "A,D", + "x": 0, + "y": 0 + }, + { + "index": 3, + "name": "C C", + "x": -1, + "y": 1 + } + ] +} \ No newline at end of file diff --git a/opentabulate/tests/data/json-table-source.json b/opentabulate/tests/data/json-table-source.json new file mode 100644 index 0000000..4973eab --- /dev/null +++ b/opentabulate/tests/data/json-table-source.json @@ -0,0 +1,16 @@ +{ + "localfile": "json-table-data.json", + "format": { + "type": "json", + "orient": "table" + }, + "encoding": "utf-8", + "schema_groups": ["label", "coordinates"], + "schema": { + "i" : "name", + "coordinates" : { + "X" : "x", + "Y" : "y" + } + } +} diff --git a/opentabulate/tests/data/json-target-output.csv b/opentabulate/tests/data/json-target-output.csv new file mode 100644 index 0000000..817411c --- /dev/null +++ b/opentabulate/tests/data/json-target-output.csv @@ -0,0 +1,5 @@ +i,X,Y +"A,B",0,-1 +C D,1,0 +"A,D",0,0 +C C,-1,1 diff --git a/opentabulate/tests/test_algorithm.py b/opentabulate/tests/test_algorithm.py index 3f1469f..0c91594 100644 --- a/opentabulate/tests/test_algorithm.py +++ b/opentabulate/tests/test_algorithm.py @@ -13,7 +13,7 @@ from opentabulate.main.source import Source from opentabulate.main.config import Configuration -from opentabulate.main.algorithm import Algorithm, CSV_Algorithm, XML_Algorithm +from opentabulate.main.algorithms import Algorithm, CSV_Algorithm, XML_Algorithm, GeoJSON_Algorithm, JSON_Algorithm def cmp_output_bytes(path1, path2): @@ -32,6 +32,12 @@ def cmp_output_bytes(path1, path2): return True +# Mock class for testing Algorithm +# Implements empty tabulate +class MockAlgorithm(Algorithm): + def tabulate(self): + pass + class TestAlgorithm(unittest.TestCase): """ Algorithm class unit tests to verify correct output after running extract_labels() @@ -43,6 +49,12 @@ def setUpClass(cls): cls.config_file = data_path + "/opentabulate.conf" + # GeoJSON files for testing + cls.geojson_src_input = data_path + "/geojson-source.json" + cls.geojson_test_input = data_path + "/geojson-data.geojson" + cls.geojson_target_output = data_path + "/geojson-target-output.csv" + cls.geojson_test_output = data_path + "/geojson-test-output.csv" + # XML files for testing cls.xml_src_input = data_path + "/xml-source.json" cls.xml_test_input = data_path + "/xml-data.xml" @@ -55,7 +67,7 @@ def setUpClass(cls): cls.csv_target_output = data_path + "/csv-target-output.csv" cls.csv_test_output = data_path + "/csv-test-output.csv" - cls.a = Algorithm() + cls.a = MockAlgorithm() cls.xa = XML_Algorithm() def test_basic_process_csv(self): @@ -101,7 +113,29 @@ def test_basic_process_xml(self): self.assertTrue( cmp_output_bytes(self.xml_target_output, self.xml_test_output) ) + + def test_basic_process_geojson(self): + """ + OpenTabulate GeoJSON parsing and tabulation test. + """ + config = Configuration(self.config_file) + config.load() + config.validate() + + source = Source(self.geojson_src_input, config=config, default_paths=False) + source.parse() + source.input_path = self.geojson_test_input + source.output_path = self.geojson_test_output + + geojson_alg = GeoJSON_Algorithm(source) + geojson_alg.construct_label_map() + geojson_alg.tabulate() + + self.assertTrue( + cmp_output_bytes(self.geojson_target_output, self.geojson_test_output) + ) + def test__is_row_empty(self): """ Test for Algorithm._isRowEmpty method. @@ -141,6 +175,18 @@ def test__quick_clean_entry(self): self.a.LOWERCASE = None + self.a.TITLECASE = True + + self.assertEqual(self.a._quickCleanEntry('ABCabc123!@$'), 'Abcabc123!@$') + + self.a.TITLECASE = None + + self.a.UPPERCASE = True + + self.assertEqual(self.a._quickCleanEntry('ABCabc123!@$'), 'ABCABC123!@$') + + self.a.UPPERCASE = None + def test__is_force_value(self): """ Test for Algorithm._isForceValue method. @@ -175,5 +221,162 @@ def test__xml_is_element_missing(self): def tearDownClass(cls): pass +class TestJSON_Algorithm(unittest.TestCase): + """ + Algorithm class unit tests to verify correct output after running extract_labels() + and parse() methods. + """ + @classmethod + def setUpClass(cls): + data_path = os.path.join(os.path.dirname(__file__), 'data') + + cls.config_file = data_path + "/opentabulate.conf" + cls.target_output = data_path + "/json-target-output.csv" + + # JSON files - split format + cls.split_src_input = data_path + "/json-split-source.json" + cls.split_test_input = data_path + "/json-split-data.json" + cls.split_test_output = data_path + "/json-split-test-output.csv" + + # JSON files - records format + cls.records_src_input = data_path + "/json-records-source.json" + cls.records_test_input = data_path + "/json-records-data.json" + cls.records_test_output = data_path + "/json-records-test-output.csv" + + # JSON files - index format + cls.index_src_input = data_path + "/json-index-source.json" + cls.index_test_input = data_path + "/json-index-data.json" + cls.index_test_output = data_path + "/json-index-test-output.csv" + + # JSON files - columns format + cls.columns_src_input = data_path + "/json-columns-source.json" + cls.columns_test_input = data_path + "/json-columns-data.json" + cls.columns_test_output = data_path + "/json-columns-test-output.csv" + + # JSON files - table format + cls.table_src_input = data_path + "/json-table-source.json" + cls.table_test_input = data_path + "/json-table-data.json" + cls.table_test_output = data_path + "/json-table-test-output.csv" + + def test_basic_process_split_data(self): + """ + OpenTabulate JSON parsing and tabulation test - split data format + """ + + config = Configuration(self.config_file) + config.load() + config.validate() + + source = Source(self.split_src_input, config=config, default_paths=False) + source.parse() + + source.input_path = self.split_test_input + source.output_path = self.split_test_output + + json_alg = JSON_Algorithm(source) + json_alg.construct_label_map() + json_alg.tabulate() + + self.assertTrue( + cmp_output_bytes(self.target_output, self.split_test_output) + ) + + def test_basic_process_records_data(self): + """ + OpenTabulate JSON parsing and tabulation test - records data format + """ + + config = Configuration(self.config_file) + config.load() + config.validate() + + source = Source(self.records_src_input, config=config, default_paths=False) + source.parse() + + source.input_path = self.records_test_input + source.output_path = self.records_test_output + + json_alg = JSON_Algorithm(source) + json_alg.construct_label_map() + json_alg.tabulate() + + self.assertTrue( + cmp_output_bytes(self.target_output, self.records_test_output) + ) + + def test_basic_process_index_data(self): + """ + OpenTabulate JSON parsing and tabulation test - index data format + """ + + config = Configuration(self.config_file) + config.load() + config.validate() + + source = Source(self.index_src_input, config=config, default_paths=False) + source.parse() + + source.input_path = self.index_test_input + source.output_path = self.index_test_output + + json_alg = JSON_Algorithm(source) + json_alg.construct_label_map() + json_alg.tabulate() + + self.assertTrue( + cmp_output_bytes(self.target_output, self.index_test_output) + ) + + def test_basic_process_columns_data(self): + """ + OpenTabulate JSON parsing and tabulation test - columns data format + """ + + config = Configuration(self.config_file) + config.load() + config.validate() + + source = Source(self.columns_src_input, config=config, default_paths=False) + source.parse() + + source.input_path = self.columns_test_input + source.output_path = self.columns_test_output + + json_alg = JSON_Algorithm(source) + json_alg.construct_label_map() + json_alg.tabulate() + + self.assertTrue( + cmp_output_bytes(self.target_output, self.columns_test_output) + ) + + def test_basic_process_table_data(self): + """ + OpenTabulate JSON parsing and tabulation test - table data format + """ + + config = Configuration(self.config_file) + config.load() + config.validate() + + source = Source(self.table_src_input, config=config, default_paths=False) + source.parse() + + source.input_path = self.table_test_input + source.output_path = self.table_test_output + + json_alg = JSON_Algorithm(source) + json_alg.construct_label_map() + json_alg.tabulate() + + self.assertTrue( + cmp_output_bytes(self.target_output, self.table_test_output) + ) + + @classmethod + def tearDownClass(cls): + pass + + if __name__ == '__main__': unittest.main()