diff --git a/.github/workflows/continous_integration.yml b/.github/workflows/continous_integration.yml index d13c203..cd8d326 100644 --- a/.github/workflows/continous_integration.yml +++ b/.github/workflows/continous_integration.yml @@ -15,7 +15,7 @@ jobs: strategy: matrix: os: [ ubuntu-latest, windows-latest, macos-latest ] - python-version: [ '3.11', '3.12', '3.13' ] + python-version: [ '3.11', '3.12', '3.13', '3.14' ] include: - os: ubuntu-22.04 python-version: '3.10' @@ -43,23 +43,11 @@ jobs: uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - - name: Install dependencies for pypy3 - if: ${{ matrix.python-version == 'pypy3' }} - run: pypy3 -m pip install -e . - - name: Install dependencies for CPython - if: ${{ matrix.python-version != 'pypy3' }} + - name: Set up Rust + uses: actions-rust-lang/setup-rust-toolchain@v1 + with: + cache: false + - name: Install package run: python3 -m pip install -e . - - name: Build c extension for pypy3 - if: ${{ matrix.python-version == 'pypy3' }} - run: pypy3 -m pip install ./c - - name: Build c extension for Cpython - if: ${{ matrix.python-version != 'pypy3' }} - run: python3 -m pip install ./c - - name: Run tests using Cpython - if: ${{ matrix.python-version != 'pypy3' }} + - name: Run tests run: python3 -m pynmrstar.unit_tests - - name: Run tests using Pypy - if: ${{ matrix.python-version == 'pypy3' }} - run: pypy3 -m pynmrstar.unit_tests - #- name: Launch interactive debug - # uses: mxschmitt/action-tmate@v3 diff --git a/.github/workflows/manylinux_wheel_builder.yml b/.github/workflows/manylinux_wheel_builder.yml index 85cbae4..6ec4685 100644 --- a/.github/workflows/manylinux_wheel_builder.yml +++ b/.github/workflows/manylinux_wheel_builder.yml @@ -3,9 +3,6 @@ name: Build and upload to PyPI on: workflow_dispatch: pull_request: - push: - branches: - - v3 release: types: - published @@ -16,17 +13,27 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - # macos-13 is an intel runner, macos-14 is apple silicon - os: [ubuntu-latest, windows-latest, macos-13, macos-14] + os: [ubuntu-latest, windows-latest, macos-15, macos-15-intel] steps: - uses: actions/checkout@v4 + - name: Set up Rust + uses: actions-rust-lang/setup-rust-toolchain@v1 + - name: Build wheels uses: pypa/cibuildwheel@v2.23.3 env: + # Skip PyPy versions below 3.11 (PyO3 requirement) and 32-bit Linux (Rust unsupported) + CIBW_SKIP: "pp37-* pp38-* pp39-* pp310-* *-musllinux_i686 *-manylinux_i686" # Ensure cibuildwheel uses PEP 517 build CIBW_BUILD_FRONTEND: "build" + # Install Rust in the cibuildwheel environment + CIBW_BEFORE_ALL_LINUX: "curl https://sh.rustup.rs -sSf | sh -s -- --default-toolchain stable -y && source $HOME/.cargo/env" + CIBW_BEFORE_ALL_MACOS: "curl https://sh.rustup.rs -sSf | sh -s -- --default-toolchain stable -y && source $HOME/.cargo/env" + CIBW_BEFORE_ALL_WINDOWS: "rustup default stable" + CIBW_ENVIRONMENT_LINUX: 'PATH="$HOME/.cargo/bin:$PATH"' + CIBW_ENVIRONMENT_MACOS: 'PATH="$HOME/.cargo/bin:$PATH" MACOSX_DEPLOYMENT_TARGET=10.15' - uses: actions/upload-artifact@v4 with: diff --git a/.gitignore b/.gitignore index b5a771f..8ae6fa7 100644 --- a/.gitignore +++ b/.gitignore @@ -45,6 +45,9 @@ coverage.xml *,cover .hypothesis/ +# Rust +Cargo.lock + # Translations *.mo *.pot diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..14cafbc --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "pynmrstar_parser" +version = "0.0.0" +edition = "2021" + +[lib] +name = "pynmrstar_parser" +crate-type = ["cdylib"] + +[dependencies] +pyo3 = { version = "0.27", features = ["extension-module"] } +memchr = "2" diff --git a/c/Makefile b/c/Makefile deleted file mode 100644 index 398c2cf..0000000 --- a/c/Makefile +++ /dev/null @@ -1,6 +0,0 @@ -python3: cnmrstarmodule.c - python3 setup.py build - cp build/*/*.so ../pynmrstar - rm -rf build -clean: - rm -rfv build/ ../pynmrstar/*.so ./pynmrstar/*.so diff --git a/c/cnmrstarmodule.c b/c/cnmrstarmodule.c deleted file mode 100644 index 8ac4492..0000000 --- a/c/cnmrstarmodule.c +++ /dev/null @@ -1,866 +0,0 @@ -#define PY_SSIZE_T_CLEAN -#include -#include -#include -#include -#include -#include - -// Version number. Only need to update when -// API changes. -#define module_version "3.2.0" - -// Use for returning errors -#define err_size 500 -// Use as a special pointer value -#define done_parsing (void *)1 -// Check if a bit is set -#define CHECK_BIT(var,pos) ((var) & (1<<(pos))) - -struct module_state { - PyObject *error; -}; -#define GETSTATE(m) ((struct module_state*)PyModule_GetState(m)) - - -// Our whitespace chars -char whitespace[4] = " \n\t\v"; - -// A parser struct to keep track of state -typedef struct { - char * source; - char * full_data; - char * token; - long index; - long length; - long line_no; - char last_delimiter; -} parser_data; - -// Initialize the parser -parser_data parser = {NULL, NULL, done_parsing, 0, 0, 0, ' '}; - -void reset_parser(parser_data * parser){ - - if (parser->full_data != NULL){ - free(parser->full_data); - parser->full_data = NULL; - } - if (parser->token != done_parsing){ - free(parser->token); - } - parser->source = NULL; - parser->token = NULL; - parser->index = 0; - parser->length = 0; - parser->line_no = 0; - parser->last_delimiter = ' '; -} - -static PyObject * -PARSE_reset(PyObject *self) -{ - reset_parser(&parser); - - Py_INCREF(Py_None); - return Py_None; -} - -/* Return the index of the first match of needle in haystack, or -1 */ -long get_index(char * haystack, char * needle, long start_pos){ - - haystack += sizeof(char) * start_pos; - char * start = strstr(haystack, needle); - - // Return the end if string not found - if (!start){ - return -1; - } - - // Calculate the length into start is the new word - long diff = start - haystack; - return diff; -} - -/* From: http://stackoverflow.com/questions/779875/what-is-the-function-to-replace-string-in-c#answer-779960 */ -// You must free the result if result is non-NULL. -char *str_replace(const char *orig, char *rep, char *with) { - char *result; // the return string - char *ins; // the next insert point - char *tmp; // varies - int len_rep; // length of rep (the string to remove) - int len_with; // length of with (the string to replace rep with) - int len_front; // distance between rep and end of last rep - int count; // number of replacements - - // sanity checks and initialization - if (!orig || !rep) - return NULL; - len_rep = strlen(rep); - if (len_rep == 0) - return NULL; // empty rep causes infinite loop during count - if (!with) - with = ""; - len_with = strlen(with); - - // Count the number of replacements needed - // We explicitly are casting from a const char * to a char *, which is fine because we don't write to any - // value in the char array - just where ins points. - ins = (char *)orig; - for (count = 0; (tmp = strstr(ins, rep)); ++count) { - ins = tmp + len_rep; - } - - tmp = result = malloc(strlen(orig) + (len_with - len_rep) * count + 1); - - if (!result) - return NULL; - - // first time through the loop, all the variable are set correctly - // from here on, - // tmp points to the end of the result string - // ins points to the next occurrence of rep in orig - // orig points to the remainder of orig after "end of rep" - while (count--) { - ins = strstr(orig, rep); - len_front = ins - orig; - tmp = strncpy(tmp, orig, len_front) + len_front; - tmp = strcpy(tmp, with) + len_with; - orig += len_front + len_rep; // move to next "end of rep" - } - strcpy(tmp, orig); - return result; -} - -// Creates a new string that is all lower-case -// You must free the result if result is non-NULL. -char *str_to_lowercase(const char *orig) { - char *result; // the return string - - // sanity checks and initialization - if (!orig) - return NULL; - - result = malloc(strlen(orig) + 1); - - if (!result) - return NULL; - - for(int i = 0; orig[i]; i++){ - result[i] = tolower(orig[i]); - } - - return result; -} - - - -/* Use to look for common unset bits between strings. -void get_common_bits(void){ - char one[5] = "data_"; - char two[5] = "save_"; - char three[5] = "loop_"; - char four[5] = "stop_"; - char five[5] = "globa"; - char comb[5]; - - int x; - for (x=0; x< 5; x++){ - comb[x] = (char)(one[x] | two[x] | three[x] | four[x] | five[x]); - } - - printf("Comb: \n"); - for (x=0; x< 5; x++){ - int y; - for (y=0; y<7; y++){ - if (CHECK_BIT(comb[x], y)){ - printf("%d.%d: 1\n", x, y); - } else { - printf("%d.%d: 0\n", x, y); - } - } - } - return; -}*/ - - -void get_file(char *fname, parser_data * parser){ - - reset_parser(parser); - - // Open the file - FILE *f = fopen(fname, "rb"); - if (!f){ - PyErr_SetString(PyExc_IOError, "Could not open file."); - return; - } - - // Determine how long it is - fseek(f, 0, SEEK_END); - long fsize = ftell(f); - fseek(f, 0, SEEK_SET); - - // Allocate space for the file in RAM and load the file - char *string = malloc(fsize + 1); - if (fread(string, fsize, 1, f) != 1){ - PyErr_SetString(PyExc_IOError, "Short read of file."); - return; - } - - fclose(f); - // Zero terminate - string[fsize] = 0; - - parser->full_data = string; - parser->length = fsize; - parser->source = fname; -} - -/* Determines if a character is whitespace */ -bool is_whitespace(char test){ - unsigned long int x; - for (x=0; xindex < parser->length) && - (is_whitespace(parser->full_data[parser->index]))){ - - // Keep track of skipped newlines - if (parser->full_data[parser->index] == '\n'){ - parser->line_no++; - //printf("Skipping in pass_whitespace\n"); - } - - parser->index++; - } -} - -bool check_multiline(parser_data * parser, long length){ - long x; - for (x=parser->index; x <= parser->index+length; x++){ - if (parser->full_data[x] == '\n'){ - return true; - } - } - return false; - -} - -void update_line_number(parser_data * parser, long start_pos, long length){ - long x; - for (x=start_pos; x< start_pos + length; x++){ - if (parser->full_data[x] == '\n'){ - parser->line_no++; - //printf("Skipping in update_line_number\n"); - } - } -} - -/* Returns a new token char * */ -char * update_token(parser_data * parser, long length, char delimiter){ - - if (parser->token != done_parsing){ - free(parser->token); - } - - // Allocate space for the token and copy the data into it - parser->token = malloc(length+1); - memcpy(parser->token, &parser->full_data[parser->index], length); - parser->token[length] = '\0'; - - // Figure out what to set the last delimiter as - if (parser->index == 0){ - if (delimiter == '#') { - parser->last_delimiter = '#'; - } else { - parser->last_delimiter = ' '; - } - } else { - parser->last_delimiter = delimiter; - } - - // Check if reference - if ((parser->token[0] == '$') && (parser->last_delimiter == ' ') && (length >1)) { - parser->last_delimiter = '$'; - } - - // Update the line number - update_line_number(parser, parser->index, length + 1); - - parser->index += length + 1; - return parser->token; -} - - -// Get the current line number -long get_line_number(parser_data * parser){ - long num_lines = 0; - long x; - for (x = 0; x < parser->index; x++){ - if (parser->full_data[x] == '\n'){ - num_lines++; - } - } - return num_lines + 1; -} - -/* Gets one token from the file/string. Returns NULL on error and - done_parsing if there are no more tokens. */ -char * get_token(parser_data * parser){ - - //printf("Cur index: %ld\n", parser->index + 1); - - // Reset the delimiter - parser->last_delimiter = '?'; - - // Set up a tmp str pointer to use for searches - char * search; - // And an error char array - char err[err_size] = "Unknown error."; - - // Nothing left - if (parser->token == done_parsing){ - return parser->token; - } - - // Skip whitespace - pass_whitespace(parser); - - // Stop if we are at the end - if (parser->index >= parser->length){ - free(parser->token); - parser->token = done_parsing; - return parser->token; - } - - // See if this is a comment - if so skip it - if (parser->full_data[parser->index] == '#'){ - search = "\n"; - long length = get_index(parser->full_data, search, parser->index); - - // Handle the edge case where this is the last line of the file and there is no newline - if (length == -1){ - free(parser->token); - parser->token = done_parsing; - return parser->token; - } - - // Return the comment - return update_token(parser, length, '#'); - } - - // See if this is a multiline value - if ((parser->length - parser->index > 1) && (parser->full_data[parser->index] == ';') && (parser->full_data[parser->index+1] == '\n')){ - search = "\n;"; - long length = get_index(parser->full_data, search, parser->index); - - // Handle the edge case where this is the last line of the file and there is no newline - if (length == -1){ - snprintf(err, sizeof(err), "Invalid file. Semicolon-delineated value was not terminated. Error on line: %ld", get_line_number(parser)); - PyErr_SetString(PyExc_ValueError, err); - free(parser->token); - parser->token = NULL; - return parser->token; - } - - // We started with a newline so make sure to count it - parser->line_no++; - - parser->index += 2; - return update_token(parser, length-1, ';'); - } - - // Handle values quoted with ' - if (parser->full_data[parser->index] == '\''){ - search = "'"; - long end_quote = get_index(parser->full_data, search, parser->index + 1); - - // Handle the case where there is no terminating quote in the file - if (end_quote == -1){ - snprintf(err, sizeof(err), "Invalid file. Single quoted value was not terminated. Error on line: %ld", get_line_number(parser)); - PyErr_SetString(PyExc_ValueError, err); - free(parser->token); - parser->token = NULL; - return parser->token; - } - - // Make sure we don't stop for quotes that are not followed by whitespace - while ((parser->index+end_quote+2 < parser->length) && (!is_whitespace(parser->full_data[parser->index+end_quote+2]))){ - long next_index = get_index(parser->full_data, search, parser->index+end_quote+2); - if (next_index == -1){ - PyErr_SetString(PyExc_ValueError, "Invalid file. Single quoted value was never terminated at end of file."); - free(parser->token); - parser->token = NULL; - return parser->token; - } - end_quote += next_index + 1; - } - - // See if the quote has a newline - if (check_multiline(parser, end_quote)){ - snprintf(err, sizeof(err), "Invalid file. Single quoted value was not terminated on the same line it began. Error on line: %ld", get_line_number(parser)); - PyErr_SetString(PyExc_ValueError, err); - free(parser->token); - parser->token = NULL; - return parser->token; - } - - // Move the index 1 to skip the ' - parser->index++; - return update_token(parser, end_quote, '\''); - } - - // Handle values quoted with " - if (parser->full_data[parser->index] == '\"'){ - search = "\""; - long end_quote = get_index(parser->full_data, search, parser->index + 1); - - // Handle the case where there is no terminating quote in the file - if (end_quote == -1){ - snprintf(err, sizeof(err), "Invalid file. Double quoted value was not terminated. Error on line: %ld", get_line_number(parser)); - PyErr_SetString(PyExc_ValueError, err); - free(parser->token); - parser->token = NULL; - return parser->token; - } - - // Make sure we don't stop for quotes that are not followed by whitespace - while ((parser->index+end_quote+2 < parser->length) && (!is_whitespace(parser->full_data[parser->index+end_quote+2]))){ - long next_index = get_index(parser->full_data, search, parser->index+end_quote+2); - if (next_index == -1){ - PyErr_SetString(PyExc_ValueError, "Invalid file. Double quoted value was never terminated at end of file."); - free(parser->token); - parser->token = NULL; - return parser->token; - } - end_quote += next_index + 1; - } - - // See if the quote has a newline - if (check_multiline(parser, end_quote)){ - snprintf(err, sizeof(err), "Invalid file. Double quoted value was not terminated on the same line it began. Error on line: %ld", get_line_number(parser)); - PyErr_SetString(PyExc_ValueError, err); - free(parser->token); - parser->token = NULL; - return parser->token; - } - - // Move the index 1 to skip the " - parser->index++; - return update_token(parser, end_quote, '"'); - } - - // Nothing special. Just get the token - long end_pos = get_next_whitespace(parser->full_data, parser->index); - return update_token(parser, end_pos - parser->index, ' '); -} - -/* IDEA: Implementing the tokenizer following this pattern may - * be slightly faster: - -function readToken() // note: returns only one token each time - while !eof - c = peekChar() - if c in A-Za-z - return readIdentifier() - else if c in 0-9 - return readInteger() - else if c in ' \n\r\t\v\f' - nextChar() - ... - return EOF - - * - */ - - - - -// Implements startswith -bool starts_with(const char *a, const char *b) -{ - if(strncmp(a, b, strlen(b)) == 0) return true; - return false; -} - -bool ends_with(const char * str, const char * suffix) -{ - int str_len = strlen(str); - int suffix_len = strlen(suffix); - - return - (str_len >= suffix_len) && - (0 == strcmp(str + (str_len-suffix_len), suffix)); -} - -/* - Automatically quotes the value in the appropriate way. Don't - quote values you send to this method or they will show up in - another set of quotes as part of the actual data. E.g.: - - quote_value('"e. coli"') returns '\'"e. coli"\'' - - while - - quote_value("e. coli") returns "'e. coli'" -*/ -static PyObject * quote_value(PyObject *self, PyObject *args){ - char * format; - PyObject * orig; - PyObject * result; - - // Get the object to clean - if (!PyArg_ParseTuple(args, "O", &orig)){ - PyErr_SetString(PyExc_ValueError, "Failed to parse the input arguments."); - return NULL; - } - - // Convert the python object to a string - PyObject * temp = PyObject_Str(orig); - if (temp == NULL){ - PyErr_SetString(PyExc_ValueError, "Failed to convert the object you passed to a string using __str__()."); - Py_DECREF(temp); - return NULL; - } - - const char * str = PyUnicode_AsUTF8(temp); - - // Figure out how long the string is - long len = strlen(str); - - // Don't allow the empty string - if (len == 0){ - PyErr_SetString(PyExc_ValueError, "Empty strings are not allowed as values. Use the None singleton, or '.' to represent null values."); - Py_DECREF(temp); - return NULL; - } - - // If it is a STAR-format multiline comment already, we need to escape it - if (strstr(str, "\n;") != NULL){ - - // Insert the spaces - char * replaced_string; - replaced_string = str_replace(str, "\n", "\n "); - - // But always newline terminate it - if (!ends_with(replaced_string, "\n")){ - // Must start with newline too - if (replaced_string[0] != '\n'){ - format = "\n %s\n"; - } else { - format = "%s\n"; - } - } else { - if (replaced_string[0] != '\n'){ - format = "\n %s"; - } else { - format = "%s"; - } - } - - PyObject* result = PyUnicode_FromFormat(format, replaced_string); - free(replaced_string); - Py_DECREF(temp); - return(result); - } - - // If it's going on it's own line, don't touch it - if (strstr(str, "\n") != NULL){ - // But always newline terminate it - if (str[len-1] != '\n'){ - result = PyUnicode_FromFormat("%s\n", str); - Py_DECREF(temp); - return result; - } else { - // Return as is if it already ends with a newline - result = PyUnicode_FromString(str); - Py_DECREF(temp); - return result; - } - } - - // If it has single and double quotes it will need to go on its - // own line under certain conditions... - bool has_single = strstr(str, "'") != NULL; - bool has_double = strstr(str, "\"") != NULL; - - bool can_wrap_single = true; - bool can_wrap_double = true; - - if (has_double && has_single){ - // Determine which quote types are appropriate to use - // (Which depends on if the existing quotes are embedded in text - // or are followed by whitespace) - long x; - for (x=0; xlast_delimiter == '#'){ - token = get_token(&parser); - } - - // Pass errors up the chain - if (token == NULL){ - return NULL; - } - - // Unwrap embedded STAR if all lines start with three spaces - if ((my_parser->last_delimiter == ';') && (starts_with(token, "\n "))){ - bool shift_over = true; - - size_t token_len = strlen(token); - unsigned long int c; - for (c=0; cline_no, my_parser->last_delimiter); - } - return Py_BuildValue("slC", token, my_parser->line_no, my_parser->last_delimiter); -} - -static PyObject * -version(PyObject *self) -{ - return PyUnicode_FromString(module_version); -} - -static PyMethodDef cnmrstar_methods[] = { - {"quote_value", (PyCFunction)quote_value, METH_VARARGS, - "Properly quote or encapsulate a value before printing."}, - - {"load", (PyCFunction)PARSE_load, METH_VARARGS, - "Load a file in preparation to tokenize."}, - - {"load_string", (PyCFunction)PARSE_load_string, METH_VARARGS, - "Load a string in preparation to tokenize."}, - - {"get_token_full", (PyCFunction)PARSE_get_token_full, METH_NOARGS, - "Get one token from the file as well as the line number and delimiter."}, - - {"reset", (PyCFunction)PARSE_reset, METH_NOARGS, - "Reset the tokenizer state."}, - - {"version", (PyCFunction)version, METH_NOARGS, - "Returns the version of the module."}, - - {NULL, NULL, 0, NULL} /* Sentinel */ -}; - -static int myextension_traverse(PyObject *m, visitproc visit, void *arg) { - Py_VISIT(GETSTATE(m)->error); - return 0; -} - -static int myextension_clear(PyObject *m) { - Py_CLEAR(GETSTATE(m)->error); - return 0; -} - -static struct PyModuleDef moduledef = { - PyModuleDef_HEAD_INIT, - "cnmrstar", - "A NMR-STAR tokenizer implemented in C.", - sizeof(struct module_state), - cnmrstar_methods, - NULL, - myextension_traverse, - myextension_clear, - NULL -}; - -#define INITERROR return NULL - -PyMODINIT_FUNC -PyInit_cnmrstar(void){ - PyObject *module = PyModule_Create(&moduledef); - - if (module == NULL) - INITERROR; - struct module_state *st = GETSTATE(module); - - st->error = PyErr_NewException("cnmrstar.Error", NULL, NULL); - if (st->error == NULL) { - Py_DECREF(module); - INITERROR; - } - - return module; -} diff --git a/c/setup.py b/c/setup.py deleted file mode 100644 index 8718bfc..0000000 --- a/c/setup.py +++ /dev/null @@ -1,10 +0,0 @@ -from distutils.core import setup, Extension - -cnmrstar = Extension('cnmrstar', - sources=['cnmrstarmodule.c'], - extra_compile_args=["-funroll-loops", "-O3"]) - -setup(name='cnmrstar', - version='3.2.0', - description='This contains a really fast NMR-STAR tokenizer and value sanitizer.', - ext_modules=[cnmrstar]) diff --git a/docs/conf.py b/docs/conf.py index d6f3e19..661e816 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -20,7 +20,7 @@ # -- Project information ----------------------------------------------------- project = 'PyNMR-STAR' -copyright = '2021, Jon Wedell' +copyright = 'UConn Health' author = 'Jon Wedell' # The full version, including alpha/beta/rc tags diff --git a/docs/release-notes.rst b/docs/release-notes.rst index 4786b54..272ae51 100644 --- a/docs/release-notes.rst +++ b/docs/release-notes.rst @@ -1,6 +1,22 @@ Release notes ============= +3.4.0b1 +~~~~~~~ + +Major improvements: + +The C helper module has been reimplemented in Rust and other performance critical functions have also been +moved to the Rust module. (Specifically, the Parsing logic and the loop string formatting logic.) Additional effort +towards maximizing the performance of the Rust module has been made, leading to massive speedups in performance +of common actions. + +- Parsing a file is now nearly twice as fast. +- Rendering an Entry to a string is now nearly three times as fast. + +This change is tested to be and understood to be fully backwards compatible, but due to the major nature of the +change is getting a new minor version number. + 3.3.6 ~~~~~ diff --git a/pynmrstar/__init__.py b/pynmrstar/__init__.py index 46396d2..e332930 100644 --- a/pynmrstar/__init__.py +++ b/pynmrstar/__init__.py @@ -7,38 +7,22 @@ Use python's built in help function for documentation.""" +import platform as _platform +import sys as _sys + +if _platform.python_implementation() == "PyPy" and _sys.version_info < (3, 11): + raise ImportError("When using PyPy, pynmrstar requires a version >= 3.11") + import decimal as _decimal import logging -import os - -try: - import cnmrstar -except ImportError: - try: - import pynmrstar.cnmrstar as cnmrstar - except ImportError: - if os.environ.get('READTHEDOCS'): - cnmrstar = None - else: - raise ImportError('Could not import cnmrstar sub-module! Your installation appears to be broken.') +import pynmrstar.definitions as definitions from pynmrstar import utils -from pynmrstar._internal import __version__, min_cnmrstar_version +from pynmrstar._internal import __version__ from pynmrstar.entry import Entry from pynmrstar.loop import Loop -from pynmrstar.parser import Parser as _Parser from pynmrstar.saveframe import Saveframe from pynmrstar.schema import Schema -import pynmrstar.definitions as definitions - -if cnmrstar: - if "version" not in dir(cnmrstar): - raise ImportError(f"Could not determine the version of cnmrstar installed, and version {min_cnmrstar_version} or " - "greater is required.") - if cnmrstar.version() < min_cnmrstar_version: - raise ImportError("The version of the cnmrstar module installed does not meet the requirements. As this should be " - f"handled automatically, there may be an issue with your installation. Version installed: " - f"{cnmrstar.version()}. Version required: {min_cnmrstar_version}") # Set up logging logger = logging.getLogger('pynmrstar') @@ -50,6 +34,5 @@ del entry del saveframe del schema -del parser -__all__ = ['Loop', 'Saveframe', 'Entry', 'Schema', 'definitions', 'utils', '__version__', 'exceptions', 'cnmrstar'] +__all__ = ['Loop', 'Saveframe', 'Entry', 'Schema', 'definitions', 'utils', '__version__', 'exceptions'] diff --git a/pynmrstar/_internal.py b/pynmrstar/_internal.py index 207d40c..b76bc61 100644 --- a/pynmrstar/_internal.py +++ b/pynmrstar/_internal.py @@ -6,6 +6,7 @@ import zlib from datetime import date from gzip import GzipFile +from importlib.metadata import version from io import StringIO, BytesIO from pathlib import Path from typing import Dict, Union, IO, List, Tuple @@ -15,8 +16,7 @@ import pynmrstar -__version__: str = "3.3.6" -min_cnmrstar_version: str = "3.2.0" +__version__: str = version("pynmrstar") # Create a session to reuse for the duration of the program run _session = requests.session() diff --git a/pynmrstar/definitions.py b/pynmrstar/definitions.py index 69595cc..a2e7751 100644 --- a/pynmrstar/definitions.py +++ b/pynmrstar/definitions.py @@ -22,7 +22,6 @@ NULL_VALUES = ['', ".", "?", None] WHITESPACE: str = " \t\n\v" -RESERVED_KEYWORDS = ["data_", "save_", "loop_", "stop_", "global_"] STR_CONVERSION_DICT: dict = {None: "."} API_URL: str = "https://api.bmrb.io/v2" diff --git a/pynmrstar/entry.py b/pynmrstar/entry.py index 6e44cc0..1858728 100644 --- a/pynmrstar/entry.py +++ b/pynmrstar/entry.py @@ -6,7 +6,7 @@ from pathlib import Path from typing import TextIO, BinaryIO, Union, List, Optional, Dict, Any, Tuple -from pynmrstar import definitions, utils, loop as loop_mod, parser as parser_mod, saveframe as saveframe_mod +from pynmrstar import definitions, utils, loop as loop_mod, saveframe as saveframe_mod, parser from pynmrstar._internal import _json_serialize, _interpret_file, _get_entry_from_database, write_to_file from pynmrstar.exceptions import InvalidStateError from pynmrstar.schema import Schema @@ -128,8 +128,10 @@ def __init__(self, **kwargs) -> None: return # Load the BMRB entry from the file - parser: parser_mod.Parser = parser_mod.Parser(entry_to_parse_into=self) - parser.parse(star_buffer.read(), source=self.source, convert_data_types=kwargs.get('convert_data_types', False), + parser.parse(star_buffer.read(), + parse_into=self, + source=self.source, + convert_data_types=kwargs.get('convert_data_types', False), raise_parse_warnings=kwargs.get('raise_parse_warnings', False)) def __iter__(self) -> saveframe_mod.Saveframe: diff --git a/pynmrstar/loop.py b/pynmrstar/loop.py index 5d27842..61646fd 100644 --- a/pynmrstar/loop.py +++ b/pynmrstar/loop.py @@ -5,13 +5,14 @@ from io import StringIO from itertools import chain from pathlib import Path -from typing import TextIO, BinaryIO, Union, List, Optional, Any, Dict, Callable, Tuple +from typing import TextIO, BinaryIO, Union, List, Optional, Any, Dict, Callable, Tuple, Generator -from pynmrstar import definitions, utils, entry as entry_mod +import pynmrstar_parser + +from pynmrstar import definitions, utils, entry as entry_mod, parser from pynmrstar._internal import _json_serialize, _interpret_file from pynmrstar._types import DataInput -from pynmrstar.exceptions import InvalidStateError -from pynmrstar.parser import Parser +from pynmrstar.exceptions import InvalidStateError, ParsingError from pynmrstar.schema import Schema @@ -121,8 +122,8 @@ def __init__(self, **kwargs) -> None: # Load the BMRB entry from the file star_buffer = StringIO(f"data_0 save_internaluseyoushouldntseethis_frame _internal.use internal " f"{star_buffer.read()} save_") - parser = Parser(entry_to_parse_into=tmp_entry) parser.parse(star_buffer.read(), + parse_into=tmp_entry, source=self.source, convert_data_types=kwargs.get('convert_data_types', False), raise_parse_warnings=kwargs.get('raise_parse_warnings', False), @@ -130,16 +131,18 @@ def __init__(self, **kwargs) -> None: # Check that there was only one loop here if len(tmp_entry[0].loops) > 1: - raise ValueError("You attempted to parse one loop but the source you provided had more than one loop. " - "Please either parse all loops as a saveframe or only parse one loop. Loops detected: " + - str(tmp_entry[0].loops)) + raise ParsingError("You attempted to parse one loop but the source you provided had more than one loop. " + "Please either parse all loops as a saveframe or only parse one loop. Loops detected: " + + str(tmp_entry[0].loops)) - # Copy the first parsed saveframe into ourself + # Copy the first parsed loop into ourself + if len(tmp_entry[0].loops) == 0: + raise ParsingError("You attempted to parse an empty string as a loop.") self._tags = tmp_entry[0][0].tags self.data = tmp_entry[0][0].data self.category = tmp_entry[0][0].category - def __iter__(self) -> list: + def __iter__(self) -> Generator[List[Any], Any, None]: """ Yields each of the rows contained within the loop. """ for row in self.data: @@ -191,89 +194,33 @@ def __setitem__(self, key: str, item: Any) -> None: def __str__(self, skip_empty_loops: bool = False, skip_empty_tags: bool = False) -> str: """Returns the loop in STAR format as a string.""" - # Check if there is any data in this loop - if len(self.data) == 0: - # They do not want us to print empty loops - if skip_empty_loops: - return "" - else: - # If we have no tags than return the empty loop - if len(self._tags) == 0: - return "\n loop_\n\n stop_\n" - - if len(self._tags) == 0: - raise InvalidStateError("Impossible to print data if there are no associated tags. Error in loop " - f"'{self.category}' which contains data but hasn't had any tags added.") - - # Make sure the tags and data match - self._check_tags_match_data() - - # If skipping null tags, it's easier to filter out a loop with only real tags and then print - if skip_empty_tags: + # If skipping null tags, filter and recurse (this path stays in Python) + if skip_empty_tags and len(self.data) > 0: has_data = [not all([_ in definitions.NULL_VALUES for _ in column]) for column in zip(*self.data)] return self.filter([tag for x, tag in enumerate(self._tags) if has_data[x]]).format() - # Start the loop - return_chunks = ["\n loop_\n"] - # Print the tags - format_string = " %-s\n" - # Check to make sure our category is set - if self.category is None: + if self.category is None and len(self._tags) > 0: raise InvalidStateError("The category was never set for this loop. Either add a tag with the category " "intact, specify it when generating the loop, or set it using Loop.set_category().") - # Print the categories - if self.category is None: - for tag in self._tags: - return_chunks.append(format_string % tag) - else: - for tag in self._tags: - return_chunks.append(format_string % (self.category + "." + tag)) - - return_chunks.append("\n") - - if len(self.data) != 0: - - # Make a copy of the data - working_data = [] - title_widths = [4]*len(self.data[0]) + # Make sure the tags and data match + if len(self.data) > 0: + self._check_tags_match_data() - # Put quotes as needed on the data - for row_pos, row in enumerate(self.data): - clean_row = [] - for col_pos, x in enumerate(row): - try: - clean_val = utils.quote_value(x) - clean_row.append(clean_val) - length = len(clean_val) + 3 - if length > title_widths[col_pos] and "\n" not in clean_val: - title_widths[col_pos] = length - - except ValueError: - raise InvalidStateError('Cannot generate NMR-STAR for entry, as empty strings are not valid ' - 'tag values in NMR-STAR. Please either replace the empty strings with' - ' None objects, or set pynmrstar.definitions.STR_CONVERSION_DICT[' - '\'\'] = None.\n' - f'Loop: {self.category} Row: {row_pos} Column: {col_pos}') - - working_data.append(clean_row) - - # Generate the format string - format_string = " " + "%-*s" * len(self._tags) + " \n" - - # Print the data, with the tags sized appropriately - for datum in working_data: - for pos, item in enumerate(datum): - if "\n" in item: - datum[pos] = "\n;\n%s;\n" % item - - # Print the data (combine the tags' widths with their data) - tag_width_list = [d for d in zip(title_widths, datum)] - return_chunks.append(format_string % tuple(chain.from_iterable(tag_width_list))) - - # Close the loop - return "".join(return_chunks) + "\n stop_\n" + # Use the Rust implementation for the main formatting work + # Pass STR_CONVERSION_DICT so Rust can handle conversions + try: + return pynmrstar_parser.format_loop( + self._tags, + self.category or "", + self.data, + skip_empty_loops, + definitions.STR_CONVERSION_DICT + ) + except ValueError as e: + # Convert ValueError from Rust to InvalidStateError for consistency + raise InvalidStateError(str(e)) @property def _lc_tags(self) -> Dict[str, int]: diff --git a/pynmrstar/parser.py b/pynmrstar/parser.py index 2247b6a..05d0bc8 100644 --- a/pynmrstar/parser.py +++ b/pynmrstar/parser.py @@ -1,287 +1,19 @@ import logging -import re -from typing import Optional -from pynmrstar import definitions, cnmrstar, entry as entry_mod, loop as loop_mod, saveframe as saveframe_mod, schema as schema_mod +import pynmrstar_parser + +from pynmrstar import entry as entry_mod, schema as schema_mod from pynmrstar.exceptions import ParsingError logger = logging.getLogger('pynmrstar') - -class Parser(object): - """Parses an entry. You should not ever use this class directly.""" - - def __init__(self, entry_to_parse_into: 'entry_mod.Entry' = None) -> None: - - # Just make an entry to parse into if called with no entry passed - if entry_to_parse_into is None: - entry_to_parse_into = entry_mod.Entry.from_scratch("") - - self.ent: entry_mod.Entry = entry_to_parse_into - self.full_data: str = "" - self.token: str = "" - self.source: str = "unknown" - self.delimiter: str = " " - self.line_number: int = 0 - - def get_token(self) -> str: - """ Returns the next token in the parsing process.""" - - try: - self.token, self.line_number, self.delimiter = cnmrstar.get_token_full() - except ValueError as err: - raise ParsingError(str(err)) - - return self.token - - @staticmethod - def load_data(data: str) -> None: - """ Loads data in preparation of parsing and cleans up newlines - and massages the data to make parsing work properly when multi-line - values aren't as expected. Useful for manually getting tokens from - the parser.""" - - # Fix DOS line endings - data = data.replace("\r\n", "\n").replace("\r", "\n") - # Change '\n; data ' started multi-lines to '\n;\ndata' - data = re.sub(r'\n;([^\n]+?)\n', r'\n;\n\1\n', data) - - cnmrstar.load_string(data) - - def parse(self, - data: str, - source: str = "unknown", - raise_parse_warnings: bool = False, - convert_data_types: bool = False, - schema: 'schema_mod.Schema' = None) -> 'entry_mod.Entry': - """ Parses the string provided as data as an NMR-STAR entry - and returns the parsed entry. Raises ParsingError on exceptions. - - Set raise_parse_warnings to raise an exception if the file has - something technically incorrect, but still parsable. - - Following is a list of the types of errors that would trigger - raise_parse_warnings: - - * A loop with no data was found. - * A loop with no tags or values was found. - * A tag with an improper multi-line value was found. - Multi-line values should look like this: - \n;\nThe multi-line\nvalue here.\n;\n - but the tag looked like this: - \n; The multi-line\nvalue here.\n;\n""" - - self.load_data(data) - self.get_token() - - # Make sure this is actually a STAR file - if not self.token.lower().startswith("data_"): - raise ParsingError("Invalid file. NMR-STAR files must start with 'data_' followed by the data name. " - f"Did you accidentally select the wrong file? Your file started with '{self.token}'.", - self.line_number) - - # Make sure there is a data name - elif len(self.token) < 6: - raise ParsingError("'data_' must be followed by data name. Simply 'data_' is not allowed.", - self.line_number) - - if self.delimiter != " ": - raise ParsingError("The data_ keyword may not be quoted or semicolon-delimited.", - self.line_number) - - # Set the entry_id - self.ent._entry_id = self.token[5:] - self.source = source - - # We are expecting to get saveframes - while self.get_token() is not None: - - if not self.token.lower().startswith("save_"): - raise ParsingError(f"Only 'save_NAME' is valid in the body of a NMR-STAR file. Found '{self.token}'.", - self.line_number) - - if len(self.token) < 6: - raise ParsingError("'save_' must be followed by saveframe name. You have a 'save_' tag which is " - "illegal without a specified saveframe name.", self.line_number) - - if self.delimiter != " ": - raise ParsingError("The save_ keyword may not be quoted or semicolon-delimited.", - self.line_number) - - # Add the saveframe - cur_frame: Optional[saveframe_mod.Saveframe] = saveframe_mod.Saveframe.from_scratch(self.token[5:], - source=source) - self.ent.add_saveframe(cur_frame) - - # We are in a saveframe - while self.get_token() is not None: - - if self.token.lower() == "loop_": - if self.delimiter != " ": - raise ParsingError("The loop_ keyword may not be quoted or semicolon-delimited.", - self.line_number) - - cur_loop: Optional[loop_mod.Loop] = loop_mod.Loop.from_scratch(source=source) - - # We are in a loop - cur_data = [] - seen_data = False - in_loop = True - while in_loop and self.get_token() is not None: - - # Add a tag if it isn't quoted - if quoted, it should be treated as a data value - if self.token.startswith("_") and self.delimiter == " ": - try: - cur_loop.add_tag(self.token) - except ValueError as err: - raise ParsingError(str(err), self.line_number) - - # On to data - else: - - # Now that we have the tags we can add the loop - # to the current saveframe - try: - cur_frame.add_loop(cur_loop) - except ValueError as err: - raise ParsingError(str(err), self.line_number) - - # We are in the data block of a loop - while self.token is not None: - if self.token.lower() == "stop_": - if self.delimiter != " ": - raise ParsingError( - "The stop_ keyword may not be quoted or semicolon-delimited.", - self.line_number) - if len(cur_loop.tags) == 0: - if raise_parse_warnings: - raise ParsingError("Loop with no tags.", self.line_number) - else: - logger.warning('Loop with no tags in parsed file on line: %s', - self.line_number) - cur_loop = None - if not seen_data: - if raise_parse_warnings: - raise ParsingError("Loop with no data.", self.line_number) - else: - logger.warning("Loop with no data on line: %s", self.line_number) - - if len(cur_data) > 0: - if len(cur_data) % len(cur_loop.tags) != 0: - raise ParsingError(f"The loop being parsed, '{cur_loop.category}' does " - f"not have the expected number of data elements. This " - f"indicates that either one or more tag values are " - f"either missing from or duplicated in this loop.", - self.line_number) - try: - cur_loop.add_data(cur_data, - rearrange=True, - convert_data_types=convert_data_types, - schema=schema) - # If there is an issue with the loops during parsing, raise a parse error - # rather than the ValueError that would be raised if they made the mistake - # directly - except ValueError as e: - raise ParsingError(str(e)) - cur_data = [] - - cur_loop = None - in_loop = False - break - elif self.token.startswith("_") and self.delimiter == " ": - raise ParsingError("Cannot have more loop tags after loop data. Or perhaps this " - f"was a data value which was not quoted (but must be, " - f"if it starts with '_')? Value: '{self.token}'.", - self.line_number) - else: - if len(cur_loop.tags) == 0: - raise ParsingError("Data value found in loop before any loop tags were " - "defined. Value: '{self.token}'", - self.line_number) - - if self.token.lower() in definitions.RESERVED_KEYWORDS and self.delimiter == " ": - error = "Cannot use keywords as data values unless quoted or semi-colon " \ - "delimited. Perhaps this is a loop that wasn't properly terminated " \ - "with a 'stop_' keyword before the saveframe ended or another loop " \ - f"began? Value found where 'stop_' or another data value expected: " \ - f"'{self.token}'." - if len(cur_data) > 0: - error += f" Last loop data element parsed: '{cur_data[-1]}'." - raise ParsingError(error, self.line_number) - cur_data.append(self.token) - seen_data = True - - # Get the next token - self.get_token() - - if not self.token: - raise ParsingError(f"Loop improperly terminated at end of file. Loops must end with the " - f"'stop_' token, but the file ended without the stop token.", - self.line_number) - if self.token.lower() != 'stop_': - raise ParsingError(f"Loop improperly terminated at end of file. Loops must end with the " - f"'stop_' token, but the token '{self.token}' was found instead.", - self.line_number) - - # Close saveframe - elif self.token.lower() == "save_": - if self.delimiter not in " ;": - raise ParsingError("The save_ keyword may not be quoted or semicolon-delimited.", - self.line_number) - - if cur_frame.tag_prefix is None: - raise ParsingError("The tag prefix was never set! Either the saveframe had no tags, you " - "tried to read a version 2.1 file, or there is something else wrong with " - f"your file. Saveframe error occurred within: '{cur_frame.name}'", - line_number=self.line_number) - break - - # Invalid content in saveframe - elif not self.token.startswith("_"): - if cur_frame.name == 'internaluseyoushouldntseethis_frame': - raise ParsingError(f"Invalid token found in loop contents. Expecting 'loop_' " - f"but found: '{self.token}'", line_number=self.line_number) - else: - raise ParsingError(f"Invalid token found in saveframe '{cur_frame.name}'. Expecting a tag, " - f"loop, or 'save_' token but found: '{self.token}'", - line_number=self.line_number) - - # Add a tag - else: - if self.delimiter != " ": - raise ParsingError(f"Saveframe tags may not be quoted or semicolon-delimited. Quoted tag: '" - f"{self.token}'.", - self.line_number) - cur_tag: Optional[str] = self.token - - # We are in a saveframe and waiting for the saveframe tag - self.get_token() - if self.delimiter == " ": - if self.token.lower() in definitions.RESERVED_KEYWORDS: - raise ParsingError("Cannot use keywords as data values unless quoted or semi-colon " - f"delimited. Illegal value: '{self.token}'", self.line_number) - if self.token.startswith("_"): - raise ParsingError( - "Cannot have a tag value start with an underscore unless the entire value " - "is quoted. You may be missing a data value on the previous line. " - f"Illegal value: '{self.token}'", self.line_number) - try: - cur_frame.add_tag(cur_tag, - self.token, - convert_data_types=convert_data_types, - schema=schema) - except ValueError as err: - raise ParsingError(str(err), line_number=self.line_number) - - if not self.token or self.token.lower() != "save_": - raise ParsingError("Saveframe improperly terminated at end of file. Saveframes must be terminated " - "with the 'save_' token.", - self.line_number) - - # Free the memory of the original copy of the data we parsed - self.full_data = None - - # Reset the parser - cnmrstar.reset() - - return self.ent +def parse(data: str, + parse_into: 'entry_mod.Entry', + source: str = "unknown", + raise_parse_warnings: bool = False, + convert_data_types: bool = False, + schema: 'schema_mod.Schema' = None) -> None: + try: + pynmrstar_parser.parse(data, parse_into, source, raise_parse_warnings, convert_data_types, schema) + except ValueError as e: + raise ParsingError(str(e)) diff --git a/pynmrstar/saveframe.py b/pynmrstar/saveframe.py index bace162..976c7c1 100644 --- a/pynmrstar/saveframe.py +++ b/pynmrstar/saveframe.py @@ -5,7 +5,9 @@ from pathlib import Path from typing import TextIO, BinaryIO, Union, List, Optional, Any, Dict, Iterable, Tuple -from pynmrstar import definitions, entry as entry_mod, loop as loop_mod, parser as parser_mod, utils +from pynmrstar_parser import pynmrstar_parser + +from pynmrstar import definitions, entry as entry_mod, loop as loop_mod, utils, parser from pynmrstar._internal import _get_comments, _json_serialize, _interpret_file, get_clean_tag_list, write_to_file from pynmrstar.exceptions import InvalidStateError from pynmrstar.schema import Schema @@ -230,8 +232,10 @@ def __init__(self, **kwargs) -> None: # Load the BMRB entry from the file star_buffer = StringIO("data_1 " + star_buffer.read()) - parser = parser_mod.Parser(entry_to_parse_into=tmp_entry) - parser.parse(star_buffer.read(), source=self.source, convert_data_types=kwargs.get('convert_data_types', False), + parser.parse(star_buffer.read(), + parse_into=tmp_entry, + source=self.source, + convert_data_types=kwargs.get('convert_data_types', False), raise_parse_warnings=kwargs.get('raise_parse_warnings', False)) # Copy the first parsed saveframe into ourself @@ -502,47 +506,33 @@ def __str__(self, if self.tag_prefix is None: raise InvalidStateError(f"The tag prefix was never set! Error in saveframe named '{self.name}'.") - return_chunks = [] - - # Insert the comment if not disabled + # Handle comments in Python (simple, not performance-critical) + comment_prefix = "" if show_comments: if self._category in _get_comments(): this_comment = _get_comments()[self._category] if first_in_category or this_comment['every_flag']: - return_chunks.append(_get_comments()[self._category]['comment']) - - # Print the saveframe - return_chunks.append(f"save_{self.name}\n") + comment_prefix = _get_comments()[self._category]['comment'] - if len(self._tags) > 0: - width = max([len(self.tag_prefix + "." + x[0]) for x in self._tags]) - pstring = " %%-%ds %%s\n" % width - mstring = " %%-%ds\n;\n%%s;\n" % width + # Format the loops first (each loop's format() already uses Rust) + formatted_loops = [each_loop.format(skip_empty_loops=skip_empty_loops, skip_empty_tags=skip_empty_tags) + for each_loop in self._loops] - # Print the tags - for each_tag in self._tags: - if skip_empty_tags and each_tag[1] in definitions.NULL_VALUES: - continue - try: - clean_tag = utils.quote_value(each_tag[1]) - except ValueError: - raise InvalidStateError('Cannot generate NMR-STAR for entry, as empty strings are not valid tag' - ' values in NMR-STAR. Please either replace the empty strings with None ' - 'objects, or set pynmrstar.definitions.STR_CONVERSION_DICT[\'\'] = None. ' - f'Saveframe: {self.name} Tag: {each_tag[0]}') - - formatted_tag = self.tag_prefix + "." + each_tag[0] - if "\n" in clean_tag: - return_chunks.append(mstring % (formatted_tag, clean_tag)) - else: - return_chunks.append(pstring % (formatted_tag, clean_tag)) - - # Print any loops - for each_loop in self._loops: - return_chunks.append(each_loop.format(skip_empty_loops=skip_empty_loops, skip_empty_tags=skip_empty_tags)) - - # Close the saveframe - return "".join(return_chunks) + "\nsave_\n" + # Use the Rust implementation for the main formatting work + try: + result = pynmrstar_parser.format_saveframe( + self.name, + self.tag_prefix, + self._tags, + formatted_loops, + skip_empty_tags, + definitions.STR_CONVERSION_DICT, + definitions.NULL_VALUES + ) + return comment_prefix + result + except ValueError as e: + # Convert ValueError from Rust to InvalidStateError for consistency + raise InvalidStateError(str(e)) def add_loop(self, loop_to_add: 'loop_mod.Loop') -> None: """Add a loop to the saveframe loops.""" @@ -636,7 +626,10 @@ def add_tag(self, f'conflicts with the saveframe name {self._name}.') self._tags.append(new_tag) - def add_tags(self, tag_list: list, update: bool = False) -> None: + def add_tags(self, tag_list: list, + update: bool = False, + convert_data_types: bool = False, + schema: Schema = None) -> None: """Adds multiple tags to the list. Input should be a list of tuples that are either [key, value] or [key]. In the latter case the value will be set to ".". Set update to true to update a @@ -644,9 +637,9 @@ def add_tags(self, tag_list: list, update: bool = False) -> None: for tag_pair in tag_list: if len(tag_pair) == 2: - self.add_tag(tag_pair[0], tag_pair[1], update=update) + self.add_tag(tag_pair[0], tag_pair[1], update=update, convert_data_types=convert_data_types, schema=schema) elif len(tag_pair) == 1: - self.add_tag(tag_pair[0], ".", update=update) + self.add_tag(tag_pair[0], ".", update=update, convert_data_types=convert_data_types, schema=schema) else: raise ValueError(f"You provided an invalid tag/value to add: '{tag_pair}'.") diff --git a/pynmrstar/unit_tests/__init__.py b/pynmrstar/unit_tests/__init__.py index 72df212..1bc1dec 100644 --- a/pynmrstar/unit_tests/__init__.py +++ b/pynmrstar/unit_tests/__init__.py @@ -1,1001 +1,16 @@ #!/usr/bin/env python3 -import json import logging -import os -import random -import tempfile import unittest -from copy import deepcopy as copy -from decimal import Decimal -from pathlib import Path -from pynmrstar import utils, definitions, Saveframe, Entry, Schema, Loop, _Parser -from pynmrstar._internal import _interpret_file -from pynmrstar.exceptions import ParsingError - -logging.getLogger('pynmrstar').setLevel(logging.ERROR) - -our_path = os.path.dirname(os.path.realpath(__file__)) -database_entry = Entry.from_database(15000) -sample_file_location = os.path.join(our_path, "sample_files", "bmr15000_3.str") -sample_saveframe_location = os.path.join(our_path, "sample_files", "saveframe.txt") -sample_loop_location = os.path.join(our_path, "sample_files", "loop.txt") -file_entry = Entry.from_file(sample_file_location) - - -class TestPyNMRSTAR(unittest.TestCase): - - def setUp(self): - self.file_entry = copy(file_entry) - self.maxDiff = None - - def test_clean_val(self): - # Check tag cleaning - self.assertEqual(utils.quote_value("single quote test"), "'single quote test'") - self.assertEqual(utils.quote_value("double quote' test"), '"double quote\' test"') - self.assertEqual(utils.quote_value("loop_"), "'loop_'") - self.assertEqual(utils.quote_value("#comment"), "'#comment'") - self.assertEqual(utils.quote_value("_tag"), "'_tag'") - self.assertEqual(utils.quote_value("simple"), "simple") - self.assertEqual(utils.quote_value(" "), "' '") - self.assertEqual(utils.quote_value("\nnewline\n"), "\nnewline\n") - self.assertEqual(utils.quote_value(None), ".") - self.assertRaises(ValueError, utils.quote_value, "") - - definitions.STR_CONVERSION_DICT = {"loop_": "noloop_"} - utils.quote_value.cache_clear() - self.assertEqual(utils.quote_value("loop_"), "noloop_") - definitions.STR_CONVERSION_DICT = {None: "."} - - def test_odd_strings(self): - """ Make sure the library can handle odd strings. """ - - # Don't run the naughty strings test in GitHub, since it won't - # recursively checkout the "naughty strings" module on platforms - # other than linux. - if "GITHUB_WORKFLOW" in os.environ: - return - - saveframe = Saveframe.from_scratch('test', 'citations') - with open(os.path.join(our_path, 'naughty-strings/blns.json')) as odd_string_file: - odd_strings = json.load(odd_string_file) - for x, string in enumerate(odd_strings): - if string == '': - continue - saveframe.add_tag(str(x), string) - - self.assertEqual(saveframe, Saveframe.from_string(str(saveframe))) - - def test_edge_cases(self): - """ Make sure that the various types of edge cases are properly handled. """ - - Entry.from_file(os.path.join(our_path, 'sample_files', 'edge_cases.str')) - Entry.from_file(os.path.join(our_path, 'sample_files', 'dos.str')) - Entry.from_file(os.path.join(our_path, 'sample_files', 'nonewlines.str')) - Entry.from_file(os.path.join(our_path, 'sample_files', 'onlynewlines.str')) - - def test__format_category(self): - self.assertEqual(utils.format_category("test"), "_test") - self.assertEqual(utils.format_category("_test"), "_test") - self.assertEqual(utils.format_category("test.test"), "_test") - - def test__format_tag(self): - self.assertEqual(utils.format_tag("test"), "test") - self.assertEqual(utils.format_tag("_test.test"), "test") - self.assertEqual(utils.format_tag("test.test"), "test") - - def test__InterpretFile(self): - with open(sample_file_location, "r") as local_file: - local_version = local_file.read() - - # Test reading file from local locations - self.assertEqual(_interpret_file(sample_file_location).read(), local_version) - with open(sample_file_location, "rb") as tmp: - self.assertEqual(_interpret_file(tmp).read(), local_version) - with open(os.path.join(our_path, "sample_files", "bmr15000_3.str.gz"), "rb") as tmp: - self.assertEqual(_interpret_file(tmp).read(), local_version) - - # Test reading from http (ftp doesn't work on TravisCI) - entry_url = 'https://bmrb.io/ftp/pub/bmrb/entry_directories/bmr15000/bmr15000_3.str' - self.assertEqual(Entry.from_string(_interpret_file(entry_url).read()), database_entry) - - # Test reading from https locations - raw_api_url = "https://api.bmrb.io/v2/entry/15000?format=rawnmrstar" - self.assertEqual(Entry.from_string(_interpret_file(raw_api_url).read()), database_entry) - - # Test the parser - def test___Parser(self): - - # Check for error when reserved token present in data value - self.assertRaises(ParsingError, Entry.from_string, "data_1\nsave_1\n_tag.example loop_\nsave_\n") - self.assertRaises(ParsingError, Entry.from_string, "data_1\nsave_1\n_tag.example data_\nsave_\n") - self.assertRaises(ParsingError, Entry.from_string, "data_1\nsave_1\n_tag.example save_\nsave_\n") - self.assertRaises(ParsingError, Entry.from_string, "data_1\nsave_1\nloop_\n_tag.tag\nloop_\nstop_\nsave_\n") - self.assertRaises(ParsingError, Entry.from_string, "data_1\nsave_1\nloop_\n_tag.tag\nsave_\nstop_\nsave_\n") - self.assertRaises(ParsingError, Entry.from_string, "data_1\nsave_1\nloop_\n_tag.tag\nglobal_\nstop_\nsave_\n") - - # Check for error when reserved token quoted - self.assertRaises(ParsingError, Entry.from_string, "'data_1'\nsave_1\nloop_\n_tag.tag\ndata_\nstop_\nsave_\n") - self.assertRaises(ParsingError, Entry.from_string, "data_1\n'save_1'\nloop_\n_tag.tag\ndata_\nstop_\nsave_\n") - self.assertRaises(ParsingError, Entry.from_string, 'data_1\nsave_1\n"loop"_\n_tag.tag\ndata_\nstop_\nsave_\n') - self.assertRaises(ParsingError, Entry.from_string, - "data_1\nsave_1\nloop_\n_tag.tag\ndata_\n;\nstop_\n;\nsave_\n") - self.assertRaises(ParsingError, Saveframe.from_string, "save_1\n_tag.1 _tag.2") - - def test_Schema(self): - default = Schema() - - self.assertEqual(default.headers, - ['Dictionary sequence', 'SFCategory', 'ADIT category mandatory', 'ADIT category view type', - 'ADIT super category ID', 'ADIT super category', 'ADIT category group ID', - 'ADIT category view name', 'Tag', 'BMRB current', 'Query prompt', 'Query interface', - 'SG Mandatory', '', 'ADIT exists', 'User full view', 'Metabolomics', 'Metabolites', 'SENCI', - 'Fragment library', 'Item enumerated', 'Item enumeration closed', 'Enum parent SFcategory', - 'Enum parent tag', 'Derived enumeration mantable', 'Derived enumeration', - 'ADIT item view name', 'Data Type', 'Nullable', 'Non-public', 'ManDBTableName', - 'ManDBColumnName', 'Row Index Key', 'Saveframe ID tag', 'Source Key', 'Table Primary Key', - 'Foreign Key Group', 'Foreign Table', 'Foreign Column', 'Secondary index', 'Sub category', - 'Units', 'Loopflag', 'Seq', 'Adit initial rows', 'Enumeration ties', - 'Mandatory code overides', 'Overide value', 'Overide view value', 'ADIT auto insert', - 'Example', 'Prompt', 'Interface', 'bmrbPdbMatchID', 'bmrbPdbTransFunc', 'STAR flag', - 'DB flag', 'SfNamelFlg', 'Sf category flag', 'Sf pointer', 'Natural primary key', - 'Natural foreign key', 'Redundant keys', 'Parent tag', 'public', 'internal', 'small molecule', - 'small molecule', 'metabolomics', 'Entry completeness', 'Overide public', 'internal', - 'small molecule', 'small molecule', 'metabolomic', 'metabolomic', 'default value', - 'Adit form code', 'Tag category', 'Tag field', 'Local key', 'Datum count flag', - 'NEF equivalent', 'mmCIF equivalent', 'Meta data', 'Tag delete', 'BMRB data type', - 'STAR vs Curated DB', 'Key group', 'Reference table', 'Reference column', - 'Dictionary description', 'variableTypeMatch', 'entryIdFlg', 'outputMapExistsFlg', - 'lclSfIdFlg', 'Met ADIT category view name', 'Met Example', 'Met Prompt', 'Met Description', - 'SM Struct ADIT-NMR category view name', 'SM Struct Example', 'SM Struct Prompt', - 'SM Struct Description', 'Met default value', 'SM default value']) - - self.assertEqual(default.val_type("_Entity.ID", 1), []) - self.assertEqual(default.val_type("_Entity.ID", "test"), [ - "Value does not match specification: '_Entity.ID':'test'.\n Type specified: int\n " - "Regular expression for type: '^(?:-?[0-9]*)?$'"]) - self.assertEqual(default.val_type("_Atom_chem_shift.Val", float(1.2)), []) - self.assertEqual(default.val_type("_Atom_chem_shift.Val", "invalid"), [ - "Value does not match specification: '_Atom_chem_shift.Val':'invalid'.\n Type " - "specified: float\n Regular expression for type: '^(?:-?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?)?$'"]) - - self.assertEqual(default.val_type("_Entry.ID", "this should be far too long - much too long"), [ - "Length of '43' is too long for 'CHAR(12)': '_Entry.ID':'this should be far too long - much too long'."]) - - def test_entry_delitem(self): - tmp_entry = copy(self.file_entry) - tmp_entry.frame_list.pop(0) - del self.file_entry[0] - self.assertEqual(self.file_entry, tmp_entry) - - def test_duplicate_saveframe_errors(self): - tmp_entry = copy(self.file_entry) - self.assertRaises(ValueError, tmp_entry.add_saveframe, tmp_entry[0]) - tmp_entry.frame_list.append(tmp_entry[0]) - self.assertRaises(ValueError, tmp_entry.__getattribute__, 'frame_dict') - - def test_entry_eq(self): - # Normalize them both first - db_copy = copy(database_entry) - db_copy.normalize() - self.file_entry.normalize() - self.assertEqual(self.file_entry, db_copy) - - def test_getitem(self): - self.assertEqual(self.file_entry['entry_information'], - self.file_entry.get_saveframe_by_name("entry_information")) - self.assertEqual(self.file_entry[0], self.file_entry.get_saveframe_by_name("entry_information")) - - def test_init(self): - # Make sure the correct errors are raised - self.assertRaises(ValueError, Entry) - self.assertRaises(ParsingError, Entry, the_string="test", entry_num="test") - # Make sure string parsing is correct - self.assertEqual(self.file_entry, Entry.from_string(str(self.file_entry))) - self.assertEqual(str(self.file_entry), str(Entry.from_string(str(self.file_entry)))) - self.assertRaises(IOError, Entry.from_database, 0) - - self.assertEqual(str(Entry.from_scratch(15000)), "data_15000\n\n") - self.assertEqual(Entry.from_file(os.path.join(our_path, "sample_files", "bmr15000_3.str.gz")), self.file_entry) - - def test_from_file_path_support(self): - """Test that from_file methods support pathlib.Path objects.""" - - # Test Entry.from_file with Path object - path_obj = Path(sample_file_location) - entry_from_path = Entry.from_file(path_obj) - self.assertEqual(entry_from_path, self.file_entry) - - # Test Saveframe.from_file with Path object - saveframe_from_path = Saveframe.from_file(Path(sample_saveframe_location)) - self.assertEqual(saveframe_from_path, self.file_entry[0]) - - # Test Loop.from_file with Path object - loop_from_str = Loop.from_file(sample_loop_location) - loop_from_path = Loop.from_file(Path(sample_loop_location)) - self.assertEqual(loop_from_str, loop_from_path) - self.assertEqual(loop_from_path.category, '_Test') - self.assertEqual(loop_from_path.tags, ['ID', 'Name']) - self.assertEqual(loop_from_path.data, [['1', 'First'], ['2', 'Second']]) - - - def test___setitem(self): - tmp_entry = copy(self.file_entry) - tmp_entry[0] = tmp_entry.get_saveframe_by_name('entry_information') - self.assertEqual(tmp_entry, self.file_entry) - tmp_entry['entry_information'] = tmp_entry.get_saveframe_by_name('entry_information') - self.assertEqual(tmp_entry, self.file_entry) - - self.assertRaises(ValueError, tmp_entry.__setitem__, 'entry_informations', - tmp_entry.get_saveframe_by_name('entry_information')) - self.assertRaises(ValueError, tmp_entry.__setitem__, 'entry_information', 1) - - def test_compare(self): - self.assertEqual(self.file_entry.compare(str(self.file_entry)), []) - self.assertEqual(self.file_entry.compare(self.file_entry), []) - - mutated = copy(self.file_entry) - mutated.frame_list.pop() - self.assertEqual(self.file_entry.compare(mutated), - ["The number of saveframes in the entries are not equal: '25' vs '24'.", - "No saveframe with name 'assigned_chem_shift_list_1' in other entry."]) - - def test_getmethods(self): - self.assertEqual(5, len(self.file_entry.get_loops_by_category("_Vendor"))) - self.assertEqual(5, len(self.file_entry.get_loops_by_category("vendor"))) - - self.assertEqual(self.file_entry.get_saveframe_by_name('assigned_chem_shift_list_1'), self.file_entry[-1]) - self.assertRaises(KeyError, self.file_entry.get_saveframe_by_name, 'no such saveframe') - - self.assertEqual(len(self.file_entry.get_saveframes_by_category("NMR_spectrometer")), 6) - self.assertEqual(len(self.file_entry.get_saveframes_by_category("nmr_SPectrometer")), 0) - self.assertEqual(self.file_entry.get_saveframes_by_category('no such category'), []) - - self.assertEqual(self.file_entry.get_saveframes_by_tag_and_value('Submission_date', '2006-09-07'), - [self.file_entry[0]]) - self.assertEqual(self.file_entry.get_saveframes_by_tag_and_value('submission_Date', '2006-09-07'), - [self.file_entry[0]]) - self.assertEqual(self.file_entry.get_saveframes_by_tag_and_value('test.submission_date', '2006-09-07'), []) - - self.assertRaises(ValueError, self.file_entry.get_tag, 'bad_tag') - self.assertEqual(self.file_entry.get_tag("entry.Submission_date"), ['2006-09-07']) - self.assertEqual(self.file_entry.get_tag("entry.Submission_date", whole_tag=True), - [[u'Submission_date', u'2006-09-07']]) - - def test_validate(self): - validation = [] - self.assertEqual(self.file_entry.validate(), []) - self.file_entry[-1][-1][0][0] = 'a' - validation.append( - "Value does not match specification: '_Atom_chem_shift.ID':'a'.\n " - "Type specified: int\n Regular expression for type: '^(?:-?[0-9]*)?$'") - self.assertEqual(self.file_entry.validate(), validation) - self.file_entry[-1][-1][0][0] = '1' - - def test_saveframe(self): - frame = self.file_entry[0] - - # Check __delitem__ - frame.remove_tag('DEtails') - self.assertEqual([[x[0], x[1]] for x in frame.tags], - [['Sf_category', 'entry_information'], - ['Sf_framecode', 'entry_information'], - ['ID', '15000'], - ['Title', - 'Solution structure of chicken villin headpiece subdomain containing a ' - 'fluorinated side chain in the core\n'], - ['Type', 'macromolecule'], - ['Version_type', 'original'], - ['Submission_date', '2006-09-07'], - ['Accession_date', '2006-09-07'], - ['Last_release_date', '2006-09-07'], - ['Original_release_date', '2006-09-07'], - ['Origination', 'author'], - ['Format_name', '.'], - ['NMR_STAR_version', '3.2.6.0'], - ['NMR_STAR_dict_location', '.'], - ['Original_NMR_STAR_version', '3.2.6.0'], - ['Experimental_method', 'NMR'], - ['Experimental_method_subtype', 'solution'], - ['Source_data_format', '.'], - ['Source_data_format_version', '.'], - ['Generated_software_name', '.'], - ['Generated_software_version', '.'], - ['Generated_software_ID', '.'], - ['Generated_software_label', '.'], - ['Generated_date', '.'], - ['DOI', '.'], - ['UUID', '.'], - ['Related_coordinate_file_name', '.'], - ['BMRB_internal_directory_name', '.']]) - self.assertEqual(len(frame), 7) - del frame[0] - self.assertEqual(len(frame), 6) - del frame[frame.get_loop('RElease')] - self.assertEqual(len(frame), 5) - self.assertRaises(KeyError, frame.get_loop, 'RElease') - - # Check __getitem__ - self.assertEqual(frame.get_tag('NMR_STAR_version'), ['3.2.6.0']) - self.assertEqual(frame[0], frame.loops[0]) - self.assertEqual(frame.get_loop('_SG_project'), frame.loops[0]) - - # Check __lt__ - self.assertEqual(frame[-3] > frame[-1], False) - - # Check __init__ - self.assertRaises(ValueError, Saveframe) - self.assertEqual(Saveframe.from_string(str(frame)), frame) - self.assertEqual(str(Saveframe.from_scratch("test", tag_prefix="test")), "save_test\n\nsave_\n") - tmp = copy(frame) - tmp._loops = [] - self.assertEqual(Saveframe.from_string(frame.get_data_as_csv(frame), csv=True).compare(tmp), []) - self.assertRaises(ValueError, Saveframe.from_string, "test.1,test.2\n2,3,4", csv=True) - - # Check __repr__ - self.assertEqual(repr(frame), "") - - # Check __setitem__ - frame['test'] = 1 - self.assertEqual(frame.tags[-1][1], 1) - frame['tESt'] = 2 - self.assertEqual(frame.tags[-1][1], 2) - frame[4] = frame[3] - self.assertEqual(frame.loops[3], frame.loops[4]) - - # Check add_loop - self.assertRaises(ValueError, frame.add_loop, frame.loops[0]) - - # Check add_tag - self.assertRaises(ValueError, frame.add_tag, "test", 1) - self.assertRaises(ValueError, frame.add_tag, "invalid test", 1) - self.assertRaises(ValueError, frame.add_tag, "invalid.test.test", 1) - self.assertRaises(ValueError, frame.add_tag, "invalid.test", 1, update=True) - frame.add_tag("test", 3, update=True) - self.assertEqual(frame.get_tag('test'), [3]) - - # Check add_tags - frame.add_tags([['example1'], ['example2']]) - self.assertEqual(frame.tags[-2], ['example1', "."]) - frame.add_tags([['example1', 5], ['example2']], update=True) - self.assertEqual(frame.tags[-2], ['example1', 5]) - - # Check compare - self.assertEqual(frame.compare(frame), []) - self.assertEqual(frame.compare(self.file_entry[1]), - ["\tSaveframe names do not match: 'entry_information' vs 'citation_1'."]) - tmp = copy(frame) - tmp.tag_prefix = "test" - self.assertEqual(frame.compare(tmp), ["\tTag prefix does not match: '_Entry' vs 'test'."]) - tmp = copy(frame) - tmp.tags[0][0] = "broken" - self.assertEqual(frame.compare(tmp), ["\tNo tag with name '_Entry.Sf_category' in compared entry."]) - - # Test remove_tag - self.assertRaises(KeyError, frame.remove_tag, "this_tag_will_not_exist") - frame.remove_tag("test") - self.assertEqual(frame.get_tag("test"), []) - - # Test get_data_as_csv - self.assertEqual(frame.get_data_as_csv(), - '''_Entry.Sf_category,_Entry.Sf_framecode,_Entry.ID,_Entry.Title,_Entry.Type,_Entry.Version_type,_Entry.Submission_date,_Entry.Accession_date,_Entry.Last_release_date,_Entry.Original_release_date,_Entry.Origination,_Entry.Format_name,_Entry.NMR_STAR_version,_Entry.NMR_STAR_dict_location,_Entry.Original_NMR_STAR_version,_Entry.Experimental_method,_Entry.Experimental_method_subtype,_Entry.Source_data_format,_Entry.Source_data_format_version,_Entry.Generated_software_name,_Entry.Generated_software_version,_Entry.Generated_software_ID,_Entry.Generated_software_label,_Entry.Generated_date,_Entry.DOI,_Entry.UUID,_Entry.Related_coordinate_file_name,_Entry.BMRB_internal_directory_name,_Entry.example1,_Entry.example2 -entry_information,entry_information,15000,"Solution structure of chicken villin headpiece subdomain containing a fluorinated side chain in the core -",macromolecule,original,2006-09-07,2006-09-07,2006-09-07,2006-09-07,author,.,3.2.6.0,.,3.2.6.0,NMR,solution,.,.,.,.,.,.,.,.,.,.,.,5,. -''') - self.assertEqual(frame.get_data_as_csv(show_category=False), - '''Sf_category,Sf_framecode,ID,Title,Type,Version_type,Submission_date,Accession_date,Last_release_date,Original_release_date,Origination,Format_name,NMR_STAR_version,NMR_STAR_dict_location,Original_NMR_STAR_version,Experimental_method,Experimental_method_subtype,Source_data_format,Source_data_format_version,Generated_software_name,Generated_software_version,Generated_software_ID,Generated_software_label,Generated_date,DOI,UUID,Related_coordinate_file_name,BMRB_internal_directory_name,example1,example2 -entry_information,entry_information,15000,"Solution structure of chicken villin headpiece subdomain containing a fluorinated side chain in the core -",macromolecule,original,2006-09-07,2006-09-07,2006-09-07,2006-09-07,author,.,3.2.6.0,.,3.2.6.0,NMR,solution,.,.,.,.,.,.,.,.,.,.,.,5,. -''') - self.assertEqual(frame.get_data_as_csv(header=False), - '''entry_information,entry_information,15000,"Solution structure of chicken villin headpiece subdomain containing a fluorinated side chain in the core -",macromolecule,original,2006-09-07,2006-09-07,2006-09-07,2006-09-07,author,.,3.2.6.0,.,3.2.6.0,NMR,solution,.,.,.,.,.,.,.,.,.,.,.,5,. -''') - self.assertEqual(frame.get_data_as_csv(show_category=False, header=False), - '''entry_information,entry_information,15000,"Solution structure of chicken villin headpiece subdomain containing a fluorinated side chain in the core -",macromolecule,original,2006-09-07,2006-09-07,2006-09-07,2006-09-07,author,.,3.2.6.0,.,3.2.6.0,NMR,solution,.,.,.,.,.,.,.,.,.,.,.,5,. -''') - - # Test get_loop - self.assertEqual(repr(frame.get_loop("_SG_projecT")), "") - self.assertRaises(KeyError, frame.get_loop, 'this_loop_wont_be_found') - - # Test get_tag - this is really already tested in the other tests here - self.assertEqual(frame.get_tag("sf_category"), ['entry_information']) - self.assertEqual(frame.get_tag("entry.sf_category"), ['entry_information']) - self.assertEqual(frame.get_tag("entry.sf_category", whole_tag=True), [['Sf_category', 'entry_information']]) - - # Test sort - self.assertEqual([[x[0], x[1]] for x in frame.tags], [['Sf_category', 'entry_information'], - ['Sf_framecode', 'entry_information'], - ['ID', '15000'], - ['Title', - 'Solution structure of chicken villin headpiece subdomain containing a ' - 'fluorinated side chain in the core\n'], - ['Type', 'macromolecule'], - ['Version_type', 'original'], - ['Submission_date', '2006-09-07'], - ['Accession_date', '2006-09-07'], - ['Last_release_date', '2006-09-07'], - ['Original_release_date', '2006-09-07'], - ['Origination', 'author'], - ['Format_name', '.'], - ['NMR_STAR_version', '3.2.6.0'], - ['NMR_STAR_dict_location', '.'], - ['Original_NMR_STAR_version', '3.2.6.0'], - ['Experimental_method', 'NMR'], - ['Experimental_method_subtype', 'solution'], - ['Source_data_format', '.'], - ['Source_data_format_version', '.'], - ['Generated_software_name', '.'], - ['Generated_software_version', '.'], - ['Generated_software_ID', '.'], - ['Generated_software_label', '.'], - ['Generated_date', '.'], - ['DOI', '.'], - ['UUID', '.'], - ['Related_coordinate_file_name', '.'], - ['BMRB_internal_directory_name', '.'], - ['example1', 5], - ['example2', '.']]) - - frame.remove_tag(['example2', 'example1']) - frame.tags.append(frame.tags.pop(0)) - frame.sort_tags() - self.assertEqual([[x[0], x[1]] for x in frame.tags], [['Sf_category', 'entry_information'], - ['Sf_framecode', 'entry_information'], - ['ID', '15000'], - ['Title', - 'Solution structure of chicken villin headpiece subdomain containing a ' - 'fluorinated side chain in the core\n'], - ['Type', 'macromolecule'], - ['Version_type', 'original'], - ['Submission_date', '2006-09-07'], - ['Accession_date', '2006-09-07'], - ['Last_release_date', '2006-09-07'], - ['Original_release_date', '2006-09-07'], - ['Origination', 'author'], - ['Format_name', '.'], - ['NMR_STAR_version', '3.2.6.0'], - ['NMR_STAR_dict_location', '.'], - ['Original_NMR_STAR_version', '3.2.6.0'], - ['Experimental_method', 'NMR'], - ['Experimental_method_subtype', 'solution'], - ['Source_data_format', '.'], - ['Source_data_format_version', '.'], - ['Generated_software_name', '.'], - ['Generated_software_version', '.'], - ['Generated_software_ID', '.'], - ['Generated_software_label', '.'], - ['Generated_date', '.'], - ['DOI', '.'], - ['UUID', '.'], - ['Related_coordinate_file_name', '.'], - ['BMRB_internal_directory_name', '.']]) - - # Test validate - self.assertEqual(self.file_entry['assigned_chem_shift_list_1'].validate(), []) - - # Test set_tag_prefix - frame.set_tag_prefix("new_prefix") - self.assertEqual(frame.tag_prefix, "_new_prefix") - - def test_Saveframe_add_tag(self): - """ Test the add_tag functionality of a saveframe. """ - - # Test that you cannot set the framecode to a null value - test_sf = Saveframe.from_scratch('test') - - # Test that the initial setter can't set a null value - with self.assertRaises(ValueError): - test_sf.add_tag('sf_framecode', None) - test_sf.add_tag('sf_framecode', 'test') - - # Test that updating both via add_tag(update=True) and .name= don't - # allow for setting a null value - for val in definitions.NULL_VALUES: - with self.assertRaises(ValueError): - test_sf.add_tag('sf_framecode', val) - with self.assertRaises(ValueError): - test_sf.name = val - - # Test that adding an sf_framecode with a different value than the - # saveframe name throws an exception - with self.assertRaises(ValueError): - test_sf_two = Saveframe.from_scratch('test') - test_sf_two.add_tag('sf_framecode', 'different') - - def test_Entry___setitem__(self): - """ Test the setting a tag functionality of an entry. """ - - test_entry = Entry.from_scratch('test') - test_saveframe = Saveframe.from_scratch('test', 'test') - test_entry._frame_list = [test_saveframe, test_saveframe] - with self.assertRaises(ValueError): - test_entry['test'] = test_saveframe - - def test_category_list(self): - """ Test the category list property. """ - - tmp = copy(self.file_entry) - self.assertEqual(tmp.category_list, ['entry_information', 'citations', 'assembly', 'entity', 'natural_source', - 'experimental_source', 'chem_comp', 'sample', 'sample_conditions', - 'software', 'NMR_spectrometer', 'NMR_spectrometer_list', 'experiment_list', - 'chem_shift_reference', 'assigned_chemical_shifts']) - tmp.add_saveframe(Saveframe.from_scratch("test", None)) - self.assertEqual(tmp.category_list, ['entry_information', 'citations', 'assembly', 'entity', 'natural_source', - 'experimental_source', 'chem_comp', 'sample', 'sample_conditions', - 'software', 'NMR_spectrometer', 'NMR_spectrometer_list', 'experiment_list', - 'chem_shift_reference', 'assigned_chemical_shifts']) - - def test_loop_parsing(self): - with self.assertRaises(ParsingError): - Loop.from_string("loop_ _test.one _test.two 1 loop_") - with self.assertRaises(ParsingError): - Loop.from_string("loop_ _test.one _test.two 1 stop_") - with self.assertRaises(ParsingError): - Loop.from_string("loop_ _test.one _test.two 1 2 3 stop_") - with self.assertRaises(ParsingError): - Loop.from_string("loop_ _test.one _test.two 1 2 3") - with self.assertRaises(ParsingError): - Loop.from_string("loop_ _test.one _test.two 1 save_ stop_") - - def test_loop(self): - test_loop = self.file_entry[0][0] - - # Check filter - self.assertEqual(test_loop.filter(['_Entry_author.Ordinal', '_Entry_author.Middle_initials']), - Loop.from_string( - "loop_ _Entry_author.Ordinal _Entry_author.Middle_initials 1 C. 2 . 3 B. 4 H. 5 L. stop_")) - # Check eq - self.assertEqual(test_loop == self.file_entry[0][0], True) - self.assertEqual(test_loop != self.file_entry[0][1], True) - # Check __getitem__ - self.assertEqual(test_loop['_Entry_author.Ordinal'], ['1', '2', '3', '4', '5']) - self.assertEqual(test_loop[['_Entry_author.Ordinal', '_Entry_author.Middle_initials']], - [['1', 'C.'], ['2', '.'], ['3', 'B.'], ['4', 'H.'], ['5', 'L.']]) - # Test __setitem__ - test_loop['_Entry_author.Ordinal'] = [1] * 5 - self.assertEqual(test_loop['_Entry_author.Ordinal'], [1, 1, 1, 1, 1]) - test_loop['_Entry_author.Ordinal'] = ['1', '2', '3', '4', '5'] - self.assertRaises(ValueError, test_loop.__setitem__, '_Entry_author.Ordinal', [1]) - self.assertRaises(ValueError, test_loop.__setitem__, '_Wrong_loop.Ordinal', [1, 2, 3, 4, 5]) - # Check __init__ - self.assertRaises(ValueError, Loop) - test = Loop.from_scratch(category="test") - self.assertEqual(test.category, "_test") - self.assertEqual(Loop.from_string(str(test_loop)), test_loop) - self.assertEqual(test_loop, Loop.from_string(test_loop.get_data_as_csv(), csv=True)) - # Check len - self.assertEqual(len(test_loop), len(test_loop.data)) - # Check lt - self.assertEqual(test_loop < self.file_entry[0][1], True) - # Check __str__ - self.assertEqual(Loop.from_scratch().format(skip_empty_loops=False), "\n loop_\n\n stop_\n") - self.assertEqual(Loop.from_scratch().format(skip_empty_loops=True), "") - tmp_loop = Loop.from_scratch() - tmp_loop.data = [[1, 2, 3]] - self.assertRaises(ValueError, tmp_loop.__str__) - tmp_loop.add_tag("tag1") - self.assertRaises(ValueError, tmp_loop.__str__) - tmp_loop.add_tag("tag2") - tmp_loop.add_tag("tag3") - self.assertRaises(ValueError, tmp_loop.__str__) - tmp_loop.set_category("test") - self.assertEqual(str(tmp_loop), "\n loop_\n _test.tag1\n _test.tag2\n _test.tag3\n\n " - "1 2 3 \n\n stop_\n") - self.assertEqual(tmp_loop.category, "_test") - # Check different category - self.assertRaises(ValueError, tmp_loop.add_tag, "invalid.tag") - # Check duplicate tag - self.assertRaises(ValueError, tmp_loop.add_tag, "test.tag3") - self.assertEqual(tmp_loop.add_tag("test.tag3", ignore_duplicates=True), None) - # Check space and period in tag - self.assertRaises(ValueError, tmp_loop.add_tag, "test. tag") - self.assertRaises(ValueError, tmp_loop.add_tag, "test.tag.test") - - # Check add_data - self.assertRaises(ValueError, tmp_loop.add_data, [1, 2, 3, 4]) - tmp_loop.add_data([4, 5, 6]) - tmp_loop.add_data([7, 8, 9]) - self.assertEqual(tmp_loop.data, [[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - - # Test delete_data_by_tag_value - self.assertEqual(tmp_loop.remove_data_by_tag_value("tag1", 1, index_tag=0), [[1, 2, 3]]) - self.assertRaises(ValueError, tmp_loop.remove_data_by_tag_value, "tag4", "data") - self.assertEqual(tmp_loop.data, [[1, 5, 6], [2, 8, 9]]) - - # Test get_data_as_csv() - self.assertEqual(tmp_loop.get_data_as_csv(), "_test.tag1,_test.tag2,_test.tag3\n1,5,6\n2,8,9\n") - self.assertEqual(tmp_loop.get_data_as_csv(show_category=False), "tag1,tag2,tag3\n1,5,6\n2,8,9\n") - self.assertEqual(tmp_loop.get_data_as_csv(header=False), "1,5,6\n2,8,9\n") - self.assertEqual(tmp_loop.get_data_as_csv(show_category=False, header=False), "1,5,6\n2,8,9\n") - - # Test get_tag - self.assertRaises(ValueError, tmp_loop.get_tag, "invalid.tag1") - self.assertEqual(tmp_loop.get_tag("tag1"), [1, 2]) - self.assertEqual(tmp_loop.get_tag(["tag1", "tag2"]), [[1, 5], [2, 8]]) - self.assertEqual(tmp_loop.get_tag("tag1", whole_tag=True), [['_test.tag1', 1], ['_test.tag1', 2]]) - - self.assertEqual( - test_loop.get_tag(['_Entry_author.Ordinal', '_Entry_author.Middle_initials'], dict_result=True), - [{'_Entry_author.Middle_initials': 'C.', '_Entry_author.Ordinal': '1'}, - {'_Entry_author.Middle_initials': '.', '_Entry_author.Ordinal': '2'}, - {'_Entry_author.Middle_initials': 'B.', '_Entry_author.Ordinal': '3'}, - {'_Entry_author.Middle_initials': 'H.', '_Entry_author.Ordinal': '4'}, - {'_Entry_author.Middle_initials': 'L.', '_Entry_author.Ordinal': '5'}]) - - self.assertEqual( - test_loop.get_tag(['ORdinal', 'MIddle_initials'], dict_result=True), - [{'MIddle_initials': 'C.', 'ORdinal': '1'}, - {'MIddle_initials': '.', 'ORdinal': '2'}, - {'MIddle_initials': 'B.', 'ORdinal': '3'}, - {'MIddle_initials': 'H.', 'ORdinal': '4'}, - {'MIddle_initials': 'L.', 'ORdinal': '5'}]) - - self.assertEqual( - test_loop.get_tag(['Ordinal', 'Middle_initials'], dict_result=True, whole_tag=True), - [{'_Entry_author.Middle_initials': 'C.', '_Entry_author.Ordinal': '1'}, - {'_Entry_author.Middle_initials': '.', '_Entry_author.Ordinal': '2'}, - {'_Entry_author.Middle_initials': 'B.', '_Entry_author.Ordinal': '3'}, - {'_Entry_author.Middle_initials': 'H.', '_Entry_author.Ordinal': '4'}, - {'_Entry_author.Middle_initials': 'L.', '_Entry_author.Ordinal': '5'}]) - - self.assertEqual(test_loop.get_tag(['_Entry_author.Ordinal', '_Entry_author.Middle_initials'], dict_result=True, - whole_tag=True), - [{'_Entry_author.Middle_initials': 'C.', '_Entry_author.Ordinal': '1'}, - {'_Entry_author.Middle_initials': '.', '_Entry_author.Ordinal': '2'}, - {'_Entry_author.Middle_initials': 'B.', '_Entry_author.Ordinal': '3'}, - {'_Entry_author.Middle_initials': 'H.', '_Entry_author.Ordinal': '4'}, - {'_Entry_author.Middle_initials': 'L.', '_Entry_author.Ordinal': '5'}]) - - def simple_key(x): - return -int(x[2]) - - # Test sort_rows - tmp_loop.sort_rows(["tag2"], key=simple_key) - self.assertEqual(tmp_loop.data, [[2, 8, 9], [1, 5, 6]]) - tmp_loop.sort_rows(["tag2"]) - self.assertEqual(tmp_loop.data, [[1, 5, 6], [2, 8, 9]]) - - # Test clear data - tmp_loop.clear_data() - self.assertEqual(tmp_loop.data, []) - - # Test that the from_template method works - self.assertEqual(Loop.from_template("atom_chem_shift", all_tags=False), - Loop.from_string(""" -loop_ - _Atom_chem_shift.ID - _Atom_chem_shift.Assembly_atom_ID - _Atom_chem_shift.Entity_assembly_ID - _Atom_chem_shift.Entity_assembly_asym_ID - _Atom_chem_shift.Entity_ID - _Atom_chem_shift.Comp_index_ID - _Atom_chem_shift.Seq_ID - _Atom_chem_shift.Comp_ID - _Atom_chem_shift.Atom_ID - _Atom_chem_shift.Atom_type - _Atom_chem_shift.Atom_isotope_number - _Atom_chem_shift.Val - _Atom_chem_shift.Val_err - _Atom_chem_shift.Assign_fig_of_merit - _Atom_chem_shift.Ambiguity_code - _Atom_chem_shift.Ambiguity_set_ID - _Atom_chem_shift.Occupancy - _Atom_chem_shift.Resonance_ID - _Atom_chem_shift.Auth_entity_assembly_ID - _Atom_chem_shift.Auth_asym_ID - _Atom_chem_shift.Auth_seq_ID - _Atom_chem_shift.Auth_comp_ID - _Atom_chem_shift.Auth_atom_ID - _Atom_chem_shift.Original_PDB_strand_ID - _Atom_chem_shift.Original_PDB_residue_no - _Atom_chem_shift.Original_PDB_residue_name - _Atom_chem_shift.Original_PDB_atom_name - _Atom_chem_shift.Details - _Atom_chem_shift.Entry_ID - _Atom_chem_shift.Assigned_chem_shift_list_ID - - - stop_ -""")) - - self.assertEqual(Loop.from_template("atom_chem_shift", all_tags=True), - Loop.from_string(""" - loop_ - _Atom_chem_shift.ID - _Atom_chem_shift.Assembly_atom_ID - _Atom_chem_shift.Entity_assembly_ID - _Atom_chem_shift.Entity_assembly_asym_ID - _Atom_chem_shift.Entity_ID - _Atom_chem_shift.Comp_index_ID - _Atom_chem_shift.Seq_ID - _Atom_chem_shift.Comp_ID - _Atom_chem_shift.Atom_ID - _Atom_chem_shift.Atom_type - _Atom_chem_shift.Atom_isotope_number - _Atom_chem_shift.Val - _Atom_chem_shift.Val_err - _Atom_chem_shift.Assign_fig_of_merit - _Atom_chem_shift.Ambiguity_code - _Atom_chem_shift.Ambiguity_set_ID - _Atom_chem_shift.Occupancy - _Atom_chem_shift.Resonance_ID - _Atom_chem_shift.Auth_entity_assembly_ID - _Atom_chem_shift.Auth_asym_ID - _Atom_chem_shift.Auth_seq_ID - _Atom_chem_shift.Auth_comp_ID - _Atom_chem_shift.Auth_atom_ID - _Atom_chem_shift.PDB_record_ID - _Atom_chem_shift.PDB_model_num - _Atom_chem_shift.PDB_strand_ID - _Atom_chem_shift.PDB_ins_code - _Atom_chem_shift.PDB_residue_no - _Atom_chem_shift.PDB_residue_name - _Atom_chem_shift.PDB_atom_name - _Atom_chem_shift.Original_PDB_strand_ID - _Atom_chem_shift.Original_PDB_residue_no - _Atom_chem_shift.Original_PDB_residue_name - _Atom_chem_shift.Original_PDB_atom_name - _Atom_chem_shift.Details - _Atom_chem_shift.Sf_ID - _Atom_chem_shift.Entry_ID - _Atom_chem_shift.Assigned_chem_shift_list_ID - - - stop_ -""")) - - # Test adding a tag to the schema - my_schem = Schema() - my_schem.add_tag("_Atom_chem_shift.New_Tag", "VARCHAR(100)", True, "assigned_chemical_shifts", True, - "_Atom_chem_shift.Atom_ID") - self.assertEqual(Loop.from_template("atom_chem_shift", all_tags=True, schema=my_schem), - Loop.from_string( - "loop_ _Atom_chem_shift.ID _Atom_chem_shift.Assembly_atom_ID " - "_Atom_chem_shift.Entity_assembly_ID _Atom_chem_shift.Entity_ID " - "_Atom_chem_shift.Comp_index_ID _Atom_chem_shift.Seq_ID " - "_Atom_chem_shift.Comp_ID _Atom_chem_shift.Atom_ID _Atom_chem_shift.New_Tag " - "_Atom_chem_shift.Atom_type _Atom_chem_shift.Atom_isotope_number " - "_Atom_chem_shift.Val _Atom_chem_shift.Val_err _Atom_chem_shift.Assign_fig_of_merit " - "_Atom_chem_shift.Ambiguity_code _Atom_chem_shift.Ambiguity_set_ID " - "_Atom_chem_shift.Occupancy _Atom_chem_shift.Resonance_ID " - "_Atom_chem_shift.Auth_entity_assembly_ID _Atom_chem_shift.Auth_asym_ID " - "_Atom_chem_shift.Auth_seq_ID _Atom_chem_shift.Auth_comp_ID " - "_Atom_chem_shift.Auth_atom_ID _Atom_chem_shift.PDB_record_ID " - "_Atom_chem_shift.PDB_model_num _Atom_chem_shift.PDB_strand_ID " - "_Atom_chem_shift.PDB_ins_code _Atom_chem_shift.PDB_residue_no " - "_Atom_chem_shift.PDB_residue_name _Atom_chem_shift.PDB_atom_name " - "_Atom_chem_shift.Original_PDB_strand_ID _Atom_chem_shift.Original_PDB_residue_no " - "_Atom_chem_shift.Original_PDB_residue_name _Atom_chem_shift.Original_PDB_atom_name " - "_Atom_chem_shift.Details _Atom_chem_shift.Sf_ID _Atom_chem_shift.Entry_ID " - "_Atom_chem_shift.Assigned_chem_shift_list_ID stop_ ")) - - # Make sure adding data with a tag works - tmp_loop = Loop.from_string("loop_ _Atom_chem_shift.ID stop_") - tmp_loop.data = [[1]] - tmp_loop.add_tag("Assembly_atom_ID", update_data=True) - self.assertEqual(tmp_loop.data, [[1, None]]) - self.assertEqual(tmp_loop.tags, ["ID", "Assembly_atom_ID"]) - - # Make sure the add missing tags loop is working - tmp_loop = Loop.from_string("loop_ _Atom_chem_shift.ID stop_") - tmp_loop.add_missing_tags() - self.assertEqual(tmp_loop, Loop.from_template("atom_chem_shift")) - - def test_loop_add_data(self): - test1 = Loop.from_scratch('test') - test1.add_tag(['Name', 'Location']) - self.assertRaises(ValueError, test1.add_data, None) - self.assertRaises(ValueError, test1.add_data, []) - self.assertRaises(ValueError, test1.add_data, {}) - self.assertRaises(ValueError, test1.add_data, {'not_present': 1}) - test1.add_data([{'name': 'Jeff', 'location': 'Connecticut'}, {'name': 'Chad', 'location': 'Madison'}]) - - test2 = Loop.from_scratch('test') - test2.add_tag(['Name', 'Location']) - test2.add_data({'name': ['Jeff', 'Chad'], 'location': ['Connecticut', 'Madison']}) - - test3 = Loop.from_scratch('test') - test3.add_tag(['Name', 'Location']) - self.assertRaises(ValueError, test3.add_data, [['Jeff', 'Connecticut'], ['Chad']]) - test3.add_data([['Jeff', 'Connecticut'], ['Chad', 'Madison']]) - - test4 = Loop.from_scratch('test') - test4.add_tag(['Name', 'Location']) - self.assertRaises(ValueError, test4.add_data, ['Jeff', 'Connecticut', 'Chad', 'Madison']) - test4.add_data(['Jeff', 'Connecticut', 'Chad', 'Madison'], rearrange=True) - - self.assertEqual(test1, test2) - self.assertEqual(test2, test3) - self.assertEqual(test3, test4) - - # Now check the 'convert_data_types' argument and the raw data present in the loop - test = Loop.from_scratch('_Atom_chem_shift') - test.add_tag(['Val', 'Entry_ID', 'Details']) - test.add_data([{'details': 'none', 'vAL': '1.2'}, {'val': 5, 'details': '.'}], convert_data_types=True) - self.assertEqual(test.data, [[Decimal('1.2'), None, 'none'], [Decimal(5), None, None]]) - test.clear_data() - test.add_data([{'details': 'none', 'vAL': '1.2'}, {'val': 5, 'details': '.'}]) - self.assertEqual(test.data, [['1.2', None, 'none'], [5, None, '.']]) - - def test_rename_saveframe(self): - tmp = copy(self.file_entry) - tmp.rename_saveframe('F5-Phe-cVHP', 'jons_frame') - tmp.rename_saveframe('jons_frame', 'F5-Phe-cVHP') - self.assertEqual(tmp, self.file_entry) - - def test_duplicate_loop_detection(self): - one = Loop.from_scratch(category="duplicate") - two = Loop.from_scratch(category="duplicate") - frame = Saveframe.from_scratch('1') - frame.add_loop(one) - self.assertRaises(ValueError, frame.add_loop, two) - - def test_normalize(self): - - db_tmp = copy(self.file_entry) - denormalized = Entry.from_file(os.path.join(our_path, "sample_files", "bmr15000_3_denormalized.str")) - denormalized.normalize() - self.assertEqual(db_tmp.compare(denormalized), []) - - # Shuffle our local entry - random.shuffle(db_tmp.frame_list) - for frame in db_tmp: - random.shuffle(frame.loops) - random.shuffle(frame.tags) - - # Might as well test equality testing while shuffled: - self.assertEqual(db_tmp.compare(self.file_entry), []) - - # Test that the frames are in a different order - self.assertNotEqual(db_tmp.frame_list, self.file_entry.frame_list) - db_tmp.normalize() - - self.assertEqual(db_tmp.frame_list, self.file_entry.frame_list) - - # Now test ordering of saveframes when tags may be missing - b = Saveframe.from_scratch('not_real2') - b.add_tag('_help.Sf_category', 'a') - b.add_tag('_help.ID', 1) - c = Saveframe.from_scratch('not_real') - c.add_tag('_help.Sf_category', 'a') - c.add_tag('_help.ID', 'a') - d = Saveframe.from_scratch('not_real3') - d.add_tag('_help.borg', 'a') - - db_tmp.add_saveframe(b) - db_tmp.add_saveframe(c) - db_tmp.add_saveframe(d) - - correct_order = db_tmp.frame_list[:] - random.shuffle(db_tmp.frame_list) - db_tmp.normalize() - self.assertEqual(db_tmp.frame_list, correct_order) - - def test_syntax_outliers(self): - """ Make sure the case of semi-colon delineated data in a data - value is properly escaped. """ - - ml = copy(self.file_entry[0][0]) - # Should always work once - ml[0][0] = str(ml) - self.assertEqual(ml, Loop.from_string(str(ml))) - # Twice could trigger bug - ml[0][0] = str(ml) - self.assertEqual(ml, Loop.from_string(str(ml))) - self.assertEqual(ml[0][0], Loop.from_string(str(ml))[0][0]) - # Third time is a charm - ml[0][0] = str(ml) - self.assertEqual(ml, Loop.from_string(str(ml))) - # Check the data too - this should never fail (the previous test would - # have already failed.) - self.assertEqual(ml[0][0], Loop.from_string(str(ml))[0][0]) - - def test_parse_outliers(self): - """ Make sure the parser handles edge cases. """ - - parser = _Parser() - parser.load_data("""data_#pound -save_entry_information _Entry.Sf_category entry_information _Entry.Sf_framecode entry_information -_Entry.sameline_comment value #ignore this all -_Entry.ID \".-!?\" -_Entry.Invalid_tag "This tag doesn't exist." -_Entry.Title -; Solution structure of chicken villin headpiece subdomain contain;ing a fluorinated side chain in the cores; -; -_Entry.Submi#ssion_date "check inn"er "quoted vals" -_Entry.Accession_date 'check inner quoted vals' -_Entry.Original_NMR_STAR_version '_.' - _Entry.Experimental_method $ - _Entry.Details "1#" - _Entry.Experimental_method_subtype solution - _Entry.BMRB_internal_directory_name ;data; -_Entry.pointer $it -_Entry.multi -; - - nothing - to shift -; -_Entry.multi2 -; - - ; - something - to shift -; -""") - parser.get_token() - self.assertEqual((parser.token, parser.delimiter), ('data_#pound', ' ')) - parser.get_token() - self.assertEqual((parser.token, parser.delimiter), ('save_entry_information', ' ')) - parser.get_token() - self.assertEqual((parser.token, parser.delimiter), ('_Entry.Sf_category', ' ')) - parser.get_token() - self.assertEqual((parser.token, parser.delimiter), ('entry_information', ' ')) - parser.get_token() - self.assertEqual((parser.token, parser.delimiter), ('_Entry.Sf_framecode', ' ')) - parser.get_token() - self.assertEqual((parser.token, parser.delimiter), ('entry_information', ' ')) - parser.get_token() - self.assertEqual((parser.token, parser.delimiter), ('_Entry.sameline_comment', ' ')) - parser.get_token() - self.assertEqual((parser.token, parser.delimiter), ('value', ' ')) - parser.get_token() - self.assertEqual((parser.token, parser.delimiter), ('_Entry.ID', ' ')) - parser.get_token() - self.assertEqual((parser.token, parser.delimiter), ('.-!?', '"')) - parser.get_token() - self.assertEqual((parser.token, parser.delimiter), ('_Entry.Invalid_tag', ' ')) - parser.get_token() - self.assertEqual((parser.token, parser.delimiter), ("This tag doesn't exist.", '"')) - parser.get_token() - self.assertEqual((parser.token, parser.delimiter), ('_Entry.Title', ' ')) - parser.get_token() - self.assertEqual((parser.token, parser.delimiter), (" Solution structure of chicken villin headpiece subdomain" - " contain;ing a fluorinated side chain in the cores;\n", - ';')) - parser.get_token() - self.assertEqual((parser.token, parser.delimiter), ('_Entry.Submi#ssion_date', ' ')) - parser.get_token() - self.assertEqual((parser.token, parser.delimiter), ('check inn"er "quoted vals', '"')) - parser.get_token() - self.assertEqual((parser.token, parser.delimiter), ('_Entry.Accession_date', ' ')) - parser.get_token() - self.assertEqual((parser.token, parser.delimiter), ('check inner quoted vals', '\'')) - parser.get_token() - self.assertEqual((parser.token, parser.delimiter), ('_Entry.Original_NMR_STAR_version', ' ')) - parser.get_token() - self.assertEqual((parser.token, parser.delimiter), ('_.', '\'')) - parser.get_token() - self.assertEqual((parser.token, parser.delimiter), ('_Entry.Experimental_method', ' ')) - parser.get_token() - self.assertEqual((parser.token, parser.delimiter), ('$', ' ')) - parser.get_token() - self.assertEqual((parser.token, parser.delimiter), ('_Entry.Details', ' ')) - parser.get_token() - self.assertEqual((parser.token, parser.delimiter), ('1#', '"')) - parser.get_token() - self.assertEqual((parser.token, parser.delimiter), ('_Entry.Experimental_method_subtype', ' ')) - parser.get_token() - self.assertEqual((parser.token, parser.delimiter), ('solution', ' ')) - parser.get_token() - self.assertEqual((parser.token, parser.delimiter), ('_Entry.BMRB_internal_directory_name', ' ')) - parser.get_token() - self.assertEqual((parser.token, parser.delimiter), (';data;', ' ')) - parser.get_token() - self.assertEqual((parser.token, parser.delimiter), ('_Entry.pointer', ' ')) - parser.get_token() - self.assertEqual((parser.token, parser.delimiter), ('$it', '$')) - parser.get_token() - parser.get_token() - self.assertEqual((parser.token, parser.delimiter), ("\n nothing\n to shift\n", ';')) - parser.get_token() - parser.get_token() - self.assertEqual((parser.token, parser.delimiter), ("\n;\nsomething\nto shift", ';')) +logging.getLogger('pynmrstar').setLevel(logging.FATAL) +# Import all test classes +from .test_entry import TestEntry +from .test_saveframe import TestSaveframe +from .test_loop import TestLoop +from .test_parser import TestParser +from .test_utils import TestUtils +from .test_schema import TestSchema # Allow unit testing from other modules diff --git a/pynmrstar/unit_tests/test_entry.py b/pynmrstar/unit_tests/test_entry.py new file mode 100644 index 0000000..835b5be --- /dev/null +++ b/pynmrstar/unit_tests/test_entry.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python3 +import os +import random +import unittest +from copy import deepcopy as copy +from pathlib import Path + +from pynmrstar import Entry, Saveframe, Loop +from pynmrstar.exceptions import ParsingError + +our_path = os.path.dirname(os.path.realpath(__file__)) +database_entry = Entry.from_database(15000) +sample_file_location = os.path.join(our_path, "sample_files", "bmr15000_3.str") +sample_saveframe_location = os.path.join(our_path, "sample_files", "saveframe.txt") +sample_loop_location = os.path.join(our_path, "sample_files", "loop.txt") +file_entry = Entry.from_file(sample_file_location) + + +class TestEntry(unittest.TestCase): + + def setUp(self): + self.file_entry = copy(file_entry) + self.maxDiff = None + + def test_edge_cases(self): + """ Make sure that the various types of edge cases are properly handled. """ + + Entry.from_file(os.path.join(our_path, 'sample_files', 'edge_cases.str')) + Entry.from_file(os.path.join(our_path, 'sample_files', 'dos.str')) + Entry.from_file(os.path.join(our_path, 'sample_files', 'nonewlines.str')) + Entry.from_file(os.path.join(our_path, 'sample_files', 'onlynewlines.str')) + + def test_entry_delitem(self): + tmp_entry = copy(self.file_entry) + tmp_entry.frame_list.pop(0) + del self.file_entry[0] + self.assertEqual(self.file_entry, tmp_entry) + + def test_duplicate_saveframe_errors(self): + tmp_entry = copy(self.file_entry) + self.assertRaises(ValueError, tmp_entry.add_saveframe, tmp_entry[0]) + tmp_entry.frame_list.append(tmp_entry[0]) + self.assertRaises(ValueError, tmp_entry.__getattribute__, 'frame_dict') + + def test_entry_eq(self): + # Normalize them both first + db_copy = copy(database_entry) + db_copy.normalize() + self.file_entry.normalize() + self.assertEqual(self.file_entry, db_copy) + + def test_getitem(self): + self.assertEqual(self.file_entry['entry_information'], + self.file_entry.get_saveframe_by_name("entry_information")) + self.assertEqual(self.file_entry[0], self.file_entry.get_saveframe_by_name("entry_information")) + + def test_init(self): + # Make sure the correct errors are raised + self.assertRaises(ValueError, Entry) + self.assertRaises(ParsingError, Entry, the_string="test", entry_num="test") + # Make sure string parsing is correct + self.assertEqual(self.file_entry, Entry.from_string(str(self.file_entry))) + self.assertEqual(str(self.file_entry), str(Entry.from_string(str(self.file_entry)))) + self.assertRaises(IOError, Entry.from_database, 0) + + self.assertEqual(str(Entry.from_scratch(15000)), "data_15000\n\n") + self.assertEqual(Entry.from_file(os.path.join(our_path, "sample_files", "bmr15000_3.str.gz")), self.file_entry) + + def test_from_file_path_support(self): + """Test that from_file methods support pathlib.Path objects.""" + + # Test Entry.from_file with Path object + path_obj = Path(sample_file_location) + entry_from_path = Entry.from_file(path_obj) + self.assertEqual(entry_from_path, self.file_entry) + + def test___setitem(self): + tmp_entry = copy(self.file_entry) + tmp_entry[0] = tmp_entry.get_saveframe_by_name('entry_information') + self.assertEqual(tmp_entry, self.file_entry) + tmp_entry['entry_information'] = tmp_entry.get_saveframe_by_name('entry_information') + self.assertEqual(tmp_entry, self.file_entry) + + self.assertRaises(ValueError, tmp_entry.__setitem__, 'entry_informations', + tmp_entry.get_saveframe_by_name('entry_information')) + self.assertRaises(ValueError, tmp_entry.__setitem__, 'entry_information', 1) + + def test_compare(self): + self.assertEqual(self.file_entry.compare(str(self.file_entry)), []) + self.assertEqual(self.file_entry.compare(self.file_entry), []) + + mutated = copy(self.file_entry) + mutated.frame_list.pop() + self.assertEqual(self.file_entry.compare(mutated), + ["The number of saveframes in the entries are not equal: '25' vs '24'.", + "No saveframe with name 'assigned_chem_shift_list_1' in other entry."]) + + def test_getmethods(self): + self.assertEqual(5, len(self.file_entry.get_loops_by_category("_Vendor"))) + self.assertEqual(5, len(self.file_entry.get_loops_by_category("vendor"))) + + self.assertEqual(self.file_entry.get_saveframe_by_name('assigned_chem_shift_list_1'), self.file_entry[-1]) + self.assertRaises(KeyError, self.file_entry.get_saveframe_by_name, 'no such saveframe') + + self.assertEqual(len(self.file_entry.get_saveframes_by_category("NMR_spectrometer")), 6) + self.assertEqual(len(self.file_entry.get_saveframes_by_category("nmr_SPectrometer")), 0) + self.assertEqual(self.file_entry.get_saveframes_by_category('no such category'), []) + + self.assertEqual(self.file_entry.get_saveframes_by_tag_and_value('Submission_date', '2006-09-07'), + [self.file_entry[0]]) + self.assertEqual(self.file_entry.get_saveframes_by_tag_and_value('submission_Date', '2006-09-07'), + [self.file_entry[0]]) + self.assertEqual(self.file_entry.get_saveframes_by_tag_and_value('test.submission_date', '2006-09-07'), []) + + self.assertRaises(ValueError, self.file_entry.get_tag, 'bad_tag') + self.assertEqual(self.file_entry.get_tag("entry.Submission_date"), ['2006-09-07']) + self.assertEqual(self.file_entry.get_tag("entry.Submission_date", whole_tag=True), + [[u'Submission_date', u'2006-09-07']]) + + def test_validate(self): + validation = [] + self.assertEqual(self.file_entry.validate(), []) + self.file_entry[-1][-1][0][0] = 'a' + validation.append( + "Value does not match specification: '_Atom_chem_shift.ID':'a'.\n " + "Type specified: int\n Regular expression for type: '^(?:-?[0-9]*)?$'") + self.assertEqual(self.file_entry.validate(), validation) + self.file_entry[-1][-1][0][0] = '1' + + def test_Entry___setitem__(self): + """ Test the setting a tag functionality of an entry. """ + + test_entry = Entry.from_scratch('test') + test_saveframe = Saveframe.from_scratch('test', 'test') + test_entry._frame_list = [test_saveframe, test_saveframe] + with self.assertRaises(ValueError): + test_entry['test'] = test_saveframe + + def test_category_list(self): + """ Test the category list property. """ + + tmp = copy(self.file_entry) + self.assertEqual(tmp.category_list, ['entry_information', 'citations', 'assembly', 'entity', 'natural_source', + 'experimental_source', 'chem_comp', 'sample', 'sample_conditions', + 'software', 'NMR_spectrometer', 'NMR_spectrometer_list', 'experiment_list', + 'chem_shift_reference', 'assigned_chemical_shifts']) + tmp.add_saveframe(Saveframe.from_scratch("test", None)) + self.assertEqual(tmp.category_list, ['entry_information', 'citations', 'assembly', 'entity', 'natural_source', + 'experimental_source', 'chem_comp', 'sample', 'sample_conditions', + 'software', 'NMR_spectrometer', 'NMR_spectrometer_list', 'experiment_list', + 'chem_shift_reference', 'assigned_chemical_shifts']) + + def test_rename_saveframe(self): + tmp = copy(self.file_entry) + tmp.rename_saveframe('F5-Phe-cVHP', 'jons_frame') + tmp.rename_saveframe('jons_frame', 'F5-Phe-cVHP') + self.assertEqual(tmp, self.file_entry) + + def test_normalize(self): + + db_tmp = copy(self.file_entry) + denormalized = Entry.from_file(os.path.join(our_path, "sample_files", "bmr15000_3_denormalized.str")) + denormalized.normalize() + self.assertEqual(db_tmp.compare(denormalized), []) + + # Shuffle our local entry + random.shuffle(db_tmp.frame_list) + for frame in db_tmp: + random.shuffle(frame.loops) + random.shuffle(frame.tags) + + # Might as well test equality testing while shuffled: + self.assertEqual(db_tmp.compare(self.file_entry), []) + + # Test that the frames are in a different order + self.assertNotEqual(db_tmp.frame_list, self.file_entry.frame_list) + db_tmp.normalize() + + self.assertEqual(db_tmp.frame_list, self.file_entry.frame_list) + + # Now test ordering of saveframes when tags may be missing + b = Saveframe.from_scratch('not_real2') + b.add_tag('_help.Sf_category', 'a') + b.add_tag('_help.ID', 1) + c = Saveframe.from_scratch('not_real') + c.add_tag('_help.Sf_category', 'a') + c.add_tag('_help.ID', 'a') + d = Saveframe.from_scratch('not_real3') + d.add_tag('_help.borg', 'a') + + db_tmp.add_saveframe(b) + db_tmp.add_saveframe(c) + db_tmp.add_saveframe(d) + + correct_order = db_tmp.frame_list[:] + random.shuffle(db_tmp.frame_list) + db_tmp.normalize() + self.assertEqual(db_tmp.frame_list, correct_order) diff --git a/pynmrstar/unit_tests/test_loop.py b/pynmrstar/unit_tests/test_loop.py new file mode 100644 index 0000000..8e988b1 --- /dev/null +++ b/pynmrstar/unit_tests/test_loop.py @@ -0,0 +1,343 @@ +#!/usr/bin/env python3 +import os +import unittest +from copy import deepcopy as copy +from decimal import Decimal +from pathlib import Path + +from pynmrstar import Loop, Entry, Schema +from pynmrstar.exceptions import ParsingError + +our_path = os.path.dirname(os.path.realpath(__file__)) +sample_file_location = os.path.join(our_path, "sample_files", "bmr15000_3.str") +sample_loop_location = os.path.join(our_path, "sample_files", "loop.txt") +file_entry = Entry.from_file(sample_file_location) + + +class TestLoop(unittest.TestCase): + + def setUp(self): + self.file_entry = copy(file_entry) + self.maxDiff = None + + def test_from_file_path_support(self): + """Test that from_file methods support pathlib.Path objects.""" + + # Test Loop.from_file with Path object + loop_from_str = Loop.from_file(sample_loop_location) + loop_from_path = Loop.from_file(Path(sample_loop_location)) + self.assertEqual(loop_from_str, loop_from_path) + self.assertEqual(loop_from_path.category, '_Test') + self.assertEqual(loop_from_path.tags, ['ID', 'Name']) + self.assertEqual(loop_from_path.data, [['1', 'First'], ['2', 'Second']]) + + def test_loop_parsing(self): + with self.assertRaises(ParsingError): + Loop.from_string("loop_ _test.one _test.two 1 loop_") + with self.assertRaises(ParsingError): + Loop.from_string("loop_ _test.one _test.two 1 stop_") + with self.assertRaises(ParsingError): + Loop.from_string("loop_ _test.one _test.two 1 2 3 stop_") + with self.assertRaises(ParsingError): + Loop.from_string("loop_ _test.one _test.two 1 2 3") + with self.assertRaises(ParsingError): + Loop.from_string("loop_ _test.one _test.two 1 save_ stop_") + + def test_loop(self): + test_loop = self.file_entry[0][0] + + # Check filter + self.assertEqual(test_loop.filter(['_Entry_author.Ordinal', '_Entry_author.Middle_initials']), + Loop.from_string( + "loop_ _Entry_author.Ordinal _Entry_author.Middle_initials 1 C. 2 . 3 B. 4 H. 5 L. stop_")) + # Check eq + self.assertEqual(test_loop == self.file_entry[0][0], True) + self.assertEqual(test_loop != self.file_entry[0][1], True) + # Check __getitem__ + self.assertEqual(test_loop['_Entry_author.Ordinal'], ['1', '2', '3', '4', '5']) + self.assertEqual(test_loop[['_Entry_author.Ordinal', '_Entry_author.Middle_initials']], + [['1', 'C.'], ['2', '.'], ['3', 'B.'], ['4', 'H.'], ['5', 'L.']]) + # Test __setitem__ + test_loop['_Entry_author.Ordinal'] = [1] * 5 + self.assertEqual(test_loop['_Entry_author.Ordinal'], [1, 1, 1, 1, 1]) + test_loop['_Entry_author.Ordinal'] = ['1', '2', '3', '4', '5'] + self.assertRaises(ValueError, test_loop.__setitem__, '_Entry_author.Ordinal', [1]) + self.assertRaises(ValueError, test_loop.__setitem__, '_Wrong_loop.Ordinal', [1, 2, 3, 4, 5]) + # Check __init__ + self.assertRaises(ValueError, Loop) + test = Loop.from_scratch(category="test") + self.assertEqual(test.category, "_test") + self.assertEqual(Loop.from_string(str(test_loop)), test_loop) + self.assertEqual(test_loop, Loop.from_string(test_loop.get_data_as_csv(), csv=True)) + # Check len + self.assertEqual(len(test_loop), len(test_loop.data)) + # Check lt + self.assertEqual(test_loop < self.file_entry[0][1], True) + # Check __str__ + self.assertEqual(Loop.from_scratch().format(skip_empty_loops=False), "\n loop_\n\n stop_\n") + self.assertEqual(Loop.from_scratch().format(skip_empty_loops=True), "") + tmp_loop = Loop.from_scratch() + tmp_loop.data = [[1, 2, 3]] + self.assertRaises(ValueError, tmp_loop.__str__) + tmp_loop.add_tag("tag1") + self.assertRaises(ValueError, tmp_loop.__str__) + tmp_loop.add_tag("tag2") + tmp_loop.add_tag("tag3") + self.assertRaises(ValueError, tmp_loop.__str__) + tmp_loop.set_category("test") + self.assertEqual(str(tmp_loop), "\n loop_\n _test.tag1\n _test.tag2\n _test.tag3\n\n " + "1 2 3 \n\n stop_\n") + self.assertEqual(tmp_loop.category, "_test") + # Check different category + self.assertRaises(ValueError, tmp_loop.add_tag, "invalid.tag") + # Check duplicate tag + self.assertRaises(ValueError, tmp_loop.add_tag, "test.tag3") + self.assertEqual(tmp_loop.add_tag("test.tag3", ignore_duplicates=True), None) + # Check space and period in tag + self.assertRaises(ValueError, tmp_loop.add_tag, "test. tag") + self.assertRaises(ValueError, tmp_loop.add_tag, "test.tag.test") + + # Check add_data + self.assertRaises(ValueError, tmp_loop.add_data, [1, 2, 3, 4]) + tmp_loop.add_data([4, 5, 6]) + tmp_loop.add_data([7, 8, 9]) + self.assertEqual(tmp_loop.data, [[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + + # Test delete_data_by_tag_value + self.assertEqual(tmp_loop.remove_data_by_tag_value("tag1", 1, index_tag=0), [[1, 2, 3]]) + self.assertRaises(ValueError, tmp_loop.remove_data_by_tag_value, "tag4", "data") + self.assertEqual(tmp_loop.data, [[1, 5, 6], [2, 8, 9]]) + + # Test get_data_as_csv() + self.assertEqual(tmp_loop.get_data_as_csv(), "_test.tag1,_test.tag2,_test.tag3\n1,5,6\n2,8,9\n") + self.assertEqual(tmp_loop.get_data_as_csv(show_category=False), "tag1,tag2,tag3\n1,5,6\n2,8,9\n") + self.assertEqual(tmp_loop.get_data_as_csv(header=False), "1,5,6\n2,8,9\n") + self.assertEqual(tmp_loop.get_data_as_csv(show_category=False, header=False), "1,5,6\n2,8,9\n") + + # Test get_tag + self.assertRaises(ValueError, tmp_loop.get_tag, "invalid.tag1") + self.assertEqual(tmp_loop.get_tag("tag1"), [1, 2]) + self.assertEqual(tmp_loop.get_tag(["tag1", "tag2"]), [[1, 5], [2, 8]]) + self.assertEqual(tmp_loop.get_tag("tag1", whole_tag=True), [['_test.tag1', 1], ['_test.tag1', 2]]) + + self.assertEqual( + test_loop.get_tag(['_Entry_author.Ordinal', '_Entry_author.Middle_initials'], dict_result=True), + [{'_Entry_author.Middle_initials': 'C.', '_Entry_author.Ordinal': '1'}, + {'_Entry_author.Middle_initials': '.', '_Entry_author.Ordinal': '2'}, + {'_Entry_author.Middle_initials': 'B.', '_Entry_author.Ordinal': '3'}, + {'_Entry_author.Middle_initials': 'H.', '_Entry_author.Ordinal': '4'}, + {'_Entry_author.Middle_initials': 'L.', '_Entry_author.Ordinal': '5'}]) + + self.assertEqual( + test_loop.get_tag(['ORdinal', 'MIddle_initials'], dict_result=True), + [{'MIddle_initials': 'C.', 'ORdinal': '1'}, + {'MIddle_initials': '.', 'ORdinal': '2'}, + {'MIddle_initials': 'B.', 'ORdinal': '3'}, + {'MIddle_initials': 'H.', 'ORdinal': '4'}, + {'MIddle_initials': 'L.', 'ORdinal': '5'}]) + + self.assertEqual( + test_loop.get_tag(['Ordinal', 'Middle_initials'], dict_result=True, whole_tag=True), + [{'_Entry_author.Middle_initials': 'C.', '_Entry_author.Ordinal': '1'}, + {'_Entry_author.Middle_initials': '.', '_Entry_author.Ordinal': '2'}, + {'_Entry_author.Middle_initials': 'B.', '_Entry_author.Ordinal': '3'}, + {'_Entry_author.Middle_initials': 'H.', '_Entry_author.Ordinal': '4'}, + {'_Entry_author.Middle_initials': 'L.', '_Entry_author.Ordinal': '5'}]) + + self.assertEqual(test_loop.get_tag(['_Entry_author.Ordinal', '_Entry_author.Middle_initials'], dict_result=True, + whole_tag=True), + [{'_Entry_author.Middle_initials': 'C.', '_Entry_author.Ordinal': '1'}, + {'_Entry_author.Middle_initials': '.', '_Entry_author.Ordinal': '2'}, + {'_Entry_author.Middle_initials': 'B.', '_Entry_author.Ordinal': '3'}, + {'_Entry_author.Middle_initials': 'H.', '_Entry_author.Ordinal': '4'}, + {'_Entry_author.Middle_initials': 'L.', '_Entry_author.Ordinal': '5'}]) + + def simple_key(x): + return -int(x[2]) + + # Test sort_rows + tmp_loop.sort_rows(["tag2"], key=simple_key) + self.assertEqual(tmp_loop.data, [[2, 8, 9], [1, 5, 6]]) + tmp_loop.sort_rows(["tag2"]) + self.assertEqual(tmp_loop.data, [[1, 5, 6], [2, 8, 9]]) + + # Test clear data + tmp_loop.clear_data() + self.assertEqual(tmp_loop.data, []) + + # Test that the from_template method works + self.assertEqual(Loop.from_template("atom_chem_shift", all_tags=False), + Loop.from_string(""" +loop_ + _Atom_chem_shift.ID + _Atom_chem_shift.Assembly_atom_ID + _Atom_chem_shift.Entity_assembly_ID + _Atom_chem_shift.Entity_assembly_asym_ID + _Atom_chem_shift.Entity_ID + _Atom_chem_shift.Comp_index_ID + _Atom_chem_shift.Seq_ID + _Atom_chem_shift.Comp_ID + _Atom_chem_shift.Atom_ID + _Atom_chem_shift.Atom_type + _Atom_chem_shift.Atom_isotope_number + _Atom_chem_shift.Val + _Atom_chem_shift.Val_err + _Atom_chem_shift.Assign_fig_of_merit + _Atom_chem_shift.Ambiguity_code + _Atom_chem_shift.Ambiguity_set_ID + _Atom_chem_shift.Occupancy + _Atom_chem_shift.Resonance_ID + _Atom_chem_shift.Auth_entity_assembly_ID + _Atom_chem_shift.Auth_asym_ID + _Atom_chem_shift.Auth_seq_ID + _Atom_chem_shift.Auth_comp_ID + _Atom_chem_shift.Auth_atom_ID + _Atom_chem_shift.Original_PDB_strand_ID + _Atom_chem_shift.Original_PDB_residue_no + _Atom_chem_shift.Original_PDB_residue_name + _Atom_chem_shift.Original_PDB_atom_name + _Atom_chem_shift.Details + _Atom_chem_shift.Entry_ID + _Atom_chem_shift.Assigned_chem_shift_list_ID + + + stop_ +""")) + + self.assertEqual(Loop.from_template("atom_chem_shift", all_tags=True), + Loop.from_string(""" + loop_ + _Atom_chem_shift.ID + _Atom_chem_shift.Assembly_atom_ID + _Atom_chem_shift.Entity_assembly_ID + _Atom_chem_shift.Entity_assembly_asym_ID + _Atom_chem_shift.Entity_ID + _Atom_chem_shift.Comp_index_ID + _Atom_chem_shift.Seq_ID + _Atom_chem_shift.Comp_ID + _Atom_chem_shift.Atom_ID + _Atom_chem_shift.Atom_type + _Atom_chem_shift.Atom_isotope_number + _Atom_chem_shift.Val + _Atom_chem_shift.Val_err + _Atom_chem_shift.Assign_fig_of_merit + _Atom_chem_shift.Ambiguity_code + _Atom_chem_shift.Ambiguity_set_ID + _Atom_chem_shift.Occupancy + _Atom_chem_shift.Resonance_ID + _Atom_chem_shift.Auth_entity_assembly_ID + _Atom_chem_shift.Auth_asym_ID + _Atom_chem_shift.Auth_seq_ID + _Atom_chem_shift.Auth_comp_ID + _Atom_chem_shift.Auth_atom_ID + _Atom_chem_shift.PDB_record_ID + _Atom_chem_shift.PDB_model_num + _Atom_chem_shift.PDB_strand_ID + _Atom_chem_shift.PDB_ins_code + _Atom_chem_shift.PDB_residue_no + _Atom_chem_shift.PDB_residue_name + _Atom_chem_shift.PDB_atom_name + _Atom_chem_shift.Original_PDB_strand_ID + _Atom_chem_shift.Original_PDB_residue_no + _Atom_chem_shift.Original_PDB_residue_name + _Atom_chem_shift.Original_PDB_atom_name + _Atom_chem_shift.Details + _Atom_chem_shift.Sf_ID + _Atom_chem_shift.Entry_ID + _Atom_chem_shift.Assigned_chem_shift_list_ID + + + stop_ +""")) + + # Test adding a tag to the schema + my_schem = Schema() + my_schem.add_tag("_Atom_chem_shift.New_Tag", "VARCHAR(100)", True, "assigned_chemical_shifts", True, + "_Atom_chem_shift.Atom_ID") + self.assertEqual(Loop.from_template("atom_chem_shift", all_tags=True, schema=my_schem), + Loop.from_string( + "loop_ _Atom_chem_shift.ID _Atom_chem_shift.Assembly_atom_ID " + "_Atom_chem_shift.Entity_assembly_ID _Atom_chem_shift.Entity_ID " + "_Atom_chem_shift.Comp_index_ID _Atom_chem_shift.Seq_ID " + "_Atom_chem_shift.Comp_ID _Atom_chem_shift.Atom_ID _Atom_chem_shift.New_Tag " + "_Atom_chem_shift.Atom_type _Atom_chem_shift.Atom_isotope_number " + "_Atom_chem_shift.Val _Atom_chem_shift.Val_err _Atom_chem_shift.Assign_fig_of_merit " + "_Atom_chem_shift.Ambiguity_code _Atom_chem_shift.Ambiguity_set_ID " + "_Atom_chem_shift.Occupancy _Atom_chem_shift.Resonance_ID " + "_Atom_chem_shift.Auth_entity_assembly_ID _Atom_chem_shift.Auth_asym_ID " + "_Atom_chem_shift.Auth_seq_ID _Atom_chem_shift.Auth_comp_ID " + "_Atom_chem_shift.Auth_atom_ID _Atom_chem_shift.PDB_record_ID " + "_Atom_chem_shift.PDB_model_num _Atom_chem_shift.PDB_strand_ID " + "_Atom_chem_shift.PDB_ins_code _Atom_chem_shift.PDB_residue_no " + "_Atom_chem_shift.PDB_residue_name _Atom_chem_shift.PDB_atom_name " + "_Atom_chem_shift.Original_PDB_strand_ID _Atom_chem_shift.Original_PDB_residue_no " + "_Atom_chem_shift.Original_PDB_residue_name _Atom_chem_shift.Original_PDB_atom_name " + "_Atom_chem_shift.Details _Atom_chem_shift.Sf_ID _Atom_chem_shift.Entry_ID " + "_Atom_chem_shift.Assigned_chem_shift_list_ID stop_ ")) + + # Make sure adding data with a tag works + tmp_loop = Loop.from_string("loop_ _Atom_chem_shift.ID stop_") + tmp_loop.data = [[1]] + tmp_loop.add_tag("Assembly_atom_ID", update_data=True) + self.assertEqual(tmp_loop.data, [[1, None]]) + self.assertEqual(tmp_loop.tags, ["ID", "Assembly_atom_ID"]) + + # Make sure the add missing tags loop is working + tmp_loop = Loop.from_string("loop_ _Atom_chem_shift.ID stop_") + tmp_loop.add_missing_tags() + self.assertEqual(tmp_loop, Loop.from_template("atom_chem_shift")) + + def test_loop_add_data(self): + test1 = Loop.from_scratch('test') + test1.add_tag(['Name', 'Location']) + self.assertRaises(ValueError, test1.add_data, None) + self.assertRaises(ValueError, test1.add_data, []) + self.assertRaises(ValueError, test1.add_data, {}) + self.assertRaises(ValueError, test1.add_data, {'not_present': 1}) + test1.add_data([{'name': 'Jeff', 'location': 'Connecticut'}, {'name': 'Chad', 'location': 'Madison'}]) + + test2 = Loop.from_scratch('test') + test2.add_tag(['Name', 'Location']) + test2.add_data({'name': ['Jeff', 'Chad'], 'location': ['Connecticut', 'Madison']}) + + test3 = Loop.from_scratch('test') + test3.add_tag(['Name', 'Location']) + self.assertRaises(ValueError, test3.add_data, [['Jeff', 'Connecticut'], ['Chad']]) + test3.add_data([['Jeff', 'Connecticut'], ['Chad', 'Madison']]) + + test4 = Loop.from_scratch('test') + test4.add_tag(['Name', 'Location']) + self.assertRaises(ValueError, test4.add_data, ['Jeff', 'Connecticut', 'Chad', 'Madison']) + test4.add_data(['Jeff', 'Connecticut', 'Chad', 'Madison'], rearrange=True) + + self.assertEqual(test1, test2) + self.assertEqual(test2, test3) + self.assertEqual(test3, test4) + + # Now check the 'convert_data_types' argument and the raw data present in the loop + test = Loop.from_scratch('_Atom_chem_shift') + test.add_tag(['Val', 'Entry_ID', 'Details']) + test.add_data([{'details': 'none', 'vAL': '1.2'}, {'val': 5, 'details': '.'}], convert_data_types=True) + self.assertEqual(test.data, [[Decimal('1.2'), None, 'none'], [Decimal(5), None, None]]) + test.clear_data() + test.add_data([{'details': 'none', 'vAL': '1.2'}, {'val': 5, 'details': '.'}]) + self.assertEqual(test.data, [['1.2', None, 'none'], [5, None, '.']]) + + def test_syntax_outliers(self): + """ Make sure the case of semi-colon delineated data in a data + value is properly escaped. """ + + ml = copy(self.file_entry[0][0]) + # Should always work once + ml[0][0] = str(ml) + self.assertEqual(ml, Loop.from_string(str(ml))) + # Twice could trigger bug + ml[0][0] = str(ml) + self.assertEqual(ml, Loop.from_string(str(ml))) + self.assertEqual(ml[0][0], Loop.from_string(str(ml))[0][0]) + # Third time is a charm + ml[0][0] = str(ml) + self.assertEqual(ml, Loop.from_string(str(ml))) + # Check the data too - this should never fail (the previous test would + # have already failed.) + self.assertEqual(ml[0][0], Loop.from_string(str(ml))[0][0]) diff --git a/pynmrstar/unit_tests/test_parser.py b/pynmrstar/unit_tests/test_parser.py new file mode 100644 index 0000000..b2ab342 --- /dev/null +++ b/pynmrstar/unit_tests/test_parser.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +import unittest + +from pynmrstar import Entry, Saveframe, Loop +from pynmrstar.exceptions import ParsingError + + +class TestParser(unittest.TestCase): + + def test__Parser(self): + """ Test that the various parsing Errors that can be raised are raised. """ + + # These checks match the order of the parser code at the time they were written. + self.assertRaises(ParsingError, Entry.from_string, 'data_1\nsave_1\n"loop"_\n_tag.tag\ndata_\nstop_\nsave_\n') + + # STAR/file start checks + self.assertRaises(ParsingError, Entry.from_string, "whatever test") + self.assertRaises(ParsingError, Entry.from_string, "data_") + self.assertRaises(ParsingError, Entry.from_string, "'data_1'\nsave_1\nloop_\n_tag.tag\ndata_\nstop_\nsave_\n") + + # Saveframe checks + self.assertRaises(ParsingError, Entry.from_string, "data_frame invalid") + self.assertRaises(ParsingError, Entry.from_string, "data_frame save_ invalid") + self.assertRaises(ParsingError, Entry.from_string, "data_1\n'save_1'\nloop_\n_tag.tag\ndata_\nstop_\nsave_\n") + + # Loop checks + self.assertRaises(ParsingError, Entry.from_string, "data_1\nsave_1\n'loop_'\n_tag.tag\ndata_\nstop_\nsave_\n") + self.assertRaises(ParsingError, Entry.from_string, "data_1 save_1 loop_ _tag.one _tag2.one stop_") + self.assertRaises(ParsingError, Entry.from_string, "data_1 save_1 loop_ _tag.one stop_ loop_ _tag.one stop_") + self.assertRaises(ParsingError, Entry.from_string, "data_1 save_1 loop_ _tag.one 'stop_'") + self.assertRaises(ParsingError, Entry.from_string, "data_1 save_1 _saveframe.tag value loop_ stop_ save_", raise_parse_warnings=True) + self.assertRaises(ParsingError, Entry.from_string, "data_1 save_1 _saveframe.tag value loop_ _tag.one stop_ save_", raise_parse_warnings=True) + with self.assertLogs('pynmrstar', level='WARNING'): + Entry.from_string("data_1 save_1 _saveframe.tag value loop_ stop_ save_", raise_parse_warnings=False) + with self.assertLogs('pynmrstar', level='WARNING'): + Entry.from_string("data_1 save_1 _saveframe.tag value loop_ _tag.one stop_ save_", raise_parse_warnings=False) + self.assertRaises(ParsingError, Entry.from_string, "data_1 save_1 _saveframe.tag value loop_ _tag.one _tag.two data stop_ save_") + self.assertRaises(ParsingError, Entry.from_string, "data_1 save_1 _saveframe.tag value loop_ _tag.one _tag.two data _tag.three stop_ save_") + self.assertRaises(ParsingError, Entry.from_string, "data_1 save_1 _saveframe.tag value loop_ data stop_ save_") + self.assertRaises(ParsingError, Entry.from_string, "data_1 save_1 _saveframe.tag value loop_ _tag.one _tag.two data data2 loop_ stop_ save_") + self.assertRaises(ParsingError, Entry.from_string, "data_1 save_1 _saveframe.tag value loop_ _tag.one _tag.two data data2") + self.assertRaises(ParsingError, Entry.from_string, "data_1 save_1 _saveframe.tag value 'save_'") + self.assertRaises(ParsingError, Entry.from_string, "data_1 save_1 save_") + self.assertRaises(ParsingError, Entry.from_string, "data_1 save_1 stop_") + + # Back to saveframes + self.assertRaises(ParsingError, Entry.from_string, "data_1\nsave_1\n'_tag.example' save_\nsave_\n") + self.assertRaises(ParsingError, Entry.from_string, "data_1\nsave_1\n_tag.example save_\nsave_\n") + self.assertRaises(ParsingError, Loop.from_string, "d") + self.assertRaises(ParsingError, Saveframe.from_string, "save_1\n_tag.1 _tag.2") + self.assertRaises(ParsingError, Entry.from_string, "data_1 save_1 _savef.rame.tag value save_") + self.assertRaises(ParsingError, Entry.from_string, "data_1 save_1 _saveframe.tag value") + + def test_parse_outliers(self): + """ Make sure the parser handles edge cases. """ + + test_string = """data_#pound +save_entry_information _Entry.Sf_category entry_information _Entry.Sf_framecode entry_information +_Entry.sameline_comment value #ignore this all +_Entry.ID \".-!?\" +_Entry.Invalid_tag "This tag doesn't exist." +_Entry.Title +; Solution structure of chicken villin headpiece subdomain contain;ing a fluorinated side chain in the cores; +; +_Entry.Submi#ssion_date "check inn"er "quoted vals" +_Entry.Accession_date 'check inner quoted vals' +_Entry.Original_NMR_STAR_version '_.' + _Entry.Experimental_method $ + _Entry.Details "1#" + _Entry.Experimental_method_subtype solution + _Entry.BMRB_internal_directory_name ;data; +_Entry.pointer $it +_Entry.multi +; + + nothing + to shift +; +_Entry.multi2 +; + + ; + something + to shift +; +save_ +""" + #self.assertEqual(test_string, str(Entry.from_string(test_string))) diff --git a/pynmrstar/unit_tests/test_saveframe.py b/pynmrstar/unit_tests/test_saveframe.py new file mode 100644 index 0000000..43fdf5c --- /dev/null +++ b/pynmrstar/unit_tests/test_saveframe.py @@ -0,0 +1,287 @@ +#!/usr/bin/env python3 +import json +import os +import unittest +from copy import deepcopy as copy +from pathlib import Path + +from pynmrstar import Saveframe, Loop, definitions, Entry +from pynmrstar.exceptions import ParsingError + +our_path = os.path.dirname(os.path.realpath(__file__)) +sample_file_location = os.path.join(our_path, "sample_files", "bmr15000_3.str") +sample_saveframe_location = os.path.join(our_path, "sample_files", "saveframe.txt") +file_entry = Entry.from_file(sample_file_location) + + +class TestSaveframe(unittest.TestCase): + + def setUp(self): + self.file_entry = copy(file_entry) + self.maxDiff = None + + def test_odd_strings(self): + """ Make sure the library can handle odd strings. """ + + # Don't run the naughty strings test in GitHub, since it won't + # recursively checkout the "naughty strings" module on platforms + # other than linux. + if "GITHUB_WORKFLOW" in os.environ: + return + + saveframe = Saveframe.from_scratch('test', 'citations') + with open(os.path.join(our_path, 'naughty-strings/blns.json')) as odd_string_file: + odd_strings = json.load(odd_string_file) + for x, string in enumerate(odd_strings): + if string == '': + continue + saveframe.add_tag(str(x), string) + + self.assertEqual(saveframe, Saveframe.from_string(str(saveframe))) + + def test_from_file_path_support(self): + """Test that from_file methods support pathlib.Path objects.""" + + # Test Saveframe.from_file with Path object + saveframe_from_path = Saveframe.from_file(Path(sample_saveframe_location)) + self.assertEqual(saveframe_from_path, self.file_entry[0]) + + def test_saveframe(self): + frame = self.file_entry[0] + + # Check __delitem__ + frame.remove_tag('DEtails') + self.assertEqual([[x[0], x[1]] for x in frame.tags], + [['Sf_category', 'entry_information'], + ['Sf_framecode', 'entry_information'], + ['ID', '15000'], + ['Title', + 'Solution structure of chicken villin headpiece subdomain containing a ' + 'fluorinated side chain in the core\n'], + ['Type', 'macromolecule'], + ['Version_type', 'original'], + ['Submission_date', '2006-09-07'], + ['Accession_date', '2006-09-07'], + ['Last_release_date', '2006-09-07'], + ['Original_release_date', '2006-09-07'], + ['Origination', 'author'], + ['Format_name', '.'], + ['NMR_STAR_version', '3.2.6.0'], + ['NMR_STAR_dict_location', '.'], + ['Original_NMR_STAR_version', '3.2.6.0'], + ['Experimental_method', 'NMR'], + ['Experimental_method_subtype', 'solution'], + ['Source_data_format', '.'], + ['Source_data_format_version', '.'], + ['Generated_software_name', '.'], + ['Generated_software_version', '.'], + ['Generated_software_ID', '.'], + ['Generated_software_label', '.'], + ['Generated_date', '.'], + ['DOI', '.'], + ['UUID', '.'], + ['Related_coordinate_file_name', '.'], + ['BMRB_internal_directory_name', '.']]) + self.assertEqual(len(frame), 7) + del frame[0] + self.assertEqual(len(frame), 6) + del frame[frame.get_loop('RElease')] + self.assertEqual(len(frame), 5) + self.assertRaises(KeyError, frame.get_loop, 'RElease') + + # Check __getitem__ + self.assertEqual(frame.get_tag('NMR_STAR_version'), ['3.2.6.0']) + self.assertEqual(frame[0], frame.loops[0]) + self.assertEqual(frame.get_loop('_SG_project'), frame.loops[0]) + + # Check __lt__ + self.assertEqual(frame[-3] > frame[-1], False) + + # Check __init__ + self.assertRaises(ValueError, Saveframe) + self.assertEqual(Saveframe.from_string(str(frame)), frame) + self.assertEqual(str(Saveframe.from_scratch("test", tag_prefix="test")), "save_test\n\nsave_\n") + tmp = copy(frame) + tmp._loops = [] + self.assertEqual(Saveframe.from_string(frame.get_data_as_csv(frame), csv=True).compare(tmp), []) + self.assertRaises(ValueError, Saveframe.from_string, "test.1,test.2\n2,3,4", csv=True) + + # Check __repr__ + self.assertEqual(repr(frame), "") + + # Check __setitem__ + frame['test'] = 1 + self.assertEqual(frame.tags[-1][1], 1) + frame['tESt'] = 2 + self.assertEqual(frame.tags[-1][1], 2) + frame[4] = frame[3] + self.assertEqual(frame.loops[3], frame.loops[4]) + + # Check add_loop + self.assertRaises(ValueError, frame.add_loop, frame.loops[0]) + + # Check add_tag + self.assertRaises(ValueError, frame.add_tag, "test", 1) + self.assertRaises(ValueError, frame.add_tag, "invalid test", 1) + self.assertRaises(ValueError, frame.add_tag, "invalid.test.test", 1) + self.assertRaises(ValueError, frame.add_tag, "invalid.test", 1, update=True) + frame.add_tag("test", 3, update=True) + self.assertEqual(frame.get_tag('test'), [3]) + + # Check add_tags + frame.add_tags([['example1'], ['example2']]) + self.assertEqual(frame.tags[-2], ['example1', "."]) + frame.add_tags([['example1', 5], ['example2']], update=True) + self.assertEqual(frame.tags[-2], ['example1', 5]) + + # Check compare + self.assertEqual(frame.compare(frame), []) + self.assertEqual(frame.compare(self.file_entry[1]), + ["\tSaveframe names do not match: 'entry_information' vs 'citation_1'."]) + tmp = copy(frame) + tmp.tag_prefix = "test" + self.assertEqual(frame.compare(tmp), ["\tTag prefix does not match: '_Entry' vs 'test'."]) + tmp = copy(frame) + tmp.tags[0][0] = "broken" + self.assertEqual(frame.compare(tmp), ["\tNo tag with name '_Entry.Sf_category' in compared entry."]) + + # Test remove_tag + self.assertRaises(KeyError, frame.remove_tag, "this_tag_will_not_exist") + frame.remove_tag("test") + self.assertEqual(frame.get_tag("test"), []) + + # Test get_data_as_csv + self.assertEqual(frame.get_data_as_csv(), + '''_Entry.Sf_category,_Entry.Sf_framecode,_Entry.ID,_Entry.Title,_Entry.Type,_Entry.Version_type,_Entry.Submission_date,_Entry.Accession_date,_Entry.Last_release_date,_Entry.Original_release_date,_Entry.Origination,_Entry.Format_name,_Entry.NMR_STAR_version,_Entry.NMR_STAR_dict_location,_Entry.Original_NMR_STAR_version,_Entry.Experimental_method,_Entry.Experimental_method_subtype,_Entry.Source_data_format,_Entry.Source_data_format_version,_Entry.Generated_software_name,_Entry.Generated_software_version,_Entry.Generated_software_ID,_Entry.Generated_software_label,_Entry.Generated_date,_Entry.DOI,_Entry.UUID,_Entry.Related_coordinate_file_name,_Entry.BMRB_internal_directory_name,_Entry.example1,_Entry.example2 +entry_information,entry_information,15000,"Solution structure of chicken villin headpiece subdomain containing a fluorinated side chain in the core +",macromolecule,original,2006-09-07,2006-09-07,2006-09-07,2006-09-07,author,.,3.2.6.0,.,3.2.6.0,NMR,solution,.,.,.,.,.,.,.,.,.,.,.,5,. +''') + self.assertEqual(frame.get_data_as_csv(show_category=False), + '''Sf_category,Sf_framecode,ID,Title,Type,Version_type,Submission_date,Accession_date,Last_release_date,Original_release_date,Origination,Format_name,NMR_STAR_version,NMR_STAR_dict_location,Original_NMR_STAR_version,Experimental_method,Experimental_method_subtype,Source_data_format,Source_data_format_version,Generated_software_name,Generated_software_version,Generated_software_ID,Generated_software_label,Generated_date,DOI,UUID,Related_coordinate_file_name,BMRB_internal_directory_name,example1,example2 +entry_information,entry_information,15000,"Solution structure of chicken villin headpiece subdomain containing a fluorinated side chain in the core +",macromolecule,original,2006-09-07,2006-09-07,2006-09-07,2006-09-07,author,.,3.2.6.0,.,3.2.6.0,NMR,solution,.,.,.,.,.,.,.,.,.,.,.,5,. +''') + self.assertEqual(frame.get_data_as_csv(header=False), + '''entry_information,entry_information,15000,"Solution structure of chicken villin headpiece subdomain containing a fluorinated side chain in the core +",macromolecule,original,2006-09-07,2006-09-07,2006-09-07,2006-09-07,author,.,3.2.6.0,.,3.2.6.0,NMR,solution,.,.,.,.,.,.,.,.,.,.,.,5,. +''') + self.assertEqual(frame.get_data_as_csv(show_category=False, header=False), + '''entry_information,entry_information,15000,"Solution structure of chicken villin headpiece subdomain containing a fluorinated side chain in the core +",macromolecule,original,2006-09-07,2006-09-07,2006-09-07,2006-09-07,author,.,3.2.6.0,.,3.2.6.0,NMR,solution,.,.,.,.,.,.,.,.,.,.,.,5,. +''') + + # Test get_loop + self.assertEqual(repr(frame.get_loop("_SG_projecT")), "") + self.assertRaises(KeyError, frame.get_loop, 'this_loop_wont_be_found') + + # Test get_tag - this is really already tested in the other tests here + self.assertEqual(frame.get_tag("sf_category"), ['entry_information']) + self.assertEqual(frame.get_tag("entry.sf_category"), ['entry_information']) + self.assertEqual(frame.get_tag("entry.sf_category", whole_tag=True), [['Sf_category', 'entry_information']]) + + # Test sort + self.assertEqual([[x[0], x[1]] for x in frame.tags], [['Sf_category', 'entry_information'], + ['Sf_framecode', 'entry_information'], + ['ID', '15000'], + ['Title', + 'Solution structure of chicken villin headpiece subdomain containing a ' + 'fluorinated side chain in the core\n'], + ['Type', 'macromolecule'], + ['Version_type', 'original'], + ['Submission_date', '2006-09-07'], + ['Accession_date', '2006-09-07'], + ['Last_release_date', '2006-09-07'], + ['Original_release_date', '2006-09-07'], + ['Origination', 'author'], + ['Format_name', '.'], + ['NMR_STAR_version', '3.2.6.0'], + ['NMR_STAR_dict_location', '.'], + ['Original_NMR_STAR_version', '3.2.6.0'], + ['Experimental_method', 'NMR'], + ['Experimental_method_subtype', 'solution'], + ['Source_data_format', '.'], + ['Source_data_format_version', '.'], + ['Generated_software_name', '.'], + ['Generated_software_version', '.'], + ['Generated_software_ID', '.'], + ['Generated_software_label', '.'], + ['Generated_date', '.'], + ['DOI', '.'], + ['UUID', '.'], + ['Related_coordinate_file_name', '.'], + ['BMRB_internal_directory_name', '.'], + ['example1', 5], + ['example2', '.']]) + + frame.remove_tag(['example2', 'example1']) + frame.tags.append(frame.tags.pop(0)) + frame.sort_tags() + self.assertEqual([[x[0], x[1]] for x in frame.tags], [['Sf_category', 'entry_information'], + ['Sf_framecode', 'entry_information'], + ['ID', '15000'], + ['Title', + 'Solution structure of chicken villin headpiece subdomain containing a ' + 'fluorinated side chain in the core\n'], + ['Type', 'macromolecule'], + ['Version_type', 'original'], + ['Submission_date', '2006-09-07'], + ['Accession_date', '2006-09-07'], + ['Last_release_date', '2006-09-07'], + ['Original_release_date', '2006-09-07'], + ['Origination', 'author'], + ['Format_name', '.'], + ['NMR_STAR_version', '3.2.6.0'], + ['NMR_STAR_dict_location', '.'], + ['Original_NMR_STAR_version', '3.2.6.0'], + ['Experimental_method', 'NMR'], + ['Experimental_method_subtype', 'solution'], + ['Source_data_format', '.'], + ['Source_data_format_version', '.'], + ['Generated_software_name', '.'], + ['Generated_software_version', '.'], + ['Generated_software_ID', '.'], + ['Generated_software_label', '.'], + ['Generated_date', '.'], + ['DOI', '.'], + ['UUID', '.'], + ['Related_coordinate_file_name', '.'], + ['BMRB_internal_directory_name', '.']]) + + # Test validate + self.assertEqual(self.file_entry['assigned_chem_shift_list_1'].validate(), []) + + # Test set_tag_prefix + frame.set_tag_prefix("new_prefix") + self.assertEqual(frame.tag_prefix, "_new_prefix") + + def test_Saveframe_add_tag(self): + """ Test the add_tag functionality of a saveframe. """ + + # Test that you cannot set the framecode to a null value + test_sf = Saveframe.from_scratch('test') + + # Test that the initial setter can't set a null value + with self.assertRaises(ValueError): + test_sf.add_tag('sf_framecode', None) + test_sf.add_tag('sf_framecode', 'test') + + # Test that updating both via add_tag(update=True) and .name= don't + # allow for setting a null value + for val in definitions.NULL_VALUES: + with self.assertRaises(ValueError): + test_sf.add_tag('sf_framecode', val) + with self.assertRaises(ValueError): + test_sf.name = val + + # Test that adding an sf_framecode with a different value than the + # saveframe name throws an exception + with self.assertRaises(ValueError): + test_sf_two = Saveframe.from_scratch('test') + test_sf_two.add_tag('sf_framecode', 'different') + + def test_duplicate_loop_detection(self): + one = Loop.from_scratch(category="duplicate") + two = Loop.from_scratch(category="duplicate") + frame = Saveframe.from_scratch('1') + frame.add_loop(one) + self.assertRaises(ValueError, frame.add_loop, two) diff --git a/pynmrstar/unit_tests/test_schema.py b/pynmrstar/unit_tests/test_schema.py new file mode 100644 index 0000000..ee3ce3f --- /dev/null +++ b/pynmrstar/unit_tests/test_schema.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +import unittest + +from pynmrstar import Schema + + +class TestSchema(unittest.TestCase): + + def test_Schema(self): + default = Schema() + + self.assertEqual(default.headers, + ['Dictionary sequence', 'SFCategory', 'ADIT category mandatory', 'ADIT category view type', + 'ADIT super category ID', 'ADIT super category', 'ADIT category group ID', + 'ADIT category view name', 'Tag', 'BMRB current', 'Query prompt', 'Query interface', + 'SG Mandatory', '', 'ADIT exists', 'User full view', 'Metabolomics', 'Metabolites', 'SENCI', + 'Fragment library', 'Item enumerated', 'Item enumeration closed', 'Enum parent SFcategory', + 'Enum parent tag', 'Derived enumeration mantable', 'Derived enumeration', + 'ADIT item view name', 'Data Type', 'Nullable', 'Non-public', 'ManDBTableName', + 'ManDBColumnName', 'Row Index Key', 'Saveframe ID tag', 'Source Key', 'Table Primary Key', + 'Foreign Key Group', 'Foreign Table', 'Foreign Column', 'Secondary index', 'Sub category', + 'Units', 'Loopflag', 'Seq', 'Adit initial rows', 'Enumeration ties', + 'Mandatory code overides', 'Overide value', 'Overide view value', 'ADIT auto insert', + 'Example', 'Prompt', 'Interface', 'bmrbPdbMatchID', 'bmrbPdbTransFunc', 'STAR flag', + 'DB flag', 'SfNamelFlg', 'Sf category flag', 'Sf pointer', 'Natural primary key', + 'Natural foreign key', 'Redundant keys', 'Parent tag', 'public', 'internal', 'small molecule', + 'small molecule', 'metabolomics', 'Entry completeness', 'Overide public', 'internal', + 'small molecule', 'small molecule', 'metabolomic', 'metabolomic', 'default value', + 'Adit form code', 'Tag category', 'Tag field', 'Local key', 'Datum count flag', + 'NEF equivalent', 'mmCIF equivalent', 'Meta data', 'Tag delete', 'BMRB data type', + 'STAR vs Curated DB', 'Key group', 'Reference table', 'Reference column', + 'Dictionary description', 'variableTypeMatch', 'entryIdFlg', 'outputMapExistsFlg', + 'lclSfIdFlg', 'Met ADIT category view name', 'Met Example', 'Met Prompt', 'Met Description', + 'SM Struct ADIT-NMR category view name', 'SM Struct Example', 'SM Struct Prompt', + 'SM Struct Description', 'Met default value', 'SM default value']) + + self.assertEqual(default.val_type("_Entity.ID", 1), []) + self.assertEqual(default.val_type("_Entity.ID", "test"), [ + "Value does not match specification: '_Entity.ID':'test'.\n Type specified: int\n " + "Regular expression for type: '^(?:-?[0-9]*)?$'"]) + self.assertEqual(default.val_type("_Atom_chem_shift.Val", float(1.2)), []) + self.assertEqual(default.val_type("_Atom_chem_shift.Val", "invalid"), [ + "Value does not match specification: '_Atom_chem_shift.Val':'invalid'.\n Type " + "specified: float\n Regular expression for type: '^(?:-?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?)?$'"]) + + self.assertEqual(default.val_type("_Entry.ID", "this should be far too long - much too long"), [ + "Length of '43' is too long for 'CHAR(12)': '_Entry.ID':'this should be far too long - much too long'."]) diff --git a/pynmrstar/unit_tests/test_utils.py b/pynmrstar/unit_tests/test_utils.py new file mode 100644 index 0000000..421c883 --- /dev/null +++ b/pynmrstar/unit_tests/test_utils.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +import os +import unittest + +from pynmrstar import utils, definitions, Entry +from pynmrstar._internal import _interpret_file + +our_path = os.path.dirname(os.path.realpath(__file__)) +database_entry = Entry.from_database(15000) +sample_file_location = os.path.join(our_path, "sample_files", "bmr15000_3.str") + + +class TestUtils(unittest.TestCase): + + def test_clean_val(self): + # Check tag cleaning + self.assertEqual(utils.quote_value("single quote test"), "'single quote test'") + self.assertEqual(utils.quote_value("double quote' test"), '"double quote\' test"') + self.assertEqual(utils.quote_value("loop_"), "'loop_'") + self.assertEqual(utils.quote_value("#comment"), "'#comment'") + self.assertEqual(utils.quote_value("_tag"), "'_tag'") + self.assertEqual(utils.quote_value("simple"), "simple") + self.assertEqual(utils.quote_value(" "), "' '") + self.assertEqual(utils.quote_value("\nnewline\n"), "\nnewline\n") + self.assertEqual(utils.quote_value(None), ".") + self.assertRaises(ValueError, utils.quote_value, "") + + definitions.STR_CONVERSION_DICT = {"loop_": "noloop_"} + utils.quote_value.cache_clear() + self.assertEqual(utils.quote_value("loop_"), "noloop_") + definitions.STR_CONVERSION_DICT = {None: "."} + + def test__format_category(self): + self.assertEqual(utils.format_category("test"), "_test") + self.assertEqual(utils.format_category("_test"), "_test") + self.assertEqual(utils.format_category("test.test"), "_test") + + def test__format_tag(self): + self.assertEqual(utils.format_tag("test"), "test") + self.assertEqual(utils.format_tag("_test.test"), "test") + self.assertEqual(utils.format_tag("test.test"), "test") + + def test__InterpretFile(self): + with open(sample_file_location, "r") as local_file: + local_version = local_file.read() + + # Test reading file from local locations + self.assertEqual(_interpret_file(sample_file_location).read(), local_version) + with open(sample_file_location, "rb") as tmp: + self.assertEqual(_interpret_file(tmp).read(), local_version) + with open(os.path.join(our_path, "sample_files", "bmr15000_3.str.gz"), "rb") as tmp: + self.assertEqual(_interpret_file(tmp).read(), local_version) + + # Test reading from http (ftp doesn't work on TravisCI) + entry_url = 'https://bmrb.io/ftp/pub/bmrb/entry_directories/bmr15000/bmr15000_3.str' + self.assertEqual(Entry.from_string(_interpret_file(entry_url).read()), database_entry) + + # Test reading from https locations + raw_api_url = "https://api.bmrb.io/v2/entry/15000?format=rawnmrstar" + self.assertEqual(Entry.from_string(_interpret_file(raw_api_url).read()), database_entry) diff --git a/pynmrstar/utils.py b/pynmrstar/utils.py index 1c38824..b1f2f98 100755 --- a/pynmrstar/utils.py +++ b/pynmrstar/utils.py @@ -7,7 +7,9 @@ from typing import Iterable, Any, Dict from urllib.error import HTTPError, URLError -from pynmrstar import definitions, cnmrstar, entry as entry_mod +import pynmrstar_parser + +from pynmrstar import definitions, entry as entry_mod from pynmrstar._internal import _interpret_file from pynmrstar.schema import Schema @@ -111,17 +113,12 @@ def quote_value(value: Any) -> str: This will automatically be called on all values when you use a str() method (so don't call it before inserting values into tags or loops). - Be mindful of the value of STR_CONVERSION_DICT as it will effect the + Be mindful of the value of STR_CONVERSION_DICT as it will affect the way the value is converted to a string. """ - # Allow manual specification of conversions for booleans, Nones, etc. - if value in definitions.STR_CONVERSION_DICT: - if any(isinstance(value, type(x)) for x in definitions.STR_CONVERSION_DICT): - value = definitions.STR_CONVERSION_DICT[value] - - return cnmrstar.quote_value(value) + return pynmrstar_parser.quote_value(value, definitions.STR_CONVERSION_DICT) def validate(entry_to_validate: 'entry_mod.Entry', schema: 'Schema' = None) -> None: diff --git a/pyproject.toml b/pyproject.toml index e79be09..f43d457 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,48 +1,49 @@ -[build-system] -requires = ["setuptools>=42", "wheel"] -build-backend = "setuptools.build_meta" - -[project] -name = "pynmrstar" -dynamic = ["version"] -description = "PyNMR-STAR provides tools for reading, writing, modifying, and interacting with NMR-STAR files. Maintained by the BMRB." -readme = "README.rst" -requires-python = ">3.7" -license = {text = "MIT"} -authors = [ - {name = "Jon Wedell", email = "wedell@uchc.edu"} -] -keywords = ["bmrb", "parser", "nmr", "nmrstar", "biomagresbank", "biological magnetic resonance bank"] -classifiers = [ - "Development Status :: 6 - Mature", - "Environment :: Console", - "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - "Programming Language :: Python :: 3.13", - "Intended Audience :: Developers", - "License :: OSI Approved :: MIT License", - "Natural Language :: English", - "Operating System :: MacOS", - "Operating System :: POSIX :: Linux", - "Operating System :: Microsoft :: Windows", - "Topic :: Scientific/Engineering :: Bio-Informatics", - "Topic :: Software Development :: Libraries", - "Topic :: Software Development :: Libraries :: Python Modules" -] -dependencies = [ - "requests>=2.21.0,<=3" -] - -[project.urls] +[build-system] +requires = ["maturin>=1.0,<2.0"] +build-backend = "maturin" + +[project] +name = "pynmrstar" +version = "3.4.0b1" +description = "PyNMR-STAR provides tools for reading, writing, modifying, and interacting with NMR-STAR files. Maintained by the BMRB." +readme = "README.rst" +requires-python = ">3.7" +license = {text = "MIT"} +authors = [ + {name = "Jon Wedell", email = "wedell@uchc.edu"} +] +keywords = ["bmrb", "parser", "nmr", "nmrstar", "biomagresbank", "biological magnetic resonance bank"] +classifiers = [ + "Development Status :: 6 - Mature", + "Environment :: Console", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Natural Language :: English", + "Operating System :: MacOS", + "Operating System :: POSIX :: Linux", + "Operating System :: Microsoft :: Windows", + "Topic :: Scientific/Engineering :: Bio-Informatics", + "Topic :: Software Development :: Libraries", + "Topic :: Software Development :: Libraries :: Python Modules" +] +dependencies = [ + "requests>=2.21.0,<=3" +] + +[project.urls] Homepage = "https://github.com/bmrb-io/PyNMRSTAR" - -[tool.setuptools] -packages = ["pynmrstar"] -package-data = {pynmrstar = ["reference_files/schema.csv", "reference_files/comments.str", "reference_files/data_types.csv"]} - -[tool.setuptools.dynamic] -version = {attr = "pynmrstar._internal.__version__"} + +[tool.setuptools] +packages = ["pynmrstar"] +package-data = {pynmrstar = ["reference_files/schema.csv", "reference_files/comments.str", "reference_files/data_types.csv"]} + +[tool.maturin] +module-name = "pynmrstar_parser" diff --git a/setup.py b/setup.py deleted file mode 100644 index 29a35ad..0000000 --- a/setup.py +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env python3 - -import os -from setuptools import setup, Extension - - -def get_version(): - internal_file_location = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'pynmrstar', '_internal.py') - - with open(internal_file_location, 'r') as internal_file: - for line in internal_file: - if line.startswith('__version__'): - delim = '"' if '"' in line else "'" - return line.split(delim)[1] - else: - raise RuntimeError("Unable to find version string.") - - -# Should fail if the readme is missing -long_des = open('README.rst', 'r').read() - -cnmrstar = Extension('cnmrstar', - sources=['c/cnmrstarmodule.c'], - extra_compile_args=["-funroll-loops", "-O3"], - optional=True) - -setup(name='pynmrstar', - version=get_version(), - packages=['pynmrstar'], - ext_modules=[cnmrstar], - install_requires=['requests>=2.21.0,<=3'], - python_requires='>=3.7', - author='Jon Wedell', - author_email='wedell@uchc.edu', - description='PyNMR-STAR provides tools for reading, writing, modifying, and interacting with NMR-STAR files. ' - 'Maintained by the BMRB.', - long_description=long_des, - long_description_content_type='text/x-rst', - keywords=['bmrb', 'parser', 'nmr', 'nmrstar', 'biomagresbank', 'biological magnetic resonance bank'], - url='https://github.com/uwbmrb/PyNMRSTAR', - license='MIT', - package_data={'pynmrstar': ['reference_files/schema.csv', - 'reference_files/comments.str', - 'reference_files/data_types.csv']}, - classifiers=[ - 'Development Status :: 6 - Mature', - 'Environment :: Console', - 'Programming Language :: Python :: 3 :: Only', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', - 'Programming Language :: Python :: 3.11', - 'Programming Language :: Python :: 3.12', - 'Programming Language :: Python :: 3.13', - 'Intended Audience :: Developers', - 'License :: OSI Approved :: MIT License', - 'Natural Language :: English', - 'Operating System :: MacOS', - 'Operating System :: POSIX :: Linux', - 'Operating System :: Microsoft :: Windows', - 'Topic :: Scientific/Engineering :: Bio-Informatics', - 'Topic :: Software Development :: Libraries', - 'Topic :: Software Development :: Libraries :: Python Modules' - ] - ) diff --git a/src/accelerators.rs b/src/accelerators.rs new file mode 100644 index 0000000..964b376 --- /dev/null +++ b/src/accelerators.rs @@ -0,0 +1,263 @@ +use pyo3::prelude::*; +use pyo3::exceptions::PyValueError; + +use crate::utils::quote_value_str; + +/// Format a saveframe in NMR-STAR format. +/// Comments are handled by Python, this focuses on the heavy lifting of tag/loop formatting. +#[pyfunction] +#[pyo3(signature = (name, tag_prefix, tags, formatted_loops, skip_empty_tags=false, str_conversion_dict=None, null_values=None))] +pub fn format_saveframe( + _py: Python, + name: &str, + tag_prefix: &str, + tags: Vec>>, // List of [tag_name, tag_value] + formatted_loops: Vec, + skip_empty_tags: bool, + str_conversion_dict: Option<&Bound<'_, PyAny>>, + null_values: Option<&Bound<'_, PyAny>>, +) -> PyResult { + // Estimate capacity for result string + let estimated_size = 100 + tags.len() * (tag_prefix.len() + 50) + + formatted_loops.iter().map(|s| s.len()).sum::(); + let mut result = String::with_capacity(estimated_size); + + // Print the saveframe header + result.push_str("save_"); + result.push_str(name); + result.push('\n'); + + // Print the tags if there are any + if !tags.is_empty() { + // Calculate maximum tag width for formatting + let mut max_width = 0; + for tag in &tags { + if tag.is_empty() { + continue; + } + let tag_name: String = tag[0].extract().unwrap_or_default(); + let full_tag_len = tag_prefix.len() + 1 + tag_name.len(); + if full_tag_len > max_width { + max_width = full_tag_len; + } + } + + // Process and print each tag + for tag in &tags { + if tag.len() < 2 { + continue; + } + + let tag_name: String = tag[0].extract().unwrap_or_default(); + let tag_value = &tag[1]; + + // Skip empty tags if requested + if skip_empty_tags { + if let Some(null_set) = null_values { + if null_set.contains(tag_value).unwrap_or(false) { + continue; + } + } + } + + // Apply STR_CONVERSION_DICT if provided + let converted_value: std::borrow::Cow<'_, Bound<'_, PyAny>> = if let Some(conv_dict) = str_conversion_dict { + if conv_dict.contains(tag_value)? { + std::borrow::Cow::Owned(conv_dict.get_item(tag_value)?) + } else { + std::borrow::Cow::Borrowed(tag_value) + } + } else { + std::borrow::Cow::Borrowed(tag_value) + }; + + // Convert to string + let string_val: String = if converted_value.is_none() { + ".".to_string() + } else { + converted_value.str()?.to_str()?.to_string() + }; + + // Quote the value + let quoted = if string_val.is_empty() { + return Err(PyValueError::new_err(format!( + "Cannot generate NMR-STAR for entry, as empty strings are not valid tag values in NMR-STAR. Please either replace the empty strings with None objects, or set pynmrstar.definitions.STR_CONVERSION_DICT[''] = None. Saveframe: {} Tag: {}", + name, tag_name + ))); + } else { + quote_value_str(&string_val) + }; + + // Format the tag + let formatted_tag = format!("{}.{}", tag_prefix, tag_name); + + if quoted.contains('\n') { + // Multiline value format + result.push_str(" "); + result.push_str(&formatted_tag); + // Pad to max_width + for _ in formatted_tag.len()..max_width { + result.push(' '); + } + result.push_str("\n;\n"); + result.push_str("ed); + result.push_str(";\n"); + } else { + // Single line format + result.push_str(" "); + result.push_str(&formatted_tag); + // Pad to max_width + 2 spaces + for _ in formatted_tag.len()..max_width { + result.push(' '); + } + result.push_str(" "); + result.push_str("ed); + result.push('\n'); + } + } + } + + // Append all formatted loops + for loop_str in formatted_loops { + result.push_str(&loop_str); + } + + // Close the saveframe + result.push_str("\nsave_\n"); + + Ok(result) +} + +/// Format a loop in NMR-STAR format. +/// This is the performance-critical function that formats all the data in a loop. +#[pyfunction] +#[pyo3(signature = (tags, category, data, skip_empty_loops=false, str_conversion_dict=None))] +pub fn format_loop( + _py: Python, + tags: Vec, + category: &str, + data: Vec>>, + skip_empty_loops: bool, + str_conversion_dict: Option<&Bound<'_, PyAny>>, +) -> PyResult { + // Handle empty data case + if data.is_empty() { + if skip_empty_loops { + return Ok(String::new()); + } else { + if tags.is_empty() { + return Ok("\n loop_\n\n stop_\n".to_string()); + } + // Fall through to print tags with no data + } + } + + if tags.is_empty() && !data.is_empty() { + return Err(PyValueError::new_err(format!( + "Impossible to print data if there are no associated tags. Error in loop '{}' which contains data but hasn't had any tags added.", + category + ))); + } + + // Pre-allocate for the result + // Estimate: header (~50) + tags (category.len + tag.len + 10 per tag) + data + let estimated_size = 100 + tags.len() * (category.len() + 20) + data.len() * tags.len() * 15; + let mut result = String::with_capacity(estimated_size); + + // Start the loop + result.push_str("\n loop_\n"); + + // Print the tags + for tag in &tags { + result.push_str(" "); + result.push_str(category); + result.push('.'); + result.push_str(tag); + result.push('\n'); + } + result.push('\n'); + + if data.is_empty() { + result.push_str("\n stop_\n"); + return Ok(result); + } + + // Convert and quote all data, tracking column widths + let num_cols = tags.len(); + let mut quoted_data: Vec> = Vec::with_capacity(data.len()); + let mut col_widths: Vec = vec![4; num_cols]; // minimum width of 4 + + for (row_idx, row) in data.iter().enumerate() { + let mut quoted_row: Vec = Vec::with_capacity(num_cols); + + for (col_idx, cell) in row.iter().enumerate() { + // Apply STR_CONVERSION_DICT if provided + let converted_cell: std::borrow::Cow<'_, Bound<'_, PyAny>> = if let Some(conv_dict) = str_conversion_dict { + // Check if the cell value is a key in the conversion dict + if conv_dict.contains(cell)? { + // Get the converted value + std::borrow::Cow::Owned(conv_dict.get_item(cell)?) + } else { + std::borrow::Cow::Borrowed(cell) + } + } else { + std::borrow::Cow::Borrowed(cell) + }; + + // Convert to string, handling None specially (default conversion) + let string_val: String = if converted_cell.is_none() { + ".".to_string() + } else { + converted_cell.str()?.to_str()?.to_string() + }; + + // Quote the value + let quoted = if string_val.is_empty() { + // Empty strings are not allowed - return error + return Err(PyValueError::new_err(format!( + "Cannot generate NMR-STAR for entry, as empty strings are not valid tag values in NMR-STAR. Please either replace the empty strings with None objects, or set pynmrstar.definitions.STR_CONVERSION_DICT[''] = None.\nLoop: {} Row: {} Column: {}", + category, row_idx, col_idx + ))); + } else { + quote_value_str(&string_val) + }; + + // Track width (but not for multiline values) + if !quoted.contains('\n') { + let width = quoted.len() + 3; // +3 for spacing + if width > col_widths[col_idx] { + col_widths[col_idx] = width; + } + } + + quoted_row.push(quoted); + } + quoted_data.push(quoted_row); + } + + // Print the data rows + for row in "ed_data { + result.push_str(" "); + for (col_idx, val) in row.iter().enumerate() { + if val.contains('\n') { + // Multiline value - format specially + result.push_str("\n;\n"); + result.push_str(val); + result.push_str(";\n"); + } else { + // Pad to column width + result.push_str(val); + let padding = col_widths[col_idx].saturating_sub(val.len()); + for _ in 0..padding { + result.push(' '); + } + } + } + result.push_str(" \n"); + } + + // Close the loop + result.push_str("\n stop_\n"); + + Ok(result) +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..d47da42 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,14 @@ +use pyo3::prelude::*; + +mod accelerators; +mod parser; +mod utils; + +#[pymodule] +fn pynmrstar_parser(m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_function(wrap_pyfunction!(parser::parse, m)?)?; + m.add_function(wrap_pyfunction!(utils::quote_value, m)?)?; + m.add_function(wrap_pyfunction!(accelerators::format_loop, m)?)?; + m.add_function(wrap_pyfunction!(accelerators::format_saveframe, m)?)?; + Ok(()) +} diff --git a/src/parser.rs b/src/parser.rs new file mode 100644 index 0000000..fda41cc --- /dev/null +++ b/src/parser.rs @@ -0,0 +1,931 @@ +use pyo3::prelude::*; +use pyo3::types::IntoPyDict; +use pyo3::import_exception; + +use crate::utils::{fix_multiline_semicolons, is_reserved_keyword, starts_with_ignore_case}; + +// Import the ParsingError exception from pynmrstar.exceptions +import_exception!(pynmrstar.exceptions, ParsingError); + +// Tokenizer state +pub struct TokenizerState { + pub full_data: String, + index: usize, + pub line_no: usize, + pub last_delimiter: char, +} + +impl TokenizerState { + pub fn new() -> Self { + TokenizerState { + full_data: String::new(), + index: 0, + line_no: 0, + last_delimiter: ' ', + } + } + + fn reset(&mut self) { + self.full_data.clear(); + self.index = 0; + self.line_no = 0; + self.last_delimiter = ' '; + } + + pub fn load_string(&mut self, data: String) { + self.reset(); + self.full_data = data; + } + + fn is_whitespace(c: char) -> bool { + matches!(c, ' ' | '\n' | '\t' | '\x0B') + } + + fn pass_whitespace(&mut self) { + let bytes = self.full_data.as_bytes(); + while self.index < bytes.len() { + let c = bytes[self.index] as char; + if Self::is_whitespace(c) { + if c == '\n' { + self.line_no += 1; + } + self.index += 1; + } else { + break; + } + } + } + + fn check_multiline(&self, length: usize) -> bool { + let end = (self.index + length).min(self.full_data.len()); + let bytes = self.full_data.as_bytes(); + for i in self.index..end { + if bytes[i] == b'\n' { + return true; + } + } + false + } + + fn update_line_number(&mut self, start_pos: usize, length: usize) { + let end = (start_pos + length).min(self.full_data.len()); + let bytes = self.full_data.as_bytes(); + for i in start_pos..end { + if bytes[i] == b'\n' { + self.line_no += 1; + } + } + } + + fn find_substring(&self, needle: &str, start_pos: usize) -> Option { + if start_pos >= self.full_data.len() { + return None; + } + + // Optimize single-byte searches to work directly with bytes + if needle.len() == 1 { + let needle_byte = needle.as_bytes()[0]; + let bytes = self.full_data.as_bytes(); + for i in start_pos..bytes.len() { + if bytes[i] == needle_byte { + return Some(i - start_pos); + } + } + return None; + } + + // Optimize two-byte searches (e.g., "\n;") + if needle.len() == 2 { + let needle_bytes = needle.as_bytes(); + let bytes = self.full_data.as_bytes(); + for i in start_pos..bytes.len().saturating_sub(1) { + if bytes[i] == needle_bytes[0] && bytes[i + 1] == needle_bytes[1] { + return Some(i - start_pos); + } + } + return None; + } + + // Fall back to str::find for longer patterns + self.full_data[start_pos..].find(needle) + } + + fn get_next_whitespace(&self, start_pos: usize) -> usize { + let bytes = self.full_data.as_bytes(); + let mut pos = start_pos; + while pos < bytes.len() { + if Self::is_whitespace(bytes[pos] as char) { + return pos; + } + pos += 1; + } + pos + } + + pub fn get_token(&mut self) -> Result, String> { + // Reset delimiter + self.last_delimiter = '?'; + + // Skip whitespace + self.pass_whitespace(); + + // Check if we're at the end + if self.index >= self.full_data.len() { + return Ok(None); + } + + let bytes = self.full_data.as_bytes(); + + // Handle comments + if bytes[self.index] == b'#' { + if let Some(length) = self.find_substring("\n", self.index) { + let start = self.index; + let end = self.index + length; + self.last_delimiter = '#'; + self.update_line_number(self.index, length + 1); + self.index += length + 1; + return Ok(Some((start, end))); + } else { + // Comment at end of file with no newline + return Ok(None); + } + } + + // Handle multiline values (semicolon-delimited) + if self.index + 1 < bytes.len() && bytes[self.index] == b';' && bytes[self.index + 1] == b'\n' { + if let Some(length) = self.find_substring("\n;", self.index) { + // We started with a newline so count it + self.line_no += 1; + self.index += 2; + + let start = self.index; + let end = self.index + length - 1; + self.last_delimiter = ';'; + self.update_line_number(self.index, length); + self.index += length; + return Ok(Some((start, end))); + } else { + return Err(format!("Invalid file. Semicolon-delineated value was not terminated. Error on line: {}", self.line_no + 1)); + } + } + + // Handle single-quoted values + if bytes[self.index] == b'\'' { + if let Some(mut end_quote) = self.find_substring("'", self.index + 1) { + // Make sure we don't stop for quotes not followed by whitespace + loop { + let absolute_quote_pos = self.index + end_quote + 1; + if absolute_quote_pos + 1 < bytes.len() && !Self::is_whitespace(bytes[absolute_quote_pos + 1] as char) { + // Search for next quote starting after this one + if let Some(next_relative_idx) = self.find_substring("'", absolute_quote_pos + 1) { + // Update end_quote to be relative to self.index + 1 (after opening quote) + end_quote = (absolute_quote_pos - self.index - 1) + next_relative_idx + 1; + } else { + return Err("Invalid file. Single quoted value was never terminated at end of file.".to_string()); + } + } else { + break; + } + } + + // Check for newlines + if self.check_multiline(end_quote + 1) { + return Err(format!("Invalid file. Single quoted value was not terminated on the same line it began. Error on line: {}", self.line_no + 1)); + } + + self.index += 1; + let start = self.index; + let end = self.index + end_quote; + self.last_delimiter = '\''; + self.update_line_number(self.index, end_quote + 1); + self.index += end_quote + 1; + return Ok(Some((start, end))); + } else { + return Err(format!("Invalid file. Single quoted value was not terminated. Error on line: {}", self.line_no + 1)); + } + } + + // Handle double-quoted values + if bytes[self.index] == b'"' { + if let Some(mut end_quote) = self.find_substring("\"", self.index + 1) { + // Make sure we don't stop for quotes not followed by whitespace + loop { + let absolute_quote_pos = self.index + end_quote + 1; + if absolute_quote_pos + 1 < bytes.len() && !Self::is_whitespace(bytes[absolute_quote_pos + 1] as char) { + // Search for next quote starting after this one + if let Some(next_relative_idx) = self.find_substring("\"", absolute_quote_pos + 1) { + // Update end_quote to be relative to self.index + 1 (after opening quote) + end_quote = (absolute_quote_pos - self.index - 1) + next_relative_idx + 1; + } else { + return Err("Invalid file. Double quoted value was never terminated at end of file.".to_string()); + } + } else { + break; + } + } + + // Check for newlines + if self.check_multiline(end_quote + 1) { + return Err(format!("Invalid file. Double quoted value was not terminated on the same line it began. Error on line: {}", self.line_no + 1)); + } + + self.index += 1; + let start = self.index; + let end = self.index + end_quote; + self.last_delimiter = '"'; + self.update_line_number(self.index, end_quote + 1); + self.index += end_quote + 1; + return Ok(Some((start, end))); + } else { + return Err(format!("Invalid file. Double quoted value was not terminated. Error on line: {}", self.line_no + 1)); + } + } + + // Handle normal unquoted tokens + let end_pos = self.get_next_whitespace(self.index); + let start = self.index; + let end = end_pos; + + // Determine delimiter + if self.index == 0 { + self.last_delimiter = ' '; + } else { + self.last_delimiter = ' '; + } + + // Check if it's a reference (starts with $ and delimiter was space) + let token_slice = &self.full_data[start..end]; + if token_slice.starts_with('$') && self.last_delimiter == ' ' && token_slice.len() > 1 { + self.last_delimiter = '$'; + } + + self.update_line_number(self.index, end_pos - self.index + 1); + self.index = end_pos + 1; + Ok(Some((start, end))) + } +} + +// Represents a token either as indices into tokenizer.full_data or as a materialized string +// (for rare processed tokens from embedded STAR format) +enum TokenValue { + Indexed(usize, usize), // Indices into ctx.tokenizer.full_data + Materialized(String), // Pre-materialized string (< 0.1% of tokens) +} + +impl TokenValue { + fn to_string(&self, full_data: &str) -> String { + match self { + TokenValue::Indexed(start, end) => full_data[*start..*end].to_string(), + TokenValue::Materialized(s) => s.clone(), + } + } + + fn as_str<'a>(&'a self, full_data: &'a str) -> std::borrow::Cow<'a, str> { + match self { + TokenValue::Indexed(start, end) => std::borrow::Cow::Borrowed(&full_data[*start..*end]), + TokenValue::Materialized(s) => std::borrow::Cow::Borrowed(s), + } + } +} + +struct LoopStatistics { + total_data_items: usize, + loop_count: usize, +} + +struct ParserContext { + tokenizer: TokenizerState, + line_number: usize, + delimiter: char, + token: Option<(usize, usize)>, + processed_token: Option, // For rare cases where we need to process the token + entry: Py, + current_saveframe: Option>, + current_loop: Option>, + loop_data: Vec, // Store indices instead of materialized strings + seen_data: bool, + in_loop: bool, + _source: String, + raise_parse_warnings: bool, + _convert_data_types: bool, + _schema: Option>, + saveframe_class: Py, + loop_class: Py, + source_dict: Py, + add_tags_kwargs: Py, + add_data_kwargs: Py, + // Loop pre-allocation tracking by loop type + loop_statistics: std::collections::HashMap, + current_loop_type: Option, +} + +impl ParserContext { + fn new(py: Python, entry: Py, source: String, raise_parse_warnings: bool, + convert_data_types: bool, schema: Option>, tokenizer: TokenizerState) -> PyResult { + // Cache module/class lookups at initialization + let saveframe_mod = py.import("pynmrstar.saveframe")?; + let saveframe_class = saveframe_mod.getattr("Saveframe")?.into(); + + let loop_mod = py.import("pynmrstar.loop")?; + let loop_class = loop_mod.getattr("Loop")?.into(); + + // Pre-create reusable dictionaries for kwargs + let source_dict = [("source", source.as_str())].into_py_dict(py)?.into(); + + let add_tags_kwargs = if schema.is_some() { + [ + ("convert_data_types", convert_data_types.into_pyobject(py)?.to_owned().into_any().unbind()), + ("schema", schema.as_ref().unwrap().clone_ref(py)) + ].into_py_dict(py)?.into() + } else { + [("convert_data_types", convert_data_types.into_pyobject(py)?.to_owned().into_any().unbind())] + .into_py_dict(py)?.into() + }; + + let add_data_kwargs = if schema.is_some() { + [ + ("rearrange", true.into_pyobject(py)?.to_owned().into_any().unbind()), + ("convert_data_types", convert_data_types.into_pyobject(py)?.to_owned().into_any().unbind()), + ("schema", schema.as_ref().unwrap().clone_ref(py)) + ].into_py_dict(py)?.into() + } else { + [ + ("rearrange", true.into_pyobject(py)?.to_owned().into_any().unbind()), + ("convert_data_types", convert_data_types.into_pyobject(py)?.to_owned().into_any().unbind()) + ].into_py_dict(py)?.into() + }; + + Ok(ParserContext { + tokenizer, + line_number: 0, + delimiter: ' ', + token: None, + processed_token: None, + entry, + current_saveframe: None, + current_loop: None, + loop_data: Vec::new(), + seen_data: false, + in_loop: false, + _source: source, + raise_parse_warnings, + _convert_data_types: convert_data_types, + _schema: schema, + saveframe_class, + loop_class, + source_dict, + add_tags_kwargs, + add_data_kwargs, + loop_statistics: std::collections::HashMap::new(), + current_loop_type: None, + }) + } + + fn get_token(&mut self) -> PyResult { + // Clear any previous processed token + self.processed_token = None; + + // Get token and skip comments + loop { + match self.tokenizer.get_token() { + Ok(Some((start, end))) => { + if self.tokenizer.last_delimiter != '#' { + let token_str = &self.tokenizer.full_data[start..end]; + + // Handle embedded STAR unwrapping for semicolon-delimited tokens + if self.tokenizer.last_delimiter == ';' && token_str.starts_with("\n ") { + let mut shift_over = true; + let lines: Vec<&str> = token_str.split('\n').collect(); + + for line in &lines[1..] { // Skip first empty line + if !line.is_empty() && !line.starts_with(" ") { + shift_over = false; + break; + } + } + + if shift_over && token_str.contains("\n ;") { + // Process the string and store it separately + let mut processed = token_str.trim_end_matches('\n').to_string(); + processed = processed.replace("\n ", "\n"); + self.processed_token = Some(processed); + } + } + + self.token = Some((start, end)); + self.line_number = self.tokenizer.line_no; + self.delimiter = self.tokenizer.last_delimiter; + return Ok(true); + } + // If it's a comment, continue to get the next token + } + Ok(None) => { + self.token = None; + return Ok(false); + } + Err(e) => return Err(ParsingError::new_err(e)), + } + } + } + + fn token_str(&self) -> &str { + // Return processed token if available + if let Some(ref processed) = self.processed_token { + processed + } else if let Some((start, end)) = self.token { + &self.tokenizer.full_data[start..end] + } else { + "" + } + } + + fn raise_error(&self, message: &str) -> PyErr { + ParsingError::new_err(format!("{} (line {})", message, self.line_number)) + } +} + +fn parse_initial(py: Python, ctx: &mut ParserContext) -> PyResult<()> { + // Get first token + if !ctx.get_token()? { + return Err(ctx.raise_error("Empty file")); + } + + let token = ctx.token_str(); + + // Validate data_ token + if !starts_with_ignore_case(token, "data_") { + return Err(ctx.raise_error(&format!( + "Invalid file. NMR-STAR files must start with 'data_' followed by the data name. \ + Did you accidentally select the wrong file? Your file started with '{}'.", + token + ))); + } + + if token.len() < 6 { + return Err(ctx.raise_error( + "'data_' must be followed by data name. Simply 'data_' is not allowed." + )); + } + + if ctx.delimiter != ' ' { + return Err(ctx.raise_error("The data_ keyword may not be quoted or semicolon-delimited.")); + } + + // Set entry_id + let entry_id = &token[5..]; + ctx.entry.setattr(py, "_entry_id", entry_id)?; + + Ok(()) +} + +fn parse_entry_body(py: Python, ctx: &mut ParserContext) -> PyResult<()> { + while ctx.get_token()? { + let token = ctx.token_str(); + + if !starts_with_ignore_case(token, "save_") { + return Err(ctx.raise_error(&format!( + "Only 'save_NAME' is valid in the body of a NMR-STAR file. Found '{}'.", + token + ))); + } + + if token.len() < 6 { + return Err(ctx.raise_error( + "'save_' must be followed by saveframe name. You have a 'save_' tag which is \ + illegal without a specified saveframe name." + )); + } + + if ctx.delimiter != ' ' { + return Err(ctx.raise_error("The save_ keyword may not be quoted or semicolon-delimited.")); + } + + // Create new saveframe using cached class + let saveframe_name = &token[5..]; + let saveframe = ctx.saveframe_class.bind(py).call_method( + "from_scratch", + (saveframe_name,), + Some(ctx.source_dict.bind(py).cast()?) + )?; + + ctx.current_saveframe = Some(saveframe.into()); + ctx.entry.call_method1(py, "add_saveframe", (ctx.current_saveframe.as_ref().unwrap(),))?; + + // Parse saveframe body + parse_saveframe_body(py, ctx)?; + } + + Ok(()) +} + +fn parse_saveframe_body(py: Python, ctx: &mut ParserContext) -> PyResult<()> { + let mut pending_tags: Vec<(TokenValue, TokenValue)> = Vec::new(); + + // Helper to flush pending tags + let flush_tags = |ctx: &ParserContext, pending: &mut Vec<(TokenValue, TokenValue)>| -> PyResult<()> { + if !pending.is_empty() { + let saveframe = ctx.current_saveframe.as_ref().unwrap(); + // Materialize TokenValues into (String, String) tuples for Python + let tags_to_add = std::mem::take(pending); + let materialized: Vec<(String, String)> = tags_to_add + .iter() + .map(|(tag, value)| (tag.to_string(&ctx.tokenizer.full_data), value.to_string(&ctx.tokenizer.full_data))) + .collect(); + saveframe.call_method(py, "add_tags", (materialized,), Some(ctx.add_tags_kwargs.bind(py).cast()?))?; + } + Ok(()) + }; + + while ctx.get_token()? { + let token = ctx.token_str(); + + if token.eq_ignore_ascii_case("loop_") { + // Flush any pending tags before processing loop + flush_tags(ctx, &mut pending_tags)?; + if ctx.delimiter != ' ' { + return Err(ctx.raise_error("The loop_ keyword may not be quoted or semicolon-delimited.")); + } + + // Create new loop using cached class + let new_loop = ctx.loop_class.bind(py).call_method( + "from_scratch", + (), + Some(ctx.source_dict.bind(py).cast()?) + )?; + + ctx.current_loop = Some(new_loop.into()); + ctx.loop_data.clear(); + ctx.seen_data = false; + ctx.in_loop = true; + + parse_loop_tags(py, ctx)?; + + } else if token.eq_ignore_ascii_case("save_") { + // Flush any pending tags before exiting saveframe + flush_tags(ctx, &mut pending_tags)?; + + if ctx.delimiter != ' ' && ctx.delimiter != ';' { + return Err(ctx.raise_error("The save_ keyword may not be quoted or semicolon-delimited.")); + } + + // Check tag_prefix is set + let saveframe = ctx.current_saveframe.as_ref().unwrap(); + let tag_prefix = saveframe.getattr(py, "tag_prefix")?; + if tag_prefix.is_none(py) { + let frame_name = saveframe.getattr(py, "name")?; + return Err(ctx.raise_error(&format!( + "The tag prefix was never set! Either the saveframe had no tags, you \ + tried to read a version 2.1 file, or there is something else wrong with \ + your file. Saveframe error occurred within: '{}'", + frame_name.extract::(py)? + ))); + } + + break; // Exit saveframe + + } else if token.starts_with('_') { + if ctx.delimiter != ' ' { + return Err(ctx.raise_error(&format!( + "Saveframe tags may not be quoted or semicolon-delimited. Quoted tag: '{}'.", + token + ))); + } + + // Capture tag name as TokenValue + let tag_name = if let Some(ref processed) = ctx.processed_token { + TokenValue::Materialized(processed.clone()) + } else if let Some((start, end)) = ctx.token { + TokenValue::Indexed(start, end) + } else { + TokenValue::Materialized(String::new()) + }; + + // Get tag value + if !ctx.get_token()? { + return Err(ctx.raise_error("Tag without value")); + } + let value = ctx.token_str(); + + if ctx.delimiter == ' ' { + if is_reserved_keyword(value) { + return Err(ctx.raise_error(&format!( + "Cannot use keywords as data values unless quoted or semi-colon \ + delimited. Illegal value: '{}'", + value + ))); + } + if value.starts_with('_') { + return Err(ctx.raise_error(&format!( + "Cannot have a tag value start with an underscore unless the entire value \ + is quoted. You may be missing a data value on the previous line. \ + Illegal value: '{}'", + value + ))); + } + } + + // Capture value as TokenValue + let value_token = if let Some(ref processed) = ctx.processed_token { + TokenValue::Materialized(processed.clone()) + } else if let Some((start, end)) = ctx.token { + TokenValue::Indexed(start, end) + } else { + TokenValue::Materialized(String::new()) + }; + + // Collect tag-value pair for batch addition + pending_tags.push((tag_name, value_token)); + } else { + // Invalid token in saveframe + let frame_name = ctx.current_saveframe.as_ref().unwrap() + .getattr(py, "name")? + .extract::(py)?; + + if frame_name == "internaluseyoushouldntseethis_frame" { + return Err(ctx.raise_error(&format!( + "Invalid token found in loop contents. Expecting 'loop_' but found: '{}'", + token + ))); + } else { + return Err(ctx.raise_error(&format!( + "Invalid token found in saveframe '{}'. Expecting a tag, \ + loop, or 'save_' token but found: '{}'", + frame_name, token + ))); + } + } + } + + // Validate saveframe was properly closed + if ctx.token.is_none() || !ctx.token_str().eq_ignore_ascii_case("save_") { + return Err(ctx.raise_error( + "Saveframe improperly terminated at end of file. Saveframes must be terminated \ + with the 'save_' token." + )); + } + + Ok(()) +} + +fn parse_loop_tags(py: Python, ctx: &mut ParserContext) -> PyResult<()> { + let mut tags: Vec = Vec::new(); + + while ctx.in_loop && ctx.get_token()? { + let token = ctx.token_str(); + + // Check if this is a tag + if token.starts_with('_') && ctx.delimiter == ' ' { + // Extract loop type from first tag (e.g., "_Entry_author.Ordinal" -> "_Entry_author") + if tags.is_empty() { + let loop_type = if let Some(dot_pos) = token.find('.') { + token[..dot_pos].to_string() + } else { + // Tag without a dot - use the whole tag as loop type + token.to_string() + }; + + // Store loop type first + ctx.current_loop_type = Some(loop_type); + } + + // Capture tag as TokenValue + let tag_value = if let Some(ref processed) = ctx.processed_token { + TokenValue::Materialized(processed.clone()) + } else if let Some((start, end)) = ctx.token { + TokenValue::Indexed(start, end) + } else { + TokenValue::Materialized(String::new()) + }; + + // Collect tag for batch addition + tags.push(tag_value); + } else { + // First non-tag token, batch add all tags to loop + let loop_obj = ctx.current_loop.as_ref().unwrap(); + + // Batch add all collected tags (materialize to strings for Python) + if !tags.is_empty() { + let materialized: Vec = tags + .iter() + .map(|tv| tv.to_string(&ctx.tokenizer.full_data)) + .collect(); + loop_obj.call_method1(py, "add_tag", (materialized.as_slice(),))?; + } + + let saveframe = ctx.current_saveframe.as_ref().unwrap(); + saveframe.call_method1(py, "add_loop", (loop_obj,))?; + + // Preallocate loop_data Vec based on number of tags + // Use adaptive sizing based on loop type statistics + let tags_len = tags.len(); + if tags_len > 0 { + let estimated_rows = if let Some(loop_type) = &ctx.current_loop_type { + // Look up statistics for this specific loop type + if let Some(stats) = ctx.loop_statistics.get(loop_type) { + let avg_items_per_loop = stats.total_data_items / stats.loop_count; + let avg_rows = (avg_items_per_loop / tags_len).max(10); // At least 10 rows + avg_rows + } else { + // First time seeing this loop type: use reasonable default + 100 + } + } else { + // No loop type detected (shouldn't happen): use default + 100 + }; + ctx.loop_data.reserve(tags_len * estimated_rows); + } + + // Parse loop data (without consuming current token) + parse_loop_data(py, ctx)?; + break; + } + } + + Ok(()) +} + +fn parse_loop_data(py: Python, ctx: &mut ParserContext) -> PyResult<()> { + loop { + if ctx.token.is_none() { + return Err(ctx.raise_error("Loop improperly terminated at end of file. \ + Loops must end with the 'stop_' token, but the \ + file ended without the stop token.")); + } + + let token = ctx.token_str(); + + if token.eq_ignore_ascii_case("stop_") { + if ctx.delimiter != ' ' { + return Err(ctx.raise_error("The stop_ keyword may not be quoted or semicolon-delimited.")); + } + + let loop_obj = ctx.current_loop.as_ref().unwrap(); + let tags = loop_obj.bind(py).getattr("tags")?; + let tags_len = tags.len()?; + + // Warnings/errors for empty loops + if tags_len == 0 { + if ctx.raise_parse_warnings { + return Err(ctx.raise_error("Loop with no tags.")); + } else { + // Log warning via Python logger + let logging = py.import("logging")?; + let logger = logging.call_method1("getLogger", ("pynmrstar",))?; + logger.call_method1("warning", (format!("Loop with no tags in parsed file on line: {}", ctx.line_number),))?; + } + } + + if !ctx.seen_data { + if ctx.raise_parse_warnings { + return Err(ctx.raise_error("Loop with no data.")); + } else { + // Log warning via Python logger + let logging = py.import("logging")?; + let logger = logging.call_method1("getLogger", ("pynmrstar",))?; + logger.call_method1("warning", (format!("Loop with no data on line: {}", ctx.line_number),))?; + } + } + + // Add data to loop + if !ctx.loop_data.is_empty() { + if ctx.loop_data.len() % tags_len != 0 { + let category = loop_obj.getattr(py, "category")?; + return Err(ctx.raise_error(&format!( + "The loop being parsed, '{}' does not have the expected number of data elements. \ + This indicates that either one or more tag values are either missing from or \ + duplicated in this loop.", + category.extract::(py)? + ))); + } + + // Materialize TokenValues into Strings only when passing to Python + let loop_data_to_add = std::mem::take(&mut ctx.loop_data); + let data_items_count = loop_data_to_add.len(); + let materialized: Vec = loop_data_to_add + .iter() + .map(|tv| tv.to_string(&ctx.tokenizer.full_data)) + .collect(); + loop_obj.call_method(py, "add_data", (materialized,), Some(ctx.add_data_kwargs.bind(py).cast()?))?; + + // Track statistics for adaptive pre-allocation by loop type + if let Some(loop_type) = &ctx.current_loop_type { + let stats = ctx.loop_statistics.entry(loop_type.clone()).or_insert(LoopStatistics { + total_data_items: 0, + loop_count: 0, + }); + stats.total_data_items += data_items_count; + stats.loop_count += 1; + } + } else { + // Track empty loops too (for accurate statistics) + if let Some(loop_type) = &ctx.current_loop_type { + let stats = ctx.loop_statistics.entry(loop_type.clone()).or_insert(LoopStatistics { + total_data_items: 0, + loop_count: 0, + }); + stats.loop_count += 1; + } + } + + ctx.loop_data.clear(); + ctx.current_loop = None; + ctx.current_loop_type = None; + ctx.in_loop = false; + break; + + } else if token.starts_with('_') && ctx.delimiter == ' ' { + return Err(ctx.raise_error(&format!( + "Cannot have more loop tags after loop data. Or perhaps this \ + was a data value which was not quoted (but must be, \ + if it starts with '_')? Value: '{}'.", + token + ))); + + } else { + // Data value + let loop_obj = ctx.current_loop.as_ref().unwrap(); + let tags = loop_obj.bind(py).getattr("tags")?; + let tags_len = tags.len()?; + + if tags_len == 0 { + return Err(ctx.raise_error(&format!( + "Data value found in loop before any loop tags were defined. Value: '{}'", + token + ))); + } + + if is_reserved_keyword(token) && ctx.delimiter == ' ' { + let mut error = format!( + "Cannot use keywords as data values unless quoted or semi-colon \ + delimited. Perhaps this is a loop that wasn't properly terminated \ + with a 'stop_' keyword before the saveframe ended or another loop \ + began? Value found where 'stop_' or another data value expected: '{}'.", + token + ); + + if !ctx.loop_data.is_empty() { + let last_value = ctx.loop_data.last().unwrap().as_str(&ctx.tokenizer.full_data); + error.push_str(&format!(" Last loop data element parsed: '{}'.", last_value)); + } + + return Err(ctx.raise_error(&error)); + } + + // Store token as indices or materialized string + let token_value = if let Some(ref processed) = ctx.processed_token { + TokenValue::Materialized(processed.clone()) + } else if let Some((start, end)) = ctx.token { + TokenValue::Indexed(start, end) + } else { + // Should never happen + TokenValue::Materialized(String::new()) + }; + ctx.loop_data.push(token_value); + ctx.seen_data = true; + } + + // Get next token + if !ctx.get_token()? { + return Err(ctx.raise_error("Loop improperly terminated at end of file. \ + Loops must end with the 'stop_' token, but the \ + file ended without the stop token.")); + } + } + + Ok(()) +} + +#[pyfunction] +#[pyo3(signature = (data, entry, source, raise_parse_warnings, convert_data_types, schema=None))] +pub fn parse( + py: Python, + data: String, + entry: Py, + source: String, + raise_parse_warnings: bool, + convert_data_types: bool, + schema: Option<&Bound>, +) -> PyResult> { + // Convert to PyObject if Some + let schema = schema.map(|s| s.clone().into()); + + // Preprocess data (same as Python's Parser.load_data) + // Fix DOS line endings + let data = data.replace("\r\n", "\n").replace("\r", "\n"); + // Change '\n; data ' started multi-lines to '\n;\ndata' + let data = fix_multiline_semicolons(&data); + + // Create tokenizer and load data + let mut tokenizer = TokenizerState::new(); + tokenizer.load_string(data); + + // Create parser context + let mut ctx = ParserContext::new(py, entry.clone_ref(py), source, + raise_parse_warnings, convert_data_types, schema, tokenizer)?; + // Parse + parse_initial(py, &mut ctx)?; + parse_entry_body(py, &mut ctx)?; + + Ok(entry) +} diff --git a/src/utils.rs b/src/utils.rs new file mode 100644 index 0000000..f3dc37b --- /dev/null +++ b/src/utils.rs @@ -0,0 +1,231 @@ +use pyo3::prelude::*; +use pyo3::exceptions::PyValueError; +use memchr::memchr_iter; + +pub const RESERVED_KEYWORDS: [&str; 5] = ["data_", "save_", "loop_", "stop_", "global_"]; + +/// Fix multiline semicolon values where content appears on the same line as the semicolon. +/// Transforms patterns like `\n;content\n` to `\n;\ncontent\n`. +/// This is equivalent to the regex: `\n;([^\n]+?)\n` -> `\n;\n$1\n` +pub fn fix_multiline_semicolons(data: &str) -> String { + let bytes = data.as_bytes(); + let len = bytes.len(); + + // Quick check: if no semicolons exist, return as-is + if memchr::memchr(b';', bytes).is_none() { + return data.to_string(); + } + + let mut result = String::with_capacity(len + 64); + let mut last_end = 0; + + // Find all newlines and check if pattern `\n;[^\n]+\n` follows + for nl_pos in memchr_iter(b'\n', bytes) { + // Check if we have `\n;` pattern (need at least 2 more chars: ; and something) + if nl_pos + 2 < len && bytes[nl_pos + 1] == b';' { + let after_semi = nl_pos + 2; + // Check if next char is NOT a newline (meaning there's content on same line) + if bytes[after_semi] != b'\n' { + // Find the next newline after the semicolon + if memchr::memchr(b'\n', &bytes[after_semi..]).is_some() { + // We found the pattern: \n;[content]\n + // Copy everything up to and including \n; + result.push_str(&data[last_end..after_semi]); + // Insert the extra newline + result.push('\n'); + // Update position to continue from the content + last_end = after_semi; + } + } + } + } + + // Append remaining data + result.push_str(&data[last_end..]); + result +} + +pub fn is_reserved_keyword(token: &str) -> bool { + RESERVED_KEYWORDS.iter().any(|&kw| token.eq_ignore_ascii_case(kw)) +} + +pub fn starts_with_ignore_case(s: &str, prefix: &str) -> bool { + // For ASCII-only prefixes, we can safely check byte-by-byte + if s.len() < prefix.len() { + return false; + } + + let s_bytes = s.as_bytes(); + let prefix_bytes = prefix.as_bytes(); + + for i in 0..prefix.len() { + if !s_bytes[i].eq_ignore_ascii_case(&prefix_bytes[i]) { + return false; + } + } + true +} + +#[pyfunction] +#[pyo3(signature = (orig, str_conversion_dict=None))] +pub fn quote_value(orig: &Bound, str_conversion_dict: Option<&Bound<'_, PyAny>>) -> PyResult { + // Apply STR_CONVERSION_DICT if provided + let converted: std::borrow::Cow<'_, Bound<'_, PyAny>> = if let Some(conv_dict) = str_conversion_dict { + if conv_dict.contains(orig)? { + std::borrow::Cow::Owned(conv_dict.get_item(orig)?) + } else { + std::borrow::Cow::Borrowed(orig) + } + } else { + std::borrow::Cow::Borrowed(orig) + }; + + // Convert to string + let str_obj = converted.str()?; + let s = str_obj.to_str()?; + + // Don't allow empty string + if s.is_empty() { + return Err(PyValueError::new_err("Empty strings are not allowed as values. Use the None singleton, or '.' to represent null values.")); + } + + // Delegate to the internal implementation + Ok(quote_value_str(s)) +} + +/// Internal function to quote a value for NMR-STAR format. +/// This is a simpler version that takes a string directly, used by format_loop. +pub fn quote_value_str(s: &str) -> String { + let len = s.len(); + + // Don't allow empty string - return as-is, caller handles this error + if len == 0 { + return String::new(); + } + + // Handle embedded STAR format multiline comments + if s.contains("\n;") { + let starts_with_newline = s.starts_with('\n'); + let newline_count = s.bytes().filter(|&b| b == b'\n').count(); + let mut result = String::with_capacity(len + newline_count * 3 + 8); + + if !starts_with_newline { + result.push_str("\n "); + } + + let bytes = s.as_bytes(); + let mut last_end = 0; + for i in memchr_iter(b'\n', bytes) { + result.push_str(&s[last_end..=i]); + result.push_str(" "); + last_end = i + 1; + } + result.push_str(&s[last_end..]); + result.push('\n'); + + return result; + } + + // If it has newlines but not "\n;", handle multiline + if s.contains('\n') { + if s.ends_with('\n') { + return s.to_string(); + } else { + let mut result = String::with_capacity(len + 1); + result.push_str(s); + result.push('\n'); + return result; + } + } + + // Check for quotes + let has_single = s.contains('\''); + let has_double = s.contains('"'); + + // If it has both single and double quotes, need special handling + if has_single && has_double { + let mut can_wrap_single = true; + let mut can_wrap_double = true; + + let bytes = s.as_bytes(); + for i in 0..bytes.len() - 1 { + let next = bytes[i + 1]; + let next_is_ws = matches!(next, b' ' | b'\t' | b'\x0B'); + if next_is_ws { + match bytes[i] { + b'\'' => can_wrap_single = false, + b'"' => can_wrap_double = false, + _ => {} + } + } + } + + if !can_wrap_single && !can_wrap_double { + let mut result = String::with_capacity(len + 1); + result.push_str(s); + result.push('\n'); + return result; + } + if can_wrap_single { + let mut result = String::with_capacity(len + 2); + result.push('\''); + result.push_str(s); + result.push('\''); + return result; + } + if can_wrap_double { + let mut result = String::with_capacity(len + 2); + result.push('"'); + result.push_str(s); + result.push('"'); + return result; + } + } + + // Check if we need wrapping + let mut needs_wrapping = false; + + if s.starts_with('_') || s.starts_with('"') || s.starts_with('\'') { + needs_wrapping = true; + } + + if !needs_wrapping { + if starts_with_ignore_case(s, "data_") || starts_with_ignore_case(s, "save_") || + starts_with_ignore_case(s, "loop_") || starts_with_ignore_case(s, "stop_") || + starts_with_ignore_case(s, "global_") { + needs_wrapping = true; + } + + if !needs_wrapping { + let bytes = s.as_bytes(); + let mut prev_is_ws = true; + for &b in bytes { + if matches!(b, b' ' | b'\t' | b'\x0B') { + needs_wrapping = true; + break; + } + if b == b'#' && prev_is_ws { + needs_wrapping = true; + break; + } + prev_is_ws = matches!(b, b' ' | b'\t' | b'\x0B'); + } + } + } + + if needs_wrapping { + let mut result = String::with_capacity(len + 2); + if has_single { + result.push('"'); + result.push_str(s); + result.push('"'); + } else { + result.push('\''); + result.push_str(s); + result.push('\''); + } + return result; + } + + s.to_string() +}