From 123e101bf3021785ce52bdc2737edeb81b656fc9 Mon Sep 17 00:00:00 2001 From: Mingxun Wang Date: Thu, 24 Jun 2021 17:48:54 -0700 Subject: [PATCH 1/4] updating ability to have or operation --- msql.ebnf | 7 +++++-- msql_engine.py | 30 +++++++++++++++++++----------- msql_parser.py | 11 ++++++++++- test.py | 13 ++++++++++++- test_queries.txt | 3 ++- 5 files changed, 48 insertions(+), 16 deletions(-) diff --git a/msql.ebnf b/msql.ebnf index bb67fa9..f893efb 100644 --- a/msql.ebnf +++ b/msql.ebnf @@ -20,10 +20,13 @@ filterfullcondition: filterfullcondition booleanconjunction filterfullcondition | condition ":" qualifier | condition -condition: conditionfields "=" floating - | conditionfields equal variable +// Conditions +condition: conditionfields equal conditionvalue | conditionfields equal "(" statement ")" +conditionvalue: conditionvalue "," conditionvalue + | variable + | floating qualifier: qualifier ":" qualifier | qualifierfields equal floating diff --git a/msql_engine.py b/msql_engine.py index 31197b5..c985678 100644 --- a/msql_engine.py +++ b/msql_engine.py @@ -461,23 +461,31 @@ def _executeconditions_query(parsed_dict, input_filename, ms1_input_df=None, ms2 # Filtering MS2 Product Ions if condition["type"] == "ms2productcondition": - mz = condition["value"][0] - mz_tol = _get_mz_tolerance(condition.get("qualifiers", None), mz) - mz_min = mz - mz_tol - mz_max = mz + mz_tol + filtered_scans = set() + for mz in condition["value"]: + mz_tol = _get_mz_tolerance(condition.get("qualifiers", None), mz) + mz_min = mz - mz_tol + mz_max = mz + mz_tol - min_int, min_intpercent = _get_minintensity(condition.get("qualifiers", None)) + min_int, min_intpercent = _get_minintensity(condition.get("qualifiers", None)) - ms2_filtered_df = ms2_df[(ms2_df["mz"] > mz_min) & (ms2_df["mz"] < mz_max) & (ms2_df["i"] > min_int) & (ms2_df["i_norm"] > min_intpercent)] + ms2_filtered_df = ms2_df[ + (ms2_df["mz"] > mz_min) & + (ms2_df["mz"] < mz_max) & + (ms2_df["i"] > min_int) & + (ms2_df["i_norm"] > min_intpercent) + ] - # Setting the intensity match register - _set_intensity_register(ms2_filtered_df, reference_conditions_register, condition) + # Setting the intensity match register + _set_intensity_register(ms2_filtered_df, reference_conditions_register, condition) - # Applying the intensity match - ms2_filtered_df = _filter_intensitymatch(ms2_filtered_df, reference_conditions_register, condition) + # Applying the intensity match + ms2_filtered_df = _filter_intensitymatch(ms2_filtered_df, reference_conditions_register, condition) + + # Getting union of all scans + filtered_scans = filtered_scans.union(set(ms2_filtered_df["scan"])) # Filtering the actual data structures - filtered_scans = set(ms2_filtered_df["scan"]) ms2_df = ms2_df[ms2_df["scan"].isin(filtered_scans)] # Filtering the MS1 data now diff --git a/msql_parser.py b/msql_parser.py index 27eac6a..378ec30 100644 --- a/msql_parser.py +++ b/msql_parser.py @@ -87,7 +87,7 @@ def qualifier(self, items): def condition(self, items): condition_dict = {} condition_dict["type"] = items[0].children[0] - condition_dict["value"] = [items[-1]] + condition_dict["value"] = items[-1] return condition_dict def wherefullcondition(self, items): @@ -159,7 +159,16 @@ def filterfullcondition(self, items): merged_list += items[-1] return merged_list + + def conditionvalue(self, items): + if len(items) == 1: + return items + if len(items) == 2: + merged_list = [] + merged_list += items[0] + merged_list += items[-1] + return merged_list def querytype(self, items): query_dict = {} diff --git a/test.py b/test.py index 225aa11..445780f 100644 --- a/test.py +++ b/test.py @@ -48,6 +48,8 @@ def test_qc_ms1_ms2peak(): print(set(results_df["scan"])) assert(len(results_df) > 1000) + + def test_diphen(): query = "QUERY scannum(MS2DATA) WHERE MS2PROD=167.0857:TOLERANCEPPM=5" print(msql_parser.parse_msql(query)) @@ -280,6 +282,14 @@ def test_gnps_full_library(): results_df = msql_engine.process_query(query, "test/gnps.json") print(results_df) +def test_multiple_mz(): + query = "QUERY scaninfo(MS2DATA) WHERE \ + MS2PROD=271.06,217.1" + parse_obj = msql_parser.parse_msql(query) + print(parse_obj) + + results_df = msql_engine.process_query(query, "test/GNPS00002_A3_p.mzML") + print(results_df) def test_networking_mgf_library(): query = "QUERY scaninfo(MS2DATA) WHERE \ @@ -395,12 +405,13 @@ def main(): #test_ms1_cu() #test_neutral_loss_intensity() #test_gnps_library() - test_gnps_full_library() + #test_gnps_full_library() #test_networking_mgf_library() #test_swath() #test_albicidin_tag() #test_double_brominated() #test_agilent() + test_multiple_mz() if __name__ == "__main__": main() diff --git a/test_queries.txt b/test_queries.txt index 0813b5e..a10885f 100644 --- a/test_queries.txt +++ b/test_queries.txt @@ -10,4 +10,5 @@ QUERY scannum(MS2DATA) WHERE MS2PROD=88:TOLERANCEMZ=0.1:INTENSITYPERCENT>10 AND QUERY scannum(MS2DATA) WHERE MS2NL=163 QUERY scaninfo(MS1DATA) WHERE MS1MZ=425.2898:TOLERANCEMZ=0.1:INTENSITYPERCENT>0.1 QUERY scaninfo(MS1DATA) WHERE MS1MZ=425.2898:TOLERANCEMZ=0.1:INTENSITYPERCENT>1 -QUERY scaninfo(MS1DATA) WHERE RTMIN=50 \ No newline at end of file +QUERY scaninfo(MS1DATA) WHERE RTMIN=50 +QUERY scaninfo(MS1DATA) WHERE MS1MZ=425.2898,426.289 \ No newline at end of file From dba7ad1f828b97c911a9b4d6204b3adf6f40bbe9 Mon Sep 17 00:00:00 2001 From: Mingxun Wang Date: Thu, 24 Jun 2021 18:01:21 -0700 Subject: [PATCH 2/4] fixes for subquery --- msql_engine.py | 2 +- msql_parser.py | 5 ++++- test.py | 9 +++++---- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/msql_engine.py b/msql_engine.py index c985678..cad861e 100644 --- a/msql_engine.py +++ b/msql_engine.py @@ -591,7 +591,7 @@ def _executeconditions_query(parsed_dict, input_filename, ms1_input_df=None, ms2 if not condition["conditiontype"] == "filter": continue - logging.error("FILTER CONDITION", condition) + #logging.error("FILTER CONDITION", condition) # filtering MS1 peaks if condition["type"] == "ms1mzcondition": diff --git a/msql_parser.py b/msql_parser.py index 378ec30..5ff1ba6 100644 --- a/msql_parser.py +++ b/msql_parser.py @@ -87,7 +87,10 @@ def qualifier(self, items): def condition(self, items): condition_dict = {} condition_dict["type"] = items[0].children[0] - condition_dict["value"] = items[-1] + if type(items[-1]) is dict: + condition_dict["value"] = [items[-1]] + else: + condition_dict["value"] = items[-1] return condition_dict def wherefullcondition(self, items): diff --git a/test.py b/test.py index 445780f..15d4f5f 100644 --- a/test.py +++ b/test.py @@ -107,8 +107,8 @@ def test_variable_ms1(): def test_subquery(): #query = "QUERY scanrangesum(MS1DATA, TOLERANCE=0.1) WHERE MS1MZ=(QUERY scanmz(MS2DATA) WHERE MS2NL=176.0321 AND MS2PROD=85.02915)" query = "QUERY MS1DATA WHERE MS1MZ=(QUERY scanmz(MS2DATA) WHERE MS2NL=176.0321 AND MS2PROD=85.02915)" - results_df = msql_engine.process_query(query, "test/GNPS00002_A3_p.mzML") print(json.dumps(msql_parser.parse_msql(query), indent=4)) + results_df = msql_engine.process_query(query, "test/GNPS00002_A3_p.mzML") print(results_df) def test_filter(): @@ -358,7 +358,8 @@ def test_parse(): for line in open("test_queries.txt"): test_query = line.rstrip() print(test_query) - msql_parser.parse_msql(test_query) + parsed_result = msql_parser.parse_msql(test_query) + assert(parsed_result is not None) def test_query(): for line in open("test_queries.txt"): @@ -382,7 +383,7 @@ def main(): #test_parse() #test_query() #test_xic() - #test_subquery() + test_subquery() #test_variable_parse() #test_variable() #test_variable_ms1() @@ -411,7 +412,7 @@ def main(): #test_albicidin_tag() #test_double_brominated() #test_agilent() - test_multiple_mz() + #test_multiple_mz() if __name__ == "__main__": main() From 6646126f4faf21a0ffcf7fe4dc2088a84674fab2 Mon Sep 17 00:00:00 2001 From: Mingxun Wang Date: Thu, 24 Jun 2021 18:04:22 -0700 Subject: [PATCH 3/4] updating for mz and neutral loss --- msql_engine.py | 74 ++++++++++++++++++++++++++------------------------ 1 file changed, 39 insertions(+), 35 deletions(-) diff --git a/msql_engine.py b/msql_engine.py index cad861e..58c62ee 100644 --- a/msql_engine.py +++ b/msql_engine.py @@ -514,28 +514,31 @@ def _executeconditions_query(parsed_dict, input_filename, ms1_input_df=None, ms2 # Filtering MS2 Neutral Loss if condition["type"] == "ms2neutrallosscondition": - mz = condition["value"][0] - mz_tol = _get_mz_tolerance(condition.get("qualifiers", None), mz) - nl_min = mz - mz_tol - nl_max = mz + mz_tol + filtered_scans = set() + for mz in condition["value"]: + mz_tol = _get_mz_tolerance(condition.get("qualifiers", None), mz) + nl_min = mz - mz_tol + nl_max = mz + mz_tol - min_int, min_intpercent = _get_minintensity(condition.get("qualifiers", None)) + min_int, min_intpercent = _get_minintensity(condition.get("qualifiers", None)) - ms2_filtered_df = ms2_df[ - ((ms2_df["precmz"] - ms2_df["mz"]) > nl_min) & - ((ms2_df["precmz"] - ms2_df["mz"]) < nl_max) & - (ms2_df["i"] > min_int) & - (ms2_df["i_norm"] > min_intpercent) - ] + ms2_filtered_df = ms2_df[ + ((ms2_df["precmz"] - ms2_df["mz"]) > nl_min) & + ((ms2_df["precmz"] - ms2_df["mz"]) < nl_max) & + (ms2_df["i"] > min_int) & + (ms2_df["i_norm"] > min_intpercent) + ] - # Setting the intensity match register - _set_intensity_register(ms2_filtered_df, reference_conditions_register, condition) + # Setting the intensity match register + _set_intensity_register(ms2_filtered_df, reference_conditions_register, condition) - # Applying the intensity match - ms2_filtered_df = _filter_intensitymatch(ms2_filtered_df, reference_conditions_register, condition) + # Applying the intensity match + ms2_filtered_df = _filter_intensitymatch(ms2_filtered_df, reference_conditions_register, condition) + + # Getting union of all scans + filtered_scans = filtered_scans.union(set(ms2_filtered_df["scan"])) # Filtering the actual data structures - filtered_scans = set(ms2_filtered_df["scan"]) ms2_df = ms2_df[ms2_df["scan"].isin(filtered_scans)] # Filtering the MS1 data now @@ -546,33 +549,34 @@ def _executeconditions_query(parsed_dict, input_filename, ms1_input_df=None, ms2 # finding MS1 peaks if condition["type"] == "ms1mzcondition": - mz = condition["value"][0] - mz_tol = _get_mz_tolerance(condition.get("qualifiers", None), mz) - mz_min = mz - mz_tol - mz_max = mz + mz_tol + filtered_scans = set() + for mz in condition["value"]: + mz_tol = _get_mz_tolerance(condition.get("qualifiers", None), mz) + mz_min = mz - mz_tol + mz_max = mz + mz_tol - min_int, min_intpercent = _get_minintensity(condition.get("qualifiers", None)) - ms1_filtered_df = ms1_df[ - (ms1_df["mz"] > mz_min) & - (ms1_df["mz"] < mz_max) & - (ms1_df["i"] > min_int) & - (ms1_df["i_norm"] > min_intpercent)] - - #print("YYY", mz_min, mz_max, min_int, min_intpercent, len(ms1_filtered_df)) + min_int, min_intpercent = _get_minintensity(condition.get("qualifiers", None)) + ms1_filtered_df = ms1_df[ + (ms1_df["mz"] > mz_min) & + (ms1_df["mz"] < mz_max) & + (ms1_df["i"] > min_int) & + (ms1_df["i_norm"] > min_intpercent)] + + #print("YYY", mz_min, mz_max, min_int, min_intpercent, len(ms1_filtered_df)) - # Setting the intensity match register - _set_intensity_register(ms1_filtered_df, reference_conditions_register, condition) + # Setting the intensity match register + _set_intensity_register(ms1_filtered_df, reference_conditions_register, condition) - # Applying the intensity match - ms1_filtered_df = _filter_intensitymatch(ms1_filtered_df, reference_conditions_register, condition) + # Applying the intensity match + ms1_filtered_df = _filter_intensitymatch(ms1_filtered_df, reference_conditions_register, condition) - #print(ms1_filtered_df) + # Getting union of all scans + filtered_scans = filtered_scans.union(set(ms1_filtered_df["scan"])) - if len(ms1_filtered_df) == 0: + if filtered_scans == 0: return pd.DataFrame(), pd.DataFrame() # Filtering the actual data structures - filtered_scans = set(ms1_filtered_df["scan"]) ms1_df = ms1_df[ms1_df["scan"].isin(filtered_scans)] ms2_df = ms2_df[ms2_df["ms1scan"].isin(filtered_scans)] From f5d16e3cefe792b325ff7afcbfcd676d6d280d19 Mon Sep 17 00:00:00 2001 From: Mingxun Wang Date: Fri, 25 Jun 2021 09:00:25 -0700 Subject: [PATCH 4/4] making sure the scans are filtered --- msql_engine.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/msql_engine.py b/msql_engine.py index 58c62ee..d04e021 100644 --- a/msql_engine.py +++ b/msql_engine.py @@ -482,8 +482,9 @@ def _executeconditions_query(parsed_dict, input_filename, ms1_input_df=None, ms2 # Applying the intensity match ms2_filtered_df = _filter_intensitymatch(ms2_filtered_df, reference_conditions_register, condition) - # Getting union of all scans - filtered_scans = filtered_scans.union(set(ms2_filtered_df["scan"])) + if len(ms2_filtered_df) > 0: + # Getting union of all scans + filtered_scans = filtered_scans.union(set(ms2_filtered_df["scan"])) # Filtering the actual data structures ms2_df = ms2_df[ms2_df["scan"].isin(filtered_scans)] @@ -535,8 +536,9 @@ def _executeconditions_query(parsed_dict, input_filename, ms1_input_df=None, ms2 # Applying the intensity match ms2_filtered_df = _filter_intensitymatch(ms2_filtered_df, reference_conditions_register, condition) - # Getting union of all scans - filtered_scans = filtered_scans.union(set(ms2_filtered_df["scan"])) + if len(ms2_filtered_df) > 0: + # Getting union of all scans + filtered_scans = filtered_scans.union(set(ms2_filtered_df["scan"])) # Filtering the actual data structures ms2_df = ms2_df[ms2_df["scan"].isin(filtered_scans)] @@ -570,8 +572,9 @@ def _executeconditions_query(parsed_dict, input_filename, ms1_input_df=None, ms2 # Applying the intensity match ms1_filtered_df = _filter_intensitymatch(ms1_filtered_df, reference_conditions_register, condition) - # Getting union of all scans - filtered_scans = filtered_scans.union(set(ms1_filtered_df["scan"])) + if len(ms1_filtered_df) > 0: + # Getting union of all scans + filtered_scans = filtered_scans.union(set(ms1_filtered_df["scan"])) if filtered_scans == 0: return pd.DataFrame(), pd.DataFrame()