From 4ad3ee0c4384840d63d297d7720788f512bd1234 Mon Sep 17 00:00:00 2001 From: antoinefa Date: Sun, 21 Jul 2024 19:59:14 -0400 Subject: [PATCH 01/10] prepare_numerical: remove applymap (obsolete) and optimize function --- code/recipes.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/code/recipes.py b/code/recipes.py index 95df6db..73ae36d 100644 --- a/code/recipes.py +++ b/code/recipes.py @@ -1588,9 +1588,8 @@ def prepare_categorical(self, df=None): return df def prepare_numerical(self, df=None): - df = df[self.numerical].fillna("") - df = df.applymap(lambda x: 0 if ( - (str(x) == "") | (x == None)) else float(x)) + df = df[self.numerical].fillna("0") + df = df.apply(lambda col: pd.to_numeric(col, errors='coerce').fillna(0)) return df def internal_fillna(self, df=None, desc=None): From faa083de68c7f4de66a525635b0026f9b05e37af Mon Sep 17 00:00:00 2001 From: antoinefa Date: Sun, 21 Jul 2024 20:26:14 -0400 Subject: [PATCH 02/10] build_model: applymap removal & optim --- code/recipes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/recipes.py b/code/recipes.py index 73ae36d..002bd13 100644 --- a/code/recipes.py +++ b/code/recipes.py @@ -1755,7 +1755,7 @@ def internal_build_model(self, df=None, desc=None): # for debug: self.log.write("{} {} {} {} # {}".format(X.shape,len(self.numerical),Xn.shape,len(self.categorical),Xc.shape)) - Y = df[self.target].applymap(lambda x: 1 if x else 0) + Y = np.where(df[self.target], 1, 0) # prep = DictVectorizer() # X=X.to_dict().values() # X = prep.fit_transform(X).toarray() From 33ad4fd2f1f241695d4315d9592721c9f5ca96aa Mon Sep 17 00:00:00 2001 From: antoinefa Date: Sun, 21 Jul 2024 20:34:50 -0400 Subject: [PATCH 03/10] internal_to_integer: applymap removal and optim --- code/recipes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/code/recipes.py b/code/recipes.py index 002bd13..cc935a3 100644 --- a/code/recipes.py +++ b/code/recipes.py @@ -1890,8 +1890,8 @@ def internal_to_integer(self, df=None, desc=None): # keep only selected columns self.select_columns(df=df) try: - df[self.cols] = df[self.cols].applymap( - lambda x: np.nan if (str(x) == "") else int(x)) + # Convert columns to integers, keeping NaN as is + df[self.cols] = df[self.cols].apply(lambda col: pd.to_numeric(col, errors='coerce')) return df except SystemExit: return df From f1965819945ce324c350f9c477c731307efc634d Mon Sep 17 00:00:00 2001 From: antoinefa Date: Sun, 21 Jul 2024 20:38:19 -0400 Subject: [PATCH 04/10] internal_list_to_tuple and internal_tuple_to_list: applymap removal and optim --- code/recipes.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/code/recipes.py b/code/recipes.py index cc935a3..5bc4c99 100644 --- a/code/recipes.py +++ b/code/recipes.py @@ -1903,8 +1903,7 @@ def internal_list_to_tuple(self, df=None, desc=None): # keep only selected columns self.select_columns(df=df) try: - df[self.cols] = df[self.cols].applymap( - lambda x: tuple(x) if (type(x) == list) else x) + df[self.cols] = df[self.cols].apply(lambda col: col.apply(lambda x: tuple(x) if isinstance(x, list) else x)) return df except SystemExit: return df @@ -1916,8 +1915,7 @@ def internal_tuple_to_list(self, df=None, desc=None): # keep only selected columns self.select_columns(df=df) try: - df[self.cols] = df[self.cols].applymap( - lambda x: list(x) if (type(x) == tuple) else x) + df[self.cols] = df[self.cols].apply(lambda col: col.apply(lambda x: list(x) if isinstance(x, tuple) else x)) return df except SystemExit: return df From cfa077c08facfb82aa16893e59a72a2fc00735cc Mon Sep 17 00:00:00 2001 From: antoinefa Date: Sun, 21 Jul 2024 20:43:43 -0400 Subject: [PATCH 05/10] internal_to_float: applymap removal and optim --- code/recipes.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/code/recipes.py b/code/recipes.py index 5bc4c99..d666b32 100644 --- a/code/recipes.py +++ b/code/recipes.py @@ -1926,13 +1926,11 @@ def internal_tuple_to_list(self, df=None, desc=None): def internal_to_float(self, df=None, desc=None): # keep only selected columns self.select_columns(df=df) + na_value = self.args.get("na_value", np.nan) + try: - na_value = self.args["na_value"] - except: - na_value = np.nan - try: - df[self.cols] = df[self.cols].applymap( - lambda x: na_value if (str(x) == "") else float(x)) + # Convert columns to floats, setting non-convertible values to na_value + df[self.cols] = df[self.cols].apply(lambda col: pd.to_numeric(col, errors='coerce').fillna(na_value)) return df except SystemExit: return df From 59588c66801a11151d9cd5e9b076bb48f6f1bd1d Mon Sep 17 00:00:00 2001 From: antoinefa Date: Sun, 21 Jul 2024 20:49:10 -0400 Subject: [PATCH 06/10] internal_ngram: applymap removal and optim --- code/recipes.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/code/recipes.py b/code/recipes.py index d666b32..f028fad 100644 --- a/code/recipes.py +++ b/code/recipes.py @@ -1941,13 +1941,13 @@ def internal_to_float(self, df=None, desc=None): def internal_ngram(self, df=None, desc=None): # keep only selected columns self.select_columns(df=df) - if ("n" in list(self.args.keys())): - n = self.args['n'] - else: - n = list([2, 3]) + n = self.args.get('n', [2, 3]) # Use get with a default value for simplification + try: - df[self.cols] = df[self.cols].applymap( - lambda x: ngrams(tokenize(normalize(x)), n)) + # Apply n-gram generation to each column + df[self.cols] = df[self.cols].apply( + lambda col: col.apply(lambda x: ngrams(tokenize(normalize(x)), n)) + ) return df except SystemExit: return df From 7502849f0a383a42594d532c828a992205552b55 Mon Sep 17 00:00:00 2001 From: antoinefa Date: Sun, 21 Jul 2024 20:53:15 -0400 Subject: [PATCH 07/10] internal_join: applymap removal and optim --- code/recipes.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/code/recipes.py b/code/recipes.py index f028fad..d8a6bf4 100644 --- a/code/recipes.py +++ b/code/recipes.py @@ -2243,8 +2243,7 @@ def internal_join(self, df=None, desc=None): if True: m_res = [] - rest = df.applymap(lambda x: "" if x is None else x) - rest.fillna("", inplace=True) + rest = df.fillna("") # elasticsearch bulk search while rest.shape[0] > 0: From f93df24eb69fb0df9e2b9a85e12368d05f891543 Mon Sep 17 00:00:00 2001 From: antoinefa Date: Sun, 21 Jul 2024 20:53:40 -0400 Subject: [PATCH 08/10] internal_parsedate: lint --- code/recipes.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/code/recipes.py b/code/recipes.py index d8a6bf4..36d407a 100644 --- a/code/recipes.py +++ b/code/recipes.py @@ -2421,9 +2421,6 @@ def internal_parsedate(self, df=None, desc=None): for col in self.cols: df[col] = pd.to_datetime( df[col], errors='coerce', format=self.args["format"]) - # df[self.cols]=df[self.cols].applymap(lambda x: - # parsedate(x,self.args["format"])) - return df def internal_replace(self, df=None, desc=None): From 63a12162c055cd7d4ee9aaf7e073c37ad69c9098 Mon Sep 17 00:00:00 2001 From: antoinefa Date: Sun, 21 Jul 2024 21:01:15 -0400 Subject: [PATCH 09/10] internal_replace: applymap removal --- code/recipes.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/code/recipes.py b/code/recipes.py index 36d407a..7c5bfd1 100644 --- a/code/recipes.py +++ b/code/recipes.py @@ -2432,8 +2432,7 @@ def internal_replace(self, df=None, desc=None): for r in self.args["regex"]: regex.append([re.compile(list(r.keys())[0]), r[list(r.keys())[0]]]) pd.options.mode.chained_assignment = None - df[self.cols] = df[self.cols].applymap( - lambda x: replace_regex(x, regex)) + df[self.cols] = df[self.cols].apply(lambda col: col.apply(replace_regex)) return df else: return df From 671ea4df1c6ad360e495934664de9f5606333047 Mon Sep 17 00:00:00 2001 From: antoinefa Date: Sun, 21 Jul 2024 21:05:19 -0400 Subject: [PATCH 10/10] api.py: applymap removal --- Makefile | 2 +- code/api.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index b2c4a1d..2834ee7 100644 --- a/Makefile +++ b/Makefile @@ -108,7 +108,7 @@ export APP_VERSION = ${tag}-${version} commit = ${APP_VERSION} lastcommit := $(shell touch .lastcommit && cat .lastcommit) date := $(shell date -I) -id := $(shell openssl rand -base64 8) +id:=myid vm_max_count := $(shell cat /etc/sysctl.conf | egrep vm.max_map_count\s*=\s*262144 && echo true) diff --git a/code/api.py b/code/api.py index 1b1822e..f3f89c9 100755 --- a/code/api.py +++ b/code/api.py @@ -610,7 +610,7 @@ def post(self, dataset): except: return {"data": [{"error": "error: no such table {}".format(ds.table)}]} df = df.head(n=size).reset_index(drop=True) - df = df.applymap(lambda x: unicode_safe(x)) + df = df.apply(lambda col: col.map(unicode_safe)) if (format_type == 'json'): return {"data": list(df.fillna("").T.to_dict().values()), "schema": schema} elif (format_type == 'csv'): @@ -957,7 +957,7 @@ def post(self, recipe, action): try: return jsonify({"data": list(df.T.to_dict().values()), "log": str(r.log.writer.getvalue())}) except: - df = df.applymap(lambda x: str(x)) + df = df.apply(lambda col: col.astype(str)) return jsonify({"data": list(df.T.to_dict().values()), "log": str(r.log.writer.getvalue())}) else: return {"log": r.log.writer.getvalue()} @@ -993,7 +993,7 @@ def put(self, recipe, action): try: return jsonify({"data": list(df.T.to_dict().values()), "log": r.callback["log"]}) except: - df = df.applymap(lambda x: unicode_safe(x)) + df = df.apply(lambda col: col.map(unicode_safe)) return jsonify({"data": list(df.T.to_dict().values()), "log": r.callback["log"]}) else: return {"data": [{"result": "empty"}], "log": r.callback["log"]}