diff --git a/Makefile b/Makefile index b2c4a1d..2834ee7 100644 --- a/Makefile +++ b/Makefile @@ -108,7 +108,7 @@ export APP_VERSION = ${tag}-${version} commit = ${APP_VERSION} lastcommit := $(shell touch .lastcommit && cat .lastcommit) date := $(shell date -I) -id := $(shell openssl rand -base64 8) +id:=myid vm_max_count := $(shell cat /etc/sysctl.conf | egrep vm.max_map_count\s*=\s*262144 && echo true) diff --git a/code/api.py b/code/api.py index 1b1822e..f3f89c9 100755 --- a/code/api.py +++ b/code/api.py @@ -610,7 +610,7 @@ def post(self, dataset): except: return {"data": [{"error": "error: no such table {}".format(ds.table)}]} df = df.head(n=size).reset_index(drop=True) - df = df.applymap(lambda x: unicode_safe(x)) + df = df.apply(lambda col: col.map(unicode_safe)) if (format_type == 'json'): return {"data": list(df.fillna("").T.to_dict().values()), "schema": schema} elif (format_type == 'csv'): @@ -957,7 +957,7 @@ def post(self, recipe, action): try: return jsonify({"data": list(df.T.to_dict().values()), "log": str(r.log.writer.getvalue())}) except: - df = df.applymap(lambda x: str(x)) + df = df.apply(lambda col: col.astype(str)) return jsonify({"data": list(df.T.to_dict().values()), "log": str(r.log.writer.getvalue())}) else: return {"log": r.log.writer.getvalue()} @@ -993,7 +993,7 @@ def put(self, recipe, action): try: return jsonify({"data": list(df.T.to_dict().values()), "log": r.callback["log"]}) except: - df = df.applymap(lambda x: unicode_safe(x)) + df = df.apply(lambda col: col.map(unicode_safe)) return jsonify({"data": list(df.T.to_dict().values()), "log": r.callback["log"]}) else: return {"data": [{"result": "empty"}], "log": r.callback["log"]} diff --git a/code/recipes.py b/code/recipes.py index 95df6db..7c5bfd1 100644 --- a/code/recipes.py +++ b/code/recipes.py @@ -1588,9 +1588,8 @@ def prepare_categorical(self, df=None): return df def prepare_numerical(self, df=None): - df = df[self.numerical].fillna("") - df = df.applymap(lambda x: 0 if ( - (str(x) == "") | (x == None)) else float(x)) + df = df[self.numerical].fillna("0") + df = df.apply(lambda col: pd.to_numeric(col, errors='coerce').fillna(0)) return df def internal_fillna(self, df=None, desc=None): @@ -1756,7 +1755,7 @@ def internal_build_model(self, df=None, desc=None): # for debug: self.log.write("{} {} {} {} # {}".format(X.shape,len(self.numerical),Xn.shape,len(self.categorical),Xc.shape)) - Y = df[self.target].applymap(lambda x: 1 if x else 0) + Y = np.where(df[self.target], 1, 0) # prep = DictVectorizer() # X=X.to_dict().values() # X = prep.fit_transform(X).toarray() @@ -1891,8 +1890,8 @@ def internal_to_integer(self, df=None, desc=None): # keep only selected columns self.select_columns(df=df) try: - df[self.cols] = df[self.cols].applymap( - lambda x: np.nan if (str(x) == "") else int(x)) + # Convert columns to integers, keeping NaN as is + df[self.cols] = df[self.cols].apply(lambda col: pd.to_numeric(col, errors='coerce')) return df except SystemExit: return df @@ -1904,8 +1903,7 @@ def internal_list_to_tuple(self, df=None, desc=None): # keep only selected columns self.select_columns(df=df) try: - df[self.cols] = df[self.cols].applymap( - lambda x: tuple(x) if (type(x) == list) else x) + df[self.cols] = df[self.cols].apply(lambda col: col.apply(lambda x: tuple(x) if isinstance(x, list) else x)) return df except SystemExit: return df @@ -1917,8 +1915,7 @@ def internal_tuple_to_list(self, df=None, desc=None): # keep only selected columns self.select_columns(df=df) try: - df[self.cols] = df[self.cols].applymap( - lambda x: list(x) if (type(x) == tuple) else x) + df[self.cols] = df[self.cols].apply(lambda col: col.apply(lambda x: list(x) if isinstance(x, tuple) else x)) return df except SystemExit: return df @@ -1929,13 +1926,11 @@ def internal_tuple_to_list(self, df=None, desc=None): def internal_to_float(self, df=None, desc=None): # keep only selected columns self.select_columns(df=df) + na_value = self.args.get("na_value", np.nan) + try: - na_value = self.args["na_value"] - except: - na_value = np.nan - try: - df[self.cols] = df[self.cols].applymap( - lambda x: na_value if (str(x) == "") else float(x)) + # Convert columns to floats, setting non-convertible values to na_value + df[self.cols] = df[self.cols].apply(lambda col: pd.to_numeric(col, errors='coerce').fillna(na_value)) return df except SystemExit: return df @@ -1946,13 +1941,13 @@ def internal_to_float(self, df=None, desc=None): def internal_ngram(self, df=None, desc=None): # keep only selected columns self.select_columns(df=df) - if ("n" in list(self.args.keys())): - n = self.args['n'] - else: - n = list([2, 3]) + n = self.args.get('n', [2, 3]) # Use get with a default value for simplification + try: - df[self.cols] = df[self.cols].applymap( - lambda x: ngrams(tokenize(normalize(x)), n)) + # Apply n-gram generation to each column + df[self.cols] = df[self.cols].apply( + lambda col: col.apply(lambda x: ngrams(tokenize(normalize(x)), n)) + ) return df except SystemExit: return df @@ -2248,8 +2243,7 @@ def internal_join(self, df=None, desc=None): if True: m_res = [] - rest = df.applymap(lambda x: "" if x is None else x) - rest.fillna("", inplace=True) + rest = df.fillna("") # elasticsearch bulk search while rest.shape[0] > 0: @@ -2427,9 +2421,6 @@ def internal_parsedate(self, df=None, desc=None): for col in self.cols: df[col] = pd.to_datetime( df[col], errors='coerce', format=self.args["format"]) - # df[self.cols]=df[self.cols].applymap(lambda x: - # parsedate(x,self.args["format"])) - return df def internal_replace(self, df=None, desc=None): @@ -2441,8 +2432,7 @@ def internal_replace(self, df=None, desc=None): for r in self.args["regex"]: regex.append([re.compile(list(r.keys())[0]), r[list(r.keys())[0]]]) pd.options.mode.chained_assignment = None - df[self.cols] = df[self.cols].applymap( - lambda x: replace_regex(x, regex)) + df[self.cols] = df[self.cols].apply(lambda col: col.apply(replace_regex)) return df else: return df