From eba21a5ccabb69e28a8d306b4c21e1ac2b2946f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karol=20=C5=BBak?= Date: Tue, 20 Aug 2019 15:01:15 +0200 Subject: [PATCH 1/2] update as_batch to work correctly with multiclass labels there was a missing type() function to determine if label is of type np.ndarray. This is crucial for multiclass labels --- mPyPl/keras.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mPyPl/keras.py b/mPyPl/keras.py index fda7757..e27d119 100644 --- a/mPyPl/keras.py +++ b/mPyPl/keras.py @@ -27,7 +27,7 @@ def as_batch(flow, feature_field_name='features', label_field_name='label', batc batch = [np.zeros((batchsize,)+flds[i].shape) for i in feature_field_name] else: batch = np.zeros((batchsize,)+flds[feature_field_name].shape) - lbls_shape = lbls.shape if lbls is np.ndarray else (1,) + lbls_shape = lbls.shape if type(lbls) is np.ndarray else (1,) labels = np.zeros((batchsize,)+lbls_shape) if isinstance(feature_field_name, list): for j,n in enumerate(feature_field_name): From 49c9155d872a6547ea1f2ba2cb64c545cdb687e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karol=20=C5=BBak?= Date: Wed, 21 Aug 2019 09:57:49 +0200 Subject: [PATCH 2/2] added output dtype parameters to as_batch function now it will either take the dtype from the provided parameter or from the original data source dtype --- mPyPl/keras.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/mPyPl/keras.py b/mPyPl/keras.py index e27d119..390914a 100644 --- a/mPyPl/keras.py +++ b/mPyPl/keras.py @@ -5,7 +5,7 @@ import numpy as np @Pipe -def as_batch(flow, feature_field_name='features', label_field_name='label', batchsize=16): +def as_batch(flow, feature_field_name='features', label_field_name='label', batchsize=16, out_features_dtype=None, out_labels_dtype=None): """ Split input datastream into a sequence of batches suitable for keras training. :param flow: input datastream @@ -22,18 +22,21 @@ def as_batch(flow, feature_field_name='features', label_field_name='label', batc # explicitly compute all fields - this is needed for all fields to be computed only once for on-demand evaluation flds = { i : data[i] for i in (feature_field_name if isinstance(feature_field_name, list) else [feature_field_name])} lbls = data[label_field_name] # TODO: what happens when label_field_name is a list? + if batch is None: if isinstance(feature_field_name, list): - batch = [np.zeros((batchsize,)+flds[i].shape) for i in feature_field_name] + batch = [np.zeros((batchsize,)+flds[i].shape, dtype=flds[i].dtype if out_features_dtype is None else out_features_dtype) for i in feature_field_name] else: - batch = np.zeros((batchsize,)+flds[feature_field_name].shape) + batch = np.zeros((batchsize,)+flds[feature_field_name].shape, dtype=flds[feature_field_name].dtype if out_features_dtype is None else out_features_dtype) + lbls_shape = lbls.shape if type(lbls) is np.ndarray else (1,) - labels = np.zeros((batchsize,)+lbls_shape) + out_labels_dtype = out_labels_dtype if out_labels_dtype is not None else lbls.dtype if type(lbls) is np.ndarray else None + labels = np.zeros((batchsize,)+lbls_shape, dtype=out_labels_dtype) if isinstance(feature_field_name, list): for j,n in enumerate(feature_field_name): batch[j][i] = flds[n] else: batch[i] = flds[feature_field_name] - labels[i] = data[label_field_name] + labels[i] = lbls yield (batch, labels) batch = labels = None