diff --git a/machine_learning/point_cloud_segmentation/point_cloud_segmentation.ipynb b/machine_learning/point_cloud_segmentation/point_cloud_segmentation.ipynb new file mode 100644 index 0000000..329e18a --- /dev/null +++ b/machine_learning/point_cloud_segmentation/point_cloud_segmentation.ipynb @@ -0,0 +1,636 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "Implementation of PointNet for ModelNet10 classification was taken from https://github.com/keras-team/keras-io/blob/master/examples/vision/pointnet.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "import os\n", + "import glob\n", + "import trimesh\n", + "import numpy as np\n", + "import tiledb\n", + "import tensorflow as tf\n", + "from tensorflow import keras\n", + "from tensorflow.keras import layers\n", + "from matplotlib import pyplot as plt\n", + "\n", + "tf.random.set_seed(1234)\n", + "\n", + "NUM_POINTS = 2048\n", + "NUM_CLASSES = 10\n", + "BATCH_SIZE = 32" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Load dataset\n", + "We use the ModelNet10 model dataset, the smaller 10 class version of the ModelNet40\n", + "dataset. First download the data:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "DATA_DIR = tf.keras.utils.get_file(\n", + " \"modelnet.zip\",\n", + " \"http://3dvision.princeton.edu/projects/2014/3DShapeNets/ModelNet10.zip\",\n", + " extract=True,\n", + ")\n", + "DATA_DIR = os.path.join(os.path.dirname(DATA_DIR), \"ModelNet10\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# Where all our data live.\n", + "DATA_PATH = \"data/\"\n", + "\n", + "# Where our tileDB arrays live.\n", + "TILEDB_PATH = \"data/tiledb\"\n", + "\n", + "# Where trained models live\n", + "MODEL_PATH = \"data/trained_models\"\n", + "\n", + "if not os.path.exists(DATA_PATH):\n", + " os.mkdir(DATA_PATH)\n", + "\n", + "if not os.path.exists(TILEDB_PATH):\n", + " os.mkdir(TILEDB_PATH)\n", + "\n", + "if not os.path.exists(MODEL_PATH):\n", + " os.mkdir(MODEL_PATH)\n", + "\n", + "os.system(\"mv ~/.keras/datasets/ModelNet10 ./data\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "Function for ingestion in TileDB\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "def ingest_in_tiledb(num_points=2048):\n", + "\n", + " train_points = []\n", + " train_labels = []\n", + " test_points = []\n", + " test_labels = []\n", + " class_map = {}\n", + " folders = glob.glob(os.path.join(\"data/ModelNet10\", \"[!README]*\"))\n", + "\n", + " for i, folder in enumerate(folders):\n", + " print(\"processing class: {}\".format(os.path.basename(folder)))\n", + " # store folder name with ID so we can retrieve later\n", + " class_map[i] = folder.split(\"/\")[-1]\n", + " # gather all files\n", + " train_files = glob.glob(os.path.join(folder, \"train/*\"))\n", + " test_files = glob.glob(os.path.join(folder, \"test/*\"))\n", + "\n", + " for f in train_files:\n", + " train_points.append(trimesh.load(f).sample(num_points))\n", + " train_labels.append(i)\n", + "\n", + " for f in test_files:\n", + " test_points.append(trimesh.load(f).sample(num_points))\n", + " test_labels.append(i)\n", + "\n", + " train_points = np.stack(train_points, axis=0).astype(np.float32)\n", + " train_labels = np.array(train_labels).astype(np.float32)\n", + "\n", + " test_points = np.stack(test_points, axis=0).astype(np.float32)\n", + " test_labels = np.array(test_labels).astype(np.float32)\n", + "\n", + " # Shuffle point and label data in the same manner\n", + "\n", + " # TRAIN DATA\n", + " randomize = np.arange(train_points.shape[0])\n", + " np.random.shuffle(randomize)\n", + "\n", + " train_points = train_points[randomize]\n", + " train_labels = train_labels[randomize]\n", + "\n", + " # TEST DATA\n", + " randomize = np.arange(test_points.shape[0])\n", + " np.random.shuffle(randomize)\n", + "\n", + " test_points = test_points[randomize]\n", + " test_labels = test_labels[randomize]\n", + "\n", + " # Ingest data into TileDB\n", + "\n", + " # Define dimensions, Schema and write TileDB array for point cloud data\n", + " train_point_cloud_id = tiledb.Dim(name=\"point_cloud_id\", domain=(0, train_points.shape[0] - 1), tile=BATCH_SIZE, dtype=np.int32)\n", + " validate_point_cloud_id = tiledb.Dim(name=\"point_cloud_id\", domain=(0, test_points.shape[0] - 1), tile=BATCH_SIZE, dtype=np.int32)\n", + "\n", + " # The following dimensions are common\n", + " samples = tiledb.Dim(name=\"mesh_samples\", domain=(0, train_points.shape[1] - 1), tile=train_points.shape[1], dtype=np.int32)\n", + "\n", + " # Two different schemas for train and validate\n", + " train_point_cloud_schema = tiledb.ArraySchema(domain=tiledb.Domain(train_point_cloud_id, samples),\n", + " sparse=False,\n", + " attrs=[tiledb.Attr(name=\"features\", dtype=[(\"\", np.float32),\n", + " (\"\", np.float32),\n", + " (\"\", np.float32)])])\n", + "\n", + " validate_point_cloud_schema = tiledb.ArraySchema(domain=tiledb.Domain(validate_point_cloud_id, samples),\n", + " sparse=False,\n", + " attrs=[tiledb.Attr(name=\"features\", dtype=[(\"\", np.float32),\n", + " (\"\", np.float32),\n", + " (\"\", np.float32)])])\n", + "\n", + " tiledb.Array.create(TILEDB_PATH + \"/train_point_cloud_array\", train_point_cloud_schema)\n", + " tiledb.Array.create(TILEDB_PATH + \"/validate_point_cloud_array\", validate_point_cloud_schema)\n", + "\n", + " train_view = train_points.view([(\"\", np.float32), (\"\", np.float32), (\"\", np.float32)])\n", + " validate_view = test_points.view([(\"\", np.float32), (\"\", np.float32), (\"\", np.float32)])\n", + "\n", + " with tiledb.open(TILEDB_PATH + \"/train_point_cloud_array\", 'w') as train_tiledb:\n", + " train_tiledb[:] = train_view\n", + "\n", + " with tiledb.open(TILEDB_PATH + \"/validate_point_cloud_array\", 'w') as validate_tiledb:\n", + " validate_tiledb[:] = validate_view\n", + "\n", + " print(\"[STATUS] point cloud TileDB arrays are ready.\")\n", + "\n", + " ######################################################\n", + "\n", + " # Similarly for label arrays.\n", + " train_label_id = tiledb.Dim(name=\"label_id\", domain=(0, train_labels.shape[0] - 1), tile=BATCH_SIZE, dtype=np.int32)\n", + " validate_label_id = tiledb.Dim(name=\"label_id\", domain=(0, test_labels.shape[0] - 1), tile=BATCH_SIZE,\n", + " dtype=np.int32)\n", + "\n", + " train_labels_schema = tiledb.ArraySchema(domain=tiledb.Domain(train_label_id),\n", + " sparse=False,\n", + " attrs=[tiledb.Attr(name=\"label\",\n", + " dtype=[(\"\", np.float32)])])\n", + "\n", + " validate_labels_schema = tiledb.ArraySchema(domain=tiledb.Domain(validate_label_id),\n", + " sparse=False,\n", + " attrs=[tiledb.Attr(name=\"label\",\n", + " dtype=[(\"\", np.float32)])])\n", + "\n", + " tiledb.Array.create(TILEDB_PATH + \"/train_label_array\", train_labels_schema)\n", + " tiledb.Array.create(TILEDB_PATH + \"/validate_label_array\", validate_labels_schema)\n", + "\n", + " train_labels_view = train_labels.view([(\"\", np.float32)])\n", + " validate_labels_view = test_labels.view([(\"\", np.float32)])\n", + "\n", + " with tiledb.open(TILEDB_PATH + \"/train_label_array\", 'w') as train_labels_tiledb:\n", + " train_labels_tiledb[:] = train_labels_view\n", + "\n", + " with tiledb.open(TILEDB_PATH + \"/validate_label_array\", 'w') as validate_labels_tiledb:\n", + " validate_labels_tiledb[:] = validate_labels_view\n", + "\n", + " print(\"[STATUS] labels TileDB arrays are ready.\")\n", + "\n", + " return class_map" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "Run ingestion." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "CLASS_MAP = ingest_in_tiledb(NUM_POINTS)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will need a data generator than will feed training and validation data into the model while training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "def generator(tiledb_images_obj, tiledb_labels_obj, shape, batch_size=BATCH_SIZE):\n", + " \"\"\"\n", + " Yields the next training batch.\n", + " \"\"\"\n", + "\n", + " while True: # Loop forever so the generator never terminates\n", + "\n", + " # Get index to start each batch\n", + " for offset in range(0, shape, batch_size):\n", + "\n", + " # Get the samples you'll use in this batch. We have to convert structured numpy arrays to\n", + " # numpy arrays.\n", + "\n", + " # Avoid reshaping error in last batch\n", + " if offset + batch_size > shape:\n", + " batch_size = shape - offset\n", + "\n", + " x_train = tiledb_images_obj[offset:offset + batch_size]['features'].view(np.float32).reshape(batch_size, NUM_POINTS, 3)\n", + "\n", + " y_train = tiledb_labels_obj[offset:offset + batch_size]['label'].view(np.float32).reshape(batch_size, 1)\n", + "\n", + " # Augment points...jitter\n", + " augment = lambda x: x + tf.random.uniform(x.shape, -0.005, 0.005, dtype=tf.float64)\n", + " x_train = augment(x_train)\n", + "\n", + " # The generator-y part: yield the next training batch\n", + " yield x_train, y_train" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "We will create generators for train and validation data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# Open TileDB image and label arrays.\n", + "train_point_clouds_tiledb = tiledb.open(TILEDB_PATH + \"/train_point_cloud_array\")\n", + "train_labels_tiledb = tiledb.open(TILEDB_PATH + \"/train_label_array\")\n", + "\n", + "validate_point_clouds_tiledb = tiledb.open(TILEDB_PATH + \"/validate_point_cloud_array\")\n", + "validate_labels_tiledb = tiledb.open(TILEDB_PATH + \"/validate_label_array\")\n", + "\n", + "# Create generators\n", + "train_generator = generator(tiledb_images_obj=train_point_clouds_tiledb,\n", + " tiledb_labels_obj=train_labels_tiledb,\n", + " shape=train_point_clouds_tiledb.domain.shape[0],\n", + " batch_size=BATCH_SIZE)\n", + "\n", + "\n", + "validate_generator = generator(tiledb_images_obj=validate_point_clouds_tiledb,\n", + " tiledb_labels_obj=validate_labels_tiledb,\n", + " shape=validate_point_clouds_tiledb.domain.shape[0],\n", + " batch_size=BATCH_SIZE)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "Define model. Each convolution and fully-connected layer (with exception for end layers) consits of\n", + "Convolution / Dense -> Batch Normalization -> ReLU Activation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "def conv_bn(x, filters):\n", + " x = layers.Conv1D(filters, kernel_size=1, padding=\"valid\")(x)\n", + " x = layers.BatchNormalization(momentum=0.0)(x)\n", + " return layers.Activation(\"relu\")(x)\n", + "\n", + "\n", + "def dense_bn(x, filters):\n", + " x = layers.Dense(filters)(x)\n", + " x = layers.BatchNormalization(momentum=0.0)(x)\n", + " return layers.Activation(\"relu\")(x)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "PointNet consists of two core components. The primary MLP network, and the transformer\n", + "net (T-net). The T-net aims to learn an affine transformation matrix by its own mini\n", + "network. The T-net is used twice. The first time to transform the input features (n, 3)\n", + "into a canonical representation. The second is an affine transformation for alignment in\n", + "feature space (n, 3). As per the original paper we constrain the transformation to be\n", + "close to an orthogonal matrix (i.e. ||X*X^T - I|| = 0)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "class OrthogonalRegularizer(keras.regularizers.Regularizer):\n", + " def __init__(self, num_features, l2reg=0.001):\n", + " self.num_features = num_features\n", + " self.l2reg = l2reg\n", + " self.eye = tf.eye(num_features)\n", + "\n", + " def __call__(self, x):\n", + " x = tf.reshape(x, (-1, self.num_features, self.num_features))\n", + " xxt = tf.tensordot(x, x, axes=(2, 2))\n", + " xxt = tf.reshape(xxt, (-1, self.num_features, self.num_features))\n", + " return tf.reduce_sum(self.l2reg * tf.square(xxt - self.eye))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "We can then define a general function to build T-net layers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "def tnet(inputs, num_features):\n", + "\n", + " # Initalise bias as the indentity matrix\n", + " bias = keras.initializers.Constant(np.eye(num_features).flatten())\n", + " reg = OrthogonalRegularizer(num_features)\n", + "\n", + " x = conv_bn(inputs, 32)\n", + " x = conv_bn(x, 64)\n", + " x = conv_bn(x, 512)\n", + " x = layers.GlobalMaxPooling1D()(x)\n", + " x = dense_bn(x, 256)\n", + " x = dense_bn(x, 128)\n", + " x = layers.Dense(\n", + " num_features * num_features,\n", + " kernel_initializer=\"zeros\",\n", + " bias_initializer=bias,\n", + " activity_regularizer=reg,\n", + " )(x)\n", + " feat_T = layers.Reshape((num_features, num_features))(x)\n", + " # Apply affine transformation to input features\n", + " return layers.Dot(axes=(2, 1))([inputs, feat_T])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "The main network can be then implemented in the same manner where the t-net mini models\n", + "can be dropped in a layers in the graph. Here we replicate the network architecture\n", + "published in the original paper but with half the number of weights at each layer as we\n", + "are using the smaller 10 class ModelNet dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "inputs = keras.Input(shape=(NUM_POINTS, 3))\n", + "\n", + "x = tnet(inputs, 3)\n", + "x = conv_bn(x, 32)\n", + "x = conv_bn(x, 32)\n", + "x = tnet(x, 32)\n", + "x = conv_bn(x, 32)\n", + "x = conv_bn(x, 64)\n", + "x = conv_bn(x, 512)\n", + "x = layers.GlobalMaxPooling1D()(x)\n", + "x = dense_bn(x, 256)\n", + "x = layers.Dropout(0.3)(x)\n", + "x = dense_bn(x, 128)\n", + "x = layers.Dropout(0.3)(x)\n", + "\n", + "outputs = layers.Dense(NUM_CLASSES, activation=\"softmax\")(x)\n", + "\n", + "model = keras.Model(inputs=inputs, outputs=outputs, name=\"pointnet\")\n", + "model.summary()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "Train model\n", + "Once the model is defined it can be trained like any other standard classification model\n", + "using `.compile()` and `.fit()`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "model.compile(\n", + " loss=\"sparse_categorical_crossentropy\",\n", + " optimizer=keras.optimizers.Adam(learning_rate=0.001),\n", + " metrics=[\"sparse_categorical_accuracy\"],\n", + ")\n", + "\n", + "model.fit(train_generator,\n", + " steps_per_epoch=train_point_clouds_tiledb.domain.shape[0] // BATCH_SIZE,\n", + " epochs=1,\n", + " validation_data=validate_generator,\n", + " validation_steps=validate_point_clouds_tiledb.domain.shape[0] // BATCH_SIZE)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "Visualize predictions some predictions\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "num_of_points = 8\n", + "points = validate_point_clouds_tiledb[:num_of_points]['features'].view(np.float32).reshape(num_of_points, NUM_POINTS, 3)\n", + "labels = validate_labels_tiledb[:num_of_points]['label'].view(np.float32).reshape(num_of_points,) \n", + "\n", + "# run test data through model\n", + "preds = model.predict(points)\n", + "preds = tf.math.argmax(preds, -1)\n", + "\n", + "# plot points with predicted class and label\n", + "fig = plt.figure(figsize=(15, 10))\n", + "for i in range(num_of_points):\n", + " ax = fig.add_subplot(2, 4, i + 1, projection=\"3d\")\n", + " ax.scatter(points[i, :, 0], points[i, :, 1], points[i, :, 2])\n", + " ax.set_title(\n", + " \"pred: {:}, label: {:}\".format(\n", + " CLASS_MAP[preds.numpy()[i]], CLASS_MAP[labels[i]]\n", + " )\n", + " )\n", + " ax.set_axis_off()\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}