google-research · bruce2233 · Nov 7, 2023
diff --git a/colabs/pix2seq_inference_multitask.ipynb b/colabs/pix2seq_inference_multitask.ipynb
@@ -2,6 +2,12 @@
   "cells": [
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "0BkmPh5WJDHX"
+      },
+      "outputs": [],
       "source": [
         "#@title License\n",
         "# Copyright 2022 The Pix2Seq Authors.\n",
@@ -18,16 +24,13 @@
         "# See the License for the specific language governing permissions and\n",
         "# limitations under the License.\n",
         "# =============================================================================="
-      ],
-      "metadata": {
-        "cellView": "form",
-        "id": "0BkmPh5WJDHX"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "iGuRf1kFH0-H"
+      },
       "source": [
         "## A Unified Sequence Interface for Vision Tasks\n",
         "<a href=\"https://colab.research.google.com/github/google-research/pix2seq/blob/master/colabs/pix2seq_inference_multitask.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
@@ -39,10 +42,7 @@
         "-------------: | ---------------: | ---------: | --------: | -----------:\n",
         "ViT-B          | 115.2            | 640x640    | 44.2      | [gs://pix2seq/multi_task/ckpt/vit_b_640x640](https://console.cloud.google.com/storage/browser/pix2seq/multi_task/ckpt/vit_b_640x640)\n",
         "ViT-B          | 115.2            | 1024x1024  | 46.5      | [gs://pix2seq/multi_task/ckpt/vit_b_1024x1024](https://console.cloud.google.com/storage/browser/pix2seq/multi_task/ckpt/vit_b_1024x1024)"
-      ],
-      "metadata": {
-        "id": "iGuRf1kFH0-H"
-      }
+      ]
     },
     {
       "cell_type": "code",
@@ -88,6 +88,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "MUY8cSCOVhb2"
+      },
+      "outputs": [],
       "source": [
         "## Download coco annotations\n",
         "!mkdir /tmp/coco_annotations\n",
@@ -97,12 +102,7 @@
         "!wget https://storage.googleapis.com/pix2seq/multi_task/data/coco/json/instances_val2017.json -P /tmp/coco_annotations/\n",
         "!wget https://storage.googleapis.com/pix2seq/multi_task/data/coco/json/person_keypoints_train2017.json -P /tmp/coco_annotations/\n",
         "!wget https://storage.googleapis.com/pix2seq/multi_task/data/coco/json/person_keypoints_val2017.json -P /tmp/coco_annotations/"
-      ],
-      "metadata": {
-        "id": "MUY8cSCOVhb2"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
@@ -563,6 +563,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "DQBbSG4fmVcb"
+      },
+      "outputs": [],
       "source": [
         "# Object detection.\n",
         "preprocessed_outputs = get_preprocessed_outputs_detection(\n",
@@ -572,15 +577,15 @@
         "\n",
         "vis_det = visualize_detection(results_det, np.asarray(im), task_det._category_names)\n",
         "Image.fromarray(vis_det)"
-      ],
-      "metadata": {
-        "id": "DQBbSG4fmVcb"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "dZc6CmgeusrO"
+      },
+      "outputs": [],
       "source": [
         "# Get a bbox for person and use it for keypoint detection and instance segmentation.\n",
         "pred_bboxes_rescaled = results_det[3].numpy().reshape((-1, 4))\n",
@@ -589,12 +594,7 @@
         "bbox = list(pred_bboxes_rescaled[person_idx])\n",
         "# You can also specify a custom box in (ymin, xmin, ymax, xmax) format.\n",
         "# bbox = [148.5505782847104, 149.26714806698848, 325.032441173339, 294.49976727245297]"
-      ],
-      "metadata": {
-        "id": "dZc6CmgeusrO"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
@@ -643,27 +643,30 @@
         "# Captioning.\n",
         "preprocessed_outputs = get_preprocessed_outputs_captioning(\n",
         "    im, image_id, config_cap)\n",
+        "captions = tf.expand_dims(task_cap._tokenizer.string_to_ids([\"dummy\"])[0],0)\n",
+        "preprocessed_outputs[2][1]['captions']=captions\n",
         "infer_outputs = infer_cap(model, preprocessed_outputs)\n",
-        "outputs = task_cap.postprocess_tpu(*infer_outputs)\n",
+        "examples={**infer_outputs[0][0],**infer_outputs[0][1]}\n",
+        "outputs = task_cap.postprocess_tpu(examples,infer_outputs[1],infer_outputs[2])\n",
         "print_captioning_result(outputs, task_cap._tokenizer.tokenizer)"
       ]
     }
   ],
   "metadata": {
+    "accelerator": "TPU",
     "colab": {
       "private_outputs": true,
       "provenance": []
     },
+    "gpuClass": "standard",
     "kernelspec": {
       "display_name": "Python 3",
       "name": "python3"
     },
     "language_info": {
       "name": "python"
-    },
-    "accelerator": "TPU",
-    "gpuClass": "standard"
+    }
   },
   "nbformat": 4,
   "nbformat_minor": 0
-}
+}