From b7a5423a5a86a501ae806216d10c2cc29fd94425 Mon Sep 17 00:00:00 2001 From: bruce2233 <864427717@qq.com> Date: Tue, 7 Nov 2023 08:09:03 +0000 Subject: [PATCH] fix: captioning task --- colabs/pix2seq_inference_multitask.ipynb | 71 ++++++++++++------------ 1 file changed, 37 insertions(+), 34 deletions(-) diff --git a/colabs/pix2seq_inference_multitask.ipynb b/colabs/pix2seq_inference_multitask.ipynb index b344d6e..f7374cd 100644 --- a/colabs/pix2seq_inference_multitask.ipynb +++ b/colabs/pix2seq_inference_multitask.ipynb @@ -2,6 +2,12 @@ "cells": [ { "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "0BkmPh5WJDHX" + }, + "outputs": [], "source": [ "#@title License\n", "# Copyright 2022 The Pix2Seq Authors.\n", @@ -18,16 +24,13 @@ "# See the License for the specific language governing permissions and\n", "# limitations under the License.\n", "# ==============================================================================" - ], - "metadata": { - "cellView": "form", - "id": "0BkmPh5WJDHX" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", + "metadata": { + "id": "iGuRf1kFH0-H" + }, "source": [ "## A Unified Sequence Interface for Vision Tasks\n", "\"Open\n", @@ -39,10 +42,7 @@ "-------------: | ---------------: | ---------: | --------: | -----------:\n", "ViT-B | 115.2 | 640x640 | 44.2 | [gs://pix2seq/multi_task/ckpt/vit_b_640x640](https://console.cloud.google.com/storage/browser/pix2seq/multi_task/ckpt/vit_b_640x640)\n", "ViT-B | 115.2 | 1024x1024 | 46.5 | [gs://pix2seq/multi_task/ckpt/vit_b_1024x1024](https://console.cloud.google.com/storage/browser/pix2seq/multi_task/ckpt/vit_b_1024x1024)" - ], - "metadata": { - "id": "iGuRf1kFH0-H" - } + ] }, { "cell_type": "code", @@ -88,6 +88,11 @@ }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MUY8cSCOVhb2" + }, + "outputs": [], "source": [ "## Download coco annotations\n", "!mkdir /tmp/coco_annotations\n", @@ -97,12 +102,7 @@ "!wget https://storage.googleapis.com/pix2seq/multi_task/data/coco/json/instances_val2017.json -P /tmp/coco_annotations/\n", "!wget https://storage.googleapis.com/pix2seq/multi_task/data/coco/json/person_keypoints_train2017.json -P /tmp/coco_annotations/\n", "!wget https://storage.googleapis.com/pix2seq/multi_task/data/coco/json/person_keypoints_val2017.json -P /tmp/coco_annotations/" - ], - "metadata": { - "id": "MUY8cSCOVhb2" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", @@ -563,6 +563,11 @@ }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DQBbSG4fmVcb" + }, + "outputs": [], "source": [ "# Object detection.\n", "preprocessed_outputs = get_preprocessed_outputs_detection(\n", @@ -572,15 +577,15 @@ "\n", "vis_det = visualize_detection(results_det, np.asarray(im), task_det._category_names)\n", "Image.fromarray(vis_det)" - ], - "metadata": { - "id": "DQBbSG4fmVcb" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "dZc6CmgeusrO" + }, + "outputs": [], "source": [ "# Get a bbox for person and use it for keypoint detection and instance segmentation.\n", "pred_bboxes_rescaled = results_det[3].numpy().reshape((-1, 4))\n", @@ -589,12 +594,7 @@ "bbox = list(pred_bboxes_rescaled[person_idx])\n", "# You can also specify a custom box in (ymin, xmin, ymax, xmax) format.\n", "# bbox = [148.5505782847104, 149.26714806698848, 325.032441173339, 294.49976727245297]" - ], - "metadata": { - "id": "dZc6CmgeusrO" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", @@ -643,27 +643,30 @@ "# Captioning.\n", "preprocessed_outputs = get_preprocessed_outputs_captioning(\n", " im, image_id, config_cap)\n", + "captions = tf.expand_dims(task_cap._tokenizer.string_to_ids([\"dummy\"])[0],0)\n", + "preprocessed_outputs[2][1]['captions']=captions\n", "infer_outputs = infer_cap(model, preprocessed_outputs)\n", - "outputs = task_cap.postprocess_tpu(*infer_outputs)\n", + "examples={**infer_outputs[0][0],**infer_outputs[0][1]}\n", + "outputs = task_cap.postprocess_tpu(examples,infer_outputs[1],infer_outputs[2])\n", "print_captioning_result(outputs, task_cap._tokenizer.tokenizer)" ] } ], "metadata": { + "accelerator": "TPU", "colab": { "private_outputs": true, "provenance": [] }, + "gpuClass": "standard", "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" - }, - "accelerator": "TPU", - "gpuClass": "standard" + } }, "nbformat": 4, "nbformat_minor": 0 -} \ No newline at end of file +}