diff --git a/.aitk/configs/checks.json b/.aitk/configs/checks.json index 4b219bcd..c80ae155 100644 --- a/.aitk/configs/checks.json +++ b/.aitk/configs/checks.json @@ -1,16 +1,16 @@ { - "configCheck": 139, + "configCheck": 140, "copyCheck": 179, "extensionCheck": 1, - "gitignoreCheck": 38, + "gitignoreCheck": 39, "inferenceModelCheck": 25, - "ipynbCheck": 38, - "licenseCheck": 37, - "modelProjectCheck": 39, + "ipynbCheck": 39, + "licenseCheck": 38, + "modelProjectCheck": 40, "oliveCheck": 45, - "oliveJsonCheck": 139, - "pathCheck": 1153, + "oliveJsonCheck": 140, + "pathCheck": 1170, "requirementsCheck": 37, "templateCheck": 1, - "venvRequirementsCheck": 13 + "venvRequirementsCheck": 14 } diff --git a/.aitk/configs/model_list.json b/.aitk/configs/model_list.json index 0ef10cf3..a6d85da0 100644 --- a/.aitk/configs/model_list.json +++ b/.aitk/configs/model_list.json @@ -647,6 +647,19 @@ "status": "Ready", "relativePath": "Qwen-Qwen2.5-Coder-14B-Instruct/aitk", "version": 3 + }, + { + "displayName": "stable-diffusion-v1-5/stable-diffusion-v1-5", + "icon": "HuggingFace", + "modelLink": "https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5", + "id": "huggingface/stable-diffusion-v1-5/stable-diffusion-v1-5", + "runtimes": [ + "QNN" + ], + "architecture": "Transformer", + "status": "Hide", + "relativePath": "sd-legacy-stable-diffusion-v1-5/aitk", + "version": 1 } ], "template_models": [ @@ -692,6 +705,7 @@ "google-research-datasets/conceptual_captions": "https://huggingface.co/datasets/google-research-datasets/conceptual_captions", "AIMClab-RUC/COCO-CN": "https://huggingface.co/datasets/AIMClab-RUC/COCO-CN", "librispeech_asr": "https://huggingface.co/datasets/openslr/librispeech_asr", + "phiyodr/coco2017": "https://huggingface.co/datasets/phiyodr/coco2017", "pileval_for_awq_benchmark": "https://huggingface.co/datasets/mit-han-lab/pile-val-backup" }, "LoginRequiredDatasets": [ diff --git a/.aitk/requirements/requirements-WCR-SD.txt b/.aitk/requirements/requirements-WCR-SD.txt new file mode 100644 index 00000000..74b5790f --- /dev/null +++ b/.aitk/requirements/requirements-WCR-SD.txt @@ -0,0 +1,3 @@ +accelerate==1.12.0 +diffusers==0.35.0 +torch-fidelity==0.3.0 diff --git a/.aitk/scripts/project_processor.py b/.aitk/scripts/project_processor.py index d36c3f82..fb04a69f 100644 --- a/.aitk/scripts/project_processor.py +++ b/.aitk/scripts/project_processor.py @@ -28,6 +28,7 @@ "mistralai": IconEnum.mistralai, # TODO add "OFA-Sys": IconEnum.HuggingFace, + "stable-diffusion-v1-5": IconEnum.HuggingFace, } diff --git a/sd-legacy-stable-diffusion-v1-5/aitk/.gitignore b/sd-legacy-stable-diffusion-v1-5/aitk/.gitignore new file mode 100644 index 00000000..ccd91a69 --- /dev/null +++ b/sd-legacy-stable-diffusion-v1-5/aitk/.gitignore @@ -0,0 +1,8 @@ +__pycache__ +/cache +/history/*/* +!/history/*/history.config +!/history/*/olive_config.json +/footprints +/result_*.png +/*data*/ diff --git a/sd-legacy-stable-diffusion-v1-5/aitk/README.md b/sd-legacy-stable-diffusion-v1-5/aitk/README.md new file mode 100644 index 00000000..1a0a7045 --- /dev/null +++ b/sd-legacy-stable-diffusion-v1-5/aitk/README.md @@ -0,0 +1,31 @@ +## Stable Diffusion Optimization with ONNX Runtime QNN EP + +### Generate data for static quantization + +To get better result, we need to generate real data from original model instead of using random data for static quantization. + +First generate onnx unoptimized model: + +`python stable_diffusion.py --model_id stable-diffusion-v1-5/stable-diffusion-v1-5 --script_dir .\ --provider cpu --format qdq --optimize --only_conversion` + +Then generate data: + +`python .\evaluation.py --save_data --script_dir .\ --model_id stable-diffusion-v1-5/stable-diffusion-v1-5 --num_inference_steps 25 --seed 0 --num_data 100 --guidance_scale 7.5` + +### Optimize + +Optimize the onnx models for performance improvements. vae_decoder and unet are per-channel quantized and text_encoder runs in fp16 precision. + +`python stable_diffusion.py --script_dir .\ --model_id stable-diffusion-v1-5/stable-diffusion-v1-5 --provider qnn --format qdq --optimize` + +### Test and evaluate + +`python .\evaluation.py --script_dir .\ --model_id stable-diffusion-v1-5/stable-diffusion-v1-5 --num_inference_steps 25 --seed 0 --num_data 100 --guidance_scale 7.5 --provider QNNExecutionProvider --model_dir optimized-qnn_qdq` + +To generate one image: + +`python stable_diffusion.py --script_dir .\ --model_id stable-diffusion-v1-5/stable-diffusion-v1-5 --provider qnn --format qdq --guidance_scale 7.5 --seed 0 --num_inference_steps 25 --prompt "A baby is laying down with a teddy bear"` + +### References + +[stable-diffusion-v1-4](https://github.com/microsoft/olive-recipes/tree/main/compvis-stable-diffusion-v1-4/olive#readme) diff --git a/sd-legacy-stable-diffusion-v1-5/aitk/assets/dog.png b/sd-legacy-stable-diffusion-v1-5/aitk/assets/dog.png new file mode 100644 index 00000000..a8bef282 Binary files /dev/null and b/sd-legacy-stable-diffusion-v1-5/aitk/assets/dog.png differ diff --git a/sd-legacy-stable-diffusion-v1-5/aitk/config_safety_checker.json b/sd-legacy-stable-diffusion-v1-5/aitk/config_safety_checker.json new file mode 100644 index 00000000..8b2d063c --- /dev/null +++ b/sd-legacy-stable-diffusion-v1-5/aitk/config_safety_checker.json @@ -0,0 +1,98 @@ +{ + "input_model": { + "type": "PyTorchModel", + "model_path": "stable-diffusion-v1-5/stable-diffusion-v1-5", + "model_loader": "safety_checker_load", + "model_script": "user_script.py", + "io_config": { + "input_names": [ "clip_input", "images" ], + "output_names": [ "out_images", "has_nsfw_concepts" ], + "dynamic_axes": { + "clip_input": { "0": "batch", "1": "channels", "2": "height", "3": "width" }, + "images": { "0": "batch", "1": "height", "2": "width", "3": "channels" } + } + }, + "dummy_inputs_func": "safety_checker_conversion_inputs" + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ { "device": "gpu", "execution_providers": [ "CUDAExecutionProvider" ] } ] + } + }, + "data_configs": [ + { + "name": "latency_data_config", + "user_script": "user_script.py", + "load_dataset_config": { "type": "local_dataset" }, + "dataloader_config": { "type": "safety_checker_data_loader", "batch_size": 1 } + } + ], + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "data_config": "latency_data_config", + "sub_types": [ { "name": "avg" } ] + } + ] + } + }, + "passes": { + "convert": { "type": "OnnxConversion", "target_opset": 14 }, + "ov_convert": { + "type": "OpenVINOConversion", + "user_script": "user_script.py", + "example_input_func": "safety_checker_conversion_inputs", + "output_model": "safety_checker" + }, + "optimize": { + "type": "OrtTransformersOptimization", + "model_type": "unet", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": [ "RandomNormalLike" ], + "force_fp16_inputs": { "GroupNorm": [ 0, 1, 2 ] } + }, + "optimize_cuda": { + "type": "OrtTransformersOptimization", + "model_type": "unet", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false + } + }, + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_dir": "footprints/safety_checker" +} diff --git a/sd-legacy-stable-diffusion-v1-5/aitk/config_text_encoder.json b/sd-legacy-stable-diffusion-v1-5/aitk/config_text_encoder.json new file mode 100644 index 00000000..ce2559d2 --- /dev/null +++ b/sd-legacy-stable-diffusion-v1-5/aitk/config_text_encoder.json @@ -0,0 +1,147 @@ +{ + "input_model": { + "type": "PyTorchModel", + "model_path": "stable-diffusion-v1-5/stable-diffusion-v1-5", + "model_loader": "text_encoder_load", + "model_script": "user_script.py", + "io_config": { + "input_names": [ "input_ids" ], + "output_names": [ "last_hidden_state", "pooler_output" ], + "dynamic_axes": { "input_ids": { "0": "batch", "1": "sequence" } } + }, + "dummy_inputs_func": "text_encoder_conversion_inputs" + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ { "device": "gpu", "execution_providers": [ "CUDAExecutionProvider" ] } ] + } + }, + "data_configs": [ + { + "name": "latency_data_config", + "user_script": "user_script.py", + "load_dataset_config": { "type": "local_dataset" }, + "dataloader_config": { "type": "text_encoder_data_loader", "batch_size": 1 } + }, + { + "name": "quantize_data_config", + "user_script": "user_script.py", + "load_dataset_config": { "type": "local_dataset" }, + "dataloader_config": { "type": "text_encoder_quantize_data_loader", "data_num": 100 } + } + ], + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "data_config": "latency_data_config", + "sub_types": [ { "name": "avg" } ] + } + ] + } + }, + "passes": { + "convert": { "type": "OnnxConversion", "target_opset": 17 }, + "ov_convert": { + "type": "OpenVINOConversion", + "user_script": "user_script.py", + "example_input_func": "text_encoder_conversion_inputs", + "output_model": "text_encoder" + }, + "optimize": { + "type": "OrtTransformersOptimization", + "model_type": "clip", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": [ "RandomNormalLike" ], + "force_fp16_inputs": { "GroupNorm": [ 0, 1, 2 ] } + }, + "optimize_cuda": { + "type": "OrtTransformersOptimization", + "model_type": "clip", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false + }, + "dynamic_shape_to_fixed": { + "type": "DynamicToFixedShape", + "dim_param": [ "batch", "sequence" ], + "dim_value": [ 1, 77 ] + }, + "surgery": { "type": "GraphSurgeries", "surgeries": [ { "surgeon": "ReplaceAttentionMaskValue", "replacement": -200.0 } ] }, + "optimize_qdq": { + "type": "OrtTransformersOptimization", + "model_type": "clip", + "opt_level": 0, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": false, + "use_multi_head_attention": false, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": false, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": false, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": false, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": false, + "enable_bias_splitgelu": false, + "enable_packed_qkv": false, + "enable_packed_kv": false, + "enable_bias_add": false, + "group_norm_channels_last": false + } + }, + "quantization": { + "type": "OnnxStaticQuantization", + "data_config": "quantize_data_config", + "activation_type": "uint16", + "precision": "uint8", + "calibrate_method": "MinMax", + "quant_preprocess": true + }, + "cb": { + "type": "EPContextBinaryGenerator", + "provider_options": { + "htp_graph_finalization_optimization_mode": "3" + } + } + }, + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_dir": "footprints/text_encoder" +} diff --git a/sd-legacy-stable-diffusion-v1-5/aitk/config_unet.json b/sd-legacy-stable-diffusion-v1-5/aitk/config_unet.json new file mode 100644 index 00000000..1f631b3f --- /dev/null +++ b/sd-legacy-stable-diffusion-v1-5/aitk/config_unet.json @@ -0,0 +1,170 @@ +{ + "input_model": { + "type": "PyTorchModel", + "model_path": "stable-diffusion-v1-5/stable-diffusion-v1-5", + "model_loader": "unet_load", + "model_script": "user_script.py", + "io_config": { + "input_names": [ "sample", "timestep", "encoder_hidden_states", "return_dict" ], + "output_names": [ "out_sample" ], + "dynamic_axes": { + "sample": { + "0": "unet_sample_batch", + "1": "unet_sample_channels", + "2": "unet_sample_height", + "3": "unet_sample_width" + }, + "timestep": { "0": "unet_time_batch" }, + "encoder_hidden_states": { "0": "unet_hidden_batch", "1": "unet_hidden_sequence" } + } + }, + "dummy_inputs_func": "unet_conversion_inputs" + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ { "device": "gpu", "execution_providers": [ "CUDAExecutionProvider" ] } ] + } + }, + "data_configs": [ + { + "name": "latency_data_config", + "user_script": "user_script.py", + "load_dataset_config": { "type": "local_dataset" }, + "dataloader_config": { "type": "unet_data_loader", "batch_size": 1 } + }, + { + "name": "quantize_data_config", + "user_script": "user_script.py", + "load_dataset_config": { "type": "local_dataset" }, + "dataloader_config": { "type": "unet_quantize_data_loader", "data_num": 200 } + } + ], + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "data_config": "latency_data_config", + "sub_types": [ { "name": "avg" } ] + } + ] + } + }, + "passes": { + "convert": { + "type": "OnnxConversion", + "target_opset": 17, + "save_as_external_data": true, + "all_tensors_to_one_file": true, + "external_data_name": "weights.pb" + }, + "ov_convert": { + "type": "OpenVINOConversion", + "user_script": "user_script.py", + "example_input_func": "get_unet_ov_example_input", + "output_model": "unet" + }, + "optimize": { + "type": "OrtTransformersOptimization", + "model_type": "unet", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": [ "RandomNormalLike" ], + "force_fp16_inputs": { "GroupNorm": [ 0, 1, 2 ] } + }, + "optimize_cuda": { + "type": "OrtTransformersOptimization", + "model_type": "unet", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false + }, + "dynamic_shape_to_fixed": { + "type": "DynamicToFixedShape", + "dim_param": [ + "unet_sample_batch", + "unet_sample_channels", + "unet_sample_height", + "unet_sample_width", + "unet_time_batch", + "unet_hidden_batch", + "unet_hidden_sequence" + ], + "dim_value": [ 1, 4, 64, 64, 1, 1, 77 ], + "save_as_external_data": true + }, + "optimize_qdq": { + "type": "OrtTransformersOptimization", + "model_type": "unet", + "opt_level": 0, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": false, + "use_multi_head_attention": false, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": false, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": false, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": false, + "enable_bias_splitgelu": false, + "enable_packed_qkv": false, + "enable_packed_kv": false, + "enable_bias_add": false, + "group_norm_channels_last": false + } + }, + "quantization": { + "type": "OnnxStaticQuantization", + "data_config": "quantize_data_config", + "activation_type": "uint16", + "precision": "uint8", + "calibrate_method": "MinMax", + "quant_preprocess": true + }, + "cb": { + "type": "EPContextBinaryGenerator", + "provider_options": { + "htp_graph_finalization_optimization_mode": "3" + } + } + }, + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_dir": "footprints/unet" +} diff --git a/sd-legacy-stable-diffusion-v1-5/aitk/config_vae_decoder.json b/sd-legacy-stable-diffusion-v1-5/aitk/config_vae_decoder.json new file mode 100644 index 00000000..ab59bdd1 --- /dev/null +++ b/sd-legacy-stable-diffusion-v1-5/aitk/config_vae_decoder.json @@ -0,0 +1,127 @@ +{ + "input_model": { + "type": "PyTorchModel", + "model_path": "stable-diffusion-v1-5/stable-diffusion-v1-5", + "model_loader": "vae_decoder_load", + "model_script": "user_script.py", + "io_config": { + "input_names": [ "latent_sample", "return_dict" ], + "output_names": [ "sample" ], + "dynamic_axes": { + "latent_sample": { + "0": "decoder_batch", + "1": "decoder_channels", + "2": "decoder_height", + "3": "decoder_width" + } + } + }, + "dummy_inputs_func": "vae_decoder_conversion_inputs" + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ { "device": "gpu", "execution_providers": [ "CUDAExecutionProvider" ] } ] + } + }, + "data_configs": [ + { + "name": "latency_data_config", + "user_script": "user_script.py", + "load_dataset_config": { "type": "local_dataset" }, + "dataloader_config": { "type": "vae_decoder_data_loader", "batch_size": 1 } + }, + { + "name": "quantize_data_config", + "user_script": "user_script.py", + "load_dataset_config": { "type": "local_dataset" }, + "dataloader_config": { "type": "vae_decoder_quantize_data_loader", "data_num": 100 } + } + ], + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "data_config": "latency_data_config", + "sub_types": [ { "name": "avg" } ] + } + ] + } + }, + "passes": { + "convert": { "type": "OnnxConversion", "target_opset": 17 }, + "ov_convert": { + "type": "OpenVINOConversion", + "user_script": "user_script.py", + "example_input_func": "vae_decoder_conversion_inputs", + "output_model": "vae_decoder" + }, + "optimize": { + "type": "OrtTransformersOptimization", + "model_type": "vae", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": [ "RandomNormalLike" ], + "force_fp16_inputs": { "GroupNorm": [ 0, 1, 2 ] } + }, + "optimize_cuda": { + "type": "OrtTransformersOptimization", + "model_type": "vae", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false + }, + "dynamic_shape_to_fixed": { + "type": "DynamicToFixedShape", + "dim_param": [ "decoder_batch", "decoder_channels", "decoder_height", "decoder_width" ], + "dim_value": [ 1, 4, 64, 64 ] + }, + "quantization": { + "type": "OnnxStaticQuantization", + "data_config": "quantize_data_config", + "activation_type": "uint16", + "precision": "uint8", + "calibrate_method": "MinMax", + "quant_preprocess": true + }, + "cb": { + "type": "EPContextBinaryGenerator", + "provider_options": { + "htp_graph_finalization_optimization_mode": "3" + } + } + }, + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_dir": "footprints/vae_decoder" +} diff --git a/sd-legacy-stable-diffusion-v1-5/aitk/config_vae_encoder.json b/sd-legacy-stable-diffusion-v1-5/aitk/config_vae_encoder.json new file mode 100644 index 00000000..a1b09a1c --- /dev/null +++ b/sd-legacy-stable-diffusion-v1-5/aitk/config_vae_encoder.json @@ -0,0 +1,125 @@ +{ + "input_model": { + "type": "PyTorchModel", + "model_path": "stable-diffusion-v1-5/stable-diffusion-v1-5", + "model_loader": "vae_encoder_load", + "model_script": "user_script.py", + "io_config": { + "input_names": [ "sample", "return_dict" ], + "output_names": [ "latent_sample" ], + "dynamic_axes": { + "sample": { "0": "encoder_batch", "1": "encoder_channels", "2": "encoder_height", "3": "encoder_width" } + } + }, + "dummy_inputs_func": "vae_encoder_conversion_inputs" + }, + "systems": { + "local_system": { + "type": "LocalSystem", + "accelerators": [ { "device": "gpu", "execution_providers": [ "CUDAExecutionProvider" ] } ] + } + }, + "data_configs": [ + { + "name": "latency_data_config", + "user_script": "user_script.py", + "load_dataset_config": { "type": "local_dataset" }, + "dataloader_config": { "type": "vae_encoder_data_loader", "batch_size": 1 } + }, + { + "name": "quantize_data_config", + "user_script": "user_script.py", + "load_dataset_config": { "type": "local_dataset" }, + "dataloader_config": { "type": "vae_encoder_quantize_data_loader", "data_num": 100 } + } + ], + "evaluators": { + "common_evaluator": { + "metrics": [ + { + "name": "latency", + "type": "latency", + "data_config": "latency_data_config", + "sub_types": [ { "name": "avg" } ] + } + ] + } + }, + "passes": { + "convert": { "type": "OnnxConversion", "target_opset": 17 }, + "ov_convert": { + "type": "OpenVINOConversion", + "user_script": "user_script.py", + "example_input_func": "vae_encoder_conversion_inputs", + "output_model": "vae_encoder" + }, + "optimize": { + "type": "OrtTransformersOptimization", + "model_type": "vae", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false, + "optimization_options": { + "enable_gelu": true, + "enable_layer_norm": true, + "enable_attention": true, + "use_multi_head_attention": true, + "enable_skip_layer_norm": false, + "enable_embed_layer_norm": true, + "enable_bias_skip_layer_norm": false, + "enable_bias_gelu": true, + "enable_gelu_approximation": false, + "enable_qordered_matmul": false, + "enable_shape_inference": true, + "enable_gemm_fast_gelu": false, + "enable_nhwc_conv": false, + "enable_group_norm": true, + "enable_bias_splitgelu": false, + "enable_packed_qkv": true, + "enable_packed_kv": true, + "enable_bias_add": false, + "group_norm_channels_last": false + }, + "force_fp32_ops": [ "RandomNormalLike" ], + "force_fp16_inputs": { "GroupNorm": [ 0, 1, 2 ] } + }, + "optimize_cuda": { + "type": "OrtTransformersOptimization", + "model_type": "vae", + "opt_level": 0, + "float16": true, + "use_gpu": true, + "keep_io_types": false + }, + "dynamic_shape_to_fixed": { + "type": "DynamicToFixedShape", + "dim_param": [ + "encoder_batch", + "encoder_channels", + "encoder_height", + "encoder_width", + "Addlatent_sample_dim_0", + "Addlatent_sample_dim_1", + "Addlatent_sample_dim_2", + "Addlatent_sample_dim_3" + ], + "dim_value": [ 1, 3, 512, 512, 1, 4, 64, 64 ] + }, + "quantization": { + "type": "OnnxStaticQuantization", + "data_config": "quantize_data_config", + "activation_type": "uint16", + "precision": "uint8", + "calibrate_method": "MinMax", + "quant_preprocess": true + } + }, + "log_severity_level": 0, + "evaluator": "common_evaluator", + "evaluate_input_model": false, + "host": "local_system", + "target": "local_system", + "cache_dir": "cache", + "output_dir": "footprints/vae_encoder" +} diff --git a/sd-legacy-stable-diffusion-v1-5/aitk/evaluation.py b/sd-legacy-stable-diffusion-v1-5/aitk/evaluation.py new file mode 100644 index 00000000..c224ddca --- /dev/null +++ b/sd-legacy-stable-diffusion-v1-5/aitk/evaluation.py @@ -0,0 +1,304 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +import logging +import math +import os +import re +import sys +from functools import partial +from pathlib import Path +from typing import Callable + +import numpy as np +import requests +import torch +from datasets import load_dataset +from PIL import Image +from sd_utils.qdq import OnnxStableDiffusionPipelineWithSave +from torchmetrics.functional.multimodal import clip_score +from torchmetrics.image.fid import FrechetInceptionDistance +from torchvision.transforms import functional as F + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +logger.addHandler(logging.StreamHandler(sys.stdout)) + +# Clip score + + +def get_clip_score(prompt: str, path: Path, clip_score_fn): + with torch.no_grad(): + image = Image.open(path / f"{prompt}.png") + image = torch.tensor(np.array(image), dtype=torch.uint8).permute(2, 0, 1) + score = clip_score_fn(image, prompt) + return score.detach() + + +def get_clip_scores(prompts: list[str], path: Path, clip_score_fn): + scores = [] + with open(path / "clip_scores.txt", "w") as f: + f.write("| Prompt | Score |\n") + for prompt in prompts: + score = get_clip_score(prompt, path, clip_score_fn) + scores.append(score) + f.write(f"| {prompt} | {score} |\n") + + logger.info("CLIP Scores avg: %s", np.mean(np.array(scores))) + f.write(f"| Avg | {np.mean(np.array(scores))} |\n") + + +# MSE score + + +def calc_error(image1, image2): + image1 = Image.open(image1) + image2 = Image.open(image2) + image1 = np.array(image1, dtype=np.float32) + image2 = np.array(image2, dtype=np.float32) + return np.mean((image1 - image2) ** 2) + + +def get_mse_scores(prompts: list[str], unoptimized_path: Path, optimized_path: Path, train_num: int): + train_error = [] + test_error = [] + with open(optimized_path / "mse_scores.txt", "w") as f: + f.write("| Prompt | Error |\n") + for i, prompt in enumerate(prompts): + error = calc_error(unoptimized_path / f"{prompt}.png", optimized_path / f"{prompt}.png") + f.write(f"| {prompt} | {error} |\n") + if i < train_num: + train_error.append(error) + else: + test_error.append(error) + + train_error = np.mean(np.array(train_error)) + test_error = np.mean(np.array(test_error)) + logger.info("Average train error %f", train_error) + logger.info("Average test error %f", test_error) + f.write(f"| Avg Train | {train_error} |\n") + f.write(f"| Avg Test | {test_error} |\n") + + +# FID score + + +def get_fid_scores(prompts: list[str], path: Path, real_images): + with torch.no_grad(), open(path / "fid_scores.txt", "w") as f: + f.write("| Prompt | Score |\n") + images = [] + for prompt in prompts: + image = Image.open(path / f"{prompt}.png") + image = torch.tensor(np.array(image), dtype=torch.uint8).permute(2, 0, 1).unsqueeze(0) + images.append(image) + images = torch.cat(images) + + fid = FrechetInceptionDistance() + fid.update(real_images, real=True) + fid.update(images, real=False) + + score = fid.compute() + logger.info("FID: %f", score) + f.write(f"| FID | {score} |\n") + + +# hpsv2 score + + +def get_hpsv2_scores(path: Path, generate_image: Callable[[str, str], None], prompt_style: str): + if prompt_style is None: + return + import hpsv2 + + # Get benchmark prompts (