From 393ed0cb910c4716c928d47ca38f455ca944e74a Mon Sep 17 00:00:00 2001 From: wuhuxiao Date: Mon, 15 Dec 2025 11:54:32 +0800 Subject: [PATCH] modify docs and comments --- docs/source/user-guide/sparse-attention/cacheblend.md | 4 ++-- docs/source/user-guide/sparse-attention/index.md | 1 + examples/offline_inference_blend.py | 2 +- ucm/sparse/blend/blend.py | 2 ++ 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/source/user-guide/sparse-attention/cacheblend.md b/docs/source/user-guide/sparse-attention/cacheblend.md index 0f5d8e819..f95f3d359 100644 --- a/docs/source/user-guide/sparse-attention/cacheblend.md +++ b/docs/source/user-guide/sparse-attention/cacheblend.md @@ -3,7 +3,7 @@ ![blend_scheme.jpg](../../_static/images/blend_scheme.jpg) -**🚀 Knowledge Cached Fusion Algorithm | 📄 EuroSys 2025 Paper ** +**🚀 Knowledge Cached Fusion Algorithm | 📄 EuroSys 2025 Paper** [![License](https://img.shields.io/badge/License-MIT-green.svg)](https://github.com/ModelEngine-Group/unified-cache-management/blob/main/LICENSE) [![Python](https://img.shields.io/badge/Python-3.10+-blue.svg)](https://python.org) @@ -31,7 +31,7 @@ CacheBlend reduces TTFT by 2.2 ~ 3.3× and increases throughput by 2.8 ~ 5× und 1. **🔐 Chunk Hash Encoding**: Similar as prefix hash encoder, hash all blocks in each chunk from the same hash meta beginning. 2. **⚡ Combine Prefix Cache and Chunk Cache**: Since chunk cache and native prefix cache share the same hash space, ucm first performs prefix cache lookup to fetch fully reused cache and then conduct chunk cache lookup to fetch the candidate cache for blending. 3. **🎯 Delta-Rope PostProcess**: Rectify loaded chunk cache according to their position in the new request. -3. **🔍 Integrate Cache Blend and First Token Generation**: Construct compute mask and attention meta according to HKVD tokens, cache miss tokens and suffix tokens, then compute their kv cache in a single model forward stage. +3. **🔍 Integrate Cache Blend and First Token Generation**: Construct compute mask and attention meta according to the HKVD tokens, cache miss tokens and suffix tokens, then compute their kv cache in a single model forward stage. 4. **🚀 Comprehensive Hook for LLM Forward Pipeline**: Based on ucm sparse module, blend module sparse the prefill tokens not only in attention stage but also in ffn, layer stage. ## 🚀 Quick Start diff --git a/docs/source/user-guide/sparse-attention/index.md b/docs/source/user-guide/sparse-attention/index.md index 6c1f3d209..822917604 100644 --- a/docs/source/user-guide/sparse-attention/index.md +++ b/docs/source/user-guide/sparse-attention/index.md @@ -41,4 +41,5 @@ esa gsa kvcomp kvstar +cacheblend ::: diff --git a/examples/offline_inference_blend.py b/examples/offline_inference_blend.py index 0de105f55..bdc2b211b 100644 --- a/examples/offline_inference_blend.py +++ b/examples/offline_inference_blend.py @@ -186,7 +186,7 @@ def main(): # choose one data row in LongBenchV1 (wikimqa) assert os.path.isfile( path_to_dataset - ), f"Incorrect dataset path. Please specify the dataset path by `export DATASET_PATH=/path/to/longbench/multifieldqa_zh.jsonl`" + ), f"Incorrect dataset path. Please specify the dataset path by `export DATASET_PATH=/home/data/Longbench/data/2wikimqa.jsonl`" with open(path_to_dataset, "r") as f: lines = f.readlines() dataset_row = json.loads(lines[0]) diff --git a/ucm/sparse/blend/blend.py b/ucm/sparse/blend/blend.py index 7cc945674..ee9d8aa07 100644 --- a/ucm/sparse/blend/blend.py +++ b/ucm/sparse/blend/blend.py @@ -189,6 +189,8 @@ def build_sparse_meta( def _update_attn_metadata(self): # update attn_metadata, cause we sparse the prefill tokens + # golden kv caches are available in current blend layer, so maybe we should cache all of them + # so maybe we should modify slot_mapping at the beginning of next layer/attn self.attn_metadata.slot_mapping = self.attn_metadata.slot_mapping[ self.blend_req_metas.compute_mask ]