From ff67955186bfc80f52dba167b3274841fdebe0d2 Mon Sep 17 00:00:00 2001 From: Billy McFall <22157057+Billy99@users.noreply.github.com> Date: Fri, 12 Dec 2025 16:50:26 -0500 Subject: [PATCH] fix: Model Cache feature under Gen instead of Pred The Local Model Cache feature is only implemented for the Predictive AI use case, but in the docs on the website, the feature is documented under the Generative AI headings. This PR leaves current feature description as is, just moves the documentation links from Generative to Predictive. Signed-off-by: Billy McFall <22157057+Billy99@users.noreply.github.com> --- docs/concepts/resources/index.md | 2 +- docs/getting-started/genai-first-isvc.md | 1 - docs/getting-started/predictive-first-isvc.md | 1 + docs/intro.md | 2 +- docs/model-serving/generative-inference/overview.md | 1 - .../sdk-integration/sdk-integration.md | 3 +-- .../generative-inference/tasks/embedding/embedding.md | 2 +- .../generative-inference/tasks/reranking/rerank.md | 2 +- .../tasks/text-generation/text-generation.md | 2 +- .../tasks/text2text-generation/text2text-generation.md | 2 +- .../predictive-inference/frameworks/overview.md | 1 + .../modelcache/localmodel.md | 0 sidebars.ts | 2 +- src/components/HomepageBenefits/index.tsx | 8 ++++---- 14 files changed, 14 insertions(+), 15 deletions(-) rename docs/model-serving/{generative-inference => predictive-inference}/modelcache/localmodel.md (100%) diff --git a/docs/concepts/resources/index.md b/docs/concepts/resources/index.md index 437f19de7..09447df94 100644 --- a/docs/concepts/resources/index.md +++ b/docs/concepts/resources/index.md @@ -46,7 +46,7 @@ Manage model storage and access patterns: ### LocalModel & LocalModelNode Enables local model caching and management: -- **[Concepts](../../model-serving/generative-inference/modelcache/localmodel.md)**: Overview of local model caching in KServe. +- **[Concepts](../../model-serving/predictive-inference/modelcache/localmodel.md)**: Overview of local model caching in KServe. - **[LocalModelCache](../../reference/crd-api.mdx)**: CRD that Defines local model caching requirements and policies - **[LocalModelNode](../../reference/crd-api.mdx)**: CRD that handles Node-level model caching management - **[LocalModelNodeGroup](../../reference/crd-api.mdx)**: CRD for Grouping of local model nodes for management and orchestration of cached models diff --git a/docs/getting-started/genai-first-isvc.md b/docs/getting-started/genai-first-isvc.md index 29c378d30..fed0d591d 100644 --- a/docs/getting-started/genai-first-isvc.md +++ b/docs/getting-started/genai-first-isvc.md @@ -319,5 +319,4 @@ Now that you have successfully deployed a generative AI service using KServe, yo - 📖 **[Supported Tasks](../model-serving/generative-inference/overview.md#supported-generative-tasks)** - Discover the various tasks that KServe can handle. - 📖 **[Autoscaling](../model-serving/generative-inference/autoscaling/autoscaling.md)**: Automatically scale your service based on traffic and resource usage / metrics. - 📖 **[KV Cache Offloading](../model-serving/generative-inference/kvcache-offloading/kvcache-offloading.md)** - Learn how to offload key-value caches to external storage for improved performance and reduced latency. -- 📖 **[Model Caching](../model-serving/generative-inference/modelcache/localmodel.md)** - Learn how to cache models for faster startup time. - 📖 **[Token Rate Limiting](../model-serving/generative-inference/ai-gateway/envoy-ai-gateway.md)** - Rate limit users based on token usage. diff --git a/docs/getting-started/predictive-first-isvc.md b/docs/getting-started/predictive-first-isvc.md index adad6eb82..df3551e70 100644 --- a/docs/getting-started/predictive-first-isvc.md +++ b/docs/getting-started/predictive-first-isvc.md @@ -297,3 +297,4 @@ Now that you have successfully deployed your first Predictive InferenceService, - 📖 **[Supported Frameworks](../model-serving/predictive-inference/frameworks/overview.md)** - Explore Supported Frameworks. - 📖 **[Batch InferenceService](../model-serving/predictive-inference/batcher/batcher.md)** - Deploy your first Batch InferenceService. - 📖 **[Canary Deployments](../model-serving/predictive-inference/rollout-strategies/canary-example.md)**: Gradually roll out new model versions to test their performance before full deployment. +- 📖 **[Model Caching](../model-serving/predictive-inference/modelcache/localmodel.md)** - Learn how to cache models for faster startup time. diff --git a/docs/intro.md b/docs/intro.md index ea5ae413b..82ba92e0e 100644 --- a/docs/intro.md +++ b/docs/intro.md @@ -35,7 +35,6 @@ Enterprise authentication, network policies, and compliance features built-in. D #### Generative Inference Benefits ✅ **LLM Multi-framework Support** - Deploy LLMs from Hugging Face, vLLM, and custom generative models ✅ **OpenAI-Compatible APIs** - Chat completion, completion, streaming, and embedding endpoints -✅ **LocalModelCache for LLMs** - Cache large models locally to reduce startup time from 15-20 minutes to ~1 minute ✅ **KV Cache Offloading** - Optimized memory management for long conversations and large contexts ✅ **Multi-node Inference** - Distributed LLM serving ✅ **Envoy AI Gateway Integration** - Enterprise-grade API management and routing for AI workloads @@ -50,6 +49,7 @@ Enterprise authentication, network policies, and compliance features built-in. D ✅ **Real-time Scoring** - Low-latency prediction serving for real-time applications ✅ **Production ML Monitoring** - Comprehensive observability, drift detection, and explainability ✅ **Standard Inference Protocols** - Support for Open Inference Protocol (V1/V2) across frameworks +✅ **LocalModelCache for LLMs** - Cache large models locally to reduce startup time from 15-20 minutes to ~1 minute #### Universal Benefits (Both Inference Types) ✅ **Serverless Inference Workloads** - Automatic scaling including scale-to-zero on both CPU and GPU diff --git a/docs/model-serving/generative-inference/overview.md b/docs/model-serving/generative-inference/overview.md index 6229321ae..e7be0107d 100644 --- a/docs/model-serving/generative-inference/overview.md +++ b/docs/model-serving/generative-inference/overview.md @@ -156,7 +156,6 @@ The following examples demonstrate how to deploy and perform inference using the ## Advanced Features The Hugging Face runtime supports several advanced features to enhance model serving capabilities: -- [**Model Caching**](./modelcache/localmodel.md): Cache models on local storage for faster loading and reduced latency. This is particularly useful for large models that are frequently accessed. - [**KV Cache Offloading**](./kvcache-offloading/kvcache-offloading.md): Offload key-value caches to CPU memory to reduce GPU memory usage, allowing larger models to be served on GPUs with limited memory. - [**Distributed LLM Serving**](./multi-node/multi-node.md): Scale model serving across multiple nodes and GPUs for high throughput and low latency. This is useful for serving large models or handling high request volumes. - [**AI Gateway**](./ai-gateway/envoy-ai-gateway.md): Use the AI Gateway to manage rate-limiting based on tokens and route requests to different models, providing a unified API for various generative tasks. diff --git a/docs/model-serving/generative-inference/sdk-integration/sdk-integration.md b/docs/model-serving/generative-inference/sdk-integration/sdk-integration.md index ae401aff9..588f1d2ce 100644 --- a/docs/model-serving/generative-inference/sdk-integration/sdk-integration.md +++ b/docs/model-serving/generative-inference/sdk-integration/sdk-integration.md @@ -276,7 +276,6 @@ After integrating your LLM with an SDK, consider exploring: 1. **Advanced serving options** like [multi-node inference](../multi-node/multi-node.md) for large models 2. **Exploring other inference tasks** such as [text-to-text generation](../tasks/text2text-generation/text2text-generation.md) and [embeddings](../tasks/embedding/embedding.md) -3. **Optimizing performance** with features like [model caching](../modelcache/localmodel.md) and [KV cache offloading](../kvcache-offloading/kvcache-offloading.md) -4. **Auto-scaling** your inference services based on traffic patterns using [KServe's auto-scaling capabilities](../autoscaling/autoscaling.md) +3. **Auto-scaling** your inference services based on traffic patterns using [KServe's auto-scaling capabilities](../autoscaling/autoscaling.md) By connecting your KServe-deployed models with these popular SDKs, you can quickly build sophisticated AI applications while maintaining control over your model infrastructure. diff --git a/docs/model-serving/generative-inference/tasks/embedding/embedding.md b/docs/model-serving/generative-inference/tasks/embedding/embedding.md index 856f21f97..501883913 100644 --- a/docs/model-serving/generative-inference/tasks/embedding/embedding.md +++ b/docs/model-serving/generative-inference/tasks/embedding/embedding.md @@ -262,7 +262,7 @@ Once you've successfully deployed your embedding model, consider: - **Advanced serving options** like [multi-node inference](../../multi-node/multi-node.md) for large models - **Exploring other inference tasks** such as [text-to-text generation](../text2text-generation/text2text-generation.md) and [reranking](../reranking/rerank.md) -- **Optimizing performance** with features like [model caching](../../modelcache/localmodel.md) and [KV cache offloading](../../kvcache-offloading/kvcache-offloading.md) +- **Optimizing performance** with features like [KV cache offloading](../../kvcache-offloading/kvcache-offloading.md) - **Auto-scaling** your inference services based on traffic patterns using [KServe's auto-scaling capabilities](../../autoscaling/autoscaling.md) - **Token based rate limiting** to control usage with [AI Gateway](../../ai-gateway/envoy-ai-gateway.md) for serving models. diff --git a/docs/model-serving/generative-inference/tasks/reranking/rerank.md b/docs/model-serving/generative-inference/tasks/reranking/rerank.md index e39307358..629deb32d 100644 --- a/docs/model-serving/generative-inference/tasks/reranking/rerank.md +++ b/docs/model-serving/generative-inference/tasks/reranking/rerank.md @@ -221,7 +221,7 @@ Once you've successfully deployed your reranker model, consider: - **Advanced serving options** like [multi-node inference](../../multi-node/multi-node.md) for large models - **Exploring other inference tasks** such as [text-to-text generation](../text2text-generation/text2text-generation.md) and [embedding](../embedding/embedding.md) -- **Optimizing performance** with features like [model caching](../../modelcache/localmodel.md) and [KV cache offloading](../../kvcache-offloading/kvcache-offloading.md) +- **Optimizing performance** with features like [KV cache offloading](../../kvcache-offloading/kvcache-offloading.md) - **Auto-scaling** your inference services based on traffic patterns using [KServe's auto-scaling capabilities](../../autoscaling/autoscaling.md) - **Token based rate limiting** to control usage with [AI Gateway](../../ai-gateway/envoy-ai-gateway.md) for serving models. diff --git a/docs/model-serving/generative-inference/tasks/text-generation/text-generation.md b/docs/model-serving/generative-inference/tasks/text-generation/text-generation.md index 6a72f40bc..d15ae25c6 100644 --- a/docs/model-serving/generative-inference/tasks/text-generation/text-generation.md +++ b/docs/model-serving/generative-inference/tasks/text-generation/text-generation.md @@ -326,7 +326,7 @@ Once you've successfully deployed your text generation model, consider: - **Advanced serving options** like [multi-node inference](../../multi-node/multi-node.md) for large models - **Exploring other inference tasks** such as [text-to-text generation](../text2text-generation/text2text-generation.md) and [embedding](../embedding/embedding.md) -- **Optimizing performance** with features like [model caching](../../modelcache/localmodel.md) and [KV cache offloading](../../kvcache-offloading/kvcache-offloading.md) +- **Optimizing performance** with features like [KV cache offloading](../../kvcache-offloading/kvcache-offloading.md) - **Auto-scaling** your inference services based on traffic patterns using [KServe's auto-scaling capabilities](../../autoscaling/autoscaling.md) - **Token based rate limiting** to control usage with [AI Gateway](../../ai-gateway/envoy-ai-gateway.md) for serving models. diff --git a/docs/model-serving/generative-inference/tasks/text2text-generation/text2text-generation.md b/docs/model-serving/generative-inference/tasks/text2text-generation/text2text-generation.md index a18699142..6e1e17eba 100644 --- a/docs/model-serving/generative-inference/tasks/text2text-generation/text2text-generation.md +++ b/docs/model-serving/generative-inference/tasks/text2text-generation/text2text-generation.md @@ -202,7 +202,7 @@ Once you've successfully deployed your text generation model, consider: - **Advanced serving options** like [multi-node inference](../../multi-node/multi-node.md) for large models - **Exploring other inference tasks** such as [reranking](../reranking/rerank.md) and [embedding](../embedding/embedding.md) -- **Optimizing performance** with features like [model caching](../../modelcache/localmodel.md) and [KV cache offloading](../../kvcache-offloading/kvcache-offloading.md) +- **Optimizing performance** with features like [KV cache offloading](../../kvcache-offloading/kvcache-offloading.md) - **Auto-scaling** your inference services based on traffic patterns using [KServe's auto-scaling capabilities](../../autoscaling/autoscaling.md) - **Token based rate limiting** to control usage with [AI Gateway](../../ai-gateway/envoy-ai-gateway.md) for serving models. diff --git a/docs/model-serving/predictive-inference/frameworks/overview.md b/docs/model-serving/predictive-inference/frameworks/overview.md index a228e80b0..babf6fd19 100644 --- a/docs/model-serving/predictive-inference/frameworks/overview.md +++ b/docs/model-serving/predictive-inference/frameworks/overview.md @@ -130,3 +130,4 @@ spec: - Learn about [custom model serving](https://github.com/kserve/kserve/tree/master/docs/samples/v1beta1/custom) - Check out the [sample implementations](https://github.com/kserve/kserve/tree/master/docs/samples/v1beta1) for hands-on tutorials - Read the [KServe developer guide](https://github.com/kserve/kserve/blob/master/docs/DEVELOPER_GUIDE.md) +- Optimizing performance with features like [model caching](../modelcache/localmodel.md) diff --git a/docs/model-serving/generative-inference/modelcache/localmodel.md b/docs/model-serving/predictive-inference/modelcache/localmodel.md similarity index 100% rename from docs/model-serving/generative-inference/modelcache/localmodel.md rename to docs/model-serving/predictive-inference/modelcache/localmodel.md diff --git a/sidebars.ts b/sidebars.ts index 7165ac635..737198c8e 100644 --- a/sidebars.ts +++ b/sidebars.ts @@ -113,7 +113,6 @@ const sidebars: SidebarsConfig = { }, "model-serving/generative-inference/sdk-integration/sdk-integration", "model-serving/generative-inference/kvcache-offloading/kvcache-offloading", - "model-serving/generative-inference/modelcache/localmodel", "model-serving/generative-inference/autoscaling/autoscaling", "model-serving/generative-inference/multi-node/multi-node", "model-serving/generative-inference/ai-gateway/envoy-ai-gateway", @@ -181,6 +180,7 @@ const sidebars: SidebarsConfig = { "model-serving/predictive-inference/transformers/feast-feature-store/feast-feature-store", ] }, + "model-serving/predictive-inference/modelcache/localmodel", { type: 'category', label: 'Model Explainability', diff --git a/src/components/HomepageBenefits/index.tsx b/src/components/HomepageBenefits/index.tsx index 904b9f101..df2b6aa7d 100644 --- a/src/components/HomepageBenefits/index.tsx +++ b/src/components/HomepageBenefits/index.tsx @@ -31,10 +31,6 @@ export default function HomepageBenefits() {
High-performance serving with GPU support and optimized memory management for large models
-Intelligent model caching to reduce loading times and improve response latency for frequently used models
-Advanced memory management with KV cache offloading to CPU/disk for handling longer sequences efficiently
@@ -71,6 +67,10 @@ export default function HomepageBenefits() {Request-based autoscaling with scale-to-zero for predictive workloads
Intelligent model caching to reduce loading times and improve response latency for frequently used models
+Built-in support for model explanations and feature attribution to understand prediction reasoning