Move functon to functions.py and use multi-vector search

badmonster0 · badmonster0 · commit aa820bd1a590 · 2025-07-26T23:33:09.000-07:00
diff --git a/examples/image_search_colpali/README.md b/examples/image_search_colpali/README.md
@@ -10,8 +10,8 @@ We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/c
 
 ## Technologies
 - CocoIndex for ETL and live update
-- **ColPali** - Multimodal Embeddings Model for images and query
-- Qdrant for Vector Storage (supports both gRPC and HTTP)
+- **ColPali** - Multimodal Embeddings Model for images and query with multi-vector late interaction
+- Qdrant for Vector Storage with multi-vector support and MaxSim scoring (supports both gRPC and HTTP)
 - FastAPI for backend
 - Ollama (Optional) for generating image captions using `gemma3` or other models
 
@@ -55,6 +55,17 @@ export OLLAMA_MODEL="gemma3"  # Optional, for caption generation
   ```
   pip install -e .
   ```
+  Note: ColPali embedding support is included in the cocoindex library with the `[embeddings]` extra.
+  
+- The app automatically detects the ColPali model dimension and uses multi-vector embeddings with MaxSim scoring for optimal search performance.
+
+## Supported Models
+- Default: `vidore/colpali-v1.2` (128-dimensional embeddings)
+- Also supports: `vidore/colpali-v1.1`, `vidore/colpali-v1.3`, and other ColPali variants
+- To use a different model, set the environment variable:
+  ```sh
+  export COLPALI_MODEL="vidore/colpali-v1.3"
+  ```
 
 - Run Backend
   ```
diff --git a/examples/image_search_colpali/main.py b/examples/image_search_colpali/main.py
@@ -2,6 +2,7 @@
 import functools
 import io
 import os
+import typing
 from contextlib import asynccontextmanager
 from typing import Any, Literal
 
@@ -13,7 +14,6 @@
 from fastapi.staticfiles import StaticFiles
 from PIL import Image
 from qdrant_client import QdrantClient
-from colpali_engine.models import ColPali, ColPaliProcessor
 
 
 # --- Config ---
@@ -29,76 +29,28 @@
 OLLAMA_URL = os.getenv("OLLAMA_URL", "http://localhost:11434/")
 QDRANT_COLLECTION = "ImageSearchColpali"
 COLPALI_MODEL_NAME = os.getenv("COLPALI_MODEL", "vidore/colpali-v1.2")
-COLPALI_MODEL_DIMENSION = 1031  # Set to match ColPali's output
+# Get ColPali embedding dimension dynamically from model  
+HIDDEN_DIM = cocoindex.functions.get_colpali_dimension(COLPALI_MODEL_NAME)
+print(f"📐 Using ColPali model {COLPALI_MODEL_NAME} with {HIDDEN_DIM} hidden dimensions")
 
-# --- ColPali model cache and embedding functions ---
-_colpali_model_cache = {}
 
 
-def get_colpali_model(model: str = COLPALI_MODEL_NAME):
-    global _colpali_model_cache
-    if model not in _colpali_model_cache:
-        print(f"Loading ColPali model: {model}")
-        _colpali_model_cache[model] = {
-            "model": ColPali.from_pretrained(model),
-            "processor": ColPaliProcessor.from_pretrained(model),
-        }
-    return _colpali_model_cache[model]["model"], _colpali_model_cache[model][
-        "processor"
-    ]
-
-
-def colpali_embed_image(
-    img_bytes: bytes, model: str = COLPALI_MODEL_NAME
-) -> list[float]:
-    from PIL import Image
-    import torch
-    import io
-
-    colpali_model, processor = get_colpali_model(model)
-    pil_image = Image.open(io.BytesIO(img_bytes)).convert("RGB")
-    inputs = processor.process_images([pil_image])
-    with torch.no_grad():
-        embeddings = colpali_model(**inputs)
-    pooled_embedding = embeddings.mean(dim=-1)
-    result = pooled_embedding[0].cpu().numpy()  # [1031]
-    return result.tolist()
-
-
-def colpali_embed_query(query: str, model: str = COLPALI_MODEL_NAME) -> list[float]:
-    import torch
-    import numpy as np
-
-    colpali_model, processor = get_colpali_model(model)
-    inputs = processor.process_queries([query])
-    with torch.no_grad():
-        embeddings = colpali_model(**inputs)
-    pooled_embedding = embeddings.mean(dim=-1)
-    query_tokens = pooled_embedding[0].cpu().numpy()  # [15]
-    target_length = COLPALI_MODEL_DIMENSION
-    result = np.zeros(target_length, dtype=np.float32)
-    result[: min(len(query_tokens), target_length)] = query_tokens[:target_length]
-    return result.tolist()
-
-
-# --- End ColPali embedding functions ---
-
-
-def embed_query(text: str) -> list[float]:
+def embed_query(text: str) -> list[list[float]]:
     """
-    Embed the caption using ColPali model.
+    Embed the caption using ColPali model, returning multi-vector format.
     """
-    return colpali_embed_query(text, model=COLPALI_MODEL_NAME)
+    return cocoindex.functions.colpali_embed_query(text, model=COLPALI_MODEL_NAME)
 
 
 @cocoindex.op.function(cache=True, behavior_version=1, gpu=True)
 def embed_image(
     img_bytes: bytes,
-) -> cocoindex.Vector[cocoindex.Float32, Literal[COLPALI_MODEL_DIMENSION]]:
+) -> cocoindex.Vector[cocoindex.Vector[cocoindex.Float32, typing.Literal[HIDDEN_DIM]]]:
     """
-    Convert image to embedding using ColPali model.
+    Convert image to embedding using ColPali model, returning multi-vector format.
+    Returns variable number of patches, each with model-specific dimensional embeddings.
     """
-    return colpali_embed_image(img_bytes, model=COLPALI_MODEL_NAME)
+    return cocoindex.functions.colpali_embed_image(img_bytes, model=COLPALI_MODEL_NAME)
 
 
 @cocoindex.flow_def(name="ImageObjectEmbeddingColpali")
@@ -189,16 +141,20 @@ def search(
     q: str = Query(..., description="Search query"),
     limit: int = Query(5, description="Number of results"),
 ) -> Any:
-    # Get the embedding for the query
+    # Get the multi-vector embedding for the query
     query_embedding = embed_query(q)
+    print(f"🔍 Query multi-vector shape: {len(query_embedding)} tokens x {len(query_embedding[0]) if query_embedding else 0} dims")
 
-    # Search in Qdrant
-    search_results = app.state.qdrant_client.search(
+    # Search in Qdrant with multi-vector MaxSim scoring using query_points API
+    search_results = app.state.qdrant_client.query_points(
         collection_name=QDRANT_COLLECTION,
-        query_vector=("embedding", query_embedding),
+        query=query_embedding,  # Multi-vector format: list[list[float]]
+        using="embedding",  # Specify the vector field name
         limit=limit,
         with_payload=True,
     )
+    
+    print(f"📈 Found {len(search_results.points)} results with MaxSim scoring")
 
     return {
         "results": [
@@ -207,6 +163,6 @@ def search(
                 "score": result.score,
                 "caption": result.payload.get("caption"),
             }
-            for result in search_results
+            for result in search_results.points
         ]
     }
diff --git a/examples/image_search_colpali/pyproject.toml b/examples/image_search_colpali/pyproject.toml
@@ -10,7 +10,6 @@ dependencies = [
     "torch>=2.0.0",
     "qdrant-client>=1.14.2",
     "uvicorn>=0.34.3",
-    "colpali-engine>=0.1.0",
     "Pillow>=10.0.0",
     "numpy>=1.24.0",
 ]
diff --git a/python/cocoindex/functions.py b/python/cocoindex/functions.py
@@ -99,3 +99,132 @@ def __call__(self, text: str) -> NDArray[np.float32]:
         assert self._model is not None
         result: NDArray[np.float32] = self._model.encode(text, convert_to_numpy=True)
         return result
+
+
+# ColPali model cache for ColPali embedding functions
+_colpali_model_cache = {}
+
+
+def get_colpali_model(model: str):
+    """Get or load ColPali model and processor."""
+    global _colpali_model_cache
+    if model not in _colpali_model_cache:
+        try:
+            from colpali_engine.models import ColPali, ColPaliProcessor
+        except ImportError as e:
+            raise ImportError(
+                "ColPali is not available. Make sure cocoindex is installed with ColPali support."
+            ) from e
+        
+        model_instance = ColPali.from_pretrained(model)
+        processor_instance = ColPaliProcessor.from_pretrained(model)
+        
+        # Try to get dimension from FastEmbed API first
+        output_dim = None
+        try:
+            from fastembed import LateInteractionMultimodalEmbedding
+            
+            # Use the standard FastEmbed ColPali model for dimension detection
+            # All ColPali variants should have the same embedding dimension
+            standard_colpali_model = "Qdrant/colpali-v1.3-fp16"
+            
+            # Try to find the model in FastEmbed's supported models
+            supported_models = LateInteractionMultimodalEmbedding.list_supported_models()
+            for supported_model in supported_models:
+                if supported_model["model"] == standard_colpali_model:
+                    output_dim = supported_model["dim"]
+                    break
+                    
+        except Exception:
+            # FastEmbed API failed, will fall back to model config
+            pass
+        
+        # Fallback to model config if FastEmbed API failed
+        if output_dim is None:
+            if hasattr(model_instance, 'config'):
+                # Try different config attributes that might contain the hidden dimension
+                if hasattr(model_instance.config, 'hidden_size'):
+                    output_dim = model_instance.config.hidden_size
+                elif hasattr(model_instance.config, 'text_config') and hasattr(model_instance.config.text_config, 'hidden_size'):
+                    output_dim = model_instance.config.text_config.hidden_size
+                elif hasattr(model_instance.config, 'vision_config') and hasattr(model_instance.config.vision_config, 'hidden_size'):
+                    output_dim = model_instance.config.vision_config.hidden_size
+                else:
+                    raise ValueError(f"Could not find hidden_size in model config for {model}. Config attributes: {dir(model_instance.config)}")
+            else:
+                raise ValueError(f"Model {model} has no config attribute. Model attributes: {dir(model_instance)}")
+        
+        _colpali_model_cache[model] = {
+            "model": model_instance,
+            "processor": processor_instance,
+            "dimension": output_dim,
+        }
+    return _colpali_model_cache[model]["model"], _colpali_model_cache[model]["processor"], _colpali_model_cache[model]["dimension"]
+
+
+def get_colpali_dimension(model: str) -> int:
+    """Get the output dimension for a ColPali model."""
+    _, _, dimension = get_colpali_model(model)
+    return dimension
+
+
+def colpali_embed_image(img_bytes: bytes, model: str) -> list[list[float]]:
+    """Embed image using ColPali model, returning multi-vector format."""
+    try:
+        from PIL import Image
+        import torch
+        import io
+    except ImportError as e:
+        raise ImportError(
+            "Required dependencies (PIL, torch) are missing for ColPali image embedding."
+        ) from e
+
+    colpali_model, processor, expected_dim = get_colpali_model(model)
+    pil_image = Image.open(io.BytesIO(img_bytes)).convert("RGB")
+    inputs = processor.process_images([pil_image])
+    with torch.no_grad():
+        embeddings = colpali_model(**inputs)
+    
+    # Return multi-vector format: [patches, hidden_dim]
+    if len(embeddings.shape) != 3:
+        raise ValueError(f"Expected 3D tensor [batch, patches, hidden_dim], got shape {embeddings.shape}")
+    
+    # Keep patch-level embeddings: [batch, patches, hidden_dim] -> [patches, hidden_dim]
+    patch_embeddings = embeddings[0]  # Remove batch dimension
+    
+    # Convert to list of lists: [[patch1_embedding], [patch2_embedding], ...]
+    result = []
+    for patch in patch_embeddings:
+        result.append(patch.cpu().numpy().tolist())
+    
+    return result
+
+
+def colpali_embed_query(query: str, model: str) -> list[list[float]]:
+    """Embed query using ColPali model, returning multi-vector format."""
+    try:
+        import torch
+        import numpy as np
+    except ImportError as e:
+        raise ImportError(
+            "Required dependencies (torch, numpy) are missing for ColPali query embedding."
+        ) from e
+
+    colpali_model, processor, target_dimension = get_colpali_model(model)
+    inputs = processor.process_queries([query])
+    with torch.no_grad():
+        embeddings = colpali_model(**inputs)
+    
+    # Return multi-vector format: [tokens, hidden_dim]
+    if len(embeddings.shape) != 3:
+        raise ValueError(f"Expected 3D tensor [batch, tokens, hidden_dim], got shape {embeddings.shape}")
+    
+    # Keep token-level embeddings: [batch, tokens, hidden_dim] -> [tokens, hidden_dim]
+    token_embeddings = embeddings[0]  # Remove batch dimension
+    
+    # Convert to list of lists: [[token1_embedding], [token2_embedding], ...]
+    result = []
+    for token in token_embeddings:
+        result.append(token.cpu().numpy().tolist())
+    
+    return result

Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,6 @@ dependencies = [`
`10`	`10`	`"torch>=2.0.0",`
`11`	`11`	`"qdrant-client>=1.14.2",`
`12`	`12`	`"uvicorn>=0.34.3",`
`13`		`- "colpali-engine>=0.1.0",`
`14`	`13`	`"Pillow>=10.0.0",`
`15`	`14`	`"numpy>=1.24.0",`
`16`	`15`	`]`