Skip to content

Commit 7492681

Browse files
akashAD98SimFGleio10
authored
support for lancedb as vectordb (#644)
* Remove the Weaviate unit test Signed-off-by: SimFG <bang.fu@zilliz.com> Signed-off-by: akashAD98 <aksdesai1998@gmail.com> * fix: avoid loading redis if not needed Signed-off-by: akashAD98 <aksdesai1998@gmail.com> * Update the version to `0.1.43` Signed-off-by: SimFG <bang.fu@zilliz.com> Signed-off-by: akashAD98 <aksdesai1998@gmail.com> * add the note for the gptcache api Signed-off-by: akashAD <aksdesai1998@gmail.com> Signed-off-by: akashAD98 <aksdesai1998@gmail.com> * Fix the nil memory eviction when using the init_similar_cache method Signed-off-by: SimFG <bang.fu@zilliz.com> Signed-off-by: akashAD <aksdesai1998@gmail.com> Signed-off-by: akashAD98 <aksdesai1998@gmail.com> * Update the version to 0.1.44 Signed-off-by: SimFG <bang.fu@zilliz.com> Signed-off-by: akashAD <aksdesai1998@gmail.com> Signed-off-by: akashAD98 <aksdesai1998@gmail.com> * added support for lancedb as vectorstore Signed-off-by: akashAD <aksdesai1998@gmail.com> Signed-off-by: akashAD98 <aksdesai1998@gmail.com> * Fix pylint issues and improve codes structure Signed-off-by: akashAD <aksdesai1998@gmail.com> Signed-off-by: akashAD98 <aksdesai1998@gmail.com> * refactor & pylint fix code Signed-off-by: akashAD98 <aksdesai1998@gmail.com> * pylint issue fixing Signed-off-by: akashAD98 <aksdesai1998@gmail.com> --------- Signed-off-by: SimFG <bang.fu@zilliz.com> Signed-off-by: akashAD98 <aksdesai1998@gmail.com> Signed-off-by: akashAD <aksdesai1998@gmail.com> Co-authored-by: SimFG <bang.fu@zilliz.com> Co-authored-by: leio10 <leo.diez@factorial.co>
1 parent 17646e1 commit 7492681

File tree

8 files changed

+136
-0
lines changed

8 files changed

+136
-0
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ Slash Your LLM API Costs by 10x 💰, Boost Speed by 100x ⚡
1414

1515
📔 This project is undergoing swift development, and as such, the API may be subject to change at any time. For the most up-to-date information, please refer to the latest [documentation]( https://gptcache.readthedocs.io/en/latest/) and [release note](https://github.com/zilliztech/GPTCache/blob/main/docs/release_note.md).
1616

17+
**NOTE:** As the number of large models is growing explosively and their API shape is constantly evolving, we no longer add support for new API or models. We encourage the usage of using the get and set API in gptcache, here is the demo code: https://github.com/zilliztech/GPTCache/blob/main/examples/adapter/api.py
18+
1719
## Quick Install
1820

1921
`pip install gptcache`

docs/configure_it.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,7 @@ For the similar cache of text, only cache store and vector store are needed. If
224224
- docarray
225225
- usearch
226226
- redis
227+
- lancedb
227228

228229
### object store
229230

examples/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,7 @@ Support vector database
274274
- Zilliz Cloud
275275
- FAISS
276276
- ChromaDB
277+
- LanceDB
277278

278279
> [Example code](https://github.com/zilliztech/GPTCache/blob/main/examples/data_manager/vector_store.py)
279280

examples/data_manager/vector_store.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ def run():
2020
'docarray',
2121
'redis',
2222
'weaviate',
23+
'lancedb',
2324
]
2425
for vector_store in vector_stores:
2526
cache_base = CacheBase('sqlite')
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
from typing import List, Optional
2+
3+
import numpy as np
4+
import pyarrow as pa
5+
import lancedb
6+
from gptcache.manager.vector_data.base import VectorBase, VectorData
7+
from gptcache.utils import import_lancedb, import_torch
8+
9+
import_torch()
10+
import_lancedb()
11+
12+
13+
class LanceDB(VectorBase):
14+
"""Vector store: LanceDB
15+
:param persist_directory: The directory to persist, defaults to '/tmp/lancedb'.
16+
:type persist_directory: str
17+
:param table_name: The name of the table in LanceDB, defaults to 'gptcache'.
18+
:type table_name: str
19+
:param top_k: The number of the vectors results to return, defaults to 1.
20+
:type top_k: int
21+
"""
22+
23+
def __init__(
24+
self,
25+
persist_directory: Optional[str] = "/tmp/lancedb",
26+
table_name: str = "gptcache",
27+
top_k: int = 1,
28+
):
29+
self._persist_directory = persist_directory
30+
self._table_name = table_name
31+
self._top_k = top_k
32+
33+
# Initialize LanceDB database
34+
self._db = lancedb.connect(self._persist_directory)
35+
36+
# Initialize or open table
37+
if self._table_name not in self._db.table_names():
38+
self._table = None # Table will be created with the first insertion
39+
else:
40+
self._table = self._db.open_table(self._table_name)
41+
42+
def mul_add(self, datas: List[VectorData]):
43+
"""Add multiple vectors to the LanceDB table"""
44+
vectors, vector_ids = map(list, zip(*((data.data.tolist(), str(data.id)) for data in datas)))
45+
# Infer the dimension of the vectors
46+
vector_dim = len(vectors[0]) if vectors else 0
47+
48+
# Create table with the inferred schema if it doesn't exist
49+
if self._table is None:
50+
schema = pa.schema([
51+
pa.field("id", pa.string()),
52+
pa.field("vector", pa.list_(pa.float32(), list_size=vector_dim))
53+
])
54+
self._table = self._db.create_table(self._table_name, schema=schema)
55+
56+
# Prepare and add data to the table
57+
self._table.add(({"id": vector_id, "vector": vector} for vector_id, vector in zip(vector_ids, vectors)))
58+
59+
def search(self, data: np.ndarray, top_k: int = -1):
60+
"""Search for the most similar vectors in the LanceDB table"""
61+
if len(self._table) == 0:
62+
return []
63+
64+
if top_k == -1:
65+
top_k = self._top_k
66+
67+
results = self._table.search(data.tolist()).limit(top_k).to_list()
68+
return [(result["_distance"], int(result["id"])) for result in results]
69+
70+
def delete(self, ids: List[int]):
71+
"""Delete vectors from the LanceDB table based on IDs"""
72+
for vector_id in ids:
73+
self._table.delete(f"id = '{vector_id}'")
74+
75+
def rebuild(self, ids: Optional[List[int]] = None):
76+
"""Rebuild the index, if applicable"""
77+
return True
78+
79+
def count(self):
80+
"""Return the total number of vectors in the table"""
81+
return len(self._table)

gptcache/manager/vector_data/manager.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ class VectorBase:
4242
`Chromadb` (with `top_k`, `client_settings`, `persist_directory`, `collection_name` params),
4343
`Hnswlib` (with `index_file_path`, `dimension`, `top_k`, `max_elements` params).
4444
`pgvector` (with `url`, `collection_name`, `index_params`, `top_k`, `dimension` params).
45+
`lancedb` (with `url`, `collection_name`, `index_param`, `top_k`,).
4546
4647
:param name: the name of the vectorbase, it is support 'milvus', 'faiss', 'chromadb', 'hnswlib' now.
4748
:type name: str
@@ -91,6 +92,14 @@ class VectorBase:
9192
:param persist_directory: the directory to persist, defaults to '.chromadb/' in the current directory.
9293
:type persist_directory: str
9394
95+
:param client_settings: the setting for LanceDB.
96+
:param persist_directory: The directory to persist, defaults to '/tmp/lancedb'.
97+
:type persist_directory: str
98+
:param table_name: The name of the table in LanceDB, defaults to 'gptcache'.
99+
:type table_name: str
100+
:param top_k: The number of the vectors results to return, defaults to 1.
101+
:type top_k: int
102+
94103
:param index_path: the path to hnswlib index, defaults to 'hnswlib_index.bin'.
95104
:type index_path: str
96105
:param max_elements: max_elements of hnswlib, defaults 100000.
@@ -293,6 +302,20 @@ def get(name, **kwargs):
293302
class_schema=class_schema,
294303
top_k=top_k,
295304
)
305+
306+
elif name == "lancedb":
307+
from gptcache.manager.vector_data.lancedb import LanceDB
308+
309+
persist_directory = kwargs.get("persist_directory", None)
310+
table_name = kwargs.get("table_name", COLLECTION_NAME)
311+
top_k: int = kwargs.get("top_k", TOP_K)
312+
313+
vector_base = LanceDB(
314+
persist_directory=persist_directory,
315+
table_name=table_name,
316+
top_k=top_k,
317+
)
318+
296319
else:
297320
raise NotFoundError("vector store", name)
298321
return vector_base

gptcache/utils/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
"import_redis",
4444
"import_qdrant",
4545
"import_weaviate",
46+
"import_lancedb",
4647
]
4748

4849
import importlib.util
@@ -152,6 +153,8 @@ def import_duckdb():
152153
_check_library("duckdb", package="duckdb")
153154
_check_library("duckdb-engine", package="duckdb-engine")
154155

156+
def import_lancedb():
157+
_check_library("lancedb", package="lancedb")
155158

156159
def import_sql_client(db_name):
157160
if db_name == "postgresql":
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import unittest
2+
import numpy as np
3+
from gptcache.manager import VectorBase
4+
from gptcache.manager.vector_data.base import VectorData
5+
6+
class TestLanceDB(unittest.TestCase):
7+
def test_normal(self):
8+
9+
db = VectorBase("lancedb", persist_directory="/tmp/test_lancedb", top_k=3)
10+
11+
# Add 100 vectors to the LanceDB
12+
db.mul_add([VectorData(id=i, data=np.random.sample(10)) for i in range(100)])
13+
14+
# Perform a search with a random query vector
15+
search_res = db.search(np.random.sample(10))
16+
17+
# Check that the search returns 3 results
18+
self.assertEqual(len(search_res), 3)
19+
20+
# Delete vectors with specific IDs
21+
db.delete([1, 3, 5, 7])
22+
23+
# Check that the count of vectors in the table is now 96
24+
self.assertEqual(db.count(), 96)

0 commit comments

Comments
 (0)