diff --git a/README.md b/README.md index 24618a3..6d33e5a 100644 --- a/README.md +++ b/README.md @@ -47,26 +47,32 @@ There are a couple of ways of doing upserts: ```python # as tuples, either of the form: +# - (id, vector, metadata, data) # - (id, vector, metadata) # - (id, vector) index.upsert( vectors=[ - ("id1", [0.1, 0.2], {"metadata_field": "metadata_value"}), - ("id2", [0.3, 0.4]), + ("id1", [0.1, 0.2], {"metadata_field": "metadata_value"}, "data-value"), + ("id2", [0.2, 0.2], {"metadata_field": "metadata_value"}), + ("id3", [0.3, 0.4]), ] ) ``` ```python # as dicts, either of the form: +# - {"id": id, "vector": vector, "metadata": metadata, "data": data) # - {"id": id, "vector": vector, "metadata": metadata) +# - {"id": id, "vector": vector, "data": data) # - {"id": id, "vector": vector} index.upsert( vectors=[ - {"id": "id3", "vector": [0.1, 0.2], "metadata": {"field": "value"}}, - {"id": "id4", "vector": [0.5, 0.6]}, + {"id": "id4", "vector": [0.1, 0.2], "metadata": {"field": "value"}, "data": "value"}, + {"id": "id5", "vector": [0.1, 0.2], "metadata": {"field": "value"}}, + {"id": "id6", "vector": [0.1, 0.2], "data": "value"}, + {"id": "id7", "vector": [0.5, 0.6]}, ] ) ``` @@ -79,12 +85,15 @@ from upstash_vector import Vector index.upsert( vectors=[ Vector(id="id5", vector=[1, 2], metadata={"field": "value"}), - Vector(id="id6", vector=[6, 7]), + Vector(id="id6", vector=[1, 2], data="value"), + Vector(id="id7", vector=[6, 7]), ] ) ``` If the index is created with an embedding model, raw string data can be upserted. +In this case, the `data` field of the vector will also be set to the `data` passed +below, so that it can be accessed later. ```python from upstash_vector import Data @@ -121,6 +130,7 @@ res = index.query( top_k=5, include_vectors=False, include_metadata=True, + include_data=True, filter="metadata_f = 'metadata_v'" ) @@ -131,6 +141,7 @@ for r in res: r.score, # The similarity score of this vector to the query vector. Higher is more similar. r.vector, # The value of the vector, if requested. r.metadata, # The metadata of the vector, if requested and present. + r.data, # The data of the vector, if requested and present. ) ``` @@ -142,6 +153,7 @@ res = index.query( top_k=5, include_vectors=False, include_metadata=True, + include_data=True, ) ``` @@ -171,6 +183,7 @@ res = index.fetch( ids=["id3", "id4"], include_vectors=False, include_metadata=True, + include_data=True, ) # List of fetch results, one for each id passed @@ -181,7 +194,8 @@ for r in res: print( r.id, # The id used while upserting the vector r.vector, # The value of the vector, if requested. - r.medata, # The metadata of the vector, if requested and present. + r.metadata, # The metadata of the vector, if requested and present. + r.data, # The metadata of the vector, if requested and present. ) ``` @@ -192,6 +206,7 @@ res = index.fetch( "id1", include_vectors=True, include_metadata=True, + include_data=False, ) r = res[0] @@ -199,7 +214,8 @@ if r: # Can be None, if there is no such vector with the given id print( r.id, # The id used while upserting the vector r.vector, # The value of the vector, if requested. - r.medata, # The metadata of the vector, if requested and present. + r.metadata, # The metadata of the vector, if requested and present. + r.data, # The metadata of the vector, if requested and present. ) ``` @@ -225,6 +241,7 @@ res = index.range( limit=100, include_vectors=False, include_metadata=True, + include_data=True, ) while res.next_cursor != "": @@ -233,6 +250,7 @@ while res.next_cursor != "": limit=100, include_vectors=False, include_metadata=True, + include_data=True, ) for v in res.vectors: @@ -240,6 +258,7 @@ while res.next_cursor != "": v.id, # The id used while upserting the vector v.vector, # The value of the vector, if requested. v.metadata, # The metadata of the vector, if requested and present. + v.data, # The data of the vector, if requested and present. ) ``` diff --git a/tests/conftest.py b/tests/conftest.py index a7ace3b..4e13f68 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,8 +1,8 @@ +from os import environ + import pytest import pytest_asyncio -from os import environ - from tests import NAMESPACES from upstash_vector import Index, AsyncIndex diff --git a/tests/core/test_fetch.py b/tests/core/test_fetch.py index 321b20c..1e7f736 100644 --- a/tests/core/test_fetch.py +++ b/tests/core/test_fetch.py @@ -136,6 +136,56 @@ def test_fetch_single(index: Index, ns: str): assert res[0].vector == v1_values +@pytest.mark.parametrize("ns", NAMESPACES) +def test_fetch_with_data(index: Index, ns: str): + v1_id = "v1-id1" + v1_metadata = {"metadata_field": "metadata_value"} + v1_data = "data1" + v1_values = [0.1, 0.2] + + v2_id = "v1-id2" + v2_values = [0.3, 0.4] + + v3_id = "v1-id3" + v3_values = [0.5, 0.6] + v3_data = "data3" + + index.upsert( + vectors=[ + (v1_id, v1_values, v1_metadata, v1_data), + (v2_id, v2_values), + (v3_id, v3_values, None, v3_data), + ], + namespace=ns, + ) + + res = index.fetch( + ids=[v1_id, v2_id, v3_id], + include_vectors=True, + include_metadata=True, + include_data=True, + namespace=ns, + ) + + assert res[0] is not None + assert res[0].id == v1_id + assert res[0].metadata == v1_metadata + assert res[0].vector == v1_values + assert res[0].data == v1_data + + assert res[1] is not None + assert res[1].id == v2_id + assert res[1].metadata is None + assert res[1].vector == v2_values + assert res[1].data is None + + assert res[2] is not None + assert res[2].id == v3_id + assert res[2].metadata is None + assert res[2].vector == v3_values + assert res[2].data == v3_data + + @pytest.mark.asyncio @pytest.mark.parametrize("ns", NAMESPACES) async def test_fetch_with_vectors_with_metadata_async(async_index: AsyncIndex, ns: str): @@ -277,3 +327,54 @@ async def test_fetch_single_async(async_index: AsyncIndex, ns: str): assert res[0].id == v1_id assert res[0].metadata == v1_metadata assert res[0].vector == v1_values + + +@pytest.mark.asyncio +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_fetch_with_data_async(async_index: AsyncIndex, ns: str): + v1_id = "v1-id1" + v1_metadata = {"metadata_field": "metadata_value"} + v1_data = "data1" + v1_values = [0.1, 0.2] + + v2_id = "v1-id2" + v2_values = [0.3, 0.4] + + v3_id = "v1-id3" + v3_values = [0.5, 0.6] + v3_data = "data3" + + await async_index.upsert( + vectors=[ + (v1_id, v1_values, v1_metadata, v1_data), + (v2_id, v2_values), + (v3_id, v3_values, None, v3_data), + ], + namespace=ns, + ) + + res = await async_index.fetch( + ids=[v1_id, v2_id, v3_id], + include_vectors=True, + include_metadata=True, + include_data=True, + namespace=ns, + ) + + assert res[0] is not None + assert res[0].id == v1_id + assert res[0].metadata == v1_metadata + assert res[0].vector == v1_values + assert res[0].data == v1_data + + assert res[1] is not None + assert res[1].id == v2_id + assert res[1].metadata is None + assert res[1].vector == v2_values + assert res[1].data is None + + assert res[2] is not None + assert res[2].id == v3_id + assert res[2].metadata is None + assert res[2].vector == v3_values + assert res[2].data == v3_data diff --git a/tests/core/test_query.py b/tests/core/test_query.py index d8e20c1..6128617 100644 --- a/tests/core/test_query.py +++ b/tests/core/test_query.py @@ -1,11 +1,10 @@ +import numpy as np +import pandas as pd import pytest -from tests import assert_eventually, assert_eventually_async, NAMESPACES +from tests import assert_eventually, assert_eventually_async, NAMESPACES from upstash_vector import Index, AsyncIndex -import numpy as np -import pandas as pd - @pytest.mark.parametrize("ns", NAMESPACES) def test_query_with_vectors_with_metadata(index: Index, ns: str): @@ -315,6 +314,112 @@ def assertion(): assert_eventually(assertion) +@pytest.mark.parametrize("ns", NAMESPACES) +def test_query_many(index: Index, ns: str): + index.upsert( + vectors=[ + ("id0", [0.1, 0.1], {"0": 0}), + ("id1", [1, 1], {"1": 1}), + ("id2", [2, 2], {"2": 2}), + ], + namespace=ns, + ) + + def assertion(): + res = index.query_many( + queries=[ + { + "vector": [0.1, 0.1], + "top_k": 1, + }, + ], + namespace=ns, + ) + + assert len(res) == 1 + assert len(res[0]) == 1 + assert res[0][0].id == "id0" + + res = index.query_many( + queries=[ + { + "vector": [1, 1], + }, + { + "vector": [2, 2], + "top_k": 1, + "include_vectors": True, + }, + ], + namespace=ns, + ) + + assert len(res) == 2 + assert len(res[0]) == 3 + + assert len(res[1]) == 1 + assert res[1][0].vector is not None + + assert_eventually(assertion) + + +@pytest.mark.parametrize("ns", NAMESPACES) +def test_query_with_data_with_vector_with_metadata(index: Index, ns: str): + v1_id = "id1" + v1_metadata = {"metadata_field": "metadata_value"} + v1_values = [0.1, 0.2] + + v2_id = "id2" + v2_values = [0.3, 0.4] + + v3_id = "id3" + v3_values = [0.5, 0.6] + v3_data = "data" + + index.upsert( + vectors=[ + (v1_id, v1_values, v1_metadata), + (v2_id, v2_values), + (v3_id, v3_values, None, v3_data), + ], + namespace=ns, + ) + + def assertion(): + query_res = index.query( + v3_values, + top_k=1, + include_metadata=True, + include_vectors=True, + include_data=True, + namespace=ns, + ) + assert len(query_res) == 1 + + assert query_res[0].id == v3_id + assert query_res[0].metadata is None + assert query_res[0].score == 1 + assert query_res[0].vector == v3_values + assert query_res[0].data == v3_data + + query_res = index.query( + v1_values, + top_k=2, + include_metadata=True, + include_vectors=True, + namespace=ns, + ) + assert len(query_res) == 2 + + assert query_res[1].id == v2_id + assert query_res[1].metadata is None + assert query_res[1].score < 1 + assert query_res[1].vector == v2_values + assert query_res[1].data is None + + assert_eventually(assertion) + + @pytest.mark.asyncio @pytest.mark.parametrize("ns", NAMESPACES) async def test_query_with_vectors_with_metadata_async(async_index: AsyncIndex, ns: str): @@ -810,3 +915,113 @@ async def assertion(): assert len(query_res) == 2 await assert_eventually_async(assertion) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_query_many_async(async_index: AsyncIndex, ns: str): + await async_index.upsert( + vectors=[ + ("id0", [0.1, 0.1], {"0": 0}), + ("id1", [1, 1], {"1": 1}), + ("id2", [2, 2], {"2": 2}), + ], + namespace=ns, + ) + + async def assertion(): + res = await async_index.query_many( + queries=[ + { + "vector": [0.1, 0.1], + "top_k": 1, + }, + ], + namespace=ns, + ) + + assert len(res) == 1 + assert len(res[0]) == 1 + assert res[0][0].id == "id0" + + res = await async_index.query_many( + queries=[ + { + "vector": [1, 1], + }, + { + "vector": [2, 2], + "top_k": 1, + "include_vectors": True, + }, + ], + namespace=ns, + ) + + assert len(res) == 2 + assert len(res[0]) == 3 + + assert len(res[1]) == 1 + assert res[1][0].vector is not None + + await assert_eventually_async(assertion) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_query_with_data_with_vector_with_metadata_async( + async_index: AsyncIndex, ns: str +): + v1_id = "id1" + v1_metadata = {"metadata_field": "metadata_value"} + v1_values = [0.1, 0.2] + + v2_id = "id2" + v2_values = [0.3, 0.4] + + v3_id = "id3" + v3_values = [0.5, 0.6] + v3_data = "data" + + await async_index.upsert( + vectors=[ + (v1_id, v1_values, v1_metadata), + (v2_id, v2_values), + (v3_id, v3_values, None, v3_data), + ], + namespace=ns, + ) + + async def assertion(): + query_res = await async_index.query( + v3_values, + top_k=1, + include_metadata=True, + include_vectors=True, + include_data=True, + namespace=ns, + ) + assert len(query_res) == 1 + + assert query_res[0].id == v3_id + assert query_res[0].metadata is None + assert query_res[0].score == 1 + assert query_res[0].vector == v3_values + assert query_res[0].data == v3_data + + query_res = await async_index.query( + v1_values, + top_k=2, + include_metadata=True, + include_vectors=True, + namespace=ns, + ) + assert len(query_res) == 2 + + assert query_res[1].id == v2_id + assert query_res[1].metadata is None + assert query_res[1].score < 1 + assert query_res[1].vector == v2_values + assert query_res[1].data is None + + await assert_eventually_async(assertion) diff --git a/tests/core/test_range.py b/tests/core/test_range.py index c29e36c..b8bc2de 100644 --- a/tests/core/test_range.py +++ b/tests/core/test_range.py @@ -1,6 +1,7 @@ +import random + import pytest from pytest import raises -import random from tests import NAMESPACES from upstash_vector import Index, AsyncIndex @@ -10,7 +11,12 @@ @pytest.mark.parametrize("ns", NAMESPACES) def test_range(index: Index, ns: str): vectors = [ - {"id": f"id-{i}", "vector": [random.random() for _ in range(2)]} + { + "id": f"id-{i}", + "vector": [random.random() for _ in range(2)], + "metadata": {"meta": i}, + "data": f"data-{i}", + } for i in range(20) ] @@ -20,11 +26,18 @@ def test_range(index: Index, ns: str): cursor="", limit=4, include_vectors=True, + include_metadata=True, + include_data=True, namespace=ns, ) assert len(res.vectors) == 4 assert res.next_cursor != "" + for i in range(4): + assert res.vectors[i].id == f"id-{i}" + assert res.vectors[i].metadata == {"meta": i} + assert res.vectors[i].data == f"data-{i}" + while res.next_cursor != "": res = index.range( cursor=res.next_cursor, @@ -47,7 +60,12 @@ def test_range(index: Index, ns: str): @pytest.mark.parametrize("ns", NAMESPACES) async def test_range_async(async_index: AsyncIndex, ns: str): vectors = [ - {"id": f"id-{i}", "vector": [random.random() for _ in range(2)]} + { + "id": f"id-{i}", + "vector": [random.random() for _ in range(2)], + "metadata": {"meta": i}, + "data": f"data-{i}", + } for i in range(20) ] @@ -57,11 +75,18 @@ async def test_range_async(async_index: AsyncIndex, ns: str): cursor="", limit=4, include_vectors=True, + include_metadata=True, + include_data=True, namespace=ns, ) assert len(res.vectors) == 4 assert res.next_cursor != "" + for i in range(4): + assert res.vectors[i].id == f"id-{i}" + assert res.vectors[i].metadata == {"meta": i} + assert res.vectors[i].data == f"data-{i}" + while res.next_cursor != "": res = await async_index.range( cursor=res.next_cursor, diff --git a/tests/core/test_update.py b/tests/core/test_update.py index 2a95a27..2da0cae 100644 --- a/tests/core/test_update.py +++ b/tests/core/test_update.py @@ -2,7 +2,7 @@ from tests import NAMESPACES from upstash_vector import Index, AsyncIndex -from upstash_vector.errors import ClientError +from upstash_vector.types import MetadataUpdateMode @pytest.mark.parametrize("ns", NAMESPACES) @@ -33,19 +33,25 @@ def test_update_data(embedding_index: Index, ns: str): namespace=ns, ) - res = embedding_index.fetch("id-0", include_vectors=True, namespace=ns) + res = embedding_index.fetch( + "id-0", include_vectors=True, include_data=True, namespace=ns + ) assert len(res) == 1 assert res[0] is not None old_vector = res[0].vector + old_data = res[0].data updated = embedding_index.update("id-0", data="bye", namespace=ns) assert updated is True - res = embedding_index.fetch("id-0", include_vectors=True, namespace=ns) + res = embedding_index.fetch( + "id-0", include_vectors=True, include_data=True, namespace=ns + ) assert len(res) == 1 assert res[0] is not None assert res[0].vector != old_vector + assert res[0].data != old_data @pytest.mark.parametrize("ns", NAMESPACES) @@ -69,17 +75,59 @@ def test_update_metadata(index: Index, ns: str): assert res[0].metadata == {"new_field": "new_value"} +@pytest.mark.parametrize("ns", NAMESPACES) +def test_patch_metadata(index: Index, ns: str): + index.upsert( + [("id-0", [0.1, 0.2], {"field": "value", "field2": "value2"})], + namespace=ns, + ) + + res = index.fetch("id-0", include_metadata=True, namespace=ns) + assert len(res) == 1 + assert res[0] is not None + assert res[0].metadata == {"field": "value", "field2": "value2"} + + updated = index.update( + "id-0", + metadata={"new_field": "new_value", "field2": None}, + namespace=ns, + metadata_update_mode=MetadataUpdateMode.PATCH, + ) + assert updated is True + + res = index.fetch("id-0", include_metadata=True, namespace=ns) + assert len(res) == 1 + assert res[0] is not None + assert res[0].metadata == {"field": "value", "new_field": "new_value"} + + +@pytest.mark.parametrize("ns", NAMESPACES) +def test_update_vector_data(index: Index, ns: str): + index.upsert( + [("id-0", [0.1, 0.2], None, "data")], + namespace=ns, + ) + + res = index.fetch("id-0", include_data=True, namespace=ns) + assert len(res) == 1 + assert res[0] is not None + assert res[0].data == "data" + + updated = index.update("id-0", data="new-data", namespace=ns) + assert updated is True + + res = index.fetch("id-0", include_data=True, namespace=ns) + assert len(res) == 1 + assert res[0] is not None + assert res[0].data == "new-data" + + @pytest.mark.parametrize("ns", NAMESPACES) def test_update_non_existing_id(index: Index, ns: str): updated = index.update("id-999", vector=[0.4, 0.5], namespace=ns) assert updated is False -def test_update_too_many_params(index: Index): - with pytest.raises(ClientError): - index.update("id-0", vector=[0.2, 0.3], metadata={"new_field": "new_value"}) - - @pytest.mark.asyncio @pytest.mark.parametrize("ns", NAMESPACES) async def test_update_vector_async(async_index: AsyncIndex, ns: str): @@ -110,19 +158,25 @@ async def test_update_data_async(async_embedding_index: AsyncIndex, ns: str): namespace=ns, ) - res = await async_embedding_index.fetch("id-0", include_vectors=True, namespace=ns) + res = await async_embedding_index.fetch( + "id-0", include_vectors=True, include_data=True, namespace=ns + ) assert len(res) == 1 assert res[0] is not None old_vector = res[0].vector + old_data = res[0].data updated = await async_embedding_index.update("id-0", data="bye", namespace=ns) assert updated is True - res = await async_embedding_index.fetch("id-0", include_vectors=True, namespace=ns) + res = await async_embedding_index.fetch( + "id-0", include_vectors=True, include_data=True, namespace=ns + ) assert len(res) == 1 assert res[0] is not None assert res[0].vector != old_vector + assert res[0].data != old_data @pytest.mark.asyncio @@ -151,14 +205,55 @@ async def test_update_metadata_async(async_index: AsyncIndex, ns: str): @pytest.mark.asyncio @pytest.mark.parametrize("ns", NAMESPACES) -async def test_update_non_existing_id_async(async_index: AsyncIndex, ns: str): - updated = await async_index.update("id-999", vector=[0.4, 0.5], namespace=ns) - assert updated is False +async def test_patch_metadata_async(async_index: AsyncIndex, ns: str): + await async_index.upsert( + [("id-0", [0.1, 0.2], {"field": "value", "field2": "value2"})], + namespace=ns, + ) + + res = await async_index.fetch("id-0", include_metadata=True, namespace=ns) + assert len(res) == 1 + assert res[0] is not None + assert res[0].metadata == {"field": "value", "field2": "value2"} + + updated = await async_index.update( + "id-0", + metadata={"new_field": "new_value", "field2": None}, + namespace=ns, + metadata_update_mode=MetadataUpdateMode.PATCH, + ) + assert updated is True + + res = await async_index.fetch("id-0", include_metadata=True, namespace=ns) + assert len(res) == 1 + assert res[0] is not None + assert res[0].metadata == {"field": "value", "new_field": "new_value"} @pytest.mark.asyncio -async def test_update_too_many_params_async(async_index: AsyncIndex): - with pytest.raises(ClientError): - await async_index.update( - "id-0", vector=[0.2, 0.3], metadata={"new_field": "new_value"} - ) +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_update_vector_data_async(async_index: AsyncIndex, ns: str): + await async_index.upsert( + [("id-0", [0.1, 0.2], None, "data")], + namespace=ns, + ) + + res = await async_index.fetch("id-0", include_data=True, namespace=ns) + assert len(res) == 1 + assert res[0] is not None + assert res[0].data == "data" + + updated = await async_index.update("id-0", data="new-data", namespace=ns) + assert updated is True + + res = await async_index.fetch("id-0", include_data=True, namespace=ns) + assert len(res) == 1 + assert res[0] is not None + assert res[0].data == "new-data" + + +@pytest.mark.asyncio +@pytest.mark.parametrize("ns", NAMESPACES) +async def test_update_non_existing_id_async(async_index: AsyncIndex, ns: str): + updated = await async_index.update("id-999", vector=[0.4, 0.5], namespace=ns) + assert updated is False diff --git a/tests/core/test_upsert.py b/tests/core/test_upsert.py index b2cf261..833eaab 100644 --- a/tests/core/test_upsert.py +++ b/tests/core/test_upsert.py @@ -19,18 +19,24 @@ def test_upsert_tuple(index: Index, ns: str): v2_id = "id2" v2_values = [0.3, 0.4] + v3_id = "id3" + v3_values = [0.5, 0.6] + v3_data = "data-value" + index.upsert( vectors=[ (v1_id, v1_values, v1_metadata), (v2_id, v2_values), + (v3_id, v3_values, None, v3_data), ], namespace=ns, ) res = index.fetch( - ids=[v1_id, v2_id], + ids=[v1_id, v2_id, v3_id], include_vectors=True, include_metadata=True, + include_data=True, namespace=ns, ) @@ -38,11 +44,19 @@ def test_upsert_tuple(index: Index, ns: str): assert res[0].id == v1_id assert res[0].metadata == v1_metadata assert res[0].vector == v1_values + assert res[0].data is None assert res[1] is not None assert res[1].id == v2_id assert res[1].metadata is None assert res[1].vector == v2_values + assert res[1].data is None + + assert res[2] is not None + assert res[2].id == v3_id + assert res[2].metadata is None + assert res[2].vector == v3_values + assert res[2].data == v3_data @pytest.mark.asyncio @@ -55,18 +69,24 @@ async def test_upsert_tuple_async(async_index: AsyncIndex, ns: str): v2_id = "id2" v2_values = [0.3, 0.4] + v3_id = "id3" + v3_values = [0.5, 0.6] + v3_data = "data-value" + await async_index.upsert( vectors=[ (v1_id, v1_values, v1_metadata), (v2_id, v2_values), + (v3_id, v3_values, None, v3_data), ], namespace=ns, ) res = await async_index.fetch( - ids=[v1_id, v2_id], + ids=[v1_id, v2_id, v3_id], include_vectors=True, include_metadata=True, + include_data=True, namespace=ns, ) @@ -74,11 +94,19 @@ async def test_upsert_tuple_async(async_index: AsyncIndex, ns: str): assert res[0].id == v1_id assert res[0].metadata == v1_metadata assert res[0].vector == v1_values + assert res[0].data is None assert res[1] is not None assert res[1].id == v2_id assert res[1].metadata is None assert res[1].vector == v2_values + assert res[1].data is None + + assert res[2] is not None + assert res[2].id == v3_id + assert res[2].metadata is None + assert res[2].vector == v3_values + assert res[2].data == v3_data @pytest.mark.parametrize("ns", NAMESPACES) @@ -90,18 +118,24 @@ def test_upsert_dict(index: Index, ns: str): v2_id = "dict_id2" v2_values = [0.3, 0.4] + v3_id = "id3" + v3_values = [0.5, 0.6] + v3_data = "data-value" + index.upsert( vectors=[ {"id": v1_id, "vector": v1_values, "metadata": v1_metadata}, {"id": v2_id, "vector": v2_values}, + {"id": v3_id, "vector": v3_values, "data": v3_data}, ], namespace=ns, ) res = index.fetch( - ids=[v1_id, v2_id], + ids=[v1_id, v2_id, v3_id], include_vectors=True, include_metadata=True, + include_data=True, namespace=ns, ) @@ -109,11 +143,19 @@ def test_upsert_dict(index: Index, ns: str): assert res[0].id == v1_id assert res[0].metadata == v1_metadata assert res[0].vector == v1_values + assert res[0].data is None assert res[1] is not None assert res[1].id == v2_id assert res[1].metadata is None assert res[1].vector == v2_values + assert res[1].data is None + + assert res[2] is not None + assert res[2].id == v3_id + assert res[2].metadata is None + assert res[2].vector == v3_values + assert res[2].data == v3_data @pytest.mark.asyncio @@ -126,18 +168,24 @@ async def test_upsert_dict_async(async_index: AsyncIndex, ns: str): v2_id = "dict_id2" v2_values = [0.3, 0.4] + v3_id = "id3" + v3_values = [0.5, 0.6] + v3_data = "data-value" + await async_index.upsert( vectors=[ {"id": v1_id, "vector": v1_values, "metadata": v1_metadata}, {"id": v2_id, "vector": v2_values}, + {"id": v3_id, "vector": v3_values, "data": v3_data}, ], namespace=ns, ) res = await async_index.fetch( - ids=[v1_id, v2_id], + ids=[v1_id, v2_id, v3_id], include_vectors=True, include_metadata=True, + include_data=True, namespace=ns, ) @@ -145,11 +193,19 @@ async def test_upsert_dict_async(async_index: AsyncIndex, ns: str): assert res[0].id == v1_id assert res[0].metadata == v1_metadata assert res[0].vector == v1_values + assert res[0].data is None assert res[1] is not None assert res[1].id == v2_id assert res[1].metadata is None assert res[1].vector == v2_values + assert res[1].data is None + + assert res[2] is not None + assert res[2].id == v3_id + assert res[2].metadata is None + assert res[2].vector == v3_values + assert res[2].data == v3_data @pytest.mark.parametrize("ns", NAMESPACES) @@ -161,18 +217,24 @@ def test_upsert_vector(index: Index, ns: str): v2_id = "vector_id2" v2_values = [0.3, 0.4] + v3_id = "id3" + v3_values = [0.5, 0.6] + v3_data = "data-value" + index.upsert( vectors=[ Vector(id=v1_id, vector=v1_values, metadata=v1_metadata), Vector(id=v2_id, vector=v2_values), + Vector(id=v3_id, vector=v3_values, data=v3_data), ], namespace=ns, ) res = index.fetch( - ids=[v1_id, v2_id], + ids=[v1_id, v2_id, v3_id], include_vectors=True, include_metadata=True, + include_data=True, namespace=ns, ) @@ -180,11 +242,19 @@ def test_upsert_vector(index: Index, ns: str): assert res[0].id == v1_id assert res[0].metadata == v1_metadata assert res[0].vector == v1_values + assert res[0].data is None assert res[1] is not None assert res[1].id == v2_id assert res[1].metadata is None assert res[1].vector == v2_values + assert res[1].data is None + + assert res[2] is not None + assert res[2].id == v3_id + assert res[2].metadata is None + assert res[2].vector == v3_values + assert res[2].data == v3_data @pytest.mark.asyncio @@ -197,18 +267,24 @@ async def test_upsert_vector_async(async_index: AsyncIndex, ns: str): v2_id = "vector_id2" v2_values = [0.3, 0.4] + v3_id = "id3" + v3_values = [0.5, 0.6] + v3_data = "data-value" + await async_index.upsert( vectors=[ Vector(id=v1_id, vector=v1_values, metadata=v1_metadata), Vector(id=v2_id, vector=v2_values), + Vector(id=v3_id, vector=v3_values, data=v3_data), ], namespace=ns, ) res = await async_index.fetch( - ids=[v1_id, v2_id], + ids=[v1_id, v2_id, v3_id], include_vectors=True, include_metadata=True, + include_data=True, namespace=ns, ) @@ -216,11 +292,19 @@ async def test_upsert_vector_async(async_index: AsyncIndex, ns: str): assert res[0].id == v1_id assert res[0].metadata == v1_metadata assert res[0].vector == v1_values + assert res[0].data is None assert res[1] is not None assert res[1].id == v2_id assert res[1].metadata is None assert res[1].vector == v2_values + assert res[1].data is None + + assert res[2] is not None + assert res[2].id == v3_id + assert res[2].metadata is None + assert res[2].vector == v3_values + assert res[2].data == v3_data @pytest.mark.parametrize("ns", NAMESPACES) @@ -232,18 +316,24 @@ def test_upsert_tuple_with_numpy(index: Index, ns: str): v2_id = "id2" v2_values = np.array([0.3, 0.4]) + v3_id = "id3" + v3_values = np.array([0.5, 0.6]) + v3_data = "data-value" + index.upsert( vectors=[ (v1_id, v1_values, v1_metadata), (v2_id, v2_values), + (v3_id, v3_values, None, v3_data), ], namespace=ns, ) res = index.fetch( - ids=[v1_id, v2_id], + ids=[v1_id, v2_id, v3_id], include_vectors=True, include_metadata=True, + include_data=True, namespace=ns, ) @@ -251,11 +341,19 @@ def test_upsert_tuple_with_numpy(index: Index, ns: str): assert res[0].id == v1_id assert res[0].metadata == v1_metadata assert res[0].vector == v1_values.tolist() + assert res[0].data is None assert res[1] is not None assert res[1].id == v2_id assert res[1].metadata is None assert res[1].vector == v2_values.tolist() + assert res[1].data is None + + assert res[2] is not None + assert res[2].id == v3_id + assert res[2].metadata is None + assert res[2].vector == v3_values.tolist() + assert res[2].data == v3_data @pytest.mark.parametrize("ns", NAMESPACES) @@ -267,18 +365,24 @@ def test_upsert_dict_with_numpy(index: Index, ns: str): v2_id = "dict_id2" v2_values = np.array([0.3, 0.4]) + v3_id = "dict_id3" + v3_values = np.array([0.5, 0.6]) + v3_data = "data-value" + index.upsert( vectors=[ {"id": v1_id, "vector": v1_values, "metadata": v1_metadata}, {"id": v2_id, "vector": v2_values}, + {"id": v3_id, "vector": v3_values, "data": v3_data}, ], namespace=ns, ) res = index.fetch( - ids=[v1_id, v2_id], + ids=[v1_id, v2_id, v3_id], include_vectors=True, include_metadata=True, + include_data=True, namespace=ns, ) @@ -286,11 +390,19 @@ def test_upsert_dict_with_numpy(index: Index, ns: str): assert res[0].id == v1_id assert res[0].metadata == v1_metadata assert res[0].vector == v1_values.tolist() + assert res[0].data is None assert res[1] is not None assert res[1].id == v2_id assert res[1].metadata is None assert res[1].vector == v2_values.tolist() + assert res[1].data is None + + assert res[2] is not None + assert res[2].id == v3_id + assert res[2].metadata is None + assert res[2].vector == v3_values.tolist() + assert res[2].data == v3_data @pytest.mark.parametrize("ns", NAMESPACES) @@ -302,18 +414,24 @@ def test_upsert_vector_with_numpy(index: Index, ns: str): v2_id = "vector_id2" v2_values = np.array([0.3, 0.4]) + v3_id = "vector_id3" + v3_values = np.array([0.5, 0.6]) + v3_data = "data-value" + index.upsert( vectors=[ Vector(id=v1_id, vector=v1_values, metadata=v1_metadata), Vector(id=v2_id, vector=v2_values), + Vector(id=v3_id, vector=v3_values, data=v3_data), ], namespace=ns, ) res = index.fetch( - ids=[v1_id, v2_id], + ids=[v1_id, v2_id, v3_id], include_vectors=True, include_metadata=True, + include_data=True, namespace=ns, ) @@ -321,11 +439,19 @@ def test_upsert_vector_with_numpy(index: Index, ns: str): assert res[0].id == v1_id assert res[0].metadata == v1_metadata assert res[0].vector == v1_values.tolist() + assert res[0].data is None assert res[1] is not None assert res[1].id == v2_id assert res[1].metadata is None assert res[1].vector == v2_values.tolist() + assert res[1].data is None + + assert res[2] is not None + assert res[2].id == v3_id + assert res[2].metadata is None + assert res[2].vector == v3_values.tolist() + assert res[2].data == v3_data @pytest.mark.parametrize("ns", NAMESPACES) @@ -337,20 +463,24 @@ def test_upsert_tuple_with_pandas(index: Index, ns: str): v2_id = "id2" v2_values = pd.array([0.3, 0.4]) - assert v2_values == [0.3, 0.4] + v3_id = "id3" + v3_values = pd.array([0.5, 0.6]) + v3_data = "data-value" index.upsert( vectors=[ (v1_id, v1_values, v1_metadata), (v2_id, v2_values), + (v3_id, v3_values, None, v3_data), ], namespace=ns, ) res = index.fetch( - ids=[v1_id, v2_id], + ids=[v1_id, v2_id, v3_id], include_vectors=True, include_metadata=True, + include_data=True, namespace=ns, ) @@ -358,11 +488,19 @@ def test_upsert_tuple_with_pandas(index: Index, ns: str): assert res[0].id == v1_id assert res[0].metadata == v1_metadata assert res[0].vector == v1_values + assert res[0].data is None assert res[1] is not None assert res[1].id == v2_id assert res[1].metadata is None assert res[1].vector == v2_values + assert res[1].data is None + + assert res[2] is not None + assert res[2].id == v3_id + assert res[2].metadata is None + assert res[2].vector == v3_values + assert res[2].data == v3_data @pytest.mark.parametrize("ns", NAMESPACES) @@ -374,18 +512,24 @@ def test_upsert_dict_with_pandas(index: Index, ns: str): v2_id = "dict_id2" v2_values = pd.array([0.3, 0.4]) + v3_id = "dict_id3" + v3_values = pd.array([0.5, 0.6]) + v3_data = "data-value" + index.upsert( vectors=[ {"id": v1_id, "vector": v1_values, "metadata": v1_metadata}, {"id": v2_id, "vector": v2_values}, + {"id": v3_id, "vector": v3_values, "data": v3_data}, ], namespace=ns, ) res = index.fetch( - ids=[v1_id, v2_id], + ids=[v1_id, v2_id, v3_id], include_vectors=True, include_metadata=True, + include_data=True, namespace=ns, ) @@ -393,11 +537,19 @@ def test_upsert_dict_with_pandas(index: Index, ns: str): assert res[0].id == v1_id assert res[0].metadata == v1_metadata assert res[0].vector == v1_values + assert res[0].data is None assert res[1] is not None assert res[1].id == v2_id assert res[1].metadata is None assert res[1].vector == v2_values + assert res[1].data is None + + assert res[2] is not None + assert res[2].id == v3_id + assert res[2].metadata is None + assert res[2].vector == v3_values + assert res[2].data == v3_data @pytest.mark.parametrize("ns", NAMESPACES) @@ -409,18 +561,24 @@ def test_upsert_vector_with_pandas(index: Index, ns: str): v2_id = "vector_id2" v2_values = pd.array([0.3, 0.4]) + v3_id = "vector_id3" + v3_values = pd.array([0.5, 0.6]) + v3_data = "data-value" + index.upsert( vectors=[ Vector(id=v1_id, vector=v1_values, metadata=v1_metadata), Vector(id=v2_id, vector=v2_values), + Vector(id=v3_id, vector=v3_values, data=v3_data), ], namespace=ns, ) res = index.fetch( - ids=[v1_id, v2_id], + ids=[v1_id, v2_id, v3_id], include_vectors=True, include_metadata=True, + include_data=True, namespace=ns, ) @@ -428,11 +586,19 @@ def test_upsert_vector_with_pandas(index: Index, ns: str): assert res[0].id == v1_id assert res[0].metadata == v1_metadata assert res[0].vector == v1_values + assert res[0].data is None assert res[1] is not None assert res[1].id == v2_id assert res[1].metadata is None assert res[1].vector == v2_values + assert res[1].data is None + + assert res[2] is not None + assert res[2].id == v3_id + assert res[2].metadata is None + assert res[2].vector == v3_values + assert res[2].data == v3_data @pytest.mark.parametrize("ns", NAMESPACES) @@ -456,16 +622,19 @@ def test_upsert_data(embedding_index: Index, ns: str): ids=[v1_id, v2_id], include_vectors=True, include_metadata=True, + include_data=True, namespace=ns, ) assert res[0] is not None assert res[0].id == v1_id assert res[0].metadata == v1_metadata + assert res[0].data == v1_data assert res[1] is not None assert res[1].id == v2_id assert res[1].metadata is None + assert res[1].data == v2_data @pytest.mark.asyncio @@ -490,16 +659,19 @@ async def test_upsert_data_async(async_embedding_index: AsyncIndex, ns: str): ids=[v1_id, v2_id], include_vectors=True, include_metadata=True, + include_data=True, namespace=ns, ) assert res[0] is not None assert res[0].id == v1_id assert res[0].metadata == v1_metadata + assert res[0].data == v1_data assert res[1] is not None assert res[1].id == v2_id assert res[1].metadata is None + assert res[1].data == v2_data @pytest.mark.parametrize("ns", NAMESPACES) diff --git a/upstash_vector/core/index_operations.py b/upstash_vector/core/index_operations.py index ab2a420..3f71d21 100644 --- a/upstash_vector/core/index_operations.py +++ b/upstash_vector/core/index_operations.py @@ -2,10 +2,13 @@ # Upsert and query functions and signatures from typing import Sequence, Union, List, Dict, Optional, Any + from upstash_vector.errors import ClientError from upstash_vector.types import ( Data, DeleteResult, + MetadataUpdateMode, + QueryRequest, RangeResult, InfoResult, SupportsToList, @@ -13,8 +16,12 @@ QueryResult, Vector, ) - -from upstash_vector.utils import convert_to_list, convert_to_vectors, convert_to_payload +from upstash_vector.utils import ( + convert_query_requests_to_payload, + convert_to_list, + convert_to_vectors, + convert_to_payload, +) DEFAULT_NAMESPACE = "" @@ -62,8 +69,9 @@ def upsert( ```python res = index.upsert( vectors=[ - ("id1", [0.1, 0.2], {"metadata_field": "metadata_value"}), - ("id2", [0.3,0.4]) + ("id1", [0.1, 0.2], {"metadata_field": "metadata_value"}, "data-value"), + ("id2", [0.2, 0.2], {"metadata_field": "metadata_value"}), + ("id3", [0.3,0.4]), ] ) ``` @@ -71,8 +79,9 @@ def upsert( ```python res = index.upsert( vectors=[ - {"id": "id3", "vector": [0.1, 0.2], "metadata": {"field": "value"}}, - {"id": "id4", "vector": [0.5, 0.6]}, + {"id": "id4", "vector": [0.1, 0.2], "metadata": {"field": "value"}, "data": "data-value"}, + {"id": "id5", "vector": [0.2, 0.2], "metadata": {"field": "value"}}, + {"id": "id6", "vector": [0.5, 0.6]}, ] ) ``` @@ -81,8 +90,9 @@ def upsert( from upstash_vector import Vector res = index.upsert( vectors=[ - Vector(id="id5", vector=[1, 2], metadata={"field": "value"}), - Vector(id="id6", vector=[6, 7]), + Vector(id="id7", vector=[0.1, 0.2], metadata={"field": "value"}, data="data-value"), + Vector(id="id8", vector=[0.1, 0.2], metadata={"field": "value"}), + Vector(id="id9", vector=[0.6, 0.7]), ] ) ``` @@ -105,7 +115,7 @@ def upsert( res = index.upsert( vectors=[ ("id1", [0.1, 0.2]), - ("id2", [0.3,0.4]), + ("id2", [0.3, 0.4]), ], namespace="ns", ) @@ -127,6 +137,7 @@ def query( filter: str = "", data: Optional[str] = None, namespace: str = DEFAULT_NAMESPACE, + include_data: bool = False, ) -> List[QueryResult]: """ Query `top_k` many similar vectors. @@ -140,6 +151,7 @@ def query( :param filter: Filter expression to narrow down the query results. :param data: Data to query for (after embedding it to a vector) :param namespace: The namespace to use. When not specified, the default namespace is used. + :param include_data: Whether the resulting `top_k` vectors will have their unstructured data or not. Example usage: @@ -165,6 +177,7 @@ def query( "topK": top_k, "includeVectors": include_vectors, "includeMetadata": include_metadata, + "includeData": include_data, "filter": filter, } @@ -189,6 +202,64 @@ def query( ) ] + def query_many( + self, + *, + queries: List[QueryRequest], + namespace: str = DEFAULT_NAMESPACE, + ) -> List[List[QueryResult]]: + """ + Makes a batch query request. + + The batch should only contain elements whose `data` + or `vector` fields set. + + Example usage: + + ```python + res = index.query_many( + queries=[ + { + "vector": [0.5, 0.4], + "top_k": 2, + }, + { + "vector": [0.3, 0.2], + }, + ] + ) + ``` + + ```python + res = index.query_many( + queries=[ + { + "data": "hello", + "top_k": 2, + }, + { + "data": "world", + }, + ] + ) + ``` + """ + if len(queries) == 1: + # handle this case separately, as the server returns a single + # response when the length of the array is 1. + query = queries[0] + single_result = self.query(**query, namespace=namespace) + return [single_result] + + has_vector_query, payload = convert_query_requests_to_payload(queries) + path = QUERY_PATH if has_vector_query else QUERY_DATA_PATH + result = self._execute_request(payload=payload, path=_path_for(namespace, path)) + + return [ + [QueryResult._from_json(obj) for obj in query_result] + for query_result in result + ] + def delete( self, ids: Union[str, List[str]], @@ -246,6 +317,7 @@ def range( include_vectors: bool = False, include_metadata: bool = False, namespace: str = DEFAULT_NAMESPACE, + include_data: bool = False, ) -> RangeResult: """ Scans the vectors starting from `cursor`, returns at most `limit` many vectors. @@ -255,6 +327,7 @@ def range( :param include_vectors: Whether the resulting `top_k` vectors will have their vector values or not. :param include_metadata: Whether the resulting `top_k` vectors will have their metadata or not. :param namespace: The namespace to use. When not specified, the default namespace is used. + :param include_data: Whether the resulting `top_k` vectors will have their unstructured data or not. Example usage: @@ -270,6 +343,7 @@ def range( "limit": limit, "includeVectors": include_vectors, "includeMetadata": include_metadata, + "includeData": include_data, } return RangeResult._from_json( self._execute_request( @@ -283,6 +357,7 @@ def fetch( include_vectors: bool = False, include_metadata: bool = False, namespace: str = DEFAULT_NAMESPACE, + include_data: bool = False, ) -> List[Optional[FetchResult]]: """ Fetches details of a set of vectors. @@ -291,6 +366,7 @@ def fetch( :param include_vectors: Whether the resulting vectors will have their vector values or not. :param include_metadata: Whether the resulting vectors will have their metadata or not. :param namespace: The namespace to use. When not specified, the default namespace is used. + :param include_data: Whether the resulting `top_k` vectors will have their unstructured data or not. Example usage: @@ -305,6 +381,7 @@ def fetch( "ids": ids, "includeVectors": include_vectors, "includeMetadata": include_metadata, + "includeData": include_data, } return [ FetchResult._from_json(vector) if vector else None @@ -320,20 +397,19 @@ def update( data: Optional[str] = None, metadata: Optional[Dict] = None, namespace: str = DEFAULT_NAMESPACE, + metadata_update_mode: MetadataUpdateMode = MetadataUpdateMode.OVERWRITE, ) -> bool: """ Updates a vector value, data, or metadata for the given id. - Only and only one of the vector, data, or metadata parameters can be set. - - To update both vector and metadata, or data and metadata, use the - upsert method. - :param id: The vector id to update. :param vector: The vector value to update to. :param data: The raw text data to embed into a vector and update to. :param metadata: The metadata to update to. :param namespace: The namespace to use. When not specified, the default namespace is used. + :param metadata_update_mode: Whether to overwrite the whole + it, or patch the metadata (insert new fields or update + according to the `RFC 7396 JSON Merge Patch` algorithm. Example usage: @@ -343,6 +419,7 @@ def update( """ payload: Dict[str, Any] = { "id": id, + "metadataUpdateMode": metadata_update_mode.value, } if vector is not None: @@ -354,11 +431,6 @@ def update( if metadata is not None: payload["metadata"] = metadata - if len(payload) != 2: - raise ClientError( - "Only and only one of the vector, data, or metadata parameters set" - ) - result = self._execute_request( payload=payload, path=_path_for(namespace, UPDATE_PATH) ) @@ -418,8 +490,9 @@ async def upsert( ```python res = await index.upsert( vectors=[ - ("id1", [0.1, 0.2], {"metadata_field": "metadata_value"}), - ("id2", [0.3,0.4]) + ("id1", [0.1, 0.2], {"metadata_field": "metadata_value"}, "data-value"), + ("id2", [0.2, 0.2], {"metadata_field": "metadata_value"}), + ("id3", [0.3,0.4]), ] ) ``` @@ -427,8 +500,9 @@ async def upsert( ```python res = await index.upsert( vectors=[ - {"id": "id3", "vector": [0.1, 0.2], "metadata": {"field": "value"}}, - {"id": "id4", "vector": [0.5, 0.6]}, + {"id": "id4", "vector": [0.1, 0.2], "metadata": {"field": "value"}, "data": "data-value"}, + {"id": "id5", "vector": [0.2, 0.2], "metadata": {"field": "value"}}, + {"id": "id6", "vector": [0.5, 0.6]}, ] ) ``` @@ -437,8 +511,9 @@ async def upsert( from upstash_vector import Vector res = await index.upsert( vectors=[ - Vector(id="id5", vector=[1, 2], metadata={"field": "value"}), - Vector(id="id6", vector=[6, 7]), + Vector(id="id7", vector=[0.1, 0.2], metadata={"field": "value"}, data="data-value"), + Vector(id="id8", vector=[0.1, 0.2], metadata={"field": "value"}), + Vector(id="id9", vector=[0.6, 0.7]), ] ) ``` @@ -460,7 +535,7 @@ async def upsert( res = index.upsert( vectors=[ ("id1", [0.1, 0.2]), - ("id2", [0.3,0.4]), + ("id2", [0.3, 0.4]), ], namespace="ns", ) @@ -483,6 +558,7 @@ async def query( filter: str = "", data: Optional[str] = None, namespace: str = DEFAULT_NAMESPACE, + include_data: bool = False, ) -> List[QueryResult]: """ Query `top_k` many similar vectors. @@ -496,6 +572,7 @@ async def query( :param filter: Filter expression to narrow down the query results. :param data: Data to query for (after embedding it to a vector) :param namespace: The namespace to use. When not specified, the default namespace is used. + :param include_data: Whether the resulting `top_k` vectors will have their unstructured data or not. Example usage: @@ -521,6 +598,7 @@ async def query( "topK": top_k, "includeVectors": include_vectors, "includeMetadata": include_metadata, + "includeData": include_data, "filter": filter, } @@ -545,6 +623,66 @@ async def query( ) ] + async def query_many( + self, + *, + queries: List[QueryRequest], + namespace: str = DEFAULT_NAMESPACE, + ) -> List[List[QueryResult]]: + """ + Makes a batch query request. + + The batch should only contain elements whose `data` + or `vector` fields set. + + Example usage: + + ```python + res = await index.query_many( + queries=[ + { + "vector": [0.5, 0.4], + "top_k": 2, + }, + { + "vector": [0.3, 0.2], + }, + ] + ) + ``` + + ```python + res = await index.query_many( + queries=[ + { + "data": "hello", + "top_k": 2, + }, + { + "data": "world", + }, + ] + ) + ``` + """ + if len(queries) == 1: + # handle this case separately, as the server returns a single + # response when the length of the array is 1. + query = queries[0] + single_result = await self.query(**query, namespace=namespace) + return [single_result] + + has_vector_query, payload = convert_query_requests_to_payload(queries) + path = QUERY_PATH if has_vector_query else QUERY_DATA_PATH + result = await self._execute_request_async( + payload=payload, path=_path_for(namespace, path) + ) + + return [ + [QueryResult._from_json(obj) for obj in query_result] + for query_result in result + ] + async def delete( self, ids: Union[str, List[str]], @@ -604,6 +742,7 @@ async def range( include_vectors: bool = False, include_metadata: bool = False, namespace: str = DEFAULT_NAMESPACE, + include_data: bool = False, ) -> RangeResult: """ Scans the vectors asynchronously starting from `cursor`, returns at most `limit` many vectors. @@ -613,6 +752,7 @@ async def range( :param include_vectors: Whether the resulting `top_k` vectors will have their vector values or not. :param include_metadata: Whether the resulting `top_k` vectors will have their metadata or not. :param namespace: The namespace to use. When not specified, the default namespace is used. + :param include_data: Whether the resulting `top_k` vectors will have their unstructured data or not. Example usage: @@ -628,6 +768,7 @@ async def range( "limit": limit, "includeVectors": include_vectors, "includeMetadata": include_metadata, + "includeData": include_data, } return RangeResult._from_json( await self._execute_request_async( @@ -641,6 +782,7 @@ async def fetch( include_vectors: bool = False, include_metadata: bool = False, namespace: str = DEFAULT_NAMESPACE, + include_data: bool = False, ) -> List[Optional[FetchResult]]: """ Fetches details of a set of vectors asynchronously. @@ -649,6 +791,7 @@ async def fetch( :param include_vectors: Whether the resulting vectors will have their vector values or not. :param include_metadata: Whether the resulting vectors will have their metadata or not. :param namespace: The namespace to use. When not specified, the default namespace is used. + :param include_data: Whether the resulting `top_k` vectors will have their unstructured data or not. Example usage: @@ -663,6 +806,7 @@ async def fetch( "ids": ids, "includeVectors": include_vectors, "includeMetadata": include_metadata, + "includeData": include_data, } return [ FetchResult._from_json(vector) if vector else None @@ -678,20 +822,19 @@ async def update( data: Optional[str] = None, metadata: Optional[Dict] = None, namespace: str = DEFAULT_NAMESPACE, + metadata_update_mode: MetadataUpdateMode = MetadataUpdateMode.OVERWRITE, ) -> bool: """ Updates a vector value, data, or metadata for the given id. - Only and only one of the vector, data, or metadata parameters can be set. - - To update both vector and metadata, or data and metadata, use the - upsert method. - :param id: The vector id to update. :param vector: The vector value to update to. :param data: The raw text data to embed into a vector and update to. :param metadata: The metadata to update to. :param namespace: The namespace to use. When not specified, the default namespace is used. + :param metadata_update_mode: Whether to overwrite the whole + it, or patch the metadata (insert new fields or update + according to the `RFC 7396 JSON Merge Patch` algorithm. Example usage: @@ -701,6 +844,7 @@ async def update( """ payload: Dict[str, Any] = { "id": id, + "metadataUpdateMode": metadata_update_mode.value, } if vector is not None: @@ -712,11 +856,6 @@ async def update( if metadata is not None: payload["metadata"] = metadata - if len(payload) != 2: - raise ClientError( - "Only and only one of the vector, data, or metadata parameters set" - ) - result = await self._execute_request_async( payload=payload, path=_path_for(namespace, UPDATE_PATH) ) diff --git a/upstash_vector/types.py b/upstash_vector/types.py index 34dde28..a022362 100644 --- a/upstash_vector/types.py +++ b/upstash_vector/types.py @@ -1,5 +1,6 @@ +import enum from dataclasses import dataclass -from typing import Optional, List, Dict, Union, Protocol +from typing import Optional, List, Dict, TypedDict, Union, Protocol class SupportsToList(Protocol): @@ -12,6 +13,7 @@ class Vector: id: Union[int, str] vector: Union[List[float], SupportsToList] metadata: Optional[Dict] = None + data: Optional[str] = None @dataclass @@ -26,6 +28,7 @@ class FetchResult: id: str vector: Optional[List[float]] = None metadata: Optional[Dict] = None + data: Optional[str] = None @classmethod def _from_json(cls, obj: dict) -> "FetchResult": @@ -33,6 +36,7 @@ def _from_json(cls, obj: dict) -> "FetchResult": id=obj["id"], vector=obj.get("vector"), metadata=obj.get("metadata"), + data=obj.get("data"), ) @@ -42,6 +46,7 @@ class QueryResult: score: float vector: Optional[List[float]] = None metadata: Optional[Dict] = None + data: Optional[str] = None @classmethod def _from_json(cls, obj: dict) -> "QueryResult": @@ -50,6 +55,7 @@ def _from_json(cls, obj: dict) -> "QueryResult": score=obj["score"], vector=obj.get("vector"), metadata=obj.get("metadata"), + data=obj.get("data"), ) @@ -110,3 +116,68 @@ def _from_json(cls, obj: dict) -> "InfoResult": for ns, ns_info in obj["namespaces"].items() }, ) + + +class MetadataUpdateMode(enum.Enum): + """ + Whether to overwrite the whole metadata while updating + it, or patch the metadata (insert new fields or update or delete existing fields) + according to the `RFC 7396 JSON Merge Patch` algorithm. + """ + + OVERWRITE = "OVERWRITE" + """Overwrite the metadata, and set it to a new value.""" + + PATCH = "PATCH" + """Patch the metadata according to Merge Patch algorithm.""" + + +class QueryRequest(TypedDict, total=False): + vector: Union[List[float], SupportsToList] + """ + The vector value to query. + + Only and only one of `vector` or `data` fields must be provided. + """ + + data: str + """ + Data to query for (after embedding it to a vector). + + Only and only one of `vector` or `data` fields must be provided. + """ + + top_k: int + """ + How many vectors will be returned as the query result. + + When not specified, defaults to `10`. + """ + + include_vectors: bool + """ + Whether the resulting `top_k` vectors will have their vector values or not. + + When not specified, defaults to `False`. + """ + + include_metadata: bool + """ + Whether the resulting `top_k` vectors will have their metadata or not. + + When not specified, defaults to `False`. + """ + + include_data: bool + """ + Whether the resulting `top_k` vectors will have their unstructured data or not. + + When not specified, defaults to `False`. + """ + + filter: str + """ + Filter expression to narrow down the query results. + + When not specified, defaults to `""`(no filter). + """ diff --git a/upstash_vector/utils.py b/upstash_vector/utils.py index 9b0b22f..ce718fc 100644 --- a/upstash_vector/utils.py +++ b/upstash_vector/utils.py @@ -1,7 +1,8 @@ -from upstash_vector.types import Data, Vector -from upstash_vector.errors import ClientError from typing import List, Union, Dict, Any, Optional, Tuple +from upstash_vector.errors import ClientError +from upstash_vector.types import Data, QueryRequest, Vector + def convert_to_list(obj): if isinstance(obj, list): @@ -18,11 +19,12 @@ def _get_payload_element( id: Union[int, str], payload: Union[str, List[float]], metadata: Optional[Dict[str, Any]] = None, + data: Optional[str] = None, ) -> Union[Vector, Data]: if isinstance(payload, str): return Data(id=id, data=payload, metadata=metadata) - return Vector(id=id, vector=convert_to_list(payload), metadata=metadata) + return Vector(id=id, vector=convert_to_list(payload), metadata=metadata, data=data) def _get_payload_element_from_dict( @@ -36,13 +38,11 @@ def _get_payload_element_from_dict( "Vector dict must have one of `vector` or `data` fields defined." ) - if vector is not None and data is not None: - raise ClientError("only one of `data` or `vector` field can be given.") - - if data is None: - return Vector(id=id, vector=convert_to_list(vector), metadata=metadata) + if vector is None: + # data cannot be none at this point + return Data(id=id, data=data, metadata=metadata) # type:ignore[arg-type] - return Data(id=id, data=data, metadata=metadata) + return Vector(id=id, vector=convert_to_list(vector), metadata=metadata, data=data) def _tuple_or_dict_to_vectors(vector) -> Union[Vector, Data]: @@ -76,20 +76,81 @@ def convert_to_payload( Returns the payload and whether it is Vector or Data. """ - is_vector = isinstance(vectors[0], Vector) - try: + expecting_vectors = isinstance(vectors[0], Vector) + payload = [] + for vector in vectors: + is_vector = isinstance(vector, Vector) + if expecting_vectors != is_vector: + raise ClientError( + "All items should either have the `data` or the `vector` field." + " Received items from both kinds. Please send them separately." + ) + if is_vector: - return [ - {"id": vector.id, "vector": vector.vector, "metadata": vector.metadata} # type: ignore[union-attr] - for vector in vectors - ], is_vector + payload.append( + { + "id": vector.id, + "vector": vector.vector, # type: ignore[union-attr] + "metadata": vector.metadata, + "data": vector.data, + } + ) else: - return [ - {"id": vector.id, "data": vector.data, "metadata": vector.metadata} # type: ignore[union-attr] - for vector in vectors - ], is_vector - except AttributeError: - raise ClientError( - "All items should either have the `data` or the `vector` field." - " Received items from both kinds. Please send them separately." - ) + payload.append( + { + "id": vector.id, + "data": vector.data, + "metadata": vector.metadata, + } + ) + + return payload, expecting_vectors + + +def convert_query_requests_to_payload( + queries: List[QueryRequest], +) -> Tuple[bool, List[Dict[str, Any]]]: + has_vector_query = False + has_data_query = False + + payloads = [] + + for query in queries: + payload = { + "topK": query.get("top_k", 10), + "includeVectors": query.get("include_vectors", False), + "includeMetadata": query.get("include_metadata", False), + "includeData": query.get("include_data", False), + "filter": query.get("filter", ""), + } + + vector = query.get("vector") + data = query.get("data") + + if data is None and vector is None: + raise ClientError("either `data` or `vector` values must be given") + if data is not None and vector is not None: + raise ClientError( + "`data` and `vector` values cannot be given at the same time" + ) + + if data is not None: + if has_vector_query: + raise ClientError( + "`data` and `vector` queries cannot be mixed in the same batch." + ) + + has_data_query = True + payload["data"] = data + else: + if has_data_query: + raise ClientError( + "`data` and `vector` queries cannot be mixed in the same batch." + ) + + has_vector_query = True + payload["vector"] = convert_to_list(vector) + + payloads.append(payload) + + return has_vector_query, payloads