From 7fc4c76c778163c21d396f99dcc710d99942895f Mon Sep 17 00:00:00 2001 From: Tyler Hutcherson Date: Thu, 13 Apr 2023 16:59:43 -0400 Subject: Improving Vector Similarity Search Example (#2661) * update vss docs * add embeddings creation and storage examples * update based on feedback * fix version and link * include more realistic search examples and clean up indices * completely remove initial cap reference --------- Co-authored-by: Chayim --- .../search_vector_similarity_examples.ipynb | 610 ++++++++++++++++++++- 1 file changed, 586 insertions(+), 24 deletions(-) diff --git a/docs/examples/search_vector_similarity_examples.ipynb b/docs/examples/search_vector_similarity_examples.ipynb index 2b02610..bd1df3c 100644 --- a/docs/examples/search_vector_similarity_examples.ipynb +++ b/docs/examples/search_vector_similarity_examples.ipynb @@ -1,81 +1,643 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Vector Similarity\n", - "## Adding Vector Fields" + "**Vectors** (also called \"Embeddings\"), represent an AI model's impression (or understanding) of a piece of unstructured data like text, images, audio, videos, etc. Vector Similarity Search (VSS) is the process of finding vectors in the vector database that are similar to a given query vector. Popular VSS uses include recommendation systems, image and video search, document retrieval, and question answering.\n", + "\n", + "## Index Creation\n", + "Before doing vector search, first define the schema and create an index." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, + "outputs": [], + "source": [ + "import redis\n", + "from redis.commands.search.field import TagField, VectorField\n", + "from redis.commands.search.indexDefinition import IndexDefinition, IndexType\n", + "from redis.commands.search.query import Query\n", + "\n", + "r = redis.Redis(host=\"localhost\", port=6379)\n", + "\n", + "INDEX_NAME = \"index\" # Vector Index Name\n", + "DOC_PREFIX = \"doc:\" # RediSearch Key Prefix for the Index\n", + "\n", + "def create_index(vector_dimensions: int):\n", + " try:\n", + " # check to see if index exists\n", + " r.ft(INDEX_NAME).info()\n", + " print(\"Index already exists!\")\n", + " except:\n", + " # schema\n", + " schema = (\n", + " TagField(\"tag\"), # Tag Field Name\n", + " VectorField(\"vector\", # Vector Field Name\n", + " \"FLAT\", { # Vector Index Type: FLAT or HNSW\n", + " \"TYPE\": \"FLOAT32\", # FLOAT32 or FLOAT64\n", + " \"DIM\": vector_dimensions, # Number of Vector Dimensions\n", + " \"DISTANCE_METRIC\": \"COSINE\", # Vector Search Distance Metric\n", + " }\n", + " ),\n", + " )\n", + "\n", + " # index Definition\n", + " definition = IndexDefinition(prefix=[DOC_PREFIX], index_type=IndexType.HASH)\n", + "\n", + " # create Index\n", + " r.ft(INDEX_NAME).create_index(fields=schema, definition=definition)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We'll start by working with vectors that have 1536 dimensions." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# define vector dimensions\n", + "VECTOR_DIMENSIONS = 1536\n", + "\n", + "# create the index\n", + "create_index(vector_dimensions=VECTOR_DIMENSIONS)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Adding Vectors to Redis\n", + "\n", + "Next, we add vectors (dummy data) to Redis using `hset`. The search index listens to keyspace notifications and will include any written HASH objects prefixed by `DOC_PREFIX`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install numpy" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# instantiate a redis pipeline\n", + "pipe = r.pipeline()\n", + "\n", + "# define some dummy data\n", + "objects = [\n", + " {\"name\": \"a\", \"tag\": \"foo\"},\n", + " {\"name\": \"b\", \"tag\": \"foo\"},\n", + " {\"name\": \"c\", \"tag\": \"bar\"},\n", + "]\n", + "\n", + "# write data\n", + "for obj in objects:\n", + " # define key\n", + " key = f\"doc:{obj['name']}\"\n", + " # create a random \"dummy\" vector\n", + " obj[\"vector\"] = np.random.rand(VECTOR_DIMENSIONS).astype(np.float32).tobytes()\n", + " # HSET\n", + " pipe.hset(key, mapping=obj)\n", + "\n", + "res = pipe.execute()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Searching\n", + "You can use VSS queries with the `.ft(...).search(...)` query command. To use a VSS query, you must specify the option `.dialect(2)`.\n", + "\n", + "There are two supported types of vector queries in Redis: `KNN` and `Range`. `Hybrid` queries can work in both settings and combine elements of traditional search and VSS." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### KNN Queries\n", + "KNN queries are for finding the topK most similar vectors given a query vector." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { "data": { "text/plain": [ - "b'OK'" + "[Document {'id': 'doc:b', 'payload': None, 'score': '0.2376562953'},\n", + " Document {'id': 'doc:c', 'payload': None, 'score': '0.240063905716'}]" ] }, - "execution_count": 1, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "import redis\n", - "from redis.commands.search.field import VectorField\n", - "from redis.commands.search.query import Query\n", + "query = (\n", + " Query(\"*=>[KNN 2 @vector $vec as score]\")\n", + " .sort_by(\"score\")\n", + " .return_fields(\"id\", \"score\")\n", + " .paging(0, 2)\n", + " .dialect(2)\n", + ")\n", "\n", - "r = redis.Redis(host='localhost', port=36379)\n", + "query_params = {\n", + " \"vec\": np.random.rand(VECTOR_DIMENSIONS).astype(np.float32).tobytes()\n", + "}\n", + "r.ft(INDEX_NAME).search(query, query_params).docs" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Range Queries\n", + "Range queries provide a way to filter results by the distance between a vector field in Redis and a query vector based on some pre-defined threshold (radius)." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document {'id': 'doc:a', 'payload': None, 'score': '0.243115246296'},\n", + " Document {'id': 'doc:c', 'payload': None, 'score': '0.24981123209'},\n", + " Document {'id': 'doc:b', 'payload': None, 'score': '0.251443207264'}]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query = (\n", + " Query(\"@vector:[VECTOR_RANGE $radius $vec]=>{$YIELD_DISTANCE_AS: score}\")\n", + " .sort_by(\"score\")\n", + " .return_fields(\"id\", \"score\")\n", + " .paging(0, 3)\n", + " .dialect(2)\n", + ")\n", "\n", - "schema = (VectorField(\"v\", \"HNSW\", {\"TYPE\": \"FLOAT32\", \"DIM\": 2, \"DISTANCE_METRIC\": \"L2\"}),)\n", - "r.ft().create_index(schema)" + "# Find all vectors within 0.8 of the query vector\n", + "query_params = {\n", + " \"radius\": 0.8,\n", + " \"vec\": np.random.rand(VECTOR_DIMENSIONS).astype(np.float32).tobytes()\n", + "}\n", + "r.ft(INDEX_NAME).search(query, query_params).docs" ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "## Searching" + "See additional Range Query examples in [this Jupyter notebook](https://github.com/RediSearch/RediSearch/blob/master/docs/docs/vecsim-range_queries_examples.ipynb)." ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "### Querying vector fields" + "### Hybrid Queries\n", + "Hybrid queries contain both traditional filters (numeric, tags, text) and VSS in one single Redis command." ] }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "pycharm": { - "name": "#%%\n" + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document {'id': 'doc:b', 'payload': None, 'score': '0.24422544241', 'tag': 'foo'},\n", + " Document {'id': 'doc:a', 'payload': None, 'score': '0.259926855564', 'tag': 'foo'}]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" } - }, + ], + "source": [ + "query = (\n", + " Query(\"(@tag:{ foo })=>[KNN 2 @vector $vec as score]\")\n", + " .sort_by(\"score\")\n", + " .return_fields(\"id\", \"tag\", \"score\")\n", + " .paging(0, 2)\n", + " .dialect(2)\n", + ")\n", + "\n", + "query_params = {\n", + " \"vec\": np.random.rand(VECTOR_DIMENSIONS).astype(np.float32).tobytes()\n", + "}\n", + "r.ft(INDEX_NAME).search(query, query_params).docs" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "See additional Hybrid Query examples in [this Jupyter notebook](https://github.com/RediSearch/RediSearch/blob/master/docs/docs/vecsim-hybrid_queries_examples.ipynb)." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Vector Creation and Storage Examples\n", + "The above examples use dummy data as vectors. However, in reality, most use cases leverage production-grade AI models for creating embeddings. Below we will take some sample text data, pass it to the OpenAI and Cohere API's respectively, and then write them to Redis." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "texts = [\n", + " \"Today is a really great day!\",\n", + " \"The dog next door barks really loudly.\",\n", + " \"My cat escaped and got out before I could close the door.\",\n", + " \"It's supposed to rain and thunder tomorrow.\"\n", + "]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### OpenAI Embeddings\n", + "Before working with OpenAI Embeddings, we clean up our existing search index and create a new one." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# delete index\n", + "r.ft(INDEX_NAME).dropindex(delete_documents=True)\n", + "\n", + "# make a new one\n", + "create_index(vector_dimensions=VECTOR_DIMENSIONS)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install openai" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "import openai\n", + "\n", + "# set your OpenAI API key - get one at https://platform.openai.com\n", + "openai.api_key = \"YOUR OPENAI API KEY\"" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Create Embeddings with OpenAI text-embedding-ada-002\n", + "# https://openai.com/blog/new-and-improved-embedding-model\n", + "response = openai.Embedding.create(input=texts, engine=\"text-embedding-ada-002\")\n", + "embeddings = np.array([r[\"embedding\"] for r in response[\"data\"]], dtype=np.float32)\n", + "\n", + "# Write to Redis\n", + "pipe = r.pipeline()\n", + "for i, embedding in enumerate(embeddings):\n", + " pipe.hset(f\"doc:{i}\", mapping = {\n", + " \"vector\": embedding.tobytes(),\n", + " \"content\": texts[i],\n", + " \"tag\": \"openai\"\n", + " })\n", + "res = pipe.execute()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 0.00509819, 0.0010873 , -0.00228475, ..., -0.00457579,\n", + " 0.01329307, -0.03167175],\n", + " [-0.00357223, -0.00550784, -0.01314328, ..., -0.02915693,\n", + " 0.01470436, -0.01367203],\n", + " [-0.01284631, 0.0034875 , -0.01719686, ..., -0.01537451,\n", + " 0.01953256, -0.05048691],\n", + " [-0.01145045, -0.00785481, 0.00206323, ..., -0.02070181,\n", + " -0.01629098, -0.00300795]], dtype=float32)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "embeddings" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Search with OpenAI Embeddings\n", + "\n", + "Now that we've created embeddings with OpenAI, we can also perform a search to find relevant documents to some input text.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 0.00062901, -0.0070723 , -0.00148926, ..., -0.01904645,\n", + " -0.00436092, -0.01117944], dtype=float32)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text = \"animals\"\n", + "\n", + "# create query embedding\n", + "response = openai.Embedding.create(input=[text], engine=\"text-embedding-ada-002\")\n", + "query_embedding = np.array([r[\"embedding\"] for r in response[\"data\"]], dtype=np.float32)[0]\n", + "\n", + "query_embedding" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Result{2 total, docs: [Document {'id': 'a', 'payload': None, '__v_score': '0'}, Document {'id': 'b', 'payload': None, '__v_score': '3.09485009821e+26'}]}" + "[Document {'id': 'doc:1', 'payload': None, 'score': '0.214349985123', 'content': 'The dog next door barks really loudly.', 'tag': 'openai'},\n", + " Document {'id': 'doc:2', 'payload': None, 'score': '0.237052619457', 'content': 'My cat escaped and got out before I could close the door.', 'tag': 'openai'}]" ] }, - "execution_count": 2, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "r.hset(\"a\", \"v\", \"aaaaaaaa\")\n", - "r.hset(\"b\", \"v\", \"aaaabaaa\")\n", - "r.hset(\"c\", \"v\", \"aaaaabaa\")\n", + "# query for similar documents that have the openai tag\n", + "query = (\n", + " Query(\"(@tag:{ openai })=>[KNN 2 @vector $vec as score]\")\n", + " .sort_by(\"score\")\n", + " .return_fields(\"content\", \"tag\", \"score\")\n", + " .paging(0, 2)\n", + " .dialect(2)\n", + ")\n", + "\n", + "query_params = {\"vec\": query_embedding.tobytes()}\n", + "r.ft(INDEX_NAME).search(query, query_params).docs\n", "\n", - "q = Query(\"*=>[KNN 2 @v $vec]\").return_field(\"__v_score\").dialect(2)\n", - "r.ft().search(q, query_params={\"vec\": \"aaaaaaaa\"})" + "# the two pieces of content related to animals are returned" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cohere Embeddings\n", + "Before working with Cohere Embeddings, we clean up our existing search index and create a new one." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# delete index\n", + "r.ft(INDEX_NAME).dropindex(delete_documents=True)\n", + "\n", + "# make a new one for cohere embeddings (1024 dimensions)\n", + "VECTOR_DIMENSIONS = 1024\n", + "create_index(vector_dimensions=VECTOR_DIMENSIONS)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install cohere" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "import cohere\n", + "\n", + "co = cohere.Client(\"YOUR COHERE API KEY\")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# Create Embeddings with Cohere\n", + "# https://docs.cohere.ai/docs/embeddings\n", + "response = co.embed(texts=texts, model=\"small\")\n", + "embeddings = np.array(response.embeddings, dtype=np.float32)\n", + "\n", + "# Write to Redis\n", + "for i, embedding in enumerate(embeddings):\n", + " r.hset(f\"doc:{i}\", mapping = {\n", + " \"vector\": embedding.tobytes(),\n", + " \"content\": texts[i],\n", + " \"tag\": \"cohere\"\n", + " })" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[-0.3034668 , -0.71533203, -0.2836914 , ..., 0.81152344,\n", + " 1.0253906 , -0.8095703 ],\n", + " [-0.02560425, -1.4912109 , 0.24267578, ..., -0.89746094,\n", + " 0.15625 , -3.203125 ],\n", + " [ 0.10125732, 0.7246094 , -0.29516602, ..., -1.9638672 ,\n", + " 1.6630859 , -0.23291016],\n", + " [-2.09375 , 0.8588867 , -0.23352051, ..., -0.01541138,\n", + " 0.17053223, -3.4042969 ]], dtype=float32)" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "embeddings" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Search with Cohere Embeddings\n", + "\n", + "Now that we've created embeddings with Cohere, we can also perform a search to find relevant documents to some input text." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([-0.49682617, 1.7070312 , 0.3466797 , ..., 0.58984375,\n", + " 0.1060791 , -2.9023438 ], dtype=float32)" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text = \"animals\"\n", + "\n", + "# create query embedding\n", + "response = co.embed(texts=[text], model=\"small\")\n", + "query_embedding = np.array(response.embeddings[0], dtype=np.float32)\n", + "\n", + "query_embedding" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document {'id': 'doc:1', 'payload': None, 'score': '0.658673524857', 'content': 'The dog next door barks really loudly.', 'tag': 'cohere'},\n", + " Document {'id': 'doc:2', 'payload': None, 'score': '0.662699103355', 'content': 'My cat escaped and got out before I could close the door.', 'tag': 'cohere'}]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# query for similar documents that have the cohere tag\n", + "query = (\n", + " Query(\"(@tag:{ cohere })=>[KNN 2 @vector $vec as score]\")\n", + " .sort_by(\"score\")\n", + " .return_fields(\"content\", \"tag\", \"score\")\n", + " .paging(0, 2)\n", + " .dialect(2)\n", + ")\n", + "\n", + "query_params = {\"vec\": query_embedding.tobytes()}\n", + "r.ft(INDEX_NAME).search(query, query_params).docs\n", + "\n", + "# the two pieces of content related to animals are returned" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Find more example apps, tutorials, and projects using Redis Vector Similarity Search [in this GitHub organization](https://github.com/RedisVentures)." ] } ], @@ -98,7 +660,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.2" + "version": "3.9.12" }, "orig_nbformat": 4 }, -- cgit v1.2.1