MatchingEngine
本文档展示了如何使用与GCP Vertex AI的 MatchingEngine
向量数据库相关的功能。
Vertex AI Matching Engine 提供业界领先的高规模低延迟向量数据库。这些向量数据库通常被称为向量相似匹配或近似最近邻(ANN)服务。
注意:此模块期望已经创建了一个端点和部署的索引,因为创建时间接近一小时。要了解如何创建索引,请参考 创建索引并将其部署到端点 部分。
从文本创建VectorStore
from langchain.vectorstores import MatchingEngine
texts = ['The cat sat on', 'the mat.', 'I like to', 'eat pizza for', 'dinner.', 'The sun sets', 'in the west.']
vector_store = MatchingEngine.from_components(
texts=texts,
project_id="<my_project_id>",
region="<my_region>",
gcs_bucket_uri="<my_gcs_bucket>",
index_id="<my_matching_engine_index_id>",
endpoint_id="<my_matching_engine_endpoint_id>"
)
vector_store.add_texts(texts=texts)
vector_store.similarity_search("lunch", k=2)
创建索引并将其部署到端点
导入、常量和配置
# Installing dependencies.
!pip install tensorflow \
google-cloud-aiplatform \
tensorflow-hub \
tensorflow-text
import os
import json
from google.cloud import aiplatform
import tensorflow_hub as hub
import tensorflow_text
PROJECT_ID = "<my_project_id>"
REGION = "<my_region>"
VPC_NETWORK = "<my_vpc_network_name>"
PEERING_RANGE_NAME = "ann-langchain-me-range" # 用于创建VPC peering的名称。
BUCKET_URI = "gs://<bucket_uri>"
# TensorFlow通用句子编码器的维度数。
# 如果使用其他嵌入器,维度数可能需要更改。
DIMENSIONS = 512
DISPLAY_NAME = "index-test-name"
EMBEDDING_DIR = f"{BUCKET_URI}/banana"
DEPLOYED_INDEX_ID = "endpoint-test-name"
PROJECT_NUMBER = !gcloud projects list --filter="PROJECT_ID:'{PROJECT_ID}'" --format='value(PROJECT_NUMBER)'
PROJECT_NUMBER = PROJECT_NUMBER[0]
VPC_NETWORK_FULL = f"projects/{PROJECT_NUMBER}/global/networks/{VPC_NETWORK}"
# 如果需要创建VPC,则更改此项设置。
CREATE_VPC = False
# 设置项目ID
! gcloud config set project {PROJECT_ID}
# 如果不需要创建VPC,请删除if条件后运行封装的代码。
if CREATE_VPC:
# 创建VPC网络
! gcloud compute networks create {VPC_NETWORK} --bgp-routing-mode=regional --subnet-mode=auto --project={PROJECT_ID}
# 添加必要的防火墙规则
! gcloud compute firewall-rules create {VPC_NETWORK}-allow-icmp --network {VPC_NETWORK} --priority 65534 --project {PROJECT_ID} --allow icmp
! gcloud compute firewall-rules create {VPC_NETWORK}-allow-internal --network {VPC_NETWORK} --priority 65534 --project {PROJECT_ID} --allow all --source-ranges 10.128.0.0/9
! gcloud compute firewall-rules create {VPC_NETWORK}-allow-rdp --network {VPC_NETWORK} --priority 65534 --project {PROJECT_ID} --allow tcp:3389
! gcloud compute firewall-rules create {VPC_NETWORK}-allow-ssh --network {VPC_NETWORK} --priority 65534 --project {PROJECT_ID} --allow tcp:22
# 保留IP范围
! gcloud compute addresses create {PEERING_RANGE_NAME} --global --prefix-length=16 --network={VPC_NETWORK} --purpose=VPC_PEERING --project={PROJECT_ID} --description="peering range"
# 设置与服务网络的对等连接
# 您的帐户必须具有"Compute Network Admin"角色才能运行以下命令。
! gcloud services vpc-peerings connect --service=servicenetworking.googleapis.com --network={VPC_NETWORK} --ranges={PEERING_RANGE_NAME} --project={PROJECT_ID}
# 创建存储桶。
! gsutil mb -l $REGION -p $PROJECT_ID $BUCKET_URI
使用TensorFlow Universal Sentence Encoder作为嵌入器
# 加载Universal Sentence Encoder模块
module_url = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
model = hub.load(module_url)
# 为每个单词生成嵌入向量
embeddings = model(['banana'])
插入一个测试嵌入向量
initial_config = {"id": "banana_id", "embedding": [float(x) for x in list(embeddings.numpy()[0])]}
with open("data.json", "w") as f:
json.dump(initial_config, f)
!gsutil cp data.json {EMBEDDING_DIR}/file.json
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)
创建索引
my_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
display_name=DISPLAY_NAME,
contents_delta_uri=EMBEDDING_DIR,
dimensions=DIMENSIONS,
approximate_neighbors_count=150,
distance_measure_type="DOT_PRODUCT_DISTANCE"
)
创建端点
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
display_name=f"{DISPLAY_NAME}-endpoint",
network=VPC_NETWORK_FULL,
)
部署索引
my_index_endpoint = my_index_endpoint.deploy_index(
index=my_index,
deployed_index_id=DEPLOYED_INDEX_ID
)
my_index_endpoint.deployed_indexes