三種檢索策略實戰:基于Ollama+BGE-M3+Elasticsearch的關鍵詞檢索、向量檢索與混合檢索完整指南
一、ollama部署向量模型bge-m3
ollama pull bge-m3(注意:因為BGE-M3 是一個嵌入模型,因此不能像生成式模型那樣使用 <font style="color:rgb(199, 37, 78);background-color:rgb(249, 242, 244);">ollama run</font> 命令運行,在通過ollama pull拉取之后將在調用時候自動加載!`)
curl http://localhost:11434/api/embed -d '{
"model": "bge-m3",
"input": ["你是誰?"]
}'input 輸入可以是list,也可以傳入單個字符串;
圖片
調用ollama的embedding模型的python代碼示例。
import requests
import json
url = "http://localhost:11434/api/embed"
payload = json.dumps({
"model": "bge-m3:latest",
"input": [
"你是誰?"
]
})
headers = {
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload).json()
print(response)二、es操作示例
es的索引創建
創建一個名為embedding_demo_v2 的索引,下面是他的結構;
{
"mappings": {
"properties": {
"title": {
"type": "text","analyzer":"ik_smart"
},
"content": {
"type": "text","analyzer":"ik_smart"
},
"embedding": {
"type": "dense_vector",
"dims": 1024,
"index": true,
"similarity": "cosine"
}
}
}
}
圖片
python代碼示例:
# 本地安裝elasticsearch時,應指定與服務端相同的大版本號:
from operator import index
from elasticsearch import Elasticsearch
import requests
import json
# 連接到 Elasticsearch,替換為實際的 IP 地址和密碼
# es = Elasticsearch('http://localhost:9200', basic_auth=('elastic', 'Elastic_j625sz'))
# es = Elasticsearch('http://192.168.100.6:9200',request_timeout=60) # 無密碼的連接
es = Elasticsearch('http://192.168.14.83:9200',request_timeout=60) # 無密碼的連接
# 檢查連接
if es.ping():
print('es連接成功')
else:
print('es連接失敗')
# 創建索引并且定義結構
def create_index(es, index_name="embedding_demo_v2"):
"""創建索引,如果索引已存在則忽略"""
if not es.indices.exists(index=index_name):
es.indices.create(index=index_name)
# 定義結構
define_mapping(es, index_name=index_name)
# 創建映射
def define_mapping(es, index_name="embedding_demo_v2"):
"""為索引定義映射"""
mapping_cnotallow={
"mappings": {
"properties": {
"title": {
"type": "text","analyzer":"ik_smart"
},
"content": {
"type": "text","analyzer":"ik_smart"
}
,
"embedding": {
"type": "dense_vector",
"dims": 1024,
"index": True,
"similarity": "cosine"
}
}
}
}
es.indices.create(index=index_name, body=mapping_config, ignore=400) # ignore=400忽略索引已存在錯誤es的添加示例數據
隨機造了幾條數據:
["張無忌","謝遜","趙敏","小昭","張翠山"]
# 寫入測試文本數據
def append_demo_data(index_name="embedding_demo_v2"):
# 添加示例數據
for single_text in ["張無忌","謝遜","趙敏","小昭","張翠山"]:
print("當前文本:{}".format(single_text))
curr_vector=embedding_function(text=single_text)
# print(curr_vector)
print("--------")
# 準備數據
curr_data = {
"title": "倚天屠龍記",
"content": single_text,
"embedding": curr_vector
}
# 插入文檔
insert_document(es, index_name=index_name, doc_id=None, document=curr_data)
# 新增
def insert_document(es, index_name="example_index", doc_id=None, document=None):
"""插入文檔到指定索引"""
respnotallow=es.index(index=index_name, id=doc_id, document=document)
print(response)
# 向量化模型
def embedding_function(text="你是誰"):
# 這里使用一個簡單的示例向量,實際應用中應調用向量化模型生成
url = "http://localhost:11434/api/embed"
payload = json.dumps({
"model": "bge-m3:latest",
"input": [text]
})
headers = {
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload).json()
# print(response)
vector=response["embeddings"][0]
# print(vector)
print("向量維度:{}".format(len(vector)))
return vectores的關鍵詞檢索
{
"query": {
"match": {
"content": {
"query": "張無忌和張翠山是什么關系"
}
}
},
"size": 10,
"fields": [
"title",
"content"
]
}
圖片
python代碼示例:
# 關鍵詞檢索
def query_keyword_function(index_name="embedding_demo_v2",questinotallow="張無忌和張翠山是什么關系"):
print("開始關鍵詞檢索!")
# questinotallow="張無忌和張翠山是什么關系"
# 單獨的關鍵詞檢索
query_keyword_dict={
"query": {
"match": {
"content": {
"query": question
}
}
},
"size": 10,
"fields": [ "title", "content" ]
}
print("question:{}".format(question))
result=search_documents(es, index_name=index_name, query=query_keyword_dict)
# print(len(result))
for single_key in result:
# print(single_key)
if single_key=="hits":
hits_list=result["hits"]["hits"]
for single_hit in hits_list:
print("_score:{0},fields:{1}".format(single_hit["_score"],single_hit["fields"]))
# 查詢
def search_documents(es, index_name="embedding_demo_v2", query=None):
"""在指定索引中搜索文檔"""
return es.search(index=index_name, body=query)結果:
圖片
es的向量搜索
"query_vector": [ -0.015572428,-0.072642185,-0.0066854693] ,實際傳入你的向量,此處只是方便演示;
{
"knn": {
"field": "embedding",
"query_vector": [ -0.015572428,-0.072642185,-0.0066854693],
"k": 10,
"num_candidates": 100
},
"fields": [ "title", "content" ]
}
圖片
python代碼示例:
# 向量檢索
def query_embedding_function(index_name="embedding_demo_v2",questinotallow="張無忌和張翠山是什么關系"):
print("開始向量檢索!")
# questinotallow="張無忌和張翠山是什么關系"
question_embedding=embedding_function(text=question)
# 單獨的向量檢索
query_embedding_dict={
"knn": {
"field": "embedding",
"query_vector": question_embedding,
"k": 10,
"num_candidates": 100
},
"fields": [ "title", "content" ]
}
print("question:{}".format(question))
result=search_documents(es, index_name=index_name, query=query_embedding_dict)
# print(len(result))
for single_key in result:
# print(single_key)
if single_key=="hits":
hits_list=result["hits"]["hits"]
for single_hit in hits_list:
print("_score:{0},fields:{1}".format(single_hit["_score"],single_hit["fields"]))
# 查詢
def search_documents(es, index_name="embedding_demo_v2", query=None):
"""在指定索引中搜索文檔"""
return es.search(index=index_name, body=query)結果:
圖片
es的關鍵詞和向量的混合檢索,利用boots的組合方法,分配不同的權重
{
"query": {
"match": {
"content": {
"query": "張無忌和張翠山是什么關系",
"boost": 0.1
}
}
},
"knn": {
"field": "embedding",
"query_vector":[ -0.015572428,-0.072642185,-0.0066854693],
"k": 10,
"num_candidates": 100,
"boost": 0.9
},
"size": 10
}
圖片
最終的得分計算:
score = 0.1 * match_score + 0.9 * knn_score
python代碼示例:
# 向量+es關鍵詞-混合檢索-利用boots方法
def query_simple_hybrid_retrieval_function(index_name="embedding_demo_v2",questinotallow="張無忌和張翠山是什么關系"):
print("開始混合檢索!利用boots")
# questinotallow="張無忌和張翠山是什么關系"
question_embedding=embedding_function(text=question)
# 單獨的向量檢索
query_embedding_dict={
"query": {
"match": {
"content": {
"query": "張無忌和張翠山是什么關系",
"boost": 0.1
}
}
},
"knn": {
"field": "embedding",
"query_vector": question_embedding,
"k": 10,
"num_candidates": 100,
"boost": 0.9
},
"fields": [ "title", "content" ]
}
print("question:{}".format(question))
result=search_documents(es, index_name=index_name, query=query_embedding_dict)
# print(len(result))
for single_key in result:
# print(single_key)
if single_key=="hits":
hits_list=result["hits"]["hits"]
for single_hit in hits_list:
print("_score:{0},fields:{1}".format(single_hit["_score"],single_hit["fields"]))
# 查詢
def search_documents(es, index_name="embedding_demo_v2", query=None):
"""在指定索引中搜索文檔"""
return es.search(index=index_name, body=query)結果:
圖片
三、完整的python示例代碼
# 本地安裝elasticsearch時,應指定與服務端相同的大版本號:
from operator import index
from elasticsearch import Elasticsearch
import requests
import json
# 連接到 Elasticsearch,替換為實際的 IP 地址和密碼
# es = Elasticsearch('http://localhost:9200', basic_auth=('elastic', 'Elastic_j625sz'))
es = Elasticsearch('http://localhost:9200',request_timeout=60) # 無密碼的連接
# 檢查連接
if es.ping():
print('es連接成功')
else:
print('es連接失敗')
# 創建索引并且定義結構
def create_index(es, index_name="embedding_demo_v2"):
"""創建索引,如果索引已存在則忽略"""
if not es.indices.exists(index=index_name):
es.indices.create(index=index_name)
# 定義結構
define_mapping(es, index_name=index_name)
# 創建映射
def define_mapping(es, index_name="embedding_demo_v2"):
"""為索引定義映射"""
mapping_cnotallow={
"mappings": {
"properties": {
"title": {
"type": "text","analyzer":"ik_smart"
},
"content": {
"type": "text","analyzer":"ik_smart"
}
,
"embedding": {
"type": "dense_vector",
"dims": 1024,
"index": True,
"similarity": "cosine"
}
}
}
}
es.indices.create(index=index_name, body=mapping_config, ignore=400) # ignore=400忽略索引已存在錯誤
# 新增
def insert_document(es, index_name="embedding_demo_v2", doc_id=None, document=None):
"""插入文檔到指定索引"""
respnotallow=es.index(index=index_name, id=doc_id, document=document)
print(response)
# 刪除
def delete_document(es, index_name="embedding_demo_v2", doc_id=None):
"""刪除指定ID的文檔"""
es.delete(index=index_name, id=doc_id)
# 改
def update_document(es, index_name="embedding_demo_v2", doc_id=None, updated_doc=None):
"""更新指定ID的文檔"""
result=es.update(index=index_name, id=doc_id, body={"doc": updated_doc})
print(result)
# 查詢
def search_documents(es, index_name="embedding_demo_v2", query=None):
"""在指定索引中搜索文檔"""
return es.search(index=index_name, body=query)
# 向量化模型
def embedding_function(text="你是誰"):
# 這里使用一個簡單的示例向量,實際應用中應調用向量化模型生成
url = "http://localhost:11434/api/embed"
payload = json.dumps({
"model": "bge-m3:latest",
"input": [text]
})
headers = {
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload).json()
# print(response)
vector=response["embeddings"][0]
# print(vector)
print("向量維度:{}".format(len(vector)))
return vector
# 寫入測試文本數據
def append_demo_data(index_name="embedding_demo_v2"):
# 添加示例數據
for single_text in ["張無忌","謝遜","趙敏","小昭","張翠山"]:
print("當前文本:{}".format(single_text))
curr_vector=embedding_function(text=single_text)
# print(curr_vector)
print("--------")
# 準備數據
curr_data = {
"title": "倚天屠龍記",
"content": single_text,
"embedding": curr_vector
}
# 插入文檔
insert_document(es, index_name=index_name, doc_id=None, document=curr_data)
# 向量檢索
def query_embedding_function(index_name="embedding_demo_v2",questinotallow="張無忌和張翠山是什么關系"):
print("開始向量檢索!")
# questinotallow="張無忌和張翠山是什么關系"
question_embedding=embedding_function(text=question)
# 單獨的向量檢索
query_embedding_dict={
"knn": {
"field": "embedding",
"query_vector": question_embedding,
"k": 10,
"num_candidates": 100
},
"fields": [ "title", "content" ]
}
print("question:{}".format(question))
result=search_documents(es, index_name=index_name, query=query_embedding_dict)
# print(len(result))
for single_key in result:
# print(single_key)
if single_key=="hits":
hits_list=result["hits"]["hits"]
for single_hit in hits_list:
print("_score:{0},fields:{1}".format(single_hit["_score"],single_hit["fields"]))
# 關鍵詞檢索
def query_keyword_function(index_name="embedding_demo_v2",questinotallow="張無忌和張翠山是什么關系"):
print("開始關鍵詞檢索!")
# questinotallow="張無忌和張翠山是什么關系"
# 單獨的關鍵詞檢索
query_keyword_dict={
"query": {
"match": {
"content": {
"query": question
}
}
},
"size": 10,
"fields": [ "title", "content" ]
}
print("question:{}".format(question))
result=search_documents(es, index_name=index_name, query=query_keyword_dict)
# print(len(result))
for single_key in result:
# print(single_key)
if single_key=="hits":
hits_list=result["hits"]["hits"]
for single_hit in hits_list:
print("_score:{0},fields:{1}".format(single_hit["_score"],single_hit["fields"]))
# 向量+es關鍵詞-混合檢索-利用boots方法
def query_simple_hybrid_retrieval_function(index_name="embedding_demo_v2",questinotallow="張無忌和張翠山是什么關系"):
print("開始混合檢索!利用boots")
# questinotallow="張無忌和張翠山是什么關系"
question_embedding=embedding_function(text=question)
# 單獨的向量檢索
query_embedding_dict={
"query": {
"match": {
"content": {
"query": "張無忌和張翠山是什么關系",
"boost": 0.1
}
}
},
"knn": {
"field": "embedding",
"query_vector": question_embedding,
"k": 10,
"num_candidates": 100,
"boost": 0.9
},
"fields": [ "title", "content" ]
}
print("question:{}".format(question))
result=search_documents(es, index_name=index_name, query=query_embedding_dict)
# print(len(result))
for single_key in result:
# print(single_key)
if single_key=="hits":
hits_list=result["hits"]["hits"]
for single_hit in hits_list:
print("_score:{0},fields:{1}".format(single_hit["_score"],single_hit["fields"]))
if __name__ == "__main__":
index_name="embedding_demo_v2"
# # # 創建索引
create_index(es, index_name=index_name)
# 寫入測試數據
append_demo_data(index_name=index_name)
# 關鍵詞檢索
query_keyword_function(index_name=index_name,questinotallow="張無忌和張翠山是什么關系")
# 向量檢索
query_embedding_function(index_name=index_name,questinotallow="張無忌和張翠山是什么關系")
# 簡單的混合檢索
query_simple_hybrid_retrieval_function(index_name=index_name,questinotallow="張無忌和張翠山是什么關系")




























