Trong thế giới AI và retrieval systems, việc tìm kiếm thông tin chính xác là yếu tố sống còn. Hybrid Search - kết hợp semantic vector search với traditional keyword search - đã trở thành standard approach cho production systems đòi hỏi độ chính xác cao.
Tại Sao Cần Hybrid Search?
Vector search vượt trội trong việc hiểu ngữ nghĩa - tìm "con mèo đen nhỏ" sẽ ra cả "kitten đen" hay "puss nhỏ". Nhưng nó có điểm yếu chí mạng: không xử lý được exact keywords, product codes, hay domain-specific terminology.
Keyword search (BM25, TF-IDF) lại hoàn hảo cho exact matching nhưng thất bại với synonyms và paraphrases. Hybrid search kết hợp cả hai để đạt recall gần 100%.
Kiến Trúc Hybrid Search System
1. High-Level Architecture
┌─────────────────────────────────────────────────────────────────┐
│ HYBRID SEARCH PIPELINE │
├─────────────────────────────────────────────────────────────────┤
│ │
│ Query: "cách cài đặt PostgreSQL trên Ubuntu 22.04" │
│ │ │
│ ▼ │
│ ┌─────────────────────────────────────────────────────────┐ │
│ │ QUERY PREPROCESSING │ │
│ │ • Tokenization (jieba, spaCy, custom) │ │
│ │ • Language Detection │ │
│ │ • Query Expansion / Synonym Mapping │ │
│ └─────────────────────────────────────────────────────────┘ │
│ │ │
│ ┌────────────┴────────────┐ │
│ ▼ ▼ │
│ ┌──────────────────┐ ┌──────────────────────┐ │
│ │ VECTOR SEARCH │ │ KEYWORD SEARCH │ │
│ │ (Embeddings) │ │ (BM25/Elastic) │ │
│ │ │ │ │ │
│ │ • Semantic sim │ │ • Exact matching │ │
│ │ • ANN index │ │ • Term frequency │ │
│ │ • Top-K recall │ │ • Document freq │ │
│ └──────────────────┘ └──────────────────────┘ │
│ │ │ │
│ └────────────┬────────────┘ │
│ ▼ │
│ ┌─────────────────────────────────────────────────────────┐ │
│ │ SCORE NORMALIZATION & FUSION │ │
│ │ • Min-Max / Z-Score normalization │ │
│ │ • RRF (Reciprocal Rank Fusion) ✓ Recommended │ │
│ │ • Weighted Linear Combination │ │
│ └─────────────────────────────────────────────────────────┘ │
│ │ │
│ ▼ │
│ Final Ranked Results │
└─────────────────────────────────────────────────────────────────┘
2. Score Fusion: RRF vs Weighted Sum
Chìa khóa của hybrid search nằm ở cách kết hợp hai score streams. RRF (Reciprocal Rank Fusion) là approach được recommend bởi tính robust và không cần weight tuning nhiều.
# RRF Formula: rrf_score(d) = Σ 1/(k + rank_i(d))
k = constant (thường 60), rank_i = thứ hạng trong mỗi search result
def reciprocal_rank_fusion(results_list: list[list], k: int = 60) -> dict:
"""
Fusion multiple ranked lists using RRF
RRF is robust: doesn't require score normalization
"""
scores = defaultdict(float)
for results in results_list:
for rank, doc in enumerate(results, start=1):
doc_id = doc['id']
# Core RRF formula
scores[doc_id] += 1.0 / (k + rank)
# Sort by fused score descending
ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
return ranked
def weighted_linear_fusion(
vector_scores: dict, # {doc_id: similarity_score}
keyword_scores: dict, # {doc_id: bm25_score}
vector_weight: float = 0.5,
normalize: bool = True
) -> dict:
"""
Weighted linear fusion with optional normalization
Requires careful weight tuning
"""
if normalize:
vector_scores = min_max_normalize(vector_scores)
keyword_scores = min_max_normalize(keyword_scores)
all_doc_ids = set(vector_scores.keys()) | set(keyword_scores.keys())
fused = {}
for doc_id in all_doc_ids:
v_score = vector_scores.get(doc_id, 0)
k_score = keyword_scores.get(doc_id, 0)
fused[doc_id] = vector_weight * v_score + (1 - vector_weight) * k_score
return dict(sorted(fused.items(), key=lambda x: x[1], reverse=True))
def min_max_normalize(scores: dict) -> dict:
"""Min-Max normalization: scale to [0, 1]"""
if not scores:
return scores
min_s = min(scores.values())
max_s = max(scores.values())
range_s = max_s - min_s
if range_s == 0:
return {k: 0.5 for k in scores}
return {k: (v - min_s) / range_s for k, v in scores.items()}
Implementation Với HolyShehep AI
HolySheep AI cung cấp embedding models với độ trễ dưới 50ms, tiết kiệm 85%+ chi phí so với OpenAI (tỷ giá ¥1=$1). Đăng ký tại đây để nhận tín dụng miễn phí.
3. Production-Ready Hybrid Search Implementation
import requests
import numpy as np
from dataclasses import dataclass
from typing import Optional
import hashlib
import time
@dataclass
class HolySheepConfig:
"""HolySheep AI API Configuration - Giá rẻ hơn 85% so với OpenAI"""
api_key: str
base_url: str = "https://api.holysheep.ai/v1"
embedding_model: str = "text-embedding-3-small"
embedding_dim: int = 1536
embedding_batch_size: int = 100
class HybridSearchEngine:
"""
Production Hybrid Search với HolySheep AI embeddings
Supports: Vector search + BM25 keyword search + RRF fusion
"""
def __init__(self, config: HolySheepConfig):
self.config = config
self.vector_index = {} # doc_id -> embedding
self.documents = {} # doc_id -> document metadata
self.bm25_index = None # Lazy import
def generate_embedding(self, text: str) -> list[float]:
"""Tạo embedding với HolySheep AI - độ trễ <50ms"""
response = requests.post(
f"{self.config.base_url}/embeddings",
headers={
"Authorization": f"Bearer {self.config.api_key}",
"Content-Type": "application/json"
},
json={
"model": self.config.embedding_model,
"input": text
}
)
response.raise_for_status()
return response.json()["data"][0]["embedding"]
def generate_embeddings_batch(self, texts: list[str]) -> list[list[float]]:
"""Batch embedding - tiết kiệm cost và latency"""
response = requests.post(
f"{self.config.base_url}/embeddings",
headers={
"Authorization": f"Bearer {self.config.api_key}",
"Content-Type": "application/json"
},
json={
"model": self.config.embedding_model,
"input": texts # Batch input
}
)
response.raise_for_status()
return [item["embedding"] for item in response.json()["data"]]
def index_documents(self, documents: list[dict], batch_size: int = 100):
"""
Index documents cho hybrid search
Mỗi document cần có: id, content (text chính)
"""
self.documents = {doc["id"]: doc for doc in documents}
# Batch generate embeddings để tối ưu cost
texts = [doc["content"] for doc in documents]
for i in range(0, len(texts), batch_size):
batch_texts = texts[i:i + batch_size]
embeddings = self.generate_embeddings_batch(batch_texts)
for j, doc_id in enumerate(documents[i:i + batch_size]):
self.vector_index[doc_id] = embeddings[j]
# Initialize BM25 index cho keyword search
self._init_bm25([doc["content"] for doc in documents])
def _init_bm25(self, texts: list[str]):
"""Initialize BM25 index với rank_bm25 library"""
try:
from rank_bm25 import BM25Okapi
tokenized_corpus = [text.lower().split() for text in texts]
self.bm25_index = BM25Okapi(tokenized_corpus)
except ImportError:
print("Warning: pip install rank-bm25")
self.bm25_index = None
def vector_search(self, query: str, top_k: int = 10) -> list[tuple]:
"""Semantic search bằng vector similarity"""
query_embedding = self.generate_embedding(query)
similarities = []
for doc_id, doc_embedding in self.vector_index.items():
sim = self._cosine_similarity(query_embedding, doc_embedding)
similarities.append((doc_id, sim))
similarities.sort(key=lambda x: x[1], reverse=True)
return similarities[:top_k]
def keyword_search(self, query: str, top_k: int = 10) -> list[tuple]:
"""Traditional keyword search với BM25"""
if self.bm25_index is None:
return []
tokenized_query = query.lower().split()
scores = self.bm25_index.get_scores(tokenized_query)
# Map scores to doc_ids
doc_ids = list(self.documents.keys())
scored = [(doc_ids[i], scores[i]) for i in range(len(scores))]
scored.sort(key=lambda x: x[1], reverse=True)
return scored[:top_k]
def hybrid_search(
self,
query: str,
top_k: int = 10,
vector_weight: float = 0.5,
use_rrf: bool = True
) -> list[dict]:
"""
Hybrid search: kết hợp vector + keyword search
Args:
query: Search query
top_k: Số lượng kết quả trả về
vector_weight: Trọng số cho vector search (0.0 - 1.0)
use_rrf: Nếu True dùng RRF, False dùng weighted sum
"""
# Execute both searches in parallel
vector_results = self.vector_search(query, top_k * 2)
keyword_results = self.keyword_search(query, top_k * 2)
if use_rrf:
# RRF fusion - recommended approach
fused_scores = self._rrf_fusion(
[vector_results, keyword_results],
k=60
)
else:
# Weighted linear combination
fused_scores = self._weighted_fusion(
dict(vector_results),
dict(keyword_results),
vector_weight
)
# Get top_k results with document metadata
top_results = []
for doc_id, score in fused_scores[:top_k]:
result = self.documents[doc_id].copy()
result["hybrid_score"] = score
top_results.append(result)
return top_results
def _rrf_fusion(self, results_list: list[list], k: int = 60) -> list[tuple]:
"""Reciprocal Rank Fusion implementation"""
scores = defaultdict(float)
for results in results_list:
for rank, (doc_id, _) in enumerate(results, start=1):
scores[doc_id] += 1.0 / (k + rank)
return sorted(scores.items(), key=lambda x: x[1], reverse=True)
def _weighted_fusion(
self,
vector_scores: dict,
keyword_scores: dict,
vector_weight: float
) -> list[tuple]:
"""Weighted linear fusion với normalization"""
all_docs = set(vector_scores.keys()) | set(keyword_scores.keys())
# Normalize
v_max = max(vector_scores.values()) if vector_scores else 1
k_max = max(keyword_scores.values()) if keyword_scores else 1
fused = {}
for doc_id in all_docs:
v = vector_scores.get(doc_id, 0) / v_max
k = keyword_scores.get(doc_id, 0) / k_max
fused[doc_id] = vector_weight * v + (1 - vector_weight) * k
return sorted(fused.items(), key=lambda x: x[1], reverse=True)
def _cosine_similarity(self, a: list[float], b: list[float]) -> float:
"""Compute cosine similarity"""
dot = sum(x * y for x, y in zip(a, b))
norm_a = sum(x * x for x in a) ** 0.5
norm_b = sum(x * x for x in b) ** 0.5
return dot / (norm_a * norm_b + 1e-8)
============ USAGE EXAMPLE ============
if __name__ == "__main__":
config = HolySheepConfig(
api_key="YOUR_HOLYSHEEP_API_KEY" # Thay bằng API key thực tế
)
engine = HybridSearchEngine(config)
# Sample documents - Vietnamese tech documentation
documents = [
{"id": "1", "content": "Cách cài đặt PostgreSQL 15 trên Ubuntu 22.04", "category": "database"},
{"id": "2", "content": "Hướng dẫn tối ưu PostgreSQL performance", "category": "database"},
{"id": "3", "content": "MySQL vs PostgreSQL comparison 2024", "category": "database"},
{"id": "4", "content": "Docker container tutorial cho beginners", "category": "devops"},
{"id": "5", "content": "Kubernetes deployment guide", "category": "devops"},
]
print("Indexing documents...")
engine.index_documents(documents)
print("\n=== Hybrid Search Results ===")
results = engine.hybrid_search(
query="cài đặt database PostgreSQL Ubuntu",
top_k=3,
use_rrf=True
)
for r in results:
print(f"ID: {r['id']}, Score: {r['hybrid_score']:.4f}")
print(f"Content: {r['content']}")
print("---")
Performance Benchmark
Đo benchmark trên 10,000 documents với query pool 100 queries:
import time
import statistics
from concurrent.futures import ThreadPoolExecutor, as_completed
class HybridSearchBenchmark:
"""Benchmark utility cho hybrid search system"""
def __init__(self, engine: HybridSearchEngine):
self.engine = engine
self.results = {}
def benchmark_latency(
self,
queries: list[str],
runs: int = 5,
warmup: int = 2
) -> dict:
"""Benchmark end-to-end latency"""
# Warmup runs
for _ in range(warmup):
for q in queries[:5]:
self.engine.hybrid_search(q)
latencies = []
for _ in range(runs):
for query in queries:
start = time.perf_counter()
self.engine.hybrid_search(query, top_k=10)
latency = (time.perf_counter() - start) * 1000 # ms
latencies.append(latency)
return {
"mean_ms": statistics.mean(latencies),
"median_ms": statistics.median(latencies),
"p95_ms": sorted(latencies)[
Tài nguyên liên quan
Bài viết liên quan