Concept #29Mediumpython-for-gen-ai

How would you implement a vector search with similarity scoring?

#gen-ai#vector-db#embeddings

Answer

Implementing Vector Search with Similarity Scoring

Vector search finds the most semantically similar documents to a query by computing distances between embedding vectors.

Core Similarity Metrics

Cosine similarity — angle between vectors (range: -1 to 1). Best for text:

cos(θ)=ABAB\cos(\theta) = \frac{A \cdot B}{\|A\| \|B\|}

Dot product — magnitude × cosine similarity. Faster, requires normalised vectors.

Euclidean distance (L2) — straight-line distance in vector space. Less common for text.

Implementation from Scratch

python
import numpy as np
from typing import NamedTuple

class SearchResult(NamedTuple):
    id: str
    text: str
    score: float

def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def vector_search(
    query_embedding: np.ndarray,
    document_embeddings: np.ndarray,
    documents: list[dict],
    top_k: int = 5,
    min_score: float = 0.7,
) -> list[SearchResult]:
    '''Search documents by cosine similarity to query embedding.'''

    # Compute all similarities at once (vectorised — much faster than loop)
    query_norm = query_embedding / np.linalg.norm(query_embedding)
    doc_norms = document_embeddings / np.linalg.norm(document_embeddings, axis=1, keepdims=True)
    scores = doc_norms @ query_norm  # (N,) array of cosine similarities

    # Get top-k indices above threshold
    top_indices = np.argsort(scores)[::-1][:top_k]

    results = []
    for idx in top_indices:
        if scores[idx] >= min_score:
            results.append(SearchResult(
                id=documents[idx]["id"],
                text=documents[idx]["text"],
                score=float(scores[idx]),
            ))

    return results

Production Implementation with FAISS

python
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

class VectorStore:
    def __init__(self, embedding_dim: int = 384):
        self.model = SentenceTransformer("all-MiniLM-L6-v2")
        self.embedding_dim = embedding_dim

        # IndexFlatIP = Inner Product (cosine similarity on normalised vectors)
        self.index = faiss.IndexFlatIP(embedding_dim)

        # IndexIVFFlat = Approximate search (faster for 100K+ documents)
        # nlist = number of Voronoi cells
        # quantizer = faiss.IndexFlatIP(embedding_dim)
        # self.index = faiss.IndexIVFFlat(quantizer, embedding_dim, nlist=100)

        self.documents: list[dict] = []

    def add_documents(self, documents: list[dict]) -> None:
        texts = [doc["text"] for doc in documents]
        embeddings = self.model.encode(texts, normalize_embeddings=True)
        embeddings = embeddings.astype(np.float32)

        self.index.add(embeddings)
        self.documents.extend(documents)
        print(f"Index now contains {self.index.ntotal} vectors")

    def search(self, query: str, top_k: int = 5) -> list[SearchResult]:
        query_embedding = self.model.encode(
            [query], normalize_embeddings=True
        ).astype(np.float32)

        scores, indices = self.index.search(query_embedding, top_k)

        results = []
        for score, idx in zip(scores[0], indices[0]):
            if idx != -1:  # -1 means no result found
                results.append(SearchResult(
                    id=self.documents[idx].get("id", str(idx)),
                    text=self.documents[idx]["text"],
                    score=float(score),
                ))
        return results

# Usage
store = VectorStore()
store.add_documents([
    {"id": "1", "text": "RAG retrieves documents to ground LLM answers"},
    {"id": "2", "text": "Fine-tuning updates model weights on domain data"},
    {"id": "3", "text": "The weather in London is often rainy"},
])

results = store.search("How does retrieval-augmented generation work?", top_k=2)
for r in results:
    print(f"Score: {r.score:.4f} | {r.text}")
# Score: 0.8912 | RAG retrieves documents to ground LLM answers
# Score: 0.4231 | Fine-tuning updates model weights on domain data

Hybrid Search (Keyword + Semantic)

python
from rank_bm25 import BM25Okapi

class HybridSearcher:
    def __init__(self, documents: list[dict], alpha: float = 0.5):
        '''alpha=0: pure BM25, alpha=1: pure semantic'''
        self.alpha = alpha
        texts = [doc["text"] for doc in documents]

        # BM25 index
        tokenized = [t.lower().split() for t in texts]
        self.bm25 = BM25Okapi(tokenized)

        # Semantic index
        self.vector_store = VectorStore()
        self.vector_store.add_documents(documents)

    def search(self, query: str, top_k: int = 5) -> list[dict]:
        # BM25 scores (keyword relevance)
        bm25_scores = self.bm25.get_scores(query.lower().split())
        bm25_norm = bm25_scores / (bm25_scores.max() + 1e-8)

        # Semantic scores
        semantic_results = self.vector_store.search(query, top_k=len(bm25_scores))
        semantic_scores = np.zeros(len(bm25_scores))
        for r in semantic_results:
            semantic_scores[int(r.id)] = r.score

        # Weighted combination (Reciprocal Rank Fusion or linear blend)
        combined = (1 - self.alpha) * bm25_norm + self.alpha * semantic_scores
        top_indices = np.argsort(combined)[::-1][:top_k]
        return [{"idx": int(i), "score": float(combined[i])} for i in top_indices]

Score interpretation: Cosine similarity of > 0.8 = very similar, 0.5–0.8 = related, < 0.5 = probably not relevant. Set your

text
min_score
threshold based on your precision/recall trade-off.