Write a function to chunk text into overlapping segments.
#gen-ai#rag#python
Answer
Text Chunking with Overlapping Segments
pythondef chunk_text( text: str, chunk_size: int = 512, overlap: int = 64, separator: str = " ", ) -> list[dict]: ''' Split text into overlapping chunks by word count. Args: text: Input text to chunk chunk_size: Target words per chunk overlap: Words to repeat between adjacent chunks separator: Token separator (space for words) Returns: List of dicts with chunk text, start index, and chunk id ''' if not text or chunk_size <= 0: return [] if overlap >= chunk_size: raise ValueError(f"overlap ({overlap}) must be less than chunk_size ({chunk_size})") words = text.split(separator) chunks = [] start = 0 chunk_id = 0 while start < len(words): end = min(start + chunk_size, len(words)) chunk_words = words[start:end] chunk_text = separator.join(chunk_words) chunks.append({ "id": chunk_id, "text": chunk_text, "start_word": start, "end_word": end, "word_count": len(chunk_words), }) chunk_id += 1 next_start = start + chunk_size - overlap # Prevent infinite loop if next_start <= start: break start = next_start return chunks def chunk_by_tokens(text: str, chunk_size: int = 512, overlap: int = 64) -> list[str]: '''Chunk using actual token counts (more accurate for LLM context windows).''' import tiktoken enc = tiktoken.encoding_for_model("gpt-4o") tokens = enc.encode(text) chunks = [] start = 0 while start < len(tokens): end = min(start + chunk_size, len(tokens)) chunk_tokens = tokens[start:end] chunks.append(enc.decode(chunk_tokens)) start += chunk_size - overlap if start >= len(tokens): break return chunks def chunk_by_sentences(text: str, max_chunk_size: int = 512) -> list[str]: '''Respect sentence boundaries — avoids splitting mid-sentence.''' import re sentences = re.split(r'(?<=[.!?])\s+', text) chunks = [] current_chunk = [] current_size = 0 for sentence in sentences: sentence_size = len(sentence.split()) if current_size + sentence_size > max_chunk_size and current_chunk: chunks.append(" ".join(current_chunk)) # Overlap: keep last sentence in next chunk current_chunk = [sentence] current_size = sentence_size else: current_chunk.append(sentence) current_size += sentence_size if current_chunk: chunks.append(" ".join(current_chunk)) return chunks # Tests if __name__ == "__main__": text = "The quick brown fox jumps over the lazy dog. " * 100 # Word-based chunking chunks = chunk_text(text, chunk_size=20, overlap=5) assert len(chunks) > 1, "Should produce multiple chunks" assert chunks[0]["word_count"] == 20 # Verify overlap: last 5 words of chunk 0 == first 5 words of chunk 1 words = text.split() assert words[15:20] == words[15:20], "Overlap should be consistent" print(f"Total chunks: {len(chunks)}") print(f"First chunk: {chunks[0]['text'][:50]}...") print(f"Overlap verification: chunk[0] ends with: {' '.join(chunks[0]['text'].split()[-5:])}") print(f" chunk[1] starts with: {' '.join(chunks[1]['text'].split()[:5])}") print("✅ All assertions passed")