Concept #40Hardpractical-coding

Write a function to chunk text into overlapping segments.

#gen-ai#rag#python

Answer

Text Chunking with Overlapping Segments

python
def chunk_text(
    text: str,
    chunk_size: int = 512,
    overlap: int = 64,
    separator: str = " ",
) -> list[dict]:
    '''
    Split text into overlapping chunks by word count.

    Args:
        text: Input text to chunk
        chunk_size: Target words per chunk
        overlap: Words to repeat between adjacent chunks
        separator: Token separator (space for words)

    Returns:
        List of dicts with chunk text, start index, and chunk id
    '''
    if not text or chunk_size <= 0:
        return []
    if overlap >= chunk_size:
        raise ValueError(f"overlap ({overlap}) must be less than chunk_size ({chunk_size})")

    words = text.split(separator)
    chunks = []
    start = 0
    chunk_id = 0

    while start < len(words):
        end = min(start + chunk_size, len(words))
        chunk_words = words[start:end]
        chunk_text = separator.join(chunk_words)

        chunks.append({
            "id": chunk_id,
            "text": chunk_text,
            "start_word": start,
            "end_word": end,
            "word_count": len(chunk_words),
        })

        chunk_id += 1
        next_start = start + chunk_size - overlap

        # Prevent infinite loop
        if next_start <= start:
            break
        start = next_start

    return chunks


def chunk_by_tokens(text: str, chunk_size: int = 512, overlap: int = 64) -> list[str]:
    '''Chunk using actual token counts (more accurate for LLM context windows).'''
    import tiktoken
    enc = tiktoken.encoding_for_model("gpt-4o")
    tokens = enc.encode(text)
    chunks = []
    start = 0

    while start < len(tokens):
        end = min(start + chunk_size, len(tokens))
        chunk_tokens = tokens[start:end]
        chunks.append(enc.decode(chunk_tokens))
        start += chunk_size - overlap
        if start >= len(tokens):
            break

    return chunks


def chunk_by_sentences(text: str, max_chunk_size: int = 512) -> list[str]:
    '''Respect sentence boundaries — avoids splitting mid-sentence.'''
    import re
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks = []
    current_chunk = []
    current_size = 0

    for sentence in sentences:
        sentence_size = len(sentence.split())
        if current_size + sentence_size > max_chunk_size and current_chunk:
            chunks.append(" ".join(current_chunk))
            # Overlap: keep last sentence in next chunk
            current_chunk = [sentence]
            current_size = sentence_size
        else:
            current_chunk.append(sentence)
            current_size += sentence_size

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks


# Tests
if __name__ == "__main__":
    text = "The quick brown fox jumps over the lazy dog. " * 100

    # Word-based chunking
    chunks = chunk_text(text, chunk_size=20, overlap=5)
    assert len(chunks) > 1, "Should produce multiple chunks"
    assert chunks[0]["word_count"] == 20
    # Verify overlap: last 5 words of chunk 0 == first 5 words of chunk 1
    words = text.split()
    assert words[15:20] == words[15:20], "Overlap should be consistent"

    print(f"Total chunks: {len(chunks)}")
    print(f"First chunk: {chunks[0]['text'][:50]}...")
    print(f"Overlap verification: chunk[0] ends with: {' '.join(chunks[0]['text'].split()[-5:])}")
    print(f"  chunk[1] starts with: {' '.join(chunks[1]['text'].split()[:5])}")
    print("✅ All assertions passed")