Concept #20Mediumpython-for-gen-ai

What are context managers? How would you use them for LLM resource management?

#gen-ai#python

Answer

Context Managers in Python

A context manager defines setup and teardown logic around a block of code using the

text
with
statement. It ensures resources are properly cleaned up — even if an exception occurs.

The Protocol

Context managers implement

text
__enter__
and
text
__exit__
methods (or use
text
@contextmanager
).

python
class DatabaseConnection:
    def __enter__(self):
        self.conn = connect_to_db()
        return self.conn

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.conn.close()   # Always runs, even if exception occurred
        return False        # Don't suppress exceptions

# Usage
with DatabaseConnection() as conn:
    conn.execute("SELECT * FROM documents")
# conn.close() called automatically

Using
text
@contextmanager
(Simpler Syntax)

python
from contextlib import contextmanager

@contextmanager
def timer(label: str):
    import time
    start = time.perf_counter()
    try:
        yield  # Code inside 'with' block runs here
    finally:
        duration = time.perf_counter() - start
        print(f"{label}: {duration:.3f}s")

with timer("embedding generation"):
    embeddings = model.encode(texts)

LLM Resource Management Use Cases

1. Managing OpenAI client lifecycle

python
from contextlib import contextmanager
from openai import OpenAI

@contextmanager
def openai_session(model: str = "gpt-4o"):
    client = OpenAI()
    try:
        yield client
    finally:
        # Cleanup (e.g., close async clients, flush logs)
        pass

with openai_session() as client:
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": "Hello"}]
    )

2. Temporary model loading (GPU memory management)

python
from contextlib import contextmanager
import torch

@contextmanager
def load_model_temporarily(model_name: str):
    from transformers import AutoModelForCausalLM, AutoTokenizer
    model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    try:
        yield model, tokenizer
    finally:
        del model
        torch.cuda.empty_cache()  # Free GPU memory
        print(f"Unloaded {model_name} from GPU")

with load_model_temporarily("mistralai/Mistral-7B-Instruct-v0.2") as (model, tok):
    inputs = tok("Explain RAG", return_tensors="pt").to("cuda")
    output = model.generate(**inputs, max_new_tokens=200)
# Model automatically unloaded from GPU

3. Vector store session management

python
from contextlib import contextmanager
from langchain_community.vectorstores import Chroma

@contextmanager
def vector_store_session(persist_dir: str):
    from langchain_openai import OpenAIEmbeddings
    store = Chroma(
        persist_directory=persist_dir,
        embedding_function=OpenAIEmbeddings()
    )
    try:
        yield store
    finally:
        store.persist()  # Save changes to disk
        print("Vector store persisted and closed")

with vector_store_session("./chroma_db") as vs:
    vs.add_documents(new_documents)
    results = vs.similarity_search("What is RAG?", k=5)

4. Conversation context (thread-local state)

python
from contextlib import contextmanager
from typing import Generator
import threading

_context = threading.local()

@contextmanager
def conversation_context(user_id: str, session_id: str) -> Generator:
    _context.user_id = user_id
    _context.session_id = session_id
    _context.messages = []
    try:
        yield _context
    finally:
        del _context.user_id
        del _context.session_id

with conversation_context("user_123", "session_abc") as ctx:
    ctx.messages.append({"role": "user", "content": "Hello"})

Key benefit: Context managers guarantee cleanup code runs regardless of success or failure — critical for GPU memory, file handles, and API sessions in production LLM applications.