How are conversation histories managed in AI agents?
#gen-ai#agents
Answer
How Conversation History Is Managed in AI Agents
Conversation history (memory) management is one of the most critical design challenges for AI agents. Without history, agents can't maintain context; with too much history, they hit context limits and increase costs.
The Core Challenge
textEvery LLM API call requires the FULL conversation history in the request. History grows with each turn → eventually exceeds context window → truncation needed.
Strategy 1: In-Memory Buffer (Simplest)
Keep last N messages:
pythonfrom collections import deque class ConversationBuffer: def __init__(self, max_messages: int = 20): self.history = deque(maxlen=max_messages) def add(self, role: str, content: str): self.history.append({"role": role, "content": content}) def get_messages(self) -> list[dict]: return list(self.history) buffer = ConversationBuffer(max_messages=10) buffer.add("user", "What is RAG?") buffer.add("assistant", "RAG is Retrieval-Augmented Generation...") buffer.add("user", "Can you show an example?")
Limitation: Loses context when history is full.
Strategy 2: Conversation Summarization
Summarize old turns to compress context:
pythonfrom anthropic import Anthropic client = Anthropic() class SummarizingMemory: def __init__(self, max_tokens: int = 4000, summary_threshold: int = 3000): self.messages = [] self.summary = "" self.max_tokens = max_tokens self.summary_threshold = summary_threshold def estimate_tokens(self) -> int: total = sum(len(m["content"].split()) * 1.3 for m in self.messages) return int(total) def maybe_summarize(self): if self.estimate_tokens() > self.summary_threshold: # Summarize oldest half of messages to_summarize = self.messages[:len(self.messages)//2] self.messages = self.messages[len(self.messages)//2:] response = client.messages.create( model="claude-opus-4-6", max_tokens=300, messages=[{ "role": "user", "content": f"Summarize this conversation concisely:\n{to_summarize}" }] ) self.summary = response.content[0].text def get_context(self) -> list[dict]: context = [] if self.summary: context.append({ "role": "user", "content": f"[Previous conversation summary: {self.summary}]" }) context.append({"role": "assistant", "content": "I understand the context."}) context.extend(self.messages) return context def add(self, role: str, content: str): self.messages.append({"role": role, "content": content}) self.maybe_summarize()
Strategy 3: Vector DB Long-Term Memory
Store all history in a vector database, retrieve relevant past conversations:
pythonimport chromadb from sentence_transformers import SentenceTransformer class VectorMemory: def __init__(self): self.encoder = SentenceTransformer("all-MiniLM-L6-v2") self.client = chromadb.Client() self.collection = self.client.create_collection("conversation_history") self.recent = [] # Last N messages always included def add(self, role: str, content: str, turn_id: str): self.recent.append({"role": role, "content": content}) # Store in vector DB for long-term retrieval embedding = self.encoder.encode([content])[0].tolist() self.collection.add( embeddings=[embedding], documents=[content], metadatas=[{"role": role, "turn_id": turn_id}], ids=[turn_id] ) def get_relevant_history(self, query: str, n: int = 3) -> list[str]: query_embedding = self.encoder.encode([query])[0].tolist() results = self.collection.query(query_embeddings=[query_embedding], n_results=n) return results["documents"][0] def build_context(self, current_query: str) -> list[dict]: relevant = self.get_relevant_history(current_query) context = [] if relevant: context.append({"role": "user", "content": f"[Relevant past context: {'; '.join(relevant)}]"}) context.append({"role": "assistant", "content": "I understand."}) context.extend(self.recent[-6:]) # Last 6 messages always return context
LangChain Memory Types
pythonfrom langchain.memory import ( ConversationBufferMemory, # Store all messages ConversationSummaryMemory, # Summarize old messages ConversationBufferWindowMemory, # Keep last N messages ConversationSummaryBufferMemory # Hybrid: summary + recent ) # Recommended for most production use cases memory = ConversationSummaryBufferMemory( llm=llm, max_token_limit=2000, # Keep recent messages up to 2000 tokens # Older messages are summarized automatically )
Comparison of Strategies
| Strategy | Memory | Token Cost | Complexity | Best For |
|---|---|---|---|---|
| Buffer (last N) | Recent only | Low | Low | Short sessions |
| Summarization | Summary + recent | Medium | Medium | Long conversations |
| Vector retrieval | All history | Variable | High | Very long sessions |
| Hybrid | Summary + vector | Medium | High | Production chatbots |