Concept #31Mediumpython-for-gen-ai

How would you implement exponential backoff for API retries?

#gen-ai#python

Answer

Exponential Backoff for API Retries

Exponential backoff retries a failed request after increasingly long delays — preventing thundering herd problems where many clients retry simultaneously and overwhelm a recovering service.

Core Algorithm

text
Wait time = base_delay × (backoff_multiplier ^ attempt) + random_jitter

Implementation

python
import time
import random
import logging
from functools import wraps
from typing import Type, tuple

logger = logging.getLogger(__name__)

def exponential_backoff(
    max_attempts: int = 5,
    base_delay: float = 1.0,
    max_delay: float = 60.0,
    backoff_factor: float = 2.0,
    jitter: bool = True,
    retryable_exceptions: tuple[Type[Exception], ...] = (Exception,),
):
    '''Decorator for exponential backoff retry logic.'''
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            for attempt in range(max_attempts):
                try:
                    return func(*args, **kwargs)
                except retryable_exceptions as e:
                    if attempt == max_attempts - 1:
                        logger.error(f"{func.__name__} failed after {max_attempts} attempts: {e}")
                        raise

                    delay = min(base_delay * (backoff_factor ** attempt), max_delay)
                    if jitter:
                        delay = delay * (0.5 + random.random())  # ±50% jitter

                    logger.warning(
                        f"{func.__name__} attempt {attempt + 1}/{max_attempts} failed: {e}. "
                        f"Retrying in {delay:.2f}s..."
                    )
                    time.sleep(delay)
        return wrapper
    return decorator

OpenAI-Specific Retry

python
from openai import RateLimitError, APIConnectionError, InternalServerError, OpenAI

client = OpenAI()

@exponential_backoff(
    max_attempts=5,
    base_delay=1.0,
    max_delay=60.0,
    jitter=True,
    retryable_exceptions=(RateLimitError, APIConnectionError, InternalServerError),
)
def call_openai(messages: list[dict], model: str = "gpt-4o") -> str:
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        timeout=30,  # Request-level timeout
    )
    return response.choices[0].message.content

# Usage — automatically retried on transient failures
answer = call_openai([{"role": "user", "content": "What is RAG?"}])

Using tenacity Library (Production-Grade)

python
from tenacity import (
    retry,
    stop_after_attempt,
    wait_exponential,
    retry_if_exception_type,
    before_sleep_log,
    after_log,
)
from openai import RateLimitError, APIConnectionError
import logging

logger = logging.getLogger(__name__)

@retry(
    stop=stop_after_attempt(5),
    wait=wait_exponential(multiplier=1, min=1, max=60),
    retry=retry_if_exception_type((RateLimitError, APIConnectionError)),
    before_sleep=before_sleep_log(logger, logging.WARNING),
    after=after_log(logger, logging.INFO),
)
def resilient_llm_call(prompt: str) -> str:
    from openai import OpenAI
    client = OpenAI()
    return client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}]
    ).choices[0].message.content

Retry with Rate Limit Header

OpenAI returns a

text
Retry-After
header on 429 errors. Respect it:

python
import httpx
from openai import RateLimitError

def smart_retry(prompt: str, max_attempts: int = 5) -> str:
    from openai import OpenAI
    client = OpenAI()

    for attempt in range(max_attempts):
        try:
            return client.chat.completions.create(
                model="gpt-4o",
                messages=[{"role": "user", "content": prompt}]
            ).choices[0].message.content

        except RateLimitError as e:
            # Extract Retry-After header if available
            retry_after = getattr(e, 'retry_after', None)
            if retry_after:
                sleep_time = float(retry_after)
            else:
                sleep_time = (2 ** attempt) + random.uniform(0, 1)

            if attempt < max_attempts - 1:
                logger.warning(f"Rate limited. Waiting {sleep_time:.1f}s...")
                time.sleep(sleep_time)
            else:
                raise

Backoff Delay Schedule

AttemptBase delayBackoff factorMax delayApprox wait
11s60s~1s
22s60s~2s
34s60s~4s
48s60s~8s
516s60s~16s

Production best practice: Combine exponential backoff with a circuit breaker — if an API fails 10 times in 60 seconds, stop sending requests for 5 minutes to let the service recover. The

text
pybreaker
library implements this pattern.