Concept #52Hardadvanced-topics

Design a safety filtering system for an LLM chatbot.

#gen-ai#safety#system-design

Answer

Safety Filtering System for an LLM Chatbot

A safety system operates at multiple layers: input filtering (screen user messages), output filtering (screen LLM responses), and content policies (define what's allowed).

Architecture

Layer 1: OpenAI Moderation API

python
from openai import OpenAI

client = OpenAI()

def moderate_content(text: str) -> dict:
    '''Use OpenAI's moderation API (free).'''
    response = client.moderations.create(input=text)
    result = response.results[0]

    return {
        "flagged": result.flagged,
        "categories": {
            cat: score
            for cat, score in result.category_scores.__dict__.items()
            if score > 0.1
        },
        "blocked_categories": [
            cat for cat, flagged in result.categories.__dict__.items() if flagged
        ],
    }

def safe_completion(user_message: str) -> str:
    # Check input
    moderation = moderate_content(user_message)
    if moderation["flagged"]:
        categories = ", ".join(moderation["blocked_categories"])
        return f"I cannot respond to that request. (Reason: {categories})"

    # Generate response
    response = call_llm(user_message)

    # Check output
    output_check = moderate_content(response)
    if output_check["flagged"]:
        return "I'm sorry, I generated an inappropriate response. Please rephrase your question."

    return response

Layer 2: Rule-Based Keyword Filter

python
import re
from enum import Enum

class FilterAction(str, Enum):
    ALLOW = "allow"
    BLOCK = "block"
    WARN = "warn"

# Categorised blocklists
BLOCKED_PATTERNS = {
    "pii_extraction": [
        r"\b(?:social security|SSN|passport)\s*(?:number)?",
        r"credit card\s*(?:number)?",
    ],
    "jailbreak": [
        r"ignore (all |previous |above )?instructions",
        r"you are now (?:DAN|an AI with no restrictions)",
        r"pretend you have no (guidelines|restrictions|filters)",
    ],
    "harmful": [
        r"\bhow to (make|build|create|synthesize)\s+(bomb|weapon|explosive)",
    ],
}

def rule_based_filter(text: str) -> dict:
    text_lower = text.lower()
    violations = []

    for category, patterns in BLOCKED_PATTERNS.items():
        for pattern in patterns:
            if re.search(pattern, text_lower, re.IGNORECASE):
                violations.append({"category": category, "pattern": pattern})

    return {
        "action": FilterAction.BLOCK if violations else FilterAction.ALLOW,
        "violations": violations,
    }

Layer 3: LLM-Based Safety Judge

python
def llm_safety_check(user_input: str, bot_context: str) -> dict:
    '''Use GPT-4o-mini as a safety classifier for edge cases.'''
    import json

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content":
            f'''You are a content safety classifier for a {bot_context}.
Classify this user message:

Message: "{user_input}"

Respond as JSON:
{{"safe": true/false, "risk_level": "low/medium/high",
  "reason": "brief explanation", "suggested_action": "allow/warn/block"}}'''
        }],
        response_format={"type": "json_object"},
        temperature=0,
    )
    return json.loads(response.choices[0].message.content)

Complete Safety Pipeline

python
import logging
logger = logging.getLogger(__name__)

async def safe_chat_response(user_input: str, user_id: str) -> str:
    # Layer 1: Fast rule-based check
    rule_result = rule_based_filter(user_input)
    if rule_result["action"] == FilterAction.BLOCK:
        logger.warning("input_blocked_rule", user_id=user_id,
                       violations=rule_result["violations"])
        return "I can't help with that request."

    # Layer 2: OpenAI moderation (fast, free)
    mod_result = moderate_content(user_input)
    if mod_result["flagged"]:
        logger.warning("input_blocked_moderation", user_id=user_id,
                       categories=mod_result["blocked_categories"])
        return "That request violates our content policy."

    # Generate response
    answer = call_rag_pipeline(user_input)

    # Layer 3: Output check
    output_mod = moderate_content(answer)
    if output_mod["flagged"]:
        logger.error("output_filtered", user_id=user_id)
        return "I apologise, I can't provide that information."

    return answer

Safety Policy Dimensions

DimensionImplementation
Harmful contentOpenAI moderation + keyword filter
PII exposureRegex pattern matching
Prompt injectionJailbreak pattern detection
Scope enforcementTopic classifier
Rate limitingToken bucket per user
Audit loggingLog all flagged attempts
Human reviewFlag edge cases for review queue

Layered defence: No single filter catches everything. Combine fast rule-based filters (microseconds) with ML-based classifiers (milliseconds) and LLM-as-judge (seconds) for robust coverage at reasonable cost.