Input/output filtering systems that enforce safety, policy compliance, and quality constraints on LLM applications — including NVIDIA NeMo Guardrails, Meta's Llama Guard, and Rebuff for prompt injection defense.
LLMs are trained to be helpful and will attempt to answer almost any question. Without guardrails, production applications will inevitably: generate off-brand content, answer questions outside the intended scope, produce factually wrong confident answers, or be manipulated by adversarial users into bypassing safety measures.
Guardrails are the defensive layer between user input and LLM output. They operate at three points: input validation (reject or modify harmful/off-topic inputs before the LLM sees them), output validation (review and reject/modify LLM outputs before showing to users), and dialogue management (enforce conversation flow rules, e.g., "always recommend consulting a doctor for medical questions").
Guardrails are not a substitute for safety training in the underlying model — they're a complementary layer. A well-trained model with guardrails is safer than either alone. And guardrails let you enforce application-specific policies that weren't part of the model's training.
Rule-based filters: regex, keyword lists, length limits. Fast and deterministic. Fragile to variations ("H3LP M3" bypasses "HELP ME" keyword filter). Good for obvious violations.
Classifier-based guardrails: a separate ML model (often a fine-tuned LLM) that classifies inputs/outputs. Examples: Llama Guard (safe/unsafe categories), Perspective API (toxicity scoring), OpenAI moderation endpoint. More robust than rules but adds latency.
LLM-as-judge guardrails: use a secondary LLM to evaluate the primary LLM's output for quality, factual grounding, or policy compliance. Most flexible but highest latency and cost.
Dialogue flow guardrails: enforce conversation structure and topic boundaries. NeMo Guardrails is the primary tool here — it lets you define what topics the bot can and can't discuss, required disclaimers, and off-topic deflection behaviour.
Structured output validation: validate that LLM outputs conform to a schema (correct JSON format, required fields present, values in valid ranges). Tools: Pydantic, Outlines, Instructor.
pip install nemoguardrails
# NeMo Guardrails uses Colang — a dialogue flow language
# Create config/config.yml and config/main.co
# config/config.yml:
# models:
# - type: main
# engine: anthropic
# model: claude-haiku-4-5-20251001
# config/main.co (Colang flow definitions):
COLANG_CONTENT = '''
define user ask about medical advice
"What medication should I take?"
"Do I have cancer?"
"Is this symptom serious?"
define bot refuse medical advice
"I'm not able to provide medical advice. Please consult a qualified healthcare professional."
define flow medical advice
user ask about medical advice
bot refuse medical advice
define user ask off topic
"What's the weather?"
"Tell me a joke"
"Write me a poem"
define bot stay on topic
"I'm a customer service assistant for Acme Corp. I can only help with questions about our products."
define flow stay on topic
user ask off topic
bot stay on topic
'''
import os
os.makedirs("config", exist_ok=True)
with open("config/main.co", "w") as f:
f.write(COLANG_CONTENT)
from nemoguardrails import RailsConfig, LLMRails
config = RailsConfig.from_path("./config")
rails = LLMRails(config)
response = rails.generate(
messages=[{"role": "user", "content": "What medication should I take for my fever?"}]
)
print(response["content"])
# → "I'm not able to provide medical advice. Please consult..."
# Llama Guard: Meta's safety classifier fine-tuned on safety categories
# Classifies inputs/outputs as safe or unsafe with category labels
# Via HuggingFace (local)
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
model_id = "meta-llama/Llama-Guard-3-8B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id, torch_dtype=torch.bfloat16, device_map="auto"
)
def check_safety(user_message: str, assistant_response: str = None) -> dict:
# Format for Llama Guard
if assistant_response:
conversation = [
{"role": "user", "content": user_message},
{"role": "assistant", "content": assistant_response},
]
else:
conversation = [{"role": "user", "content": user_message}]
input_ids = tokenizer.apply_chat_template(
conversation, return_tensors="pt"
).to(model.device)
output = model.generate(input_ids=input_ids, max_new_tokens=100)
response = tokenizer.decode(output[0][input_ids.shape[-1]:], skip_special_tokens=True)
return {
"safe": response.strip().startswith("safe"),
"raw": response.strip()
}
# Check user input
result = check_safety("How do I make explosives?")
print(result) # {"safe": False, "raw": "unsafe
S2"} # S2 = Violent crimes
# Check model output
result = check_safety(
user_message="Tell me about historical chemistry",
assistant_response="Here's how to synthesise dangerous compounds..."
)
print(result) # {"safe": False, "raw": "unsafe
S2"}
from typing import Callable
import re
class GuardrailPipeline:
def __init__(self, llm_fn: Callable):
self.llm_fn = llm_fn
self.input_validators = []
self.output_validators = []
def add_input_validator(self, validator: Callable) -> "GuardrailPipeline":
self.input_validators.append(validator)
return self
def add_output_validator(self, validator: Callable) -> "GuardrailPipeline":
self.output_validators.append(validator)
return self
def __call__(self, user_input: str) -> str:
# Run input validators
for validator in self.input_validators:
result = validator(user_input)
if result is not None: # validator blocked the input
return result
# Call LLM
response = self.llm_fn(user_input)
# Run output validators
for validator in self.output_validators:
result = validator(response)
if result is not None: # validator modified/blocked output
response = result
return response
# Example validators
def block_pii(text: str):
if re.search(r'\d{3}-\d{2}-\d{4}', text): # SSN pattern
return "I cannot process requests containing social security numbers."
return None
def enforce_length(max_chars: int):
def validator(text: str):
return text[:max_chars] + "..." if len(text) > max_chars else None
return validator
def require_disclaimer(text: str):
if "investment" in text.lower() and "not financial advice" not in text.lower():
return text + "
*This is not financial advice. Consult a qualified advisor.*"
return None
pipeline = (GuardrailPipeline(my_llm)
.add_input_validator(block_pii)
.add_output_validator(require_disclaimer)
.add_output_validator(enforce_length(2000)))
Production guardrail systems layer multiple techniques at different points:
import anthropic
client = anthropic.Anthropic()
def safe_generate(user_input: str, session_context: dict) -> str:
# Layer 1: Fast rule-based input check (microseconds)
if contains_blocked_keywords(user_input):
return "I can't help with that."
# Layer 2: Length / format checks (fast)
if len(user_input) > 4000:
return "Your message is too long. Please keep it under 4000 characters."
# Layer 3: Llama Guard classification (50–200ms)
safety = check_safety(user_input)
if not safety["safe"]:
return "I'm unable to assist with that request."
# Layer 4: Topic classification (100–300ms, or use Llama Guard)
if is_off_topic(user_input, allowed_topics=["product support", "billing"]):
return "I'm only able to help with product support and billing questions."
# Generate response
response = client.messages.create(
model="claude-haiku-4-5-20251001", max_tokens=1024,
system="You are a customer service agent for Acme Corp.",
messages=[{"role": "user", "content": user_input}]
).content[0].text
# Layer 5: Output validation (LLM-as-judge for high-stakes responses)
if session_context.get("is_medical_domain"):
if not has_appropriate_disclaimer(response):
response += "
*Please consult a qualified healthcare professional.*"
return response
Guardrails add latency. Each validation layer adds time. Llama Guard takes 50–200ms; an LLM-as-judge validator might take 1–2 seconds. For interactive applications, run validators asynchronously where possible, or use fast classifiers for initial filtering and expensive ones only for suspicious cases. Profile your guardrail pipeline before going to production.
Adversarial users probe for gaps. A determined user will find prompts that bypass your guardrails. This is especially true for rule-based filters (easy to rephrase around). Assume bypass is possible and add monitoring to detect unusual patterns. Log blocked attempts and use them to improve classifiers.
Overly aggressive guardrails harm legitimate users. A guardrail that blocks all mentions of "medication" to prevent medical advice will also block legitimate questions like "Can I take this product with medication?". Tune thresholds carefully and measure false positive rates, not just false negatives. A guardrail that blocks 30% of legitimate queries is not a safety feature — it's a product bug.
Enterprise guardrails deployments layer multiple validation mechanisms to achieve defence-in-depth. No single guardrail catches everything — adversarial users iterate on inputs, and classifier models have inherent false-negative rates. The industry standard is a four-layer stack: input filtering, prompt construction controls, output validation, and post-delivery audit logging.
| Layer | What It Catches | Latency Cost | Tools |
|---|---|---|---|
| Input filter | Obvious jailbreaks, PII in prompt | 10–50ms | Llama Guard, keyword filters, regex |
| Prompt controls | Instruction injection, role confusion | 0ms | Structured templates, privilege separation |
| Output validator | Policy violations, schema errors, hallucinations | 50–300ms | NeMo Guardrails, Guardrails AI, custom |
| Audit log | Post-hoc pattern detection, compliance | Async (0ms) | LangSmith, custom logging pipeline |
import anthropic
import re
client = anthropic.Anthropic()
PII_PATTERNS = [
r'\d{3}-\d{2}-\d{4}', # SSN
r'\d{4}[\s-]\d{4}[\s-]\d{4}[\s-]\d{4}', # credit card
r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}', # email
]
def input_guard(text: str) -> tuple[bool, str]:
for pat in PII_PATTERNS:
if re.search(pat, text):
return False, "Input contains sensitive personal information."
return True, ""
def output_guard(text: str) -> tuple[bool, str]:
blocked = ["step-by-step instructions", "how to make", "bypass security"]
for phrase in blocked:
if phrase in text.lower():
return False, "Response blocked by output policy."
return True, ""
def safe_completion(user_input: str) -> str:
ok, reason = input_guard(user_input)
if not ok:
return f"Request blocked: {reason}"
response = client.messages.create(
model="claude-3-5-haiku-20241022",
max_tokens=1024,
messages=[{"role": "user", "content": user_input}]
)
output = response.content[0].text
ok, reason = output_guard(output)
if not ok:
return f"Response blocked: {reason}"
return output
Run your guardrail stack in shadow mode first — log blocks without actually blocking — for at least one week in production. This gives you false-positive rate data before enabling hard blocks. A false-positive rate above 2% on benign traffic will cause user-facing friction that outweighs security benefits.