AI Engineering
Evaluation
A/B Testing
MAB
Production
2026 年,AI Agent 系统已经进入精细化运营阶段。本文系统性地构建了生产级在线评估与 A/B 测试的完整技术体系,涵盖四层评估金字塔、多维度指标体系、统计显著性检验、Thompson Sampling MAB 多臂老虎机、隐式用户反馈采集、自动护栏检测和生产级 Pipeline 全链路实现。包含完整的 Python 工程代码,为 AI 工程团队提供从离线评估到生产实验的全栈实践指南。
| 维度 | 离线评估 | 在线评估 |
|---|---|---|
| 数据来源 | Golden Dataset + LLM-as-Judge | 真实用户流量 |
| 反馈延迟 | 分钟级 | 小时-天级 |
| 指标丰富度 | 准确率、BLEU、ROUGE | 转化率、留存、NPS |
| 安全风险 | 低 | 需降级/兜底策略 |
| 统计效力 | 确定性强 | 需显著性检验 |
┌─────────────────────┐
│ 在线生产评估 │ ← A/B实验、流量分桶、实时指标
┌─────────────────────┐
│ 离线批量评估 │ ← Golden Dataset、Regression Suite
┌─────────────────────┐
│ 组件级微评估 │ ← 单个Tool、单个Prompt变体
┌─────────────────────┐
│ 单元级快照测试 │ ← 确定性断言、Schema校验
与传统的软件系统不同,Agent 系统的评估需要同时关注输出质量、执行效率、安全性、成本和用户体验五个维度:
from dataclasses import dataclass, field
from typing import Any, Optional
from enum import Enum
import time
import statistics
class AgentMetricType(Enum):
"""Agent评估指标分类"""
QUALITY = "quality" # 输出质量
EFFICIENCY = "efficiency" # 效率
SAFETY = "safety" # 安全性
COST = "cost" # 成本
EXPERIENCE = "experience" # 用户体验
@dataclass
class AgentEvalMetrics:
"""单次Agent调用的完整评估指标"""
# 质量指标
task_success: bool = False
response_coherence: float = 0.0
tool_selection_accuracy: float = 0.0
# 效率指标
total_latency_ms: float = 0.0
llm_calls_count: int = 0
tool_calls_count: int = 0
reasoning_steps: int = 0
# 安全指标
has_policy_violation: bool = False
has_pii_leak: bool = False
safety_score: float = 1.0
# 成本指标
total_cost_usd: float = 0.0
input_tokens: int = 0
output_tokens: int = 0
# 用户体验
user_rating: Optional[float] = None
requires_followup: bool = False
Primary Metrics(北极星指标):
Secondary Metrics(辅助诊断指标):
import hashlib
from datetime import datetime, timedelta
from dataclasses import dataclass, field
@dataclass
class ExperimentConfig:
"""实验配置"""
experiment_id: str
name: str
description: str
variants: list[str]
traffic_percentage: float = 10.0
min_sample_size: int = 10000
max_duration_hours: int = 72
start_time: datetime = field(default_factory=datetime.now)
end_time: Optional[datetime] = None
metrics_to_track: list[str] = field(default_factory=lambda: [
"task_success_rate", "avg_latency_ms", "avg_cost_usd", "safety_score"
])
guardrail_metrics: dict[str, float] = field(default_factory=lambda: {
"task_success_rate": 0.7,
"safety_score": 0.9,
"avg_latency_ms": 30000,
"avg_cost_usd": 0.50,
})
class TrafficAllocator:
"""
确定性流量分桶:基于用户ID的哈希一致性分桶
确保同一用户始终看到同一变体
"""
def __init__(self, total_buckets: int = 1000):
self.total_buckets = total_buckets
def get_bucket(self, user_id: str) -> int:
hash_val = int(hashlib.md5(user_id.encode()).hexdigest(), 16)
return hash_val % self.total_buckets
def get_variant(self, user_id: str, config: ExperimentConfig) -> str:
bucket = self.get_bucket(user_id)
effective_buckets = int(self.total_buckets * (config.traffic_percentage / 100.0))
if bucket >= effective_buckets:
return "holdout"
variant_index = bucket % len(config.variants)
return config.variants[variant_index]
from scipy import stats
import math
class StatisticalSignificance:
"""
统计显著性检验工具箱
支持比例类(Z检验)、均值类(Welch's t检验)、序数类(Mann-Whitney U检验)
"""
MIN_SAMPLE_SIZE = 100
@staticmethod
def test_proportion(
control_success: int, control_total: int,
treatment_success: int, treatment_total: int
) -> dict:
p1 = control_success / control_total
p2 = treatment_success / treatment_total
p_pool = (control_success + treatment_success) / (control_total + treatment_total)
se = math.sqrt(p_pool * (1 - p_pool) * (1/control_total + 1/treatment_total))
if se == 0:
return {"p_value": 1.0, "significant": False}
z_score = (p2 - p1) / se
p_value = 2 * (1 - stats.norm.cdf(abs(z_score)))
lift = ((p2 - p1) / p1 * 100) if p1 > 0 else float('inf')
return {
"p_value": round(p_value, 6),
"z_score": round(z_score, 4),
"significant": p_value < 0.05,
"lift": round(lift, 2),
}
传统A/B测试在统计显著性达到前浪费了大量流量在低效变体上。Thompson Sampling 算法能够动态分配更多流量给表现更好的变体,节省30-50%样本量。
import numpy as np
class ThompsonSamplingMAB:
"""
Thompson Sampling 多臂老虎机
优势: 比传统A/B测试收敛更快,自动平衡探索与利用
"""
def __init__(self, variants: list[str], alpha: float = 1.0, beta: float = 1.0):
self.variants = variants
self.alpha = {v: alpha for v in variants}
self.beta = {v: beta for v in variants}
def select_variant(self) -> str:
"""Thompson采样:从每个变体的后验分布中采样,选择最优"""
samples = {v: np.random.beta(self.alpha[v], self.beta[v]) for v in self.variants}
return max(samples, key=samples.get)
def update(self, variant: str, success: bool):
if success:
self.alpha[variant] += 1
else:
self.beta[variant] += 1
def get_win_probability(self) -> dict[str, float]:
"""蒙特卡洛模拟每个变体是当前最优的概率"""
n_simulations = 10000
wins = {v: 0 for v in self.variants}
for _ in range(n_simulations):
samples = {v: np.random.beta(self.alpha[v], self.beta[v]) for v in self.variants}
wins[max(samples, key=samples.get)] += 1
return {v: count / n_simulations for v, count in wins.items()}
| 维度 | 传统A/B测试 | Thompson Sampling MAB |
|---|---|---|
| 流量分配 | 固定50/50 | 动态分配 |
| 收敛速度 | 需要完整统计量 | 快30-50% |
| 探索代价 | 高(浪费50%流量在差变体) | 低(自动减少差变体流量) |
| 实施复杂度 | 低 | 中 |
| 适用场景 | 一次性决策(上线/不下线) | 持续优化场景 |
生产评估Pipeline 包含自动护栏机制,当某个变体的关键指标低于阈值时自动告警并建议停止实验:
class ExperimentManager:
"""
实验生命周期管理器
- 管理多个并行的A/B实验
- 自动监控指标,触发告警/自动停止
"""
def check_guardrails(self, experiment_id: str) -> list[str]:
"""检查所有变体是否触发护栏条件"""
config = self._experiments.get(experiment_id)
if not config:
return []
reports = self.generate_report(experiment_id)
alerts = []
for variant_name, report in reports.items():
for metric, threshold in config.guardrail_metrics.items():
current_value = getattr(report, metric, None)
if current_value is None:
continue
if current_value < threshold:
alerts.append(
f"[{experiment_id}/{variant_name}] {metric} = {current_value:.4f} "
f"低于阈值 {threshold}"
)
return alerts
显式用户评分(点赞/踩)通常覆盖率不足5%。隐式反馈提供了更丰富的信号源:
@dataclass
class ImplicitFeedbackSignal:
"""隐式用户行为信号"""
user_id: str
session_id: str
# 正向信号
copied_response: bool = False
continued_conversation: bool = False
clicked_link: bool = False
saved_as_reference: bool = False
# 负向信号
regenerated: bool = False
abandoned_session: bool = False
manually_corrected: bool = False
switched_to_human: bool = False
@property
def satisfaction_score(self) -> float:
"""将隐式信号转化为用户满意度评分 (0-1)"""
positive = sum([self.copied_response, self.continued_conversation,
self.clicked_link, self.saved_as_reference])
negative = sum([self.regenerated, self.abandoned_session,
self.manually_corrected, self.switched_to_human])
total = positive + negative
if total == 0:
return 0.5
return positive / total
| 指标 | 显著提升 | 无显著变化 | 显著下降 |
|---|---|---|---|
| Task Success | ✅ 采纳 | 继续观察 | ❌ 回滚 |
| Latency | ✅ 采纳 | 可接受 | 需trade-off |
| Cost | ✅ 采纳 | 可接受 | ⚠ 结合成功率 |
| Safety | ✅ 采纳 | ✅ 采纳 | ❌ 立即回滚 |
| User Rating | ✅ 采纳 | 继续观察 | ❌ 回滚 |
场景: 电商客服Agent优化回答模板
基线 (control): 当前使用的标准客服Prompt v2.3 实验组 (treatment-a): 加入情感分析+个性化推荐的Prompt v3.0 实验组 (treatment-b): 简化版Prompt v3.0-lite(去掉个性化) 实验结果(24h, N=15,000): 指标 control treatment-a treatment-b ───────────────────────────────────────────────────────── Task Success 82.1% 89.4% ▲* 85.2% ▲ Avg Latency 2.1s 2.8s ▲* 1.9s ▼ P95 Latency 5.8s 8.9s ▲* 4.7s ▼ Avg Cost $0.08 $0.15 ▲* $0.07 ▼ User Satisfaction 4.1 4.5 ▲* 4.2 Regeneration Rate 12.3% 7.1% ▼* 10.2% ▼ (* = p < 0.05 统计显著)
结论: treatment-a 虽然延迟和成本有所增加,但任务成功率和用户满意度显著提升。在电商客服场景中,客户满意度优先级高于成本,因此采纳 treatment-a。同时将 treatment-b 作为低成本备选方案在非核心时段使用。
本文系统性地构建了AI Agent生产级在线评估与A/B测试的完整技术栈:
对于AI工程团队来说,建议从以下三个方面逐步落地:
记住:没有测量就没有优化。建立可靠的评估体系是AI工程化的关键里程碑。