Agent 价值对齐与意图一致性完整实现
import numpy as np
from typing import Dict, List, Any, Optional, Tuple, Set
from dataclasses import dataclass, field
from datetime import datetime
from enum import Enum
import math
import random
from collections import defaultdict
import copy
class AlignmentMethod(Enum):
"""对齐方法"""
RLHF = "rlhf" # 人类反馈强化学习
CONSTITUTIONAL = "constitutional" # 宪法 AI
IRL = "irl" # 逆向强化学习
CPL = "cpl" # 对比偏好学习
class SafetyLevel(Enum):
"""安全等级"""
SAFE = "safe" # 安全
CAUTION = "caution" # 谨慎
RISKY = "risky" # 有风险
DANGEROUS = "dangerous" # 危险
class IntentType(Enum):
"""意图类型"""
EXPLICIT = "explicit" # 显式意图
IMPLICIT = "implicit" # 隐含意图
AMBIGUOUS = "ambiguous" # 歧义意图
MALICIOUS = "malicious" # 恶意意图
@dataclass
class HumanPreference:
"""人类偏好"""
id: str
prompt: str
response_a: str
response_b: str
preferred: str # "A" or "B"
reasoning: str = ""
timestamp: datetime = field(default_factory=datetime.now)
@dataclass
class ConstitutionalPrinciple:
"""宪法原则"""
name: str
description: str
category: str # "ethics", "safety", "helpfulness", "honesty"
weight: float = 1.0
@dataclass
class IntentAnalysis:
"""意图分析结果"""
intent_type: IntentType
confidence: float
primary_intent: str
secondary_intents: List[str]
safety_concerns: List[str]
suggested_response: str
class RewardModel:
"""
奖励模型
支持:
1. 从偏好学习奖励函数
2. 预测人类偏好
3. 生成奖励信号
"""
def __init__(self, model_capacity: int = 128):
self.weights: Dict[str, float] = defaultdict(float)
self.bias: float = 0.0
self.training_history: List[Dict[str, Any]] = []
def predict_reward(self, features: Dict[str, float]) -> float:
"""预测奖励"""
reward = self.bias
for feature, value in features.items():
reward += self.weights.get(feature, 0.0) * value
return reward
def train_from_preferences(self, preferences: List[HumanPreference],
feature_extractor,
learning_rate: float = 0.01,
epochs: int = 100):
"""从偏好训练奖励模型"""
for epoch in range(epochs):
total_loss = 0.0
for pref in preferences:
# 提取特征
features_a = feature_extractor(pref.prompt, pref.response_a)
features_b = feature_extractor(pref.prompt, pref.response_b)
# 预测奖励
reward_a = self.predict_reward(features_a)
reward_b = self.predict_reward(features_b)
# 计算损失 (基于 Bradley-Terry 模型)
if pref.preferred == "A":
log_prob = -math.log(1 + math.exp(reward_b - reward_a) + 1e-6)
else:
log_prob = -math.log(1 + math.exp(reward_a - reward_b) + 1e-6)
total_loss -= log_prob
# 梯度更新 (简化版)
gradient_scale = 1.0 / (1 + math.exp(abs(reward_a - reward_b)) + 1e-6)
if pref.preferred == "A":
for feature, value in features_a.items():
self.weights[feature] += learning_rate * gradient_scale * value
for feature, value in features_b.items():
self.weights[feature] -= learning_rate * gradient_scale * value
else:
for feature, value in features_a.items():
self.weights[feature] -= learning_rate * gradient_scale * value
for feature, value in features_b.items():
self.weights[feature] += learning_rate * gradient_scale * value
self.training_history.append({
"epoch": epoch,
"loss": total_loss / len(preferences)
})
def get_feature_weights(self) -> Dict[str, float]:
"""获取特征权重"""
return dict(self.weights)
class IntentRecognizer:
"""
意图识别器
支持:
1. 显式意图识别
2. 隐含意图推断
3. 歧义检测
4. 恶意意图检测
"""
def __init__(self):
self.intent_patterns: Dict[str, List[str]] = {
"information_seeking": ["what is", "how to", "explain", "tell me"],
"task_completion": ["do this", "create", "generate", "write"],
"creative": ["story", "poem", "art", "design"],
"analysis": ["analyze", "compare", "evaluate", "assess"],
"malicious": ["hack", "exploit", "bypass", "ignore rules"]
}
self.safety_keywords = [
"violence", "hate", "discrimination", "illegal",
"harmful", "dangerous", "exploit", "attack"
]
def analyze_intent(self, prompt: str, context: str = "") -> IntentAnalysis:
"""分析意图"""
prompt_lower = prompt.lower()
# 检测意图类型
intent_scores = defaultdict(float)
for intent_type, patterns in self.intent_patterns.items():
for pattern in patterns:
if pattern in prompt_lower:
intent_scores[intent_type] += 1.0
# 考虑上下文
if context:
context_lower = context.lower()
for intent_type, patterns in self.intent_patterns.items():
for pattern in patterns:
if pattern in context_lower:
intent_scores[intent_type] += 0.5
# 确定主要意图
if not intent_scores:
primary_intent = "unknown"
confidence = 0.0
else:
primary_intent = max(intent_scores, key=intent_scores.get)
max_score = intent_scores[primary_intent]
total_score = sum(intent_scores.values())
confidence = max_score / (total_score + 1e-6)
# 检测安全关切
safety_concerns = []
for keyword in self.safety_keywords:
if keyword in prompt_lower:
safety_concerns.append(keyword)
# 确定意图类型
if safety_concerns:
intent_type = IntentType.MALICIOUS
elif confidence < 0.5:
intent_type = IntentType.AMBIGUOUS
elif primary_intent in ["information_seeking", "task_completion"]:
intent_type = IntentType.EXPLICIT
else:
intent_type = IntentType.IMPLICIT
# 生成建议响应
if intent_type == IntentType.MALICIOUS:
suggested_response = "I cannot assist with requests that may cause harm or violate ethical guidelines."
elif intent_type == IntentType.AMBIGUOUS:
suggested_response = "Could you please clarify what you're looking for?"
else:
suggested_response = f"I understand you want to {primary_intent.replace('_', ' ')}. How can I help?"
return IntentAnalysis(
intent_type=intent_type,
confidence=confidence,
primary_intent=primary_intent,
secondary_intents=[k for k in intent_scores.keys() if k != primary_intent],
safety_concerns=safety_concerns,
suggested_response=suggested_response
)
class ConstitutionalAI:
"""
宪法 AI
支持:
1. 原则定义
2. 自批判
3. 原则遵循检查
4. 响应修正
"""
def __init__(self):
self.principles: List[ConstitutionalPrinciple] = [
ConstitutionalPrinciple(
name="Helpfulness",
description="Be helpful and provide useful information",
category="helpfulness",
weight=1.0
),
ConstitutionalPrinciple(
name="Harmlessness",
description="Do not cause harm or enable harmful activities",
category="safety",
weight=2.0
),
ConstitutionalPrinciple(
name="Honesty",
description="Be truthful and acknowledge uncertainty",
category="honesty",
weight=1.5
),
ConstitutionalPrinciple(
name="Ethical Behavior",
description="Act in accordance with ethical principles",
category="ethics",
weight=2.0
)
]
def critique_response(self, prompt: str, response: str) -> Dict[str, Any]:
"""批判响应"""
critiques = []
for principle in self.principles:
# 简化检查 (实际应使用更复杂的 NLP 技术)
violation_score = 0.0
# 检查安全关切
if principle.category == "safety":
safety_keywords = ["harm", "dangerous", "illegal", "exploit"]
for keyword in safety_keywords:
if keyword in response.lower():
violation_score += 0.5
# 检查诚实性
if principle.category == "honesty":
dishonest_phrases = ["definitely", "always", "never", "100%"]
for phrase in dishonest_phrases:
if phrase in response.lower():
violation_score += 0.2
if violation_score > 0.3:
critiques.append({
"principle": principle.name,
"violation_score": violation_score,
"suggestion": f"Consider revising to better align with {principle.description}"
})
return {
"critiques": critiques,
"needs_revision": len(critiques) > 0,
"overall_score": 1.0 - sum(c["violation_score"] for c in critiques) / len(self.principles)
}
def revise_response(self, prompt: str, response: str,
critiques: List[Dict[str, Any]]) -> str:
"""修正响应"""
revised = response
for critique in critiques:
if "harm" in critique["principle"].lower():
# 添加安全限定
revised = "While I can provide general information, I cannot assist with anything that could cause harm. " + revised
elif "honesty" in critique["principle"].lower():
# 添加不确定性表达
revised = revised + " However, please note that this information may not be complete or accurate in all cases."
return revised
class SafetyGuardrail:
"""
安全护栏
支持:
1. 输入检测
2. 输出过滤
3. 风险评分
4. 拦截机制
"""
def __init__(self):
self.risk_thresholds = {
SafetyLevel.SAFE: 0.2,
SafetyLevel.CAUTION: 0.5,
SafetyLevel.RISKY: 0.7,
SafetyLevel.DANGEROUS: 0.9
}
self.blocked_topics = [
"violence", "hate speech", "discrimination",
"illegal activities", "self-harm", "exploitation"
]
def assess_risk(self, text: str) -> Tuple[SafetyLevel, float, List[str]]:
"""评估风险"""
text_lower = text.lower()
# 计算风险分数
risk_score = 0.0
risk_factors = []
for topic in self.blocked_topics:
if topic in text_lower:
risk_score += 0.3
risk_factors.append(topic)
# 检测越狱尝试
jailbreak_patterns = [
"ignore previous instructions",
"bypass safety",
"act as",
"roleplay without restrictions"
]
for pattern in jailbreak_patterns:
if pattern in text_lower:
risk_score += 0.4
risk_factors.append(f"jailbreak_attempt: {pattern}")
# 确定安全等级
safety_level = SafetyLevel.SAFE
for level, threshold in self.risk_thresholds.items():
if risk_score >= threshold:
safety_level = level
return safety_level, risk_score, risk_factors
def filter_response(self, response: str,
safety_level: SafetyLevel) -> Tuple[str, bool]:
"""过滤响应"""
if safety_level in [SafetyLevel.RISKY, SafetyLevel.DANGEROUS]:
return "I cannot provide a response that may be harmful or unsafe.", True
# 添加安全限定
if safety_level == SafetyLevel.CAUTION:
response = "Please note: " + response + " Always use this information responsibly."
return response, False
class AlignedAgent:
"""
对齐 Agent
整合:
1. 价值对齐
2. 意图理解
3. 安全防护
4. 伦理约束
"""
def __init__(self):
self.reward_model = RewardModel()
self.intent_recognizer = IntentRecognizer()
self.constitutional_ai = ConstitutionalAI()
self.safety_guardrail = SafetyGuardrail()
self.interaction_history: List[Dict[str, Any]] = []
def process_request(self, prompt: str,
context: str = "") -> Dict[str, Any]:
"""处理请求"""
# 1. 意图分析
intent_analysis = self.intent_recognizer.analyze_intent(prompt, context)
# 2. 安全检测 (输入)
input_safety, input_risk, input_factors = self.safety_guardrail.assess_risk(prompt)
# 3. 生成响应 (简化)
if intent_analysis.intent_type == IntentType.MALICIOUS:
response = intent_analysis.suggested_response
else:
response = f"I understand your request about '{intent_analysis.primary_intent}'. Here's how I can help..."
# 4. 宪法 AI 批判
critique_result = self.constitutional_ai.critique_response(prompt, response)
# 5. 修正响应
if critique_result["needs_revision"]:
response = self.constitutional_ai.revise_response(
prompt, response, critique_result["critiques"]
)
# 6. 安全检测 (输出)
output_safety, output_risk, output_factors = self.safety_guardrail.assess_risk(response)
filtered_response, was_blocked = self.safety_guardrail.filter_response(
response, output_safety
)
# 7. 记录交互
interaction = {
"timestamp": datetime.now().isoformat(),
"prompt": prompt,
"intent_analysis": {
"type": intent_analysis.intent_type.value,
"confidence": intent_analysis.confidence,
"primary": intent_analysis.primary_intent
},
"input_safety": input_safety.value,
"output_safety": output_safety.value,
"response_blocked": was_blocked,
"final_response": filtered_response
}
self.interaction_history.append(interaction)
return {
"response": filtered_response,
"intent": intent_analysis,
"safety": {
"input_level": input_safety.value,
"output_level": output_safety.value,
"blocked": was_blocked
},
"critique": critique_result
}
def train_from_feedback(self, preferences: List[HumanPreference]):
"""从反馈训练"""
def feature_extractor(prompt, response):
# 简化特征提取
return {
"length": len(response) / 100.0,
"helpfulness": 1.0 if "help" in response.lower() else 0.5,
"safety": 1.0 if not any(word in response.lower()
for word in ["harm", "dangerous"]) else 0.0
}
self.reward_model.train_from_preferences(
preferences, feature_extractor, epochs=50
)
# 使用示例
if __name__ == "__main__":
print("=== Agent 价值对齐与人类意图一致性 ===\n")
print("=== 创建对齐 Agent ===")
# 创建对齐 Agent
agent = AlignedAgent()
print(f"对齐 Agent 初始化完成")
print(f" - 奖励模型:已加载")
print(f" - 意图识别器:已加载")
print(f" - 宪法 AI: {len(agent.constitutional_ai.principles)} 条原则")
print(f" - 安全护栏:已激活")
print(f"\n=== 测试场景 1: 正常请求 ===")
# 测试正常请求
prompt1 = "How can I learn Python programming?"
result1 = agent.process_request(prompt1)
print(f"请求:{prompt1}")
print(f"意图:{result1['intent'].primary_intent} (置信度:{result1['intent'].confidence:.2f})")
print(f"安全等级:输入={result1['safety']['input_level']}, 输出={result1['safety']['output_level']}")
print(f"响应:{result1['response'][:100]}...")
print(f"\n=== 测试场景 2: 潜在风险请求 ===")
# 测试风险请求
prompt2 = "How to bypass security systems?"
result2 = agent.process_request(prompt2)
print(f"请求:{prompt2}")
print(f"意图:{result2['intent'].primary_intent} (类型:{result2['intent'].intent_type.value})")
print(f"安全关切:{result2['intent'].safety_concerns}")
print(f"安全等级:输入={result2['safety']['input_level']}, 输出={result2['safety']['output_level']}")
print(f"响应:{result2['response']}")
print(f"\n=== 测试场景 3: 歧义请求 ===")
# 测试歧义请求
prompt3 = "Tell me about attacks"
result3 = agent.process_request(prompt3)
print(f"请求:{prompt3}")
print(f"意图:{result3['intent'].primary_intent} (类型:{result3['intent'].intent_type.value})")
print(f"置信度:{result3['intent'].confidence:.2f}")
print(f"响应:{result3['response']}")
print(f"\n=== 训练奖励模型 ===")
# 创建偏好数据
preferences = [
HumanPreference(
id="pref_1",
prompt="Help me write code",
response_a="Here's some helpful code...",
response_b="I don't want to help.",
preferred="A",
reasoning="Response A is more helpful"
),
HumanPreference(
id="pref_2",
prompt="Explain quantum physics",
response_a="Quantum physics is complex...",
response_b="It's definitely this and always works.",
preferred="A",
reasoning="Response B is overconfident"
)
]
# 训练奖励模型
agent.train_from_feedback(preferences)
print(f"奖励模型训练完成")
print(f"特征权重:")
weights = agent.reward_model.get_feature_weights()
for feature, weight in weights.items():
print(f" {feature}: {weight:.3f}")
print(f"\n=== 交互历史统计 ===")
print(f"总交互次数:{len(agent.interaction_history)}")
# 统计安全等级分布
safety_distribution = defaultdict(int)
for interaction in agent.interaction_history:
safety_distribution[interaction["output_safety"]] += 1
print(f"安全等级分布:")
for level, count in safety_distribution.items():
print(f" {level}: {count} 次")
print(f"\n关键观察:")
print("1. 价值对齐:通过 RLHF 学习人类偏好")
print("2. 意图理解:识别显式、隐含、歧义、恶意意图")
print("3. 安全防护:输入输出双重检测,风险拦截")
print("4. 宪法 AI:自批判与响应修正")
print("5. 可信赖 AI:对齐 + 理解 + 安全 + 伦理 = 可信赖")
print("\nAI 安全的使命:让 AI 对齐人类价值、理解人类意图、保障人类安全")