完整教程:LLM Agent 动态 API 调用与多模态推理技术详解

文章目录
作者:北辰alk
关键词:LLM Agent、动态API、多模态推理、工具调用、智能体系统
引言
随着大语言模型能力的不断提升,LLM Agent 正在从简单的对话系统向能够执行复杂任务、处理多模态信息的智能体演进。其中,动态 API 调用和多模态推理是实现高级 Agent 能力的两个核心技术。本文将深入探讨 LLM Agent 如何实现动态 API 调用来处理实时数据,以及如何在多模态任务中进行复杂推理,通过详细的架构图、代码实现和实战案例,帮助开发者掌握这些关键技术。
一、LLM Agent 动态 API 调用技术
1.1 动态 API 调用的核心价值
动态 API 调用使 LLM Agent 能够突破训练数据的限制,访问实时信息、执行具体操作:
- 实时数据获取:天气、股价、新闻等实时信息
- 工具能力扩展:计算、翻译、代码执行等专业功能
- 系统集成:与现有业务系统、数据库、服务的集成
- 状态持久化:保存对话状态、用户偏好等长期信息
1.2 动态 API 调用架构设计
让我们首先通过架构图了解动态 API 调用的完整流程:
1.3 基础 API 调用系统实现
import json
import requests
import inspect
from typing import Dict, List, Any, Optional
from datetime import datetime
import hashlib
class DynamicAPIClient:
"""动态 API 调用客户端"""
def __init__(self):
self.registered_apis = {}
self.api_cache = {}
self.request_timeout = 30
self.max_cache_size = 1000
def register_api(self, api_name: str, api_config: Dict):
"""注册 API 配置"""
required_fields = ['base_url', 'endpoints', 'authentication']
for field in required_fields:
if field not in api_config:
raise ValueError(f"API 配置缺少必要字段: {field}")
self.registered_apis[api_name] = api_config
print(f"API 注册成功: {api_name}")
def call_api(self, api_name: str, endpoint: str,
params: Dict = None, data: Dict = None,
use_cache: bool = True) -> Dict[str, Any]:
"""执行 API 调用"""
# 检查 API 是否注册
if api_name not in self.registered_apis:
return {
'success': False,
'error': f"API 未注册: {api_name}",
'data': None
}
# 生成缓存键
cache_key = self._generate_cache_key(api_name, endpoint, params, data)
# 检查缓存
if use_cache and cache_key in self.api_cache:
cached_data = self.api_cache[cache_key]
if self._is_cache_valid(cached_data):
print(f"使用缓存数据: {api_name}/{endpoint}")
return cached_data
api_config = self.registered_apis[api_name]
try:
# 构造请求 URL
url = self._build_request_url(api_config, endpoint)
# 准备请求头
headers = self._prepare_headers(api_config)
# 准备认证信息
auth = self._prepare_auth(api_config)
# 执行请求
response = self._execute_request(
url, headers, auth, params, data, api_config
)
# 处理响应
result = self._process_response(response, api_config)
# 缓存结果
if use_cache and result['success']:
self._cache_result(cache_key, result)
return result
except Exception as e:
error_result = {
'success': False,
'error': f"API 调用异常: {str(e)}",
'data': None,
'timestamp': datetime.now().isoformat()
}
return error_result
def _generate_cache_key(self, api_name: str, endpoint: str,
params: Dict, data: Dict) -> str:
"""生成缓存键"""
key_data = {
'api_name': api_name,
'endpoint': endpoint,
'params': params,
'data': data
}
key_string = json.dumps(key_data, sort_keys=True)
return hashlib.md5(key_string.encode()).hexdigest()
def _is_cache_valid(self, cached_data: Dict) -> bool:
"""检查缓存是否有效"""
cache_time = datetime.fromisoformat(cached_data['timestamp'])
current_time = datetime.now()
cache_age = (current_time - cache_time).total_seconds()
# 缓存有效期 5 分钟
return cache_age < 300
def _build_request_url(self, api_config: Dict, endpoint: str) -> str:
"""构造请求 URL"""
base_url = api_config['base_url'].rstrip('/')
endpoint_path = endpoint.lstrip('/')
return f"{base_url}/{endpoint_path}"
def _prepare_headers(self, api_config: Dict) -> Dict[str, str]:
"""准备请求头"""
headers = {
'User-Agent': 'LLM-Agent/1.0',
'Content-Type': 'application/json'
}
# 添加自定义头部
if 'headers' in api_config:
headers.update(api_config['headers'])
return headers
def _prepare_auth(self, api_config: Dict) -> Any:
"""准备认证信息"""
auth_config = api_config['authentication']
auth_type = auth_config.get('type', 'none')
if auth_type == 'api_key':
return requests.auth.HTTPBasicAuth(
auth_config['key'], auth_config.get('secret', '')
)
elif auth_type == 'bearer_token':
return f"Bearer {auth_config['token']}"
else:
return None
def _execute_request(self, url: str, headers: Dict, auth: Any,
params: Dict, data: Dict, api_config: Dict):
"""执行 HTTP 请求"""
method = api_config.get('method', 'GET').upper()
request_kwargs = {
'url': url,
'headers': headers,
'timeout': self.request_timeout
}
if auth:
if isinstance(auth, str):
headers['Authorization'] = auth
else:
request_kwargs['auth'] = auth
if params:
request_kwargs['params'] = params
if data and method in ['POST', 'PUT', 'PATCH']:
request_kwargs['json'] = data
# 执行请求
if method == 'GET':
response = requests.get(**request_kwargs)
elif method == 'POST':
response = requests.post(**request_kwargs)
elif method == 'PUT':
response = requests.put(**request_kwargs)
elif method == 'DELETE':
response = requests.delete(**request_kwargs)
else:
raise ValueError(f"不支持的 HTTP 方法: {method}")
return response
def _process_response(self, response: requests.Response,
api_config: Dict) -> Dict[str, Any]:
"""处理 API 响应"""
try:
if response.status_code == 200:
content_type = response.headers.get('content-type', '')
if 'application/json' in content_type:
response_data = response.json()
else:
response_data = response.text
result = {
'success': True,
'data': response_data,
'status_code': response.status_code,
'timestamp': datetime.now().isoformat()
}
else:
result = {
'success': False,
'error': f"HTTP 错误: {response.status_code}",
'status_code': response.status_code,
'data': response.text,
'timestamp': datetime.now().isoformat()
}
return result
except Exception as e:
return {
'success': False,
'error': f"响应处理异常: {str(e)}",
'status_code': response.status_code,
'timestamp': datetime.now().isoformat()
}
def _cache_result(self, cache_key: str, result: Dict):
"""缓存 API 结果"""
if len(self.api_cache) >= self.max_cache_size:
# 简单的 LRU 缓存淘汰
oldest_key = next(iter(self.api_cache))
del self.api_cache[oldest_key]
self.api_cache[cache_key] = result
# 使用示例
def demo_dynamic_api_client():
"""演示动态 API 客户端的使用"""
client = DynamicAPIClient()
# 注册天气 API
weather_api_config = {
'base_url': 'https://api.weatherapi.com/v1',
'method': 'GET',
'authentication': {
'type': 'api_key',
'key': 'your_api_key_here'
},
'endpoints': {
'current': '/current.json',
'forecast': '/forecast.json'
}
}
client.register_api('weather', weather_api_config)
# 调用天气 API
result = client.call_api(
api_name='weather',
endpoint='current',
params={'q': 'Beijing', 'lang': 'zh'}
)
print("API 调用结果:")
print(json.dumps(result, ensure_ascii=False, indent=2))
if __name__ == "__main__":
demo_dynamic_api_client()
1.4 智能 API 路由与参数提取
class APIRouter:
"""智能 API 路由器"""
def __init__(self, api_client: DynamicAPIClient):
self.api_client = api_client
self.api_descriptions = {}
self.parameter_extractors = {}
def register_api_description(self, api_name: str, description: Dict):
"""注册 API 描述信息"""
self.api_descriptions[api_name] = description
def register_parameter_extractor(self, api_name: str, extractor_func):
"""注册参数提取器"""
self.parameter_extractors[api_name] = extractor_func
def route_request(self, user_input: str) -> Dict[str, Any]:
"""路由用户请求到合适的 API"""
# 分析用户意图
intent = self._analyze_intent(user_input)
# 匹配最合适的 API
best_api = self._find_best_api_match(intent, user_input)
if not best_api:
return {
'success': False,
'error': '未找到合适的 API',
'suggestions': self._get_suggestions(user_input)
}
# 提取 API 参数
parameters = self._extract_parameters(best_api, user_input)
# 执行 API 调用
api_result = self.api_client.call_api(
api_name=best_api,
endpoint=self.api_descriptions[best_api]['default_endpoint'],
params=parameters
)
return {
'success': True,
'api_used': best_api,
'parameters': parameters,
'api_result': api_result
}
def _analyze_intent(self, user_input: str) -> Dict[str, Any]:
"""分析用户意图"""
intent_keywords = {
'weather': ['天气', '气温', '温度', '气象', '下雨', '下雪'],
'calculator': ['计算', '算一下', '等于多少', '加减乘除'],
'translation': ['翻译', '英文', '中文', '日语', '韩语'],
'search': ['搜索', '查找', '查询', '百度', '谷歌']
}
detected_intents = []
for intent, keywords in intent_keywords.items():
score = sum(1 for keyword in keywords if keyword in user_input)
if score > 0:
detected_intents.append({
'intent': intent,
'score': score,
'confidence': min(score / len(keywords), 1.0)
})
# 按置信度排序
detected_intents.sort(key=lambda x: x['confidence'], reverse=True)
return {
'primary_intent': detected_intents[0] if detected_intents else None,
'all_intents': detected_intents,
'input_text': user_input
}
def _find_best_api_match(self, intent: Dict, user_input: str) -> Optional[str]:
"""寻找最佳 API 匹配"""
if not intent['primary_intent']:
return None
primary_intent = intent['primary_intent']['intent']
# 简单的意图到 API 的映射
intent_to_api = {
'weather': 'weather',
'calculator': 'calculator',
'translation': 'translation',
'search': 'web_search'
}
return intent_to_api.get(primary_intent)
def _extract_parameters(self, api_name: str, user_input: str) -> Dict[str, Any]:
"""提取 API 参数"""
if api_name in self.parameter_extractors:
return self.parameter_extractors[api_name](user_input)
else:
return self._default_parameter_extraction(api_name, user_input)
def _default_parameter_extraction(self, api_name: str, user_input: str) -> Dict:
"""默认参数提取"""
if api_name == 'weather':
# 简单的位置提取
locations = ['北京', '上海', '广州', '深圳', '杭州', '成都']
for location in locations:
if location in user_input:
return {'q': location}
return {'q': '北京'} # 默认值
elif api_name == 'calculator':
# 提取数学表达式
import re
numbers = re.findall(r'\d+\.?\d*', user_input)
if numbers:
return {'numbers': [float(num) for num in numbers]}
return {}
def _get_suggestions(self, user_input: str) -> List[str]:
"""获取建议的 API"""
available_apis = list(self.api_descriptions.keys())
return [f"可以尝试使用 {api} API" for api in available_apis[:3]]
# 使用示例
def demo_api_router():
"""演示 API 路由器的使用"""
api_client = DynamicAPIClient()
router = APIRouter(api_client)
# 注册 API 描述
router.register_api_description('weather', {
'name': '天气查询',
'description': '获取实时天气信息',
'default_endpoint': 'current',
'parameters': ['location']
})
router.register_api_description('calculator', {
'name': '计算器',
'description': '执行数学计算',
'default_endpoint': 'calculate',
'parameters': ['numbers', 'operation']
})
# 注册参数提取器
def weather_extractor(user_input):
locations = {
'北京': 'Beijing',
'上海': 'Shanghai',
'广州': 'Guangzhou',
'深圳': 'Shenzhen'
}
for cn_name, en_name in locations.items():
if cn_name in user_input:
return {'q': en_name}
return {'q': 'Beijing'}
router.register_parameter_extractor('weather', weather_extractor)
# 测试路由
test_inputs = [
"今天北京天气怎么样?",
"计算一下 125 加 38 等于多少",
"翻译这句话成英文"
]
for user_input in test_inputs:
print(f"用户输入: {user_input}")
result = router.route_request(user_input)
print(f"路由结果: {result['api_used'] if result['success'] else '无匹配'}")
print("-" * 50)
# demo_api_router()
1.5 高级 API 编排系统
class APIOrchestrator:
"""API 编排系统:支持复杂的 API 调用序列"""
def __init__(self, api_client: DynamicAPIClient):
self.api_client = api_client
self.workflow_templates = {}
self.execution_history = []
def register_workflow(self, workflow_name: str, workflow_steps: List[Dict]):
"""注册工作流模板"""
self.workflow_templates[workflow_name] = {
'steps': workflow_steps,
'created_at': datetime.now().isoformat()
}
def execute_workflow(self, workflow_name: str,
initial_params: Dict = None) -> Dict[str, Any]:
"""执行工作流"""
if workflow_name not in self.workflow_templates:
return {
'success': False,
'error': f"工作流未注册: {workflow_name}",
'results': []
}
workflow = self.workflow_templates[workflow_name]
steps = workflow['steps']
execution_results = []
current_context = initial_params or {}
for step_index, step in enumerate(steps):
step_name = step.get('name', f'step_{step_index}')
print(f"执行步骤 {step_index + 1}: {step_name}")
# 准备步骤参数
step_params = self._prepare_step_parameters(step, current_context)
# 执行 API 调用
api_result = self.api_client.call_api(
api_name=step['api'],
endpoint=step['endpoint'],
params=step_params.get('params'),
data=step_params.get('data')
)
# 保存步骤结果
step_result = {
'step_name': step_name,
'step_index': step_index,
'api_result': api_result,
'timestamp': datetime.now().isoformat()
}
execution_results.append(step_result)
# 更新执行上下文
if api_result['success'] and 'output_mapping' in step:
self._update_context(current_context, step['output_mapping'], api_result)
# 检查条件终止
if 'condition' in step and not self._evaluate_condition(step['condition'], api_result):
print(f"工作流在步骤 {step_name} 条件终止")
break
# 记录执行历史
execution_record = {
'workflow_name': workflow_name,
'execution_time': datetime.now().isoformat(),
'steps_executed': len(execution_results),
'overall_success': all(
result['api_result']['success']
for result in execution_results
)
}
self.execution_history.append(execution_record)
return {
'success': True,
'workflow_name': workflow_name,
'results': execution_results,
'final_context': current_context
}
def _prepare_step_parameters(self, step: Dict, context: Dict) -> Dict[str, Any]:
"""准备步骤参数"""
parameters = {}
if 'parameters' in step:
for param_key, param_value in step['parameters'].items():
# 支持上下文变量引用
if isinstance(param_value, str) and param_value.startswith('${') and param_value.endswith('}'):
context_key = param_value[2:-1]
parameters[param_key] = context.get(context_key)
else:
parameters[param_key] = param_value
return parameters
def _update_context(self, context: Dict, output_mapping: Dict, api_result: Dict):
"""更新执行上下文"""
for context_key, result_path in output_mapping.items():
# 简单的路径解析(实际应该更复杂)
if '.' in result_path:
path_parts = result_path.split('.')
current_data = api_result['data']
for part in path_parts:
if isinstance(current_data, dict) and part in current_data:
current_data = current_data[part]
else:
current_data = None
break
context[context_key] = current_data
else:
context[context_key] = api_result['data'].get(result_path)
def _evaluate_condition(self, condition: Dict, api_result: Dict) -> bool:
"""评估执行条件"""
condition_type = condition.get('type', 'always')
if condition_type == 'always':
return True
elif condition_type == 'on_success':
return api_result['success']
elif condition_type == 'on_failure':
return not api_result['success']
elif condition_type == 'data_condition':
return self._evaluate_data_condition(condition, api_result)
else:
return True
def _evaluate_data_condition(self, condition: Dict, api_result: Dict) -> bool:
"""评估数据条件"""
if not api_result['success']:
return False
field_path = condition['field']
operator = condition['operator']
expected_value = condition['value']
# 提取字段值
current_data = api_result['data']
if '.' in field_path:
path_parts = field_path.split('.')
for part in path_parts:
if isinstance(current_data, dict) and part in current_data:
current_data = current_data[part]
else:
current_data = None
break
# 执行比较
if operator == 'equals':
return current_data == expected_value
elif operator == 'greater_than':
return current_data > expected_value
elif operator == 'less_than':
return current_data < expected_value
elif operator == 'contains':
return expected_value in str(current_data)
else:
return True
# 使用示例
def demo_api_orchestration():
"""演示 API 编排系统"""
api_client = DynamicAPIClient()
orchestrator = APIOrchestrator(api_client)
# 定义复杂数据分析工作流
data_analysis_workflow = [
{
'name': '数据获取',
'api': 'data_source',
'endpoint': 'fetch',
'parameters': {
'dataset': 'sales_data',
'time_range': 'last_30_days'
},
'output_mapping': {
'raw_data': 'data'
}
},
{
'name': '数据清洗',
'api': 'data_processor',
'endpoint': 'clean',
'parameters': {
'input_data': '${raw_data}',
'methods': ['remove_duplicates', 'fill_missing']
},
'output_mapping': {
'cleaned_data': 'result'
},
'condition': {
'type': 'on_success'
}
},
{
'name': '统计分析',
'api': 'analyzer',
'endpoint': 'statistics',
'parameters': {
'data': '${cleaned_data}',
'metrics': ['mean', 'median', 'std_dev']
},
'output_mapping': {
'stats': 'analysis'
}
},
{
'name': '生成报告',
'api': 'report_generator',
'endpoint': 'create',
'parameters': {
'analysis_results': '${stats}',
'format': 'summary'
}
}
]
orchestrator.register_workflow('data_analysis', data_analysis_workflow)
# 执行工作流
result = orchestrator.execute_workflow('data_analysis')
print("工作流执行结果:")
print(json.dumps(result, ensure_ascii=False, indent=2))
# demo_api_orchestration()
二、LLM Agent 多模态推理技术
2.1 多模态推理的核心挑战
多模态推理要求 Agent 能够理解和处理不同类型的信息:
- 模态对齐:不同模态信息的语义对齐和关联
- 信息融合:有效整合文本、图像、音频等信息
- 跨模态理解:理解不同模态之间的语义关系
- 推理一致性:确保跨模态推理的逻辑一致性
2.2 多模态推理架构设计
2.3 基础多模态处理系统
import base64
import io
from PIL import Image
import numpy as np
from typing import Dict, List, Any, Union
class MultimodalProcessor:
"""多模态处理器"""
def __init__(self):
self.supported_modalities = ['text', 'image', 'audio']
self.feature_extractors = {}
self.fusion_strategies = {}
self._initialize_processors()
def _initialize_processors(self):
"""初始化各模态处理器"""
# 文本处理器
self.feature_extractors['text'] = self._extract_text_features
# 图像处理器
self.feature_extractors['image'] = self._extract_image_features
# 音频处理器
self.feature_extractors['audio'] = self._extract_audio_features
# 融合策略
self.fusion_strategies['early'] = self._early_fusion
self.fusion_strategies['late'] = self._late_fusion
self.fusion_strategies['hybrid'] = self._hybrid_fusion
def process_input(self, multimodal_input: Dict[str, Any]) -> Dict[str, Any]:
"""处理多模态输入"""
processed_data = {
'modalities': [],
'features': {},
'fusion_strategy': 'hybrid',
'timestamp': datetime.now().isoformat()
}
# 处理每个模态
for modality, data in multimodal_input.items():
if modality in self.supported_modalities:
processed_data['modalities'].append(modality)
features = self.feature_extractors[modality](data)
processed_data['features'][modality] = features
# 多模态融合
fused_features = self._fuse_modalities(
processed_data['features'],
processed_data['fusion_strategy']
)
processed_data['fused_features'] = fused_features
return processed_data
def _extract_text_features(self, text: str) -> Dict[str, Any]:
"""提取文本特征"""
# 实际实现应该使用 BERT、GPT 等模型
words = text.split()
sentences = text.split('。')
return {
'word_count': len(words),
'sentence_count': len(sentences),
'avg_word_length': sum(len(word) for word in words) / len(words) if words else 0,
'key_phrases': self._extract_key_phrases(text),
'sentiment': self._analyze_sentiment(text),
'entities': self._extract_entities(text)
}
def _extract_image_features(self, image_data: Any) -> Dict[str, Any]:
"""提取图像特征"""
try:
if isinstance(image_data, str) and image_data.startswith('data:image'):
# 处理 base64 图像数据
image_data = self._decode_base64_image(image_data)
if isinstance(image_data, Image.Image):
image = image_data
else:
image = Image.open(io.BytesIO(image_data))
# 提取基本特征
width, height = image.size
mode = image.mode
# 颜色特征
histogram = image.histogram()
dominant_colors = self._extract_dominant_colors(image)
# 纹理特征(简化实现)
texture_features = self._extract_texture_features(image)
return {
'dimensions': {'width': width, 'height': height},
'mode': mode,
'color_features': {
'histogram': histogram[:256], # 只取 R 通道
'dominant_colors': dominant_colors
},
'texture_features': texture_features,
'contains_text': self._detect_text_in_image(image),
'object_count': self._estimate_objects(image)
}
except Exception as e:
return {
'error': f"图像处理失败: {str(e)}",
'dimensions': None,
'color_features': None
}
def _extract_audio_features(self, audio_data: Any) -> Dict[str, Any]:
"""提取音频特征"""
# 简化实现,实际应该使用 librosa 等库
return {
'duration': 0,
'sample_rate': 0,
'amplitude_features': {},
'spectral_features': {},
'transcribed_text': self._transcribe_audio(audio_data)
}
def _extract_key_phrases(self, text: str) -> List[str]:
"""提取关键短语"""
# 简化实现
words = text.split()
if len(words) <= 3:
return [text]
# 返回前3个重要短语
return [f"{words[i]} {words[i+1]}" for i in range(min(2, len(words)-1))]
def _analyze_sentiment(self, text: str) -> str:
"""分析情感"""
positive_words = ['好', '棒', '优秀', '满意', '高兴']
negative_words = ['差', '坏', '糟糕', '不满', '生气']
pos_count = sum(1 for word in positive_words if word in text)
neg_count = sum(1 for word in negative_words if word in text)
if pos_count > neg_count:
return 'positive'
elif neg_count > pos_count:
return 'negative'
else:
return 'neutral'
def _extract_entities(self, text: str) -> List[Dict]:
"""提取实体"""
entities = []
# 简化实现
location_words = ['北京', '上海', '广州', '深圳']
for location in location_words:
if location in text:
entities.append({'type': 'location', 'value': location})
return entities
def _decode_base64_image(self, base64_string: str) -> Image.Image:
"""解码 base64 图像"""
if base64_string.startswith('data:image'):
base64_string = base64_string.split(',')[1]
image_data = base64.b64decode(base64_string)
return Image.open(io.BytesIO(image_data))
def _extract_dominant_colors(self, image: Image.Image, num_colors: int = 3) -> List[tuple]:
"""提取主色调"""
# 简化实现
image = image.resize((100, 100)) # 缩小以加快处理
pixels = list(image.getdata())
return pixels[:num_colors] # 实际应该使用聚类算法
def _extract_texture_features(self, image: Image.Image) -> Dict[str, float]:
"""提取纹理特征"""
# 简化实现
gray_image = image.convert('L')
pixels = np.array(gray_image)
return {
'contrast': np.std(pixels),
'brightness': np.mean(pixels),
'smoothness': 0.5 # 简化
}
def _detect_text_in_image(self, image: Image.Image) -> bool:
"""检测图像中是否包含文本"""
# 简化实现,实际应该使用 OCR
return False
def _estimate_objects(self, image: Image.Image) -> int:
"""估计图像中的物体数量"""
# 简化实现
return 1
def _transcribe_audio(self, audio_data: Any) -> str:
"""音频转文本"""
# 简化实现
return "模拟转写文本"
def _fuse_modalities(self, features: Dict[str, Any], strategy: str) -> Dict[str, Any]:
"""多模态特征融合"""
if strategy in self.fusion_strategies:
return self.fusion_strategies[strategy](features)
else:
return self._hybrid_fusion(features)
def _early_fusion(self, features: Dict[str, Any]) -> Dict[str, Any]:
"""早期融合:在特征级别融合"""
fused = {
'strategy': 'early',
'combined_features': {},
'modality_weights': {}
}
for modality, feature_dict in features.items():
# 扁平化所有特征
for key, value in feature_dict.items():
if isinstance(value, (int, float)):
fused['combined_features'][f"{modality}_{key}"] = value
elif isinstance(value, dict):
for sub_key, sub_value in value.items():
if isinstance(sub_value, (int, float)):
fused['combined_features'][f"{modality}_{key}_{sub_key}"] = sub_value
return fused
def _late_fusion(self, features: Dict[str, Any]) -> Dict[str, Any]:
"""晚期融合:在决策级别融合"""
fused = {
'strategy': 'late',
'modality_specific_features': features.copy(),
'consensus_decision': self._reach_consensus(features)
}
return fused
def _hybrid_fusion(self, features: Dict[str, Any]) -> Dict[str, Any]:
"""混合融合策略"""
early_fused = self._early_fusion(features)
late_fused = self._late_fusion(features)
return {
'strategy': 'hybrid',
'early_fusion': early_fused,
'late_fusion': late_fused,
'final_decision': self._combine_decisions(early_fused, late_fused)
}
def _reach_consensus(self, features: Dict[str, Any]) -> str:
"""达成跨模态共识"""
# 基于各模态特征达成共识
if 'text' in features:
text_sentiment = features['text'].get('sentiment', 'neutral')
return f"基于文本情感: {text_sentiment}"
return "无法达成共识"
def _combine_decisions(self, early_fused: Dict, late_fused: Dict) -> str:
"""结合早期和晚期融合的决策"""
return f"混合决策: {late_fused['consensus_decision']}"
# 使用示例
def demo_multimodal_processing():
"""演示多模态处理"""
processor = MultimodalProcessor()
# 创建测试图像
test_image = Image.new('RGB', (100, 100), color='red')
# 多模态输入
multimodal_input = {
'text': '这张红色的图片很漂亮,让人感到开心',
'image': test_image,
'audio': b'fake_audio_data'
}
processed = processor.process_input(multimodal_input)
print("多模态处理结果:")
print(json.dumps(processed, ensure_ascii=False, indent=2))
# demo_multimodal_processing()
2.4 跨模态推理引擎
class CrossModalReasoner:
"""跨模态推理引擎"""
def __init__(self, multimodal_processor: MultimodalProcessor):
self.processor = multimodal_processor
self.reasoning_strategies = {}
self.knowledge_graph = {}
self._initialize_reasoning_strategies()
def _initialize_reasoning_strategies(self):
"""初始化推理策略"""
self.reasoning_strategies['consistency_check'] = self._check_consistency
self.reasoning_strategies['complementary_inference'] = self._complementary_inference
self.reasoning_strategies['causal_reasoning'] = self._causal_reasoning
self.reasoning_strategies['temporal_reasoning'] = self._temporal_reasoning
def perform_reasoning(self, multimodal_input: Dict[str, Any],
task: str = "general") -> Dict[str, Any]:
"""执行跨模态推理"""
# 处理多模态输入
processed_data = self.processor.process_input(multimodal_input)
# 选择推理策略
reasoning_strategy = self._select_reasoning_strategy(task, processed_data)
# 执行推理
reasoning_result = reasoning_strategy(processed_data)
# 生成解释
explanation = self._generate_explanation(reasoning_result, processed_data)
return {
'reasoning_strategy': reasoning_strategy.__name__,
'input_modalities': processed_data['modalities'],
'reasoning_result': reasoning_result,
'explanation': explanation,
'confidence': self._calculate_confidence(reasoning_result),
'timestamp': datetime.now().isoformat()
}
def _select_reasoning_strategy(self, task: str,
processed_data: Dict) -> callable:
"""选择推理策略"""
if task == 'consistency_verification':
return self.reasoning_strategies['consistency_check']
elif task == 'information_completion':
return self.reasoning_strategies['complementary_inference']
elif task == 'causal_analysis':
return self.reasoning_strategies['causal_reasoning']
else:
# 默认策略:基于输入模态选择
modalities = processed_data['modalities']
if len(modalities) >= 2:
return self.reasoning_strategies['complementary_inference']
else:
return self.reasoning_strategies['consistency_check']
def _check_consistency(self, processed_data: Dict) -> Dict[str, Any]:
"""一致性检查推理"""
features = processed_data['features']
inconsistencies = []
consistency_score = 1.0
# 检查文本和图像的一致性
if 'text' in features and 'image' in features:
text_sentiment = features['text'].get('sentiment')
image_colors = features['image'].get('color_features', {})
# 简单的情感-颜色一致性检查
if text_sentiment == 'positive' and image_colors:
# 检查是否有明亮的颜色
brightness = image_colors.get('texture_features', {}).get('brightness', 0)
if brightness < 100: # 假设阈值
inconsistencies.append("文本情感积极但图像较暗")
consistency_score *= 0.7
# 检查实体一致性
if 'text' in features:
text_entities = features['text'].get('entities', [])
# 可以添加更多一致性检查逻辑
return {
'is_consistent': len(inconsistencies) == 0,
'inconsistencies': inconsistencies,
'consistency_score': consistency_score,
'suggested_resolutions': self._suggest_resolutions(inconsistencies)
}
def _complementary_inference(self, processed_data: Dict) -> Dict[str, Any]:
"""互补推理:利用多模态信息的互补性"""
features = processed_data['features']
inferences = []
confidence = 0.8
# 文本和图像的互补推理
if 'text' in features and 'image' in features:
text_content = features['text']
image_content = features['image']
# 从文本中提取场景信息
scene_from_text = self._infer_scene_from_text(text_content)
# 从图像中提取场景信息
scene_from_image = self._infer_scene_from_image(image_content)
# 结合两者进行推理
combined_scene = self._combine_scenes(scene_from_text, scene_from_image)
inferences.append(f"推断场景: {combined_scene}")
# 更新置信度
if scene_from_text and scene_from_image:
if scene_from_text == scene_from_image:
confidence = 0.9
else:
confidence = 0.7
# 添加更多互补推理逻辑
return {
'inferences': inferences,
'confidence': confidence,
'information_gain': len(inferences),
'complementary_relationships': self._identify_complementary_relationships(features)
}
def _causal_reasoning(self, processed_data: Dict) -> Dict[str, Any]:
"""因果推理"""
features = processed_data['features']
causal_chains = []
# 基于多模态信息构建因果链
if 'text' in features:
text_content = features['text']
# 提取因果关系的简单实现
causal_relations = self._extract_causal_relations(text_content)
causal_chains.extend(causal_relations)
return {
'causal_chains': causal_chains,
'root_causes': self._identify_root_causes(causal_chains),
'potential_effects': self._identify_potential_effects(causal_chains),
'intervention_suggestions': self._suggest_interventions(causal_chains)
}
def _temporal_reasoning(self, processed_data: Dict) -> Dict[str, Any]:
"""时序推理"""
# 处理包含时序信息的多模态数据
return {
'temporal_sequence': [],
'event_ordering': [],
'duration_estimates': {}
}
def _infer_scene_from_text(self, text_features: Dict) -> str:
"""从文本特征推断场景"""
text_content = text_features.get('raw_text', '')
if '天气' in text_content or '气温' in text_content:
return 'weather'
elif '食物' in text_content or '餐厅' in text_content:
return 'food'
elif '交通' in text_content or '道路' in text_content:
return 'traffic'
else:
return 'general'
def _infer_scene_from_image(self, image_features: Dict) -> str:
"""从图像特征推断场景"""
colors = image_features.get('color_features', {})
dominant_colors = colors.get('dominant_colors', [])
# 简单的基于颜色的场景推断
if dominant_colors:
# 检查是否有蓝色(天空/水)
if any(isinstance(color, tuple) and len(color) == 3 and color[2] > 100
for color in dominant_colors[:1]):
return 'outdoor'
return 'indoor'
def _combine_scenes(self, scene_text: str, scene_image: str) -> str:
"""结合文本和图像的场景信息"""
if scene_text == scene_image:
return scene_text
elif scene_text == 'weather' and scene_image == 'outdoor':
return 'outdoor_weather'
elif scene_text == 'food' and scene_image == 'indoor':
return 'indoor_dining'
else:
return f"{scene_text}_{scene_image}"
def _identify_complementary_relationships(self, features: Dict) -> List[str]:
"""识别互补关系"""
relationships = []
if 'text' in features and 'image' in features:
relationships.append("文本提供语义,图像提供视觉证据")
if 'audio' in features and 'text' in features:
relationships.append("音频提供语调信息,文本提供内容")
return relationships
def _extract_causal_relations(self, text_features: Dict) -> List[Dict]:
"""提取因果关系"""
# 简化实现
text_content = text_features.get('raw_text', '')
causal_keywords = ['因为', '所以', '导致', '造成', '因此']
relations = []
for keyword in causal_keywords:
if keyword in text_content:
relations.append({
'cause': f"包含 '{keyword}' 的陈述",
'effect': '相关结果',
'confidence': 0.7
})
return relations
def _identify_root_causes(self, causal_chains: List[Dict]) -> List[str]:
"""识别根本原因"""
return [chain['cause'] for chain in causal_chains]
def _identify_potential_effects(self, causal_chains: List[Dict]) -> List[str]:
"""识别潜在影响"""
return [chain['effect'] for chain in causal_chains]
def _suggest_interventions(self, causal_chains: List[Dict]) -> List[str]:
"""建议干预措施"""
return [f"干预 {chain['cause']} 来改变 {chain['effect']}"
for chain in causal_chains]
def _suggest_resolutions(self, inconsistencies: List[str]) -> List[str]:
"""建议解决不一致性的方法"""
resolutions = []
for inconsistency in inconsistencies:
if "文本情感积极但图像较暗" in inconsistency:
resolutions.append("重新评估图像亮度或文本情感")
else:
resolutions.append("检查多模态数据的一致性")
return resolutions
def _generate_explanation(self, reasoning_result: Dict,
processed_data: Dict) -> str:
"""生成推理解释"""
strategy = reasoning_result.get('reasoning_strategy', 'unknown')
if strategy == 'consistency_check':
if reasoning_result['is_consistent']:
return "所有模态的信息是一致的"
else:
return f"发现不一致: {', '.join(reasoning_result['inconsistencies'])}"
elif strategy == 'complementary_inference':
inferences = reasoning_result.get('inferences', [])
return f"基于多模态互补推理: {', '.join(inferences)}"
else:
return "执行了跨模态推理分析"
def _calculate_confidence(self, reasoning_result: Dict) -> float:
"""计算推理置信度"""
if 'confidence' in reasoning_result:
return reasoning_result['confidence']
elif 'consistency_score' in reasoning_result:
return reasoning_result['consistency_score']
else:
return 0.5 # 默认置信度
# 使用示例
def demo_cross_modal_reasoning():
"""演示跨模态推理"""
processor = MultimodalProcessor()
reasoner = CrossModalReasoner(processor)
# 创建测试数据
test_image = Image.new('RGB', (200, 200), color=(50, 50, 50)) # 暗色图像
multimodal_input = {
'text': '今天天气真好,阳光明媚让人心情愉快!',
'image': test_image
}
# 执行一致性检查推理
result = reasoner.perform_reasoning(multimodal_input, 'consistency_verification')
print("跨模态推理结果:")
print(json.dumps(result, ensure_ascii=False, indent=2))
# demo_cross_modal_reasoning()
2.5 多模态 Agent 完整系统
class MultimodalAgent:
"""完整的多模态 Agent 系统"""
def __init__(self, name: str = "多模态助手"):
self.name = name
self.multimodal_processor = MultimodalProcessor()
self.cross_modal_reasoner = CrossModalReasoner(self.multimodal_processor)
self.api_client = DynamicAPIClient()
self.conversation_history = []
# 初始化 API
self._initialize_apis()
def _initialize_apis(self):
"""初始化 API 配置"""
# 注册图像分析 API
vision_api_config = {
'base_url': 'https://api.vision.example.com/v1',
'method': 'POST',
'authentication': {
'type': 'api_key',
'key': 'vision_api_key'
},
'headers': {
'Accept': 'application/json'
}
}
self.api_client.register_api('vision', vision_api_config)
# 注册语音处理 API
speech_api_config = {
'base_url': 'https://api.speech.example.com/v1',
'method': 'POST',
'authentication': {
'type': 'bearer_token',
'token': 'speech_token'
}
}
self.api_client.register_api('speech', speech_api_config)
def process_multimodal_request(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
"""处理多模态请求"""
print(f"\n=== 多模态 Agent 处理开始 ===")
# 步骤1: 多模态处理
print("1. 多模态特征提取...")
processed_data = self.multimodal_processor.process_input(request_data)
modalities = processed_data['modalities']
print(f" 检测到模态: {modalities}")
# 步骤2: 跨模态推理
print("2. 跨模态推理...")
reasoning_task = self._determine_reasoning_task(request_data)
reasoning_result = self.cross_modal_reasoner.perform_reasoning(
request_data, reasoning_task)
print(f" 推理策略: {reasoning_result['reasoning_strategy']}")
print(f" 推理置信度: {reasoning_result['confidence']:.2f}")
# 步骤3: API 调用决策
print("3. API 调用决策...")
api_actions = self._decide_api_actions(processed_data, reasoning_result)
# 步骤4: 执行 API 调用
print("4. 执行 API 调用...")
api_results = self._execute_api_actions(api_actions)
# 步骤5: 生成响应
print("5. 生成多模态响应...")
final_response = self._generate_multimodal_response(
processed_data, reasoning_result, api_results)
# 更新对话历史
self._update_conversation_history(request_data, final_response)
print("=== 处理完成 ===\n")
return final_response
def _determine_reasoning_task(self, request_data: Dict) -> str:
"""确定推理任务类型"""
modalities = list(request_data.keys())
if len(modalities) >= 2:
return 'complementary_inference'
elif 'text' in request_data:
text = request_data['text']
if any(word in text for word in ['因为', '所以', '原因']):
return 'causal_analysis'
else:
return 'consistency_verification'
else:
return 'general'
def _decide_api_actions(self, processed_data: Dict,
reasoning_result: Dict) -> List[Dict]:
"""决定需要执行的 API 动作"""
actions = []
modalities = processed_data['modalities']
# 基于模态决定 API 调用
if 'image' in modalities:
actions.append({
'api': 'vision',
'endpoint': 'analyze',
'purpose': '深度图像分析',
'data': processed_data['features']['image']
})
if 'audio' in modalities:
actions.append({
'api': 'speech',
'endpoint': 'transcribe',
'purpose': '语音转文本',
'data': processed_data['features']['audio']
})
# 基于推理结果决定附加动作
if not reasoning_result.get('is_consistent', True):
actions.append({
'api': 'knowledge',
'endpoint': 'verify',
'purpose': '验证信息一致性',
'data': reasoning_result['inconsistencies']
})
return actions
def _execute_api_actions(self, api_actions: List[Dict]) -> List[Dict]:
"""执行 API 动作"""
results = []
for action in api_actions:
print(f" 调用 {action['api']} API: {action['purpose']}")
# 模拟 API 调用结果
if action['api'] == 'vision':
result = {
'success': True,
'data': {
'objects_detected': ['person', 'car', 'building'],
'scene_classification': 'urban',
'color_analysis': {'dominant_colors': ['gray', 'blue']}
},
'api_used': 'vision'
}
elif action['api'] == 'speech':
result = {
'success': True,
'data': {
'transcribed_text': '这是模拟的语音转写结果',
'confidence': 0.85,
'language': 'zh-CN'
},
'api_used': 'speech'
}
else:
result = {
'success': True,
'data': {'verification_result': 'consistent'},
'api_used': action['api']
}
results.append(result)
return results
def _generate_multimodal_response(self, processed_data: Dict,
reasoning_result: Dict,
api_results: List[Dict]) -> Dict[str, Any]:
"""生成多模态响应"""
# 构建文本响应
text_response = self._build_text_response(processed_data, reasoning_result, api_results)
# 构建可能的其他模态响应
multimodal_elements = self._prepare_multimodal_elements(api_results)
response = {
'text': text_response,
'multimodal_elements': multimodal_elements,
'reasoning_summary': reasoning_result['explanation'],
'confidence': reasoning_result['confidence'],
'timestamp': datetime.now().isoformat(),
'modalities_processed': processed_data['modalities']
}
return response
def _build_text_response(self, processed_data: Dict,
reasoning_result: Dict,
api_results: List[Dict]) -> str:
"""构建文本响应"""
modalities = processed_data['modalities']
response_parts = []
# 基于处理模态的响应
if 'text' in modalities:
response_parts.append("我已经理解了您的文本内容。")
if 'image' in modalities:
response_parts.append("我已经分析了您提供的图像。")
# 添加图像分析结果
for api_result in api_results:
if api_result['api_used'] == 'vision':
objects = api_result['data'].get('objects_detected', [])
if objects:
response_parts.append(f"在图像中检测到: {', '.join(objects)}")
if 'audio' in modalities:
response_parts.append("我已经处理了音频内容。")
for api_result in api_results:
if api_result['api_used'] == 'speech':
transcribed = api_result['data'].get('transcribed_text')
if transcribed:
response_parts.append(f"语音转写结果: {transcribed}")
# 添加推理结果
if not reasoning_result.get('is_consistent', True):
response_parts.append("注意: 发现多模态信息之间存在不一致。")
# 组合响应
if response_parts:
return " ".join(response_parts)
else:
return "我已经处理了您提供的多模态信息。"
def _prepare_multimodal_elements(self, api_results: List[Dict]) -> Dict[str, Any]:
"""准备多模态响应元素"""
elements = {}
for api_result in api_results:
if api_result['api_used'] == 'vision':
elements['image_analysis'] = api_result['data']
elif api_result['api_used'] == 'speech':
elements['speech_analysis'] = api_result['data']
return elements
def _update_conversation_history(self, request: Dict, response: Dict):
"""更新对话历史"""
conversation_entry = {
'request': request,
'response': response,
'timestamp': datetime.now().isoformat()
}
self.conversation_history.append(conversation_entry)
# 保持历史记录大小
if len(self.conversation_history) > 10:
self.conversation_history.pop(0)
def get_agent_status(self) -> Dict[str, Any]:
"""获取 Agent 状态"""
return {
'name': self.name,
'conversation_count': len(self.conversation_history),
'supported_modalities': self.multimodal_processor.supported_modalities,
'registered_apis': list(self.api_client.registered_apis.keys()),
'last_activity': self.conversation_history[-1]['timestamp'] if self.conversation_history else '无'
}
# 使用示例
def demo_multimodal_agent():
"""演示多模态 Agent 的完整工作流程"""
agent = MultimodalAgent("智能多模态助手")
# 测试多模态请求
test_image = Image.new('RGB', (300, 200), color='lightblue')
test_requests = [
{
'text': '请分析这张图片的内容',
'image': test_image
},
{
'text': '今天的天气和这张图片匹配吗?',
'image': test_image
}
]
for i, request in enumerate(test_requests, 1):
print(f"\n测试请求 {i}:")
response = agent.process_multimodal_request(request)
print(f"Agent 响应: {response['text']}")
print(f"处理模态: {response['modalities_processed']}")
print(f"置信度: {response['confidence']:.2f}")
# 显示 Agent 状态
status = agent.get_agent_status()
print("\nAgent 状态:")
print(json.dumps(status, ensure_ascii=False, indent=2))
if __name__ == "__main__":
demo_multimodal_agent()
三、总结与最佳实践
3.1 核心技术总结
通过本文的详细探讨,我们了解了 LLM Agent 在动态 API 调用和多模态推理方面的核心技术:
动态 API 调用关键点:
- 灵活注册机制:支持动态添加和管理 API
- 智能路由:基于意图识别自动选择合适 API
- 参数提取:从自然语言中自动提取 API 参数
- 错误处理:完善的异常处理和降级方案
- 缓存优化:提高性能并减少 API 调用次数
多模态推理关键点:
- 特征提取:各模态信息的标准化特征表示
- 跨模态对齐:不同模态信息的语义对齐
- 融合策略:早期、晚期和混合融合方法
- 一致性检查:确保多模态信息的一致性
- 互补推理:利用多模态信息的互补性
3.2 最佳实践建议
API 设计原则:
- 提供清晰的 API 描述和文档
- 实现统一的错误处理机制
- 考虑速率限制和配额管理
多模态处理建议:
- 设计可扩展的模态处理框架
- 实现模态间的容错机制
- 考虑计算效率和资源消耗
系统架构考虑:
- 模块化设计便于维护和扩展
- 实现完善的日志和监控
- 考虑安全性和隐私保护
3.3 未来发展方向
随着技术的不断发展,LLM Agent 在动态 API 调用和多模态推理方面还有很大的提升空间:
- 更智能的 API 发现:自动发现和集成新的 API
- 端到端的多模态学习:统一的跨模态表示学习
- 实时自适应:根据上下文动态调整处理策略
- 联邦学习集成:在保护隐私的前提下实现知识共享
- 因果推理增强:更深入的理解因果关系
通过掌握这些核心技术,开发者可以构建出真正智能、实用的 LLM Agent 系统,在各种复杂场景中提供准确、高效的服务。
浙公网安备 33010602011771号