"""
多模态分析API模块
"""
import logging
import os
import uuid
import cv2
import ffmpeg
import numpy as np
from app.api.auth import token_required
from app.models.analysis import MultimodalAnalysis
from app.services.audio import extract_and_evaluate_audio
from flask import current_app, jsonify, request
# 配置日志
logger = logging.getLogger(__name__)
@token_required
def multimodal_analysis():
"""
多模态分析接口
---
tags:
- 分析
security:
- CookieAuth: []
consumes:
- multipart/form-data
parameters:
- name: video
in: formData
required: true
type: file
description: 要分析的视频文件(WebM格式)
- name: session_id
in: formData
required: true
type: string
description: 关联的面试会话ID
responses:
200:
description: 分析结果
schema:
type: object
properties:
video_analysis:
type: object
properties:
expressions:
type: object
description: 表情分析结果
properties:
happy:
type: number
example: 0.45
neutral:
type: number
example: 0.35
sad:
type: number
example: 0.05
angry:
type: number
example: 0.02
surprised:
type: number
example: 0.08
fearful:
type: number
example: 0.03
disgusted:
type: number
example: 0.02
eye_contact:
type: number
description: 眼神接触评分
example: 0.78
body_language:
type: object
description: 肢体语言分析
properties:
confidence:
type: number
example: 0.82
energy:
type: number
example: 0.65
audio_analysis:
type: object
properties:
clarity:
type: number
example: 0.85
fluency:
type: number
example: 0.75
pace:
type: number
example: 0.8
volume:
type: number
example: 0.7
pitch_variation:
type: number
example: 0.65
emotions:
type: object
properties:
confident:
type: number
example: 0.7
nervous:
type: number
example: 0.3
message:
type: string
example: 分析完成
400:
description: 请求参数错误
schema:
type: object
properties:
error:
type: string
example: 没有提供视频文件
401:
description: 未授权访问
schema:
type: object
properties:
error:
type: string
example: 未提供认证令牌
500:
description: 服务器错误
schema:
type: object
properties:
error:
type: string
example: 处理视频失败
"""
if 'video' not in request.files:
return jsonify({"error": "没有提供视频文件"}), 400
try:
video_file = request.files['video']
session_id = request.form.get('session_id')
# 创建临时文件夹保存视频(如果不存在)
temp_dir = os.path.join(os.getcwd(), 'temp', 'videos')
os.makedirs(temp_dir, exist_ok=True)
# 生成唯一文件名并保存视频
filename = f"{uuid.uuid4()}.webm"
video_path = os.path.join(temp_dir, filename)
video_file.save(video_path)
# 验证视频文件完整性
try:
# 使用 ffmpeg 探测视频文件
probe = ffmpeg.probe(video_path, v='error')
if not probe or 'streams' not in probe or not probe['streams']:
logger.warning(f"视频文件 {video_path} 无效或不完整")
return jsonify({"error": "视频文件无效或不完整"}), 400
except ffmpeg.Error as e:
logger.error(
f"视频文件验证失败: {e.stderr.decode() if hasattr(e, 'stderr') else str(e)}"
)
return jsonify({"error": "无法处理视频文件,格式可能不受支持或文件已损坏"}), 400
# 使用OpenCV分析视频
cap = cv2.VideoCapture(video_path)
# 检查视频是否成功打开
if not cap.isOpened():
logger.error(f"OpenCV无法打开视频文件: {video_path}")
return jsonify({"error": "无法打开视频文件进行分析"}), 400
# 加载人脸检测器和面部特征检测器
face_cascade = cv2.CascadeClassifier(
cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
)
eye_cascade = cv2.CascadeClassifier(
cv2.data.haarcascades + 'haarcascade_eye.xml'
)
# 分析指标初始化
frame_count = 0
face_detected_frames = 0
eye_contact_frames = 0
facial_expression_variance = []
# 肢体语言分析指标
face_positions = [] # 记录人脸位置
head_poses = [] # 记录头部姿势
frame_diffs = [] # 记录帧间差异
prev_frame = None # 上一帧
upper_body_regions = [] # 上半身区域
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
# 转换为灰度图像
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
# 帧间差异计算(用于评估动作频率)
if prev_frame is not None:
frame_diff = cv2.absdiff(prev_frame, gray)
frame_diffs.append(np.mean(frame_diff))
prev_frame = gray.copy()
# 人脸检测
faces = face_cascade.detectMultiScale(gray, 1.3, 5)
if len(faces) > 0:
face_detected_frames += 1
for (x, y, w, h) in faces:
# 记录人脸位置(中心点)
face_center = (x + w//2, y + h//2)
face_positions.append(face_center)
# 截取脸部区域
roi_gray = gray[y:y+h, x:x+w]
# 计算头部姿势(简化版,通过脸部矩形的宽高比来估计)
head_pose = w / h if h > 0 else 1.0
head_poses.append(head_pose)
# 估计上半身区域(面部下方区域)
upper_body_y = y + h
upper_body_h = int(h * 1.5) # 假设上半身高度是脸部高度的1.5倍
# 确保不超出图像边界
if upper_body_y + upper_body_h < frame.shape[0]:
upper_body = gray[upper_body_y:upper_body_y +
upper_body_h, x-w//2:x+w+w//2]
upper_body_regions.append(
np.std(upper_body)) # 上半身区域的标准差作为稳定性指标
# 眼睛检测
eyes = eye_cascade.detectMultiScale(roi_gray)
if len(eyes) >= 2: # 检测到双眼
eye_contact_frames += 1
# 计算面部表情变化(使用像素值标准差作为简单指标)
face_variance = np.std(roi_gray)
facial_expression_variance.append(face_variance)
# 每100帧检查一次,避免处理过大的视频
frame_count += 1
if frame_count > 300:
break
cap.release()
# 如果没有成功处理任何帧,返回默认值
if frame_count == 0:
logger.warning(f"无法从视频中提取任何有效帧: {video_path}")
# 清理临时文件
if not current_app.config.get("DEBUG"):
try:
os.remove(video_path)
except:
logger.warning(f"无法删除临时视频文件: {video_path}")
return jsonify({"error": "视频中没有检测到有效的面部或眼睛"}), 400
logger.info(
"视频分析完成: "
f"frame_count: {frame_count}, "
f"face_detected_frames: {face_detected_frames}, "
f"eye_contact_frames: {eye_contact_frames}"
)
# 计算分析指标
eye_contact_rate = eye_contact_frames / \
face_detected_frames if face_detected_frames > 0 else 0
expression_variability = np.mean(
facial_expression_variance
) if facial_expression_variance else 0 # 计算评分(简化版)
eye_contact_score = min(10, eye_contact_rate * 10)
facial_expressions_score = min(
10, (expression_variability / 50) * 10
) # 假设50是较好的变化值
# 计算肢体语言得分
body_language_score = 7.0 # 默认初始值
body_language_details = {}
# 1. 姿态稳定性评分(通过人脸位置的稳定性来评估)
if len(face_positions) > 1:
# 计算人脸位置的标准差(x和y方向)
face_pos_x = [pos[0] for pos in face_positions]
face_pos_y = [pos[1] for pos in face_positions]
face_stability_x = np.std(face_pos_x)
face_stability_y = np.std(face_pos_y)
# 归一化稳定性得分(标准差越小越稳定,得分越高)
# 理想的轻微移动范围在10-30像素之间
stability_score_x = 10 - \
min(10, max(0, (face_stability_x - 10) / 5))
stability_score_y = 10 - \
min(10, max(0, (face_stability_y - 10) / 5))
stability_score = (stability_score_x + stability_score_y) / 2
body_language_details['stability'] = round(stability_score, 1)
else:
body_language_details['stability'] = 5.0 # 默认中等值
# 2. 头部姿势评分
if head_poses:
# 头部姿势的平均值和变异性
head_pose_mean = np.mean(head_poses)
head_pose_std = np.std(head_poses)
# 头部姿势分数(理想的宽高比接近1.0,表示正面朝向)
head_pose_score = 10 - min(10, abs(head_pose_mean - 1.0) * 10)
# 头部姿势变化(适度变化是好的,但过度变化不好)
head_movement_score = 10 - \
min(10, max(0, (head_pose_std - 0.05) * 20))
head_score = (head_pose_score + head_movement_score) / 2
body_language_details['headPose'] = round(head_score, 1)
else:
body_language_details['headPose'] = 5.0 # 默认中等值
# 3. 上半身稳定性
if upper_body_regions:
# 上半身区域的变异性(适度的变化是好的,表示自然的手势)
upper_body_std = np.mean(upper_body_regions)
# 理想的上半身变化在10-30之间
upper_body_score = 10 if 10 <= upper_body_std <= 30 else 10 - \
min(10, abs(upper_body_std - 20) / 3)
body_language_details['upperBody'] = round(upper_body_score, 1)
else:
body_language_details['upperBody'] = 5.0 # 默认中等值
# 4. 动作频率评分
if frame_diffs:
# 计算动作频率的均值
motion_mean = np.mean(frame_diffs)
# 理想的动作变化在5-15之间(太少表示僵硬,太多表示不稳定)
motion_score = 10 if 5 <= motion_mean <= 15 else 10 - \
min(10, abs(motion_mean - 10) / 2)
body_language_details['motion'] = round(motion_score, 1)
else:
body_language_details['motion'] = 5.0 # 默认中等值
# 综合肢体语言得分(各部分权重可以调整)
if face_detected_frames > 0:
body_language_score = (
body_language_details['stability'] * 0.3 +
body_language_details['headPose'] * 0.3 +
body_language_details['upperBody'] * 0.2 +
body_language_details['motion'] * 0.2
)
# 否则保持默认值
# 记录详细评分到日志
logger.info(f"肢体语言详细评分: {body_language_details}")
# 自信度评分综合考虑眼神接触和肢体语言
confidence_score = 0.5 * eye_contact_score + 0.3 * \
body_language_score + 0.2 * facial_expressions_score
# 生成建议
recommendations = []
if eye_contact_score < 7:
recommendations.append("增加与面试官的眼神接触")
if facial_expressions_score < 6:
recommendations.append("尝试展示更多自然的面部表情")
# 根据肢体语言的各个方面提供具体建议
if 'stability' in body_language_details:
if body_language_details['stability'] < 6:
recommendations.append("面试时保持身体稳定,减少不必要的晃动")
if 'headPose' in body_language_details:
if body_language_details['headPose'] < 6:
recommendations.append("保持头部正面朝向面试官,适当点头示意")
if 'upperBody' in body_language_details:
if body_language_details['upperBody'] < 6:
recommendations.append("注意上半身姿态,保持挺胸自然的坐姿")
if 'motion' in body_language_details:
if body_language_details['motion'] < 5:
recommendations.append("适当增加手势动作,避免过于僵硬")
elif body_language_details['motion'] > 8:
recommendations.append("减少过度频繁的动作,保持沉稳大方") # 最终分析结果
analysis = {
"eyeContact": round(eye_contact_score, 1), # 眼神接触评分(1-10)
"facialExpressions": round(facial_expressions_score, 1), # 面部表情评分
"bodyLanguage": round(body_language_score, 1), # 肢体语言评分
"confidence": round(confidence_score, 1), # 自信程度
"recommendations": "、".join(recommendations) if recommendations else "保持良好的眼神接触和面部表情。"
}
# 处理同一视频的音频分析
audio_analysis = None
try:
# 从视频文件提取音频
audio_analysis = extract_and_evaluate_audio(video_path)
except Exception as audio_error:
# 音频分析失败不影响视频分析结果的返回
logger.warning(f"从视频提取并分析音频失败: {str(audio_error)}")
# 清理临时文件
if not current_app.config.get("DEBUG"):
try:
os.remove(video_path)
except:
logger.warning(f"无法删除临时视频文件: {video_path}")
# 保存分析结果
MultimodalAnalysis.create_or_update(
session_id, analysis, audio_analysis
)
return jsonify({"msg": "分析完成"})
except Exception as e:
logger.exception(f"视频分析失败: {str(e)}")
return jsonify({"error": f"视频分析失败: {str(e)}"}), 500
浙公网安备 33010602011771号