Andrew Ng课程作业第八周(python版)
作业目的:使用高斯模型进行异常检测
作业内容:根据服务器的吞吐量和延时分析异常服务器
提供的数据:ex8data1.txt(mooc可下载),X是训练集,有两个列,一个是throughput(mb/s),一个是latency(ms);Xval和yval用于交叉验证
处理过程:
- step1:读数,画出X的散点图
import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sb from scipy.io import loadmat %matplotlib inline data = loadmat('d:/jupyter/ipython-notebooks-master/data/ex8data1.mat') X = data['X'] X.shape #(307, 2) #画图 fig, ax = plt.subplots(figsize=(12,8)) ax.scatter(X[:,0], X[:,1])

- step2:对X的每一列预估一个高斯模型
1、计算每列的均值和方差
def estimate_gaussian(X): #axis=0计算的是每一列的 mu = X.mean(axis=0) sigma = X.var(axis=0) return mu,sigma mu, sigma = estimate_gaussian(X) mu, sigma
2、使用scipy建立高斯分布
Xval = data['Xval'] yval = data['yval'] Xval.shape, yval.shape
((307, 2), (307, 1))
from scipy import stats #用mu[0]和sigma[0]建立高斯分布 dist = stats.norm(mu[0], sigma[0]) #计算15这个点对应的概率密度函数的y值 dist.pdf(15) #0.1935875044615038 dist.pdf(X[:,0])[0:50]
因此,我们可以得到:
p = np.zeros((X.shape[0], X.shape[1])) p[:,0] = stats.norm(mu[0], sigma[0]).pdf(X[:,0]) p[:,1] = stats.norm(mu[1], sigma[1]).pdf(X[:,1]) p.shape
(307, 2)
对于交叉验证,也是同样的参数。结合概率密度和阈值,对比yval可以判断阈值设的是否合理
pval = np.zeros((Xval.shape[0], Xval.shape[1])) pval[:,0] = stats.norm(mu[0], sigma[0]).pdf(Xval[:,0]) pval[:,1] = stats.norm(mu[1], sigma[1]).pdf(Xval[:,1]) pval.shape
- step3:寻找最佳阈值的函数
def select_threshold(pval,yval): #epsilon是阈值 best_epsilon = 0 #F1越高,threshold能更好的让precision和recall平衡 best_f1 = 0 f1 = 0 step = (pval.max()-pval.min()) / 1000 for epsilon in np.arange(pval.min(),pval.max(),step): preds = pval <epsilon tp = np.sum(np.logical_and(preds == 1, yval == 1)).astype(float) fp = np.sum(np.logical_and(preds == 1, yval == 0)).astype(float) fn = np.sum(np.logical_and(preds == 0, yval == 1)).astype(float) precision = tp / (tp + fp) recall = tp / (tp + fn) f1 = (2 * precision * recall) / (precision + recall) if f1 > best_f1: best_f1 = f1 best_epsilon = epsilon return best_epsilon,best_f1
epsilon, f1 = select_threshold(pval, yval)
epsilon, f1
(0.0095667060059568421, 0.7142857142857143)
- step4:根据最佳阈值找异常点
outliers = np.where(p<epsilon) outliers #(array([300, 301, 301, 303, 303, 304, 306, 306], dtype=int64),array([1, 0, 1, 0, #1, 0, 0, 1], dtype=int64))
fig,ax = plt.subplots(figsize=(12,8)) ax.scatter(X[:,0],X[:,1]) ax.scatter(X[outliers[0],0], X[outliers[0],1], s=50, color='r', marker='o')

浙公网安备 33010602011771号