# --*-- coding:utf-8 --*--
import math
import itertools
def Mean(t):
"""均值"""
return float(sum(t)) / len(t)
def E(x, p):
"""
离散性随即变量的数学期望(也称为均值): 随机变量X与其概率P乘积的和
"""
return sum([x[i] * p[i] for i in range(len(x))])
def Median(t):
"""中位数"""
arr = sorted(t)
idx = (len(arr) - 1) / 2
if type(idx) is int:
return arr[idx]
if type(idx) is float:
return Mean(arr[int(math.floor(idx)):int(math.ceil(idx)) + 1])
def Mode(t):
"""众数"""
if not t:
return None
arr = __getfreq(t)
if arr[0][0] == 1:
return None
else:
for k, g in itertools.groupby(arr, key=lambda x: x[0]):
return [t[1] for t in g]
def __getfreq(t):
"""获取t中每个值及其出现次数"""
arr = sorted(t)
alist = []
for k, g in itertools.groupby(arr):
alist.append((len(list(g)), k))
alist.sort(key=lambda x: x[0], reverse=True)
return alist
def Var(t, mu=None):
"""方差"""
if mu is None:
mu = Mean(t)
# compute the squared deviations and return their mean.
dev2 = [(x - mu)**2 for x in t]
var = Mean(dev2)
return var
def D(x, p):
"""
离散性随机变量的方差: ((X与X的期望(均值)的差值)的平方)的期望(均值)
"""
# 由定义计算_0
# e = E(x, p)
# return sum([(x[i] - e) ** 2 * p[i] for i in range(len(x))])
# 由定义计算_1,构造新的随机变量Y
# e = E(x, p)
# y = [(x[i] - e) ** 2 for i in range(len(x))]
# return E(y, p)
# 由简化公式计算
e = E(x, p)
e_1 = E([x[i] ** 2 for i in range(len(x))], p)
return e_1 - e ** 2
def SVar(t):
"""样本方差"""
if not t:
return None
mu = Mean(t)
return sum([(x - mu) ** 2 for x in t]) / (len(t) - 1)
def MeanVar(t):
"""均值和方差"""
mu = Mean(t)
var = Var(t, mu)
return mu, var
def StdVar(t, mu=None):
"""标准差"""
if mu is None:
mu = Mean(t)
import math
return math.sqrt(Var(t, mu))
def Range(t):
"""极差"""
if not t:
return None
return max(t) - min(t)
def Cov(X, Y):
"""
协方差
X与Y的对应离均差(x-mu)的乘积的均值
功能:
如果离均差变化方向一致,则正负号相同,乘积为正数
缺陷:
X Y 例如是身高和体重, cm * kg 没有意义,所以一般用标准分数来解决
详见相关系数 ==> def pearson_correlation(X, Y): 标准分数单位为1,均值为0,方差为1
相关系数的单位为1,相比于协方差的单位更好理解
"""
mu_x = Mean(X)
mu_y = Mean(Y)
# 计算离均差:如果X Y的变化方向一致,那么X,Y的离均差应该有相同的正负号
# d_x = [x - mu_x for x in X]
# d_y = [y - mu_y for y in Y]
# # 离均差的均值 ==> 协方差
# return Mean([d_x[i] * d_y[i] for i in range(len(X))])
total = 0.0
for x, y in zip(X, Y):
total += (x-mu_x) * (y-mu_y)
return total / len(X)
def standardsocre(x, mu, sigma):
"""
标准分数
x-mu ==> 离差:x与均值的差
x-mu / sigma 实现归一化
功能:转换后的标准化变量Z的单位为 1 , 均值为0, 方差为 1
"""
return (x - mu) / sigma
def pearson_correlation(X, Y):
"""
相关系数:
将协方差中的X,Y 转化为标准分数,标准分数的乘积的均值即为相关系数
(x-mu)/ sigma 实现了 归一化
相关系数的单位为1,相比于协方差的单位更好理解
协方差为1,二者完全相关,知道其中的一个值,可以准确预测另外一个值
协方差为-1,而这完全负相关
"""
# 计算均值(期望)
mu_x = Mean(X)
mu_y = Mean(Y)
# 计算标准差
sigma_x = StdVar(X)
sigma_y = StdVar(Y)
# 标准分数的均值即为:相关系数
p = [standardsocre(X[i], mu_x, sigma_x) * standardsocre(Y[i], mu_y, sigma_y) for i in range(len(X))]
return Mean(p)
# 采用简易公式计算相关系数
# sigma_x = stdvar(X)
# sigma_y = stdvar(Y)
# return Cov(X, Y) / (sigma_x * sigma_y)
#相关系数=X,Y的协方差除以X的标准差*Y的标准差
# xbar, varx = MeanVar(X)
# ybar, vary = MeanVar(Y)
# corr = Cov(xs, ys) / math.sqrt(varx * vary)
# return corr
if __name__ == '__main__':
astr = '93 62 51 93 75 82 93 62 65 51'
alist = [int(e) for e in astr.split()]
print '均值:', Mean(alist)
print '中位数:', Median(alist)
print '众数:', Mode(alist)
print '极差:', Range(alist)
print '总体方差:', Var(alist)
print '样本方差', SVar(alist)
print '标准差:', StdVar(alist)
print '验证协方差 Cov(X, X) == Var(X)'
print Cov(alist, alist)
print Var(alist)
print '协方差与方差相等,说明公式编写的函数是正确的'