第十三篇:数理统计基本概念与大数定律

总体和样本与样本概念的二重性

切比雪夫不等式和切比雪夫定理

 

大数定律

import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import random

#设置字符集,防止中文乱码
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus'] = False

#给定随机数种子
random.seed(28)
def genetate_random_int(n):
    '''产生n个0-9的随机数'''
    return [random.randint(1,20) for i in range(n)]
if __name__ == '__main__':
    number = 8000
    x = [i for i in range(1,number+1)]

    #产生numper个1-20的随机数
    total_random_int = genetate_random_int(number)

    #求n个1-20的随机数的平均值
    y = [np.mean(total_random_int[0:n+1]) for n in range(number)]

    #总体平均值
    total_mean = sum(total_random_int)/number
    total_mean_list = [total_mean for i in range(number)]
    plt.plot(x,y,'-b',label = '样本容量为n的样本平均值')
    plt.plot(x,total_mean_list,'-r',label='总体平均值')
    plt.title('n个1-20的随机数的平均值')
    plt.xlim(0,number)
    plt.legend(loc='lower right')
    plt.grid(True)
    plt.show()

中心极限定理

import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from collections import Counter #统计数
import random

#设置字符集,防止中文乱码
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus'] = False

'''
随机抛六面的骰子,计算三次的点数的和,设三次点数的和为事件A.
==>事件A的发生属于什么分布?
==>A = x1+x2+x3,其中x1,x2,x3是分别三次的抛骰子的点数。
根据中心极限定理。由于x1,x2,x3属于独立同分布,所以说最终的事件A属于高斯分布。
'''

#给定随机数种子
random.seed(28)
def genetate_random_int():
    '''产生一个1-6的随机数,表示抛一次骰子的结果'''
    return random.randint(1,6)
def genetate_sum(n):
    '''计算返回n次抛六面骰子结果的和'''
    return np.sum([genetate_random_int() for i in range(n)])
if __name__ == '__main__':
    number = 10000
    n = 3

    #进行number次事件A的操作,每次事件A进行n次抛骰子。
    test_result = [genetate_sum(n) for i in range(number)]

    # res = {}
    # for key in test_result:
    #     count = 1
    #     if key in res:
    #         count += res[key]
    #     res[key] = count

    result = Counter(test_result)
    x = sorted(list(result.keys()))
    print(x)
    y = [result[key]/number for key in x]
    plt.plot(x,y,'-b')
    plt.title('三次点数的和的分布律')
    plt.xlim(x[0]-1,x[-1]+1)
    plt.grid(True)
    plt.show()
    print(y)

统计量

常用的分布

from sympy import summation,product,pprint,Eq,E,simplify,Piecewise,S,expand,oo,radsimp,exp,Symbol,integrate,Integral
from sympy.stats import E,P,density,variance,Normal,cdf,Covariance,ChiSquared,Exponential,Variance
from sympy.abc import p,r,y,x,C
from sympy.matrices import Matrix
n = Symbol('n',positive=True,integer=True)
X = ChiSquared('X',n)
print(simplify(E(X)))
print(simplify(variance(X)))

from scipy.stats import chi2
Y = chi2(8)
print(Y.mean())
print(Y.var())

from sympy import summation,product,pprint,Eq,E,simplify,Piecewise,S,expand,oo,radsimp,exp,Symbol,integrate,Integral,sqrt
from sympy.stats import E,P,density,variance,Normal,cdf,Covariance,ChiSquared,Exponential,Variance,StudentT
from sympy.abc import p,r,y,x,C
from sympy.matrices import Matrix
n = Symbol('n',positive=True,integer=True)
X = StudentT('X',8)
print(E(X))
print(variance(X).evalf())


from scipy.stats import t
Z = t(8)
print(Z.mean())
print(Z.var())

from sympy import summation,product,pprint,Eq,E,simplify,Piecewise,S,expand,oo,radsimp,exp,Symbol,integrate,Integral,sqrt
from sympy.stats import E,P,density,variance,Normal,cdf,Covariance,ChiSquared,Exponential,Variance,FDistribution
from sympy.abc import p,r,y,x,C
from sympy.matrices import Matrix
n = Symbol('n',positive=True,integer=True)
X = FDistribution('X',10,8)
print(E(X).evalf())
print(variance(X).evalf())

from scipy.stats import f
Y = f(10,8)
print(Y.mean())
print(Y.var())

参数估计

点估计

距估计

极大似然估计

连续性总体似然函数的求法

from sympy import summation,product,pprint,Eq,Product,E,simplify,log,expand,solve,exp,Symbol,integrate,expand_log,diff
from sympy.stats import E,P,density,variance,Normal,cdf,Covariance,ChiSquared,Exponential,Variance,Probability
from sympy.abc import p,r,y,C,x
from sympy.matrices import Matrix
n = Symbol('n',positive=True,integer=True)
i = Symbol('i',positive=True,integer=True)
x_i = Symbol('x_i')
X = Normal('X',0,1)
f = Product(Probability(Eq(X,x_i)),(i,1,n))
print('似然函数:')
pprint(f)
L = log(f)
print('对数似然:')
print(L)
L_expand = expand_log(L.subs([(x_i,x*i),(n,5)]).doit(),force=True)
pprint(L_expand)
dx = simplify(diff(L_expand,x).doit())
print(dx)

区间估计

sample = [506, 508, 499, 503, 504, 510, 497, 512, 514, 505, 493, 496, 506, 502, 509, 496]
def confidence_interval_n(data,var,confidence=0.95):
    from scipy.stats import norm
    import numpy as np
    sample_mean = np.mean(data)
    sample_size = len(data)
    alpha = 1 - confidence
    N = norm(0,1)
    z_score = N.isf(alpha/2)
    ME = z_score * np.sqrt(var/sample_size)
    lower_limit = sample_mean - ME
    upper_limit = sample_mean + ME
    
    print( 'mu的置信水平为'+str(confidence*100)+ '%%的置信区间为: ( %.2f, %.2f)' % (lower_limit, upper_limit))
    return lower_limit, upper_limit
confidence_interval_n(sample,6.2)

sample = [506, 508, 499, 503, 504, 510, 497, 512, 514, 505, 493, 496, 506, 502, 509, 496]
def confidence_interval_t(data,confidence=0.95):
    from scipy.stats import t
    import numpy as np
    sample_mean = np.mean(data)
    sample_std = np.std(data,ddof=1) #无偏方差
    sample_size = len(data)
    alpha = 1 - confidence
    t_score = t.isf(alpha / 2, df = (sample_size-1) )

    ME = t_score * sample_std / np.sqrt(sample_size)
    lower_limit = sample_mean - ME
    upper_limit = sample_mean + ME

    print( 'mu的置信水平为'+str(confidence*100)+ '%%的置信区间为: ( %.2f, %.2f)' % (lower_limit, upper_limit))
    return lower_limit, upper_limit
confidence_interval_t(sample)

sample = [506, 508, 499, 503, 504, 510, 497, 512, 514, 505, 493, 496, 506, 502, 509, 496]
def confidence_interval_chi2(data,confidence=0.95):
    from scipy.stats import chi2
    import numpy as np
    sample_var = np.var(data,ddof=1)
    n_1 = len(data)-1
    alpha = 1 - confidence
    N = chi2(n_1)
    lower_limit = (n_1*sample_var)/N.isf(alpha/2)
    upper_limit = (n_1*sample_var)/N.isf(1-alpha/2)
    print( 'var的置信水平为'+str(confidence*100)+ '%%的置信区间为: ( %.2f, %.2f)' % (lower_limit, upper_limit))
    return lower_limit, upper_limit
confidence_interval_chi2(sample)

假设检验

卡方检验x2检验(chi-square test)

 x2检验(chi-square test)或称卡方检验,是一种用途较广的假设检验方法。可以分为成组比较(不配对资料)和个别比较(配对,或同一对象两种处理的比较)两类。

一、四格表资料的x2检验

例20.7某医院分别用化学疗法和化疗结合放射治疗卵巢癌肿患者,结果如表20-11,问两种疗法有无差别?

 

表内用虚线隔开的这四个数据是整个表中的基本资料,其余数据均由此推算出来;这四格资料表就专称四格表(fourfold table),或称2行2列表(2×2 contingency table)从该资料算出的两种疗法有效率分别为44.2%和77.3%,两者的差别可能是抽样误差所致,亦可能是两种治疗有效率(总体率)确有所不同。这里可通过x2检验来区别其差异有无统计学意义,检验的基本公式为:

 

式中A为实际数,以上四格表的四个数据就是实际数。T为理论数,是根据检验假设推断出来的;即假设这两种卵巢癌治疗的有效率本无不同,差别仅是由抽样误差所致。这里可将两种疗法合计有效率作为理论上的有效率,即53/87=60.9%,以此为依据便可推算出四格表中相应的四格的理论数。兹以表20-11资料为例检验如下

检验步骤:

    1.建立检验假设:

    H0:π1=π2

    H1:π1≠π2

    α=0.05

    2.计算理论数(TRC),计算公式为:

    TRC=nR.nc/n 公式(20.13)

    式中TRC是表示第R行C列格子的理论数,nR为理论数同行的合计数,nC为与理论数同列的合计数,n为总例数。

    第1行1列: 43×53/87=26.2

    第1行2列: 43×34/87=16.8

    第2行1列: 44×53/87=26.8

    第2行2列: 4×34/87=17.2

    以推算结果,可与原四项实际数并列成表20-12:

 

因为上表每行和每列合计数都是固定的,所以只要用TRC式求得其中一项理论数(例如T1.1=26.2),则其余三项理论数都可用同行或同列合计数相减,直接求出,示范如下:

    T1.1=26.2

    T1.2=43-26.2=16.8

    T2.1=53-26.2=26.8

    T2.2=44-26.2=17.2

    3.计算x2值 按公式20.12代入

 

4.查x2值表求P值

在查表之前应知本题自由度。按x2检验的自由度v=(行数-1)(列数-1),则该题的自由度v=(2-1)(2-1)=1,查x2界值表(附表20-1),找到x20.001(1)=6.63,而本题x2=10.01即x2>x20.001(1),P<0.01,差异有高度统计学意义,按α=0.05水准,拒绝H0,可以认为采用化疗加放疗治疗卵巢癌的疗效比单用化疗佳。

四格表的专用公式

对于四格表资料,还可用以下专用公式求x2值。

 

 

行×列表x2检验注意事项

1.一般认为行×列表中不宜有1/5以上格子的理论数小于5,或有小于1的理论数。当理论数太小可采取下列方法处理:①增加样本含量以增大理论数;②删去上述理论数太小的行和列;③将太小理论数所在行或列与性质相近的邻行邻列中的实际数合并,使重新计算的理论数增大。由于后两法可能会损失信息,损害样本的随机性,不同的合并方式有可能影响推断结论,故不宜作常规方法。另外,不能把不同性质的实际数合并,如研究血型时,不能把不同的血型资料合并。

2.如检验结果拒绝检验假设,只能认为各总体率或总体构成比之间总的来说有差别,但不能说明它们彼此之间都有差别,或某两者间有差别。

假设检验原理

sample = [0.497, 0.506, 0.518, 0.524, 0.498, 0.511, 0.520, 0.515, 0.512]
# 双边检验函数
def two_tailed_test(data, hypothesis_mu,std=None, alpha=0.05):
    from scipy.stats import norm,t
    import numpy as np
    N = norm(0, 1)
    sample_mean = np.mean(data)
    sample_std = std
    sample_size = len(data)
    if not sample_std:
        N = t(sample_size-1)
        sample_std = np.std(data, ddof=1)
    standardization_value = abs(sample_mean - hypothesis_mu)*np.sqrt(sample_size)/sample_std
    k = N.isf(alpha / 2)
    if standardization_value<k:
        print('可接受原假设')
        return True
    elif standardization_value>=k:
        print('拒绝原假设')
        return False
print(two_tailed_test(sample, 0.5,0.015))

Z检验

sample = [10.4, 10.6, 10.1, 10.4, 10.5, 10.3, 10.3, 10.2, 10.9, 10.6, 10.8, 10.5, 10.7, 10.2, 10.7]
# 双边检验函数
def two_tailed_test(data, hypothesis_mu,std=None, alpha=0.05):
    from scipy.stats import norm,t
    import numpy as np
    N = norm(0, 1)
    sample_mean = np.mean(data)
    sample_std = std
    sample_size = len(data)
    if not sample_std:
        N = t(sample_size-1)
        sample_std = np.std(data, ddof=1)
    standardization_value = abs(sample_mean - hypothesis_mu)*np.sqrt(sample_size)/sample_std
    k = N.isf(alpha / 2)
    print(standardization_value)
    print(k)
    if standardization_value<k:
        print('可接受原假设')
        return True
    elif standardization_value>=k:
        print('拒绝原假设')
        return False
print(two_tailed_test(sample, 10.5,0.15))

T检验

sample = [10.4, 10.6, 10.1, 10.4, 10.5, 10.3, 10.3, 10.2, 10.9, 10.6, 10.8, 10.5, 10.7, 10.2, 10.7]
# 双边检验函数
def two_tailed_test(data, hypothesis_mu,std=None, alpha=0.05):
    from scipy.stats import norm,t
    import numpy as np
    N = norm(0, 1)
    sample_mean = np.mean(data)
    sample_std = std
    sample_size = len(data)
    if not sample_std:
        N = t(sample_size-1)
        sample_std = np.std(data, ddof=1)
    standardization_value = abs(sample_mean - hypothesis_mu)*np.sqrt(sample_size)/sample_std
    k = N.isf(alpha / 2)
    print(standardization_value)
    print(k)
    if standardization_value<k:
        print('可接受原假设')
        return True
    elif standardization_value>=k:
        print('拒绝原假设')
        return False
print(two_tailed_test(sample, 10.5))

# sample = [10.4, 10.6, 10.1, 10.4, 10.5, 10.3, 10.3, 10.2, 10.9, 10.6, 10.8, 10.5, 10.7, 10.2, 10.7]
sample = [0.497, 0.506, 0.518, 0.524, 0.498, 0.511, 0.520, 0.515, 0.512]
# sample = [159,280,101,212,224,379,179,264,222,362,168,250,149,260,485,170]
def one_tailed_test(data, hypothesis_mu,std=None, brim='right', alpha=0.05):
    from scipy.stats import norm,t
    import numpy as np
    N = norm(0, 1)
    sample_mean = np.mean(data)
    sample_std = std
    sample_size = len(data)
    if not sample_std:
        N = t(sample_size-1)
        sample_std = np.std(data, ddof=1)
    standardization_value = (sample_mean - hypothesis_mu)*np.sqrt(sample_size)/sample_std
    k = N.isf(alpha)
    if brim == 'right' and standardization_value >= 0:
        if standardization_value >= k:
            print('拒绝原假设')
            return False
        else:
            print('可接受原假设')
            return True
    elif brim == 'right' and standardization_value <= 0:
        if standardization_value <= -k:
            print('可接受原假设')
            return True
        else:
            print('拒绝原假设')
            return False
    elif brim == 'left' and standardization_value <= 0:
        if standardization_value <= -k:
            print('拒绝原假设')
            return False
        else:
            print('可接受原假设')
            return True
    elif brim == 'left' and standardization_value >= 0:
        if standardization_value >= k:
            print('可接受原假设')
            return True
        else:
            print('拒绝原假设')
            return False
#题目假设属于右边检验,因为H_1:mu>mu_0
res = one_tailed_test(sample, 0.1,brim='left')
# res = one_tailed_test(sample, 225,brim='left')
if res:
    print('接受H_0')
else:
    print('接受H_1')
# sample = [10.4, 10.6, 10.1, 10.4, 10.5, 10.3, 10.3, 10.2, 10.9, 10.6, 10.8, 10.5, 10.7, 10.2, 10.7]
# sample = [0.497, 0.506, 0.518, 0.524, 0.498, 0.511, 0.520, 0.515, 0.512]
sample = [159,280,101,212,224,379,179,264,222,362,168,250,149,260,485,170]
def one_tailed_test(data, hypothesis_mu,std=None, brim='right', alpha=0.05):
    from scipy.stats import norm,t
    import numpy as np
    N = norm(0, 1)
    sample_mean = np.mean(data)
    sample_std = std
    sample_size = len(data)
    if not sample_std:
        N = t(sample_size-1)
        sample_std = np.std(data, ddof=1)
    standardization_value = (sample_mean - hypothesis_mu)*np.sqrt(sample_size)/sample_std
    k = N.isf(alpha)
    if brim == 'right' and standardization_value >= 0:
        if standardization_value >= k:
            print('拒绝原假设')
            return False
        else:
            print('可接受原假设')
            return True
    elif brim == 'left' and standardization_value <= 0:
        if standardization_value <= -k:
            print('拒绝原假设')
            return False
        else:
            print('可接受原假设')
            return True
    else:
        if abs(standardization_value) < k :
            print('拒绝原假设')
            return False
        else:
            print('可接受原假设')
            return True
#题目假设属于右边检验,因为H_1:mu>mu_0
# res = one_tailed_test(sample, 0.5,brim='right')
res = one_tailed_test(sample, 225,brim='right')
if res:
    print('接受H_0')
else:
    print('接受H_1')
# sample = [10.4, 10.6, 10.1, 10.4, 10.5, 10.3, 10.3, 10.2, 10.9, 10.6, 10.8, 10.5, 10.7, 10.2, 10.7]
# sample = [0.497, 0.506, 0.518, 0.524, 0.498, 0.511, 0.520, 0.515, 0.512]
sample = [159,280,101,212,224,379,179,264,222,362,168,250,149,260,485,170]
def one_tailed_test(data, hypothesis_mu,std=None, brim='right', alpha=0.05):
    from scipy.stats import norm,t
    import numpy as np
    N = norm(0, 1)
    sample_mean = np.mean(data)
    sample_std = std
    sample_size = len(data)
    if not sample_std:
        N = t(sample_size-1)
        sample_std = np.std(data, ddof=1)
    standardization_value = (sample_mean - hypothesis_mu)*np.sqrt(sample_size)/sample_std
    k = N.isf(alpha)
    if standardization_value >= k:
        if brim == 'right':
            print('拒绝原假设;'+'总体均值应大于{}'.format(hypothesis_mu))
            return False
        elif brim == 'left':
            print('可接受原假设;'+'总体均值应大于{}'.format(hypothesis_mu))
            return True
    elif standardization_value <= -k:
        if brim == 'left':
            print('拒绝原假设;'+'总体均值应小于{}'.format(hypothesis_mu))
            return False
        elif brim == 'right':
            print('可接受原假设;'+'总体均值应小于{}'.format(hypothesis_mu))
            return True
    elif -k < standardization_value < k:
        if brim == 'left':
            print('没理由可认为总体均值小于假设均值,此假设均值可为总体均值,所以可接受原假设')
            return True
        elif brim == 'right':
            print('没理由可认为总体均值大于假设均值,此假设均值可为总体均值,所以可接受原假设')
            return True

#题目假设属于右边检验,因为H_1:mu>mu_0
# res = one_tailed_test(sample, 0.51,brim='left')
res = one_tailed_test(sample, 250,brim='right')
if res:
    print('接受H_0')
else:
    print('接受H_1')
posted @ 2019-07-26 00:39  码迷-wjz  阅读(1127)  评论(0)    收藏  举报