第十三篇:数理统计基本概念与大数定律
总体和样本与样本概念的二重性



切比雪夫不等式和切比雪夫定理

大数定律


import numpy as np import matplotlib as mpl import matplotlib.pyplot as plt import random #设置字符集,防止中文乱码 mpl.rcParams['font.sans-serif'] = ['SimHei'] mpl.rcParams['axes.unicode_minus'] = False #给定随机数种子 random.seed(28) def genetate_random_int(n): '''产生n个0-9的随机数''' return [random.randint(1,20) for i in range(n)] if __name__ == '__main__': number = 8000 x = [i for i in range(1,number+1)] #产生numper个1-20的随机数 total_random_int = genetate_random_int(number) #求n个1-20的随机数的平均值 y = [np.mean(total_random_int[0:n+1]) for n in range(number)] #总体平均值 total_mean = sum(total_random_int)/number total_mean_list = [total_mean for i in range(number)] plt.plot(x,y,'-b',label = '样本容量为n的样本平均值') plt.plot(x,total_mean_list,'-r',label='总体平均值') plt.title('n个1-20的随机数的平均值') plt.xlim(0,number) plt.legend(loc='lower right') plt.grid(True) plt.show()
中心极限定理

import numpy as np import matplotlib as mpl import matplotlib.pyplot as plt from collections import Counter #统计数 import random #设置字符集,防止中文乱码 mpl.rcParams['font.sans-serif'] = ['SimHei'] mpl.rcParams['axes.unicode_minus'] = False ''' 随机抛六面的骰子,计算三次的点数的和,设三次点数的和为事件A. ==>事件A的发生属于什么分布? ==>A = x1+x2+x3,其中x1,x2,x3是分别三次的抛骰子的点数。 根据中心极限定理。由于x1,x2,x3属于独立同分布,所以说最终的事件A属于高斯分布。 ''' #给定随机数种子 random.seed(28) def genetate_random_int(): '''产生一个1-6的随机数,表示抛一次骰子的结果''' return random.randint(1,6) def genetate_sum(n): '''计算返回n次抛六面骰子结果的和''' return np.sum([genetate_random_int() for i in range(n)]) if __name__ == '__main__': number = 10000 n = 3 #进行number次事件A的操作,每次事件A进行n次抛骰子。 test_result = [genetate_sum(n) for i in range(number)] # res = {} # for key in test_result: # count = 1 # if key in res: # count += res[key] # res[key] = count result = Counter(test_result) x = sorted(list(result.keys())) print(x) y = [result[key]/number for key in x] plt.plot(x,y,'-b') plt.title('三次点数的和的分布律') plt.xlim(x[0]-1,x[-1]+1) plt.grid(True) plt.show() print(y)
统计量



常用的分布


from sympy import summation,product,pprint,Eq,E,simplify,Piecewise,S,expand,oo,radsimp,exp,Symbol,integrate,Integral from sympy.stats import E,P,density,variance,Normal,cdf,Covariance,ChiSquared,Exponential,Variance from sympy.abc import p,r,y,x,C from sympy.matrices import Matrix n = Symbol('n',positive=True,integer=True) X = ChiSquared('X',n) print(simplify(E(X))) print(simplify(variance(X))) from scipy.stats import chi2 Y = chi2(8) print(Y.mean()) print(Y.var())


from sympy import summation,product,pprint,Eq,E,simplify,Piecewise,S,expand,oo,radsimp,exp,Symbol,integrate,Integral,sqrt from sympy.stats import E,P,density,variance,Normal,cdf,Covariance,ChiSquared,Exponential,Variance,StudentT from sympy.abc import p,r,y,x,C from sympy.matrices import Matrix n = Symbol('n',positive=True,integer=True) X = StudentT('X',8) print(E(X)) print(variance(X).evalf()) from scipy.stats import t Z = t(8) print(Z.mean()) print(Z.var())


from sympy import summation,product,pprint,Eq,E,simplify,Piecewise,S,expand,oo,radsimp,exp,Symbol,integrate,Integral,sqrt from sympy.stats import E,P,density,variance,Normal,cdf,Covariance,ChiSquared,Exponential,Variance,FDistribution from sympy.abc import p,r,y,x,C from sympy.matrices import Matrix n = Symbol('n',positive=True,integer=True) X = FDistribution('X',10,8) print(E(X).evalf()) print(variance(X).evalf()) from scipy.stats import f Y = f(10,8) print(Y.mean()) print(Y.var())
参数估计
点估计


距估计





极大似然估计






连续性总体似然函数的求法







from sympy import summation,product,pprint,Eq,Product,E,simplify,log,expand,solve,exp,Symbol,integrate,expand_log,diff from sympy.stats import E,P,density,variance,Normal,cdf,Covariance,ChiSquared,Exponential,Variance,Probability from sympy.abc import p,r,y,C,x from sympy.matrices import Matrix n = Symbol('n',positive=True,integer=True) i = Symbol('i',positive=True,integer=True) x_i = Symbol('x_i') X = Normal('X',0,1) f = Product(Probability(Eq(X,x_i)),(i,1,n)) print('似然函数:') pprint(f) L = log(f) print('对数似然:') print(L) L_expand = expand_log(L.subs([(x_i,x*i),(n,5)]).doit(),force=True) pprint(L_expand) dx = simplify(diff(L_expand,x).doit()) print(dx)
区间估计














sample = [506, 508, 499, 503, 504, 510, 497, 512, 514, 505, 493, 496, 506, 502, 509, 496] def confidence_interval_n(data,var,confidence=0.95): from scipy.stats import norm import numpy as np sample_mean = np.mean(data) sample_size = len(data) alpha = 1 - confidence N = norm(0,1) z_score = N.isf(alpha/2) ME = z_score * np.sqrt(var/sample_size) lower_limit = sample_mean - ME upper_limit = sample_mean + ME print( 'mu的置信水平为'+str(confidence*100)+ '%%的置信区间为: ( %.2f, %.2f)' % (lower_limit, upper_limit)) return lower_limit, upper_limit confidence_interval_n(sample,6.2)



sample = [506, 508, 499, 503, 504, 510, 497, 512, 514, 505, 493, 496, 506, 502, 509, 496] def confidence_interval_t(data,confidence=0.95): from scipy.stats import t import numpy as np sample_mean = np.mean(data) sample_std = np.std(data,ddof=1) #无偏方差 sample_size = len(data) alpha = 1 - confidence t_score = t.isf(alpha / 2, df = (sample_size-1) ) ME = t_score * sample_std / np.sqrt(sample_size) lower_limit = sample_mean - ME upper_limit = sample_mean + ME print( 'mu的置信水平为'+str(confidence*100)+ '%%的置信区间为: ( %.2f, %.2f)' % (lower_limit, upper_limit)) return lower_limit, upper_limit confidence_interval_t(sample)




sample = [506, 508, 499, 503, 504, 510, 497, 512, 514, 505, 493, 496, 506, 502, 509, 496] def confidence_interval_chi2(data,confidence=0.95): from scipy.stats import chi2 import numpy as np sample_var = np.var(data,ddof=1) n_1 = len(data)-1 alpha = 1 - confidence N = chi2(n_1) lower_limit = (n_1*sample_var)/N.isf(alpha/2) upper_limit = (n_1*sample_var)/N.isf(1-alpha/2) print( 'var的置信水平为'+str(confidence*100)+ '%%的置信区间为: ( %.2f, %.2f)' % (lower_limit, upper_limit)) return lower_limit, upper_limit confidence_interval_chi2(sample)
假设检验
卡方检验x2检验(chi-square test)
x2检验(chi-square test)或称卡方检验,是一种用途较广的假设检验方法。可以分为成组比较(不配对资料)和个别比较(配对,或同一对象两种处理的比较)两类。
一、四格表资料的x2检验
例20.7某医院分别用化学疗法和化疗结合放射治疗卵巢癌肿患者,结果如表20-11,问两种疗法有无差别?

表内用虚线隔开的这四个数据是整个表中的基本资料,其余数据均由此推算出来;这四格资料表就专称四格表(fourfold table),或称2行2列表(2×2 contingency table)从该资料算出的两种疗法有效率分别为44.2%和77.3%,两者的差别可能是抽样误差所致,亦可能是两种治疗有效率(总体率)确有所不同。这里可通过x2检验来区别其差异有无统计学意义,检验的基本公式为:

式中A为实际数,以上四格表的四个数据就是实际数。T为理论数,是根据检验假设推断出来的;即假设这两种卵巢癌治疗的有效率本无不同,差别仅是由抽样误差所致。这里可将两种疗法合计有效率作为理论上的有效率,即53/87=60.9%,以此为依据便可推算出四格表中相应的四格的理论数。兹以表20-11资料为例检验如下
检验步骤:
1.建立检验假设:
H0:π1=π2
H1:π1≠π2
α=0.05
2.计算理论数(TRC),计算公式为:
TRC=nR.nc/n 公式(20.13)
式中TRC是表示第R行C列格子的理论数,nR为理论数同行的合计数,nC为与理论数同列的合计数,n为总例数。
第1行1列: 43×53/87=26.2
第1行2列: 43×34/87=16.8
第2行1列: 44×53/87=26.8
第2行2列: 4×34/87=17.2
以推算结果,可与原四项实际数并列成表20-12:

因为上表每行和每列合计数都是固定的,所以只要用TRC式求得其中一项理论数(例如T1.1=26.2),则其余三项理论数都可用同行或同列合计数相减,直接求出,示范如下:
T1.1=26.2
T1.2=43-26.2=16.8
T2.1=53-26.2=26.8
T2.2=44-26.2=17.2
3.计算x2值 按公式20.12代入

4.查x2值表求P值
在查表之前应知本题自由度。按x2检验的自由度v=(行数-1)(列数-1),则该题的自由度v=(2-1)(2-1)=1,查x2界值表(附表20-1),找到x20.001(1)=6.63,而本题x2=10.01即x2>x20.001(1),P<0.01,差异有高度统计学意义,按α=0.05水准,拒绝H0,可以认为采用化疗加放疗治疗卵巢癌的疗效比单用化疗佳。
四格表的专用公式
对于四格表资料,还可用以下专用公式求x2值。


行×列表x2检验注意事项
1.一般认为行×列表中不宜有1/5以上格子的理论数小于5,或有小于1的理论数。当理论数太小可采取下列方法处理:①增加样本含量以增大理论数;②删去上述理论数太小的行和列;③将太小理论数所在行或列与性质相近的邻行邻列中的实际数合并,使重新计算的理论数增大。由于后两法可能会损失信息,损害样本的随机性,不同的合并方式有可能影响推断结论,故不宜作常规方法。另外,不能把不同性质的实际数合并,如研究血型时,不能把不同的血型资料合并。
2.如检验结果拒绝检验假设,只能认为各总体率或总体构成比之间总的来说有差别,但不能说明它们彼此之间都有差别,或某两者间有差别。
假设检验原理







sample = [0.497, 0.506, 0.518, 0.524, 0.498, 0.511, 0.520, 0.515, 0.512] # 双边检验函数 def two_tailed_test(data, hypothesis_mu,std=None, alpha=0.05): from scipy.stats import norm,t import numpy as np N = norm(0, 1) sample_mean = np.mean(data) sample_std = std sample_size = len(data) if not sample_std: N = t(sample_size-1) sample_std = np.std(data, ddof=1) standardization_value = abs(sample_mean - hypothesis_mu)*np.sqrt(sample_size)/sample_std k = N.isf(alpha / 2) if standardization_value<k: print('可接受原假设') return True elif standardization_value>=k: print('拒绝原假设') return False print(two_tailed_test(sample, 0.5,0.015))




Z检验











sample = [10.4, 10.6, 10.1, 10.4, 10.5, 10.3, 10.3, 10.2, 10.9, 10.6, 10.8, 10.5, 10.7, 10.2, 10.7] # 双边检验函数 def two_tailed_test(data, hypothesis_mu,std=None, alpha=0.05): from scipy.stats import norm,t import numpy as np N = norm(0, 1) sample_mean = np.mean(data) sample_std = std sample_size = len(data) if not sample_std: N = t(sample_size-1) sample_std = np.std(data, ddof=1) standardization_value = abs(sample_mean - hypothesis_mu)*np.sqrt(sample_size)/sample_std k = N.isf(alpha / 2) print(standardization_value) print(k) if standardization_value<k: print('可接受原假设') return True elif standardization_value>=k: print('拒绝原假设') return False print(two_tailed_test(sample, 10.5,0.15))
T检验



sample = [10.4, 10.6, 10.1, 10.4, 10.5, 10.3, 10.3, 10.2, 10.9, 10.6, 10.8, 10.5, 10.7, 10.2, 10.7] # 双边检验函数 def two_tailed_test(data, hypothesis_mu,std=None, alpha=0.05): from scipy.stats import norm,t import numpy as np N = norm(0, 1) sample_mean = np.mean(data) sample_std = std sample_size = len(data) if not sample_std: N = t(sample_size-1) sample_std = np.std(data, ddof=1) standardization_value = abs(sample_mean - hypothesis_mu)*np.sqrt(sample_size)/sample_std k = N.isf(alpha / 2) print(standardization_value) print(k) if standardization_value<k: print('可接受原假设') return True elif standardization_value>=k: print('拒绝原假设') return False print(two_tailed_test(sample, 10.5))

# sample = [10.4, 10.6, 10.1, 10.4, 10.5, 10.3, 10.3, 10.2, 10.9, 10.6, 10.8, 10.5, 10.7, 10.2, 10.7] sample = [0.497, 0.506, 0.518, 0.524, 0.498, 0.511, 0.520, 0.515, 0.512] # sample = [159,280,101,212,224,379,179,264,222,362,168,250,149,260,485,170] def one_tailed_test(data, hypothesis_mu,std=None, brim='right', alpha=0.05): from scipy.stats import norm,t import numpy as np N = norm(0, 1) sample_mean = np.mean(data) sample_std = std sample_size = len(data) if not sample_std: N = t(sample_size-1) sample_std = np.std(data, ddof=1) standardization_value = (sample_mean - hypothesis_mu)*np.sqrt(sample_size)/sample_std k = N.isf(alpha) if brim == 'right' and standardization_value >= 0: if standardization_value >= k: print('拒绝原假设') return False else: print('可接受原假设') return True elif brim == 'right' and standardization_value <= 0: if standardization_value <= -k: print('可接受原假设') return True else: print('拒绝原假设') return False elif brim == 'left' and standardization_value <= 0: if standardization_value <= -k: print('拒绝原假设') return False else: print('可接受原假设') return True elif brim == 'left' and standardization_value >= 0: if standardization_value >= k: print('可接受原假设') return True else: print('拒绝原假设') return False #题目假设属于右边检验,因为H_1:mu>mu_0 res = one_tailed_test(sample, 0.1,brim='left') # res = one_tailed_test(sample, 225,brim='left') if res: print('接受H_0') else: print('接受H_1')
# sample = [10.4, 10.6, 10.1, 10.4, 10.5, 10.3, 10.3, 10.2, 10.9, 10.6, 10.8, 10.5, 10.7, 10.2, 10.7] # sample = [0.497, 0.506, 0.518, 0.524, 0.498, 0.511, 0.520, 0.515, 0.512] sample = [159,280,101,212,224,379,179,264,222,362,168,250,149,260,485,170] def one_tailed_test(data, hypothesis_mu,std=None, brim='right', alpha=0.05): from scipy.stats import norm,t import numpy as np N = norm(0, 1) sample_mean = np.mean(data) sample_std = std sample_size = len(data) if not sample_std: N = t(sample_size-1) sample_std = np.std(data, ddof=1) standardization_value = (sample_mean - hypothesis_mu)*np.sqrt(sample_size)/sample_std k = N.isf(alpha) if brim == 'right' and standardization_value >= 0: if standardization_value >= k: print('拒绝原假设') return False else: print('可接受原假设') return True elif brim == 'left' and standardization_value <= 0: if standardization_value <= -k: print('拒绝原假设') return False else: print('可接受原假设') return True else: if abs(standardization_value) < k : print('拒绝原假设') return False else: print('可接受原假设') return True #题目假设属于右边检验,因为H_1:mu>mu_0 # res = one_tailed_test(sample, 0.5,brim='right') res = one_tailed_test(sample, 225,brim='right') if res: print('接受H_0') else: print('接受H_1')
# sample = [10.4, 10.6, 10.1, 10.4, 10.5, 10.3, 10.3, 10.2, 10.9, 10.6, 10.8, 10.5, 10.7, 10.2, 10.7] # sample = [0.497, 0.506, 0.518, 0.524, 0.498, 0.511, 0.520, 0.515, 0.512] sample = [159,280,101,212,224,379,179,264,222,362,168,250,149,260,485,170] def one_tailed_test(data, hypothesis_mu,std=None, brim='right', alpha=0.05): from scipy.stats import norm,t import numpy as np N = norm(0, 1) sample_mean = np.mean(data) sample_std = std sample_size = len(data) if not sample_std: N = t(sample_size-1) sample_std = np.std(data, ddof=1) standardization_value = (sample_mean - hypothesis_mu)*np.sqrt(sample_size)/sample_std k = N.isf(alpha) if standardization_value >= k: if brim == 'right': print('拒绝原假设;'+'总体均值应大于{}'.format(hypothesis_mu)) return False elif brim == 'left': print('可接受原假设;'+'总体均值应大于{}'.format(hypothesis_mu)) return True elif standardization_value <= -k: if brim == 'left': print('拒绝原假设;'+'总体均值应小于{}'.format(hypothesis_mu)) return False elif brim == 'right': print('可接受原假设;'+'总体均值应小于{}'.format(hypothesis_mu)) return True elif -k < standardization_value < k: if brim == 'left': print('没理由可认为总体均值小于假设均值,此假设均值可为总体均值,所以可接受原假设') return True elif brim == 'right': print('没理由可认为总体均值大于假设均值,此假设均值可为总体均值,所以可接受原假设') return True #题目假设属于右边检验,因为H_1:mu>mu_0 # res = one_tailed_test(sample, 0.51,brim='left') res = one_tailed_test(sample, 250,brim='right') if res: print('接受H_0') else: print('接受H_1')

浙公网安备 33010602011771号