数据挖掘第三周作业
一、飞机客户数据分析预测
1、读取数据
代码如下
import pandas as pd datafile = "D:\\python_data\\air_data.csv" resultfile = "D:\\python_data\\air_data_explore.csv" data = pd.read_csv(datafile, encoding='utf-8') explore = data.describe(percentiles=[], include='all').T explore['null'] = len(data)-explore['count'] explore = explore[['null', 'max', 'min']] explore.columns = [u'空值数', u'最大值', u'最小值'] explore.to_csv(resultfile)
2、绘制图像
各年份会员入会人数#直方图
代码如下
#提取会员入会年份 from datetime import datetime import pandas as pd import matplotlib.pyplot as plt ffp = data['FFP_DATE'].apply(lambda x:datetime.strptime(x,'%Y/%m/%d')) ffp_year = ffp.map(lambda x :x.year) fig = plt.figure(figsize=(8,5)) plt.rcParams['font.sans-serif'] = 'SimHei' plt.rcParams['axes.unicode_minus'] = False plt.hist(ffp_year,bins='auto',color='#0504aa') plt.xlabel('年份') plt.ylabel('入会人数') plt.title('各年份会员入会人数3129') plt.show() plt.close
运行结果
会员性别比例#扇形图
代码如下
male = pd.value_counts(data['GENDER'])['男'] female = pd.value_counts(data['GENDER'])['女'] fig = plt.figure(figsize=(7,4)) plt.pie([male,female],labels=['男','女'],colors=['lightskyblue','lightcoral'],autopct='%1.1f%%') plt.title('会员性别比例3129') plt.show() plt.close()
运行结果

会员各级别人数#直方图
代码如下
lv_four = pd.value_counts(data['FFP_TIER'])[4] lv_five = pd.value_counts(data['FFP_TIER'])[5] lv_six = pd.value_counts(data['FFP_TIER'])[6] fig = plt.figure(figsize=(8,5)) plt.bar(x=range(3),height=[lv_four,lv_five,lv_six],width=0.4,alpha=0.8,color='skyblue') plt.xticks([index for index in range(3)],['4','5','6']) plt.xlabel('会员等级') plt.ylabel('会员人数') plt.title('会员各级别人数3129') plt.show() plt.close()
运行结果

会员年龄分布#箱型图
代码如下
age = data['AGE'].dropna() age = age.astype('int64') fig = plt.figure(figsize=(5,10)) plt.boxplot(age, patch_artist=True, labels=['会员年龄'], boxprops={'facecolor':'lightblue'}) plt.title('会员年龄分布箱形图3129') plt.grid(axis='y') plt.show() plt.close()
运行结果
会员最后乘机至结束时长分布#箱型图
代码如下
datafile = "D:\\python_data\\air_data.csv" resultfile = "D:\\python_data\\air_data_explore.csv" data = pd.read_csv(datafile, encoding='utf-8') lte = data['LAST_TO_END'] fc = data['FLIGHT_COUNT'] sks = data['SEG_KM_SUM'] fig = plt.figure(figsize=(5, 8)) plt.boxplot(lte, patch_artist=True, labels=['时长'], boxprops={'facecolor': 'lightblue'}) plt.title('会员最后乘机至结束时长分布箱型图3129') plt.rcParams['font.sans-serif'] = 'SimHei' plt.grid(axis='y') plt.show() plt.close
运行结果
会员飞行次数分布#箱型图
代码如下
fig = plt.figure(figsize=(5, 8)) plt.boxplot(fc, patch_artist=True, labels=['飞行次数'], boxprops={'facecolor': 'lightblue'}) plt.title('会员飞行次数分布箱型图3129') plt.grid(axis='y') plt.show() plt.close
运行结果
客户飞行公里数#箱型图
代码如下
fig = plt.figure(figsize=(5, 10)) plt.boxplot(sks, patch_artist=True, labels=['总飞行公里数'], boxprops={'facecolor': 'lightblue'}) plt.title('客户飞行公里数箱型图3129') plt.grid(axis='y') plt.show() plt.close
运行结果
会员兑换积分次数分布#直方图
代码如下
# 绘制直方图 ec = data['EXCHANGE_COUNT'] # 绘制会员兑换积分次数直方图 fig = plt.figure(figsize=(8, 5)) plt.hist(ec, bins=5, color='#0405aa') plt.xlabel('兑换次数') plt.ylabel('会员人数') plt.title('会员兑换积分次数分布直方图3129') plt.show() plt.close
运行结果
客户总累计积分#箱型图
代码如下
ps = data['Points_Sum'] # 绘制会员总累计积分箱型图 fig = plt.figure(figsize=(5, 8)) plt.boxplot(ps, patch_artist=True, labels=['总累计积分'], boxprops={'facecolor': 'lightblue'}) plt.title('客户总累计积分箱型图3129') plt.grid(axis='y') plt.show() plt.close
运行结果

3、相关矩阵及热力图
代码如下
#相关系数矩阵与热力图 data_corr=data[['FFP_TIER','FLIGHT_COUNT','LAST_TO_END', 'SEG_KM_SUM','EXCHANGE_COUNT','Points_Sum']] age1=data['AGE'].fillna(0) data_corr['AGE']=age1.astype('int64') data_corr['ffp_year']=ffp_year dt_corr=data_corr.corr(method='pearson') print('相关性矩阵为:\n',dt_corr) import seaborn as sns plt.subplots(figsize=(10,10)) sns.heatmap(dt_corr,annot=True,vmax=1,square=True,cmap='Blues') plt.title('3129') plt.show() plt.close
运行结果


客户分群#雷达图
代码如下
%matplotlib inline import matplotlib.pyplot as plt labels=['ZL','ZR','ZF','ZM','ZC'] legen=['客户群'+str(i+1) for i in cluster_center.index]#客户群命名 lstype=['-','--',(0,(3,5,1,5,1,5)),':','-.'] kinds=list(cluster_center.iloc[:,0]) #由于雷达图要保证数据闭合,因此再添加L列,并转换为np.ndarry cluster_center=pd.concat([cluster_center,cluster_center[['ZL']]],axis=1) centers=np.array(cluster_center.iloc[:,0:]) #分割圆周长,并让其闭合 n=len(labels) angle=np.linspace(0,2*np.pi,n,endpoint=False) angle=np.concatenate((angle,[angle[0]])) feature=np.concatenate((feature,[feature[0]])) #绘图 fig=plt.figure(figsize=(8,6)) ax=fig.add_subplot(111,polar=True) plt.rcParams['font.sans-serif']=['SimHei'] plt.rcParams['axes.unicode_minus']=False #画线 for i in range(len(kinds)): ax.plot(angle,centers[i],linestyle=lstype[i],linewidth=2,label=kinds[i]) #添加属性标签 ax.set_thetagrids(angle* 180/np.pi, labels) plt.legend(legen) plt.show() plt.close
运行结果

二、电信客户流失分析预测
代码1:读取并简单分析数据
plt.rc("font",family="SimHei",size="12") #解决中文无法显示的问题
data = pd.read_csv("D:\python_data\dianxin_kehuliushi.csv") # 导入数据
data.shape # 查看数据大小
(7043, 21)
data.head()
运行结果
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7590-VHVEG | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
| 1 | 5575-GNVDE | Male | 0 | No | No | 34 | Yes | No | DSL | Yes | ... | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No |
| 2 | 3668-QPYBK | Male | 0 | No | No | 2 | Yes | No | DSL | Yes | ... | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
| 3 | 7795-CFOCW | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes | ... | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
| 4 | 9237-HQITU | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
5 rows × 21 columns
data.describe() #描述性统计信息
| SeniorCitizen | tenure | MonthlyCharges | |
|---|---|---|---|
| count | 7043.000000 | 7043.000000 | 7043.000000 |
| mean | 0.162147 | 32.371149 | 64.761692 |
| std | 0.368612 | 24.559481 | 30.090047 |
| min | 0.000000 | 0.000000 | 18.250000 |
| 25% | 0.000000 | 9.000000 | 35.500000 |
| 50% | 0.000000 | 29.000000 | 70.350000 |
| 75% | 0.000000 | 55.000000 | 89.850000 |
| max | 1.000000 | 72.000000 | 118.750000 |
代码2:客户流失数据分析
data['Churn'].value_counts() #查找缺失值
No 5174 Yes 1869 Name: Churn, dtype: int64
#数据集中有5174名用户没流失,有1869名客户流失,数据集不均衡。
data.dtypes #查看数据类型
customerID object gender object SeniorCitizen int64 Partner object Dependents object tenure int64 PhoneService object MultipleLines object InternetService object OnlineSecurity object OnlineBackup object DeviceProtection object TechSupport object StreamingTV object StreamingMovies object Contract object PaperlessBilling object PaymentMethod object MonthlyCharges float64 TotalCharges object Churn object dtype: object
#TotalCharges表示总费用,这里为对象类型,需要转换为float类型
data['TotalCharges']=data['TotalCharges'].apply(pd.to_numeric, errors="ignore")
data['TotalCharges'].describe()
count 7043 unique 6531 top freq 11 Name: TotalCharges, dtype: object
#数据归一化处理
#对Churn列中的YES和No分别用1和0替换,方便后续处理
data['Churn'].replace(to_replace='Yes',value=1,inplace=True)
data['Churn'].replace(to_replace='No',value=0,inplace=True)
data['Churn'].describe()
count 7043.000000 mean 0.265370 std 0.441561 min 0.000000 25% 0.000000 50% 0.000000 75% 1.000000 max 1.000000 Name: Churn, dtype: float64
data.info() #数据预览
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7043 entries, 0 to 7042 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customerID 7043 non-null object 1 gender 7043 non-null object 2 SeniorCitizen 7043 non-null int64 3 Partner 7043 non-null object 4 Dependents 7043 non-null object 5 tenure 7043 non-null int64 6 PhoneService 7043 non-null object 7 MultipleLines 7043 non-null object 8 InternetService 7043 non-null object 9 OnlineSecurity 7043 non-null object 10 OnlineBackup 7043 non-null object 11 DeviceProtection 7043 non-null object 12 TechSupport 7043 non-null object 13 StreamingTV 7043 non-null object 14 StreamingMovies 7043 non-null object 15 Contract 7043 non-null object 16 PaperlessBilling 7043 non-null object 17 PaymentMethod 7043 non-null object 18 MonthlyCharges 7043 non-null float64 19 TotalCharges 7043 non-null object 20 Churn 7043 non-null int64 dtypes: float64(1), int64(3), object(17) memory usage: 1.1+ MB
#在数据预览过后,我们发现不存在缺失值,并且许多特征维度的数据类型均为python默认的object对象类型。
代码3:绘制电信客户性别饼图和绘制客户流失情况饼图
plt.rcParams['font.sans-serif']='SimHei'
plt.rcParams['axes.unicode_minus']='False'
#提取会员不同性别人数
male=pd.value_counts(data['gender'])['Female']
female=pd.value_counts(data['gender'])['Male']
#绘制会员性别比例饼图
fig=plt.figure(figsize=(10,6))
plt.pie([male,female],labels=['男','女'],colors=['lightskyblue','lightcoral'],autopct='%1.1f%%')
plt.title('电信用户性别比例3129',fontsize=15)
plt.show()
plt.close()
churnvalue=data[ "Churn" ].value_counts()
labels=data["Churn"].value_counts().index
plt.figure(figsize=(6,6))
plt.pie(churnvalue,labels=labels,colors=["blue","yellow"],explode=(0.1,0),autopct='%1.1f', shadow=True)
plt.title('客户流失情况饼图3129',fontsize=15)
plt.show

#由图中结果可以看出,流失客户占整体客户的26.5%。
代码4:客户流失影响直方图
#性别、老年人、配偶、亲属对流客户流失率的影响
plt.figure(figsize=(10,10))
plt.subplot(2,2,1)
gender=sns.countplot(x='gender',hue='Churn',data=data,palette='Set2') #palette参数表示设置颜色,设置为主颜色paste12
plt.xlabel('性别')
plt.title('不同性别客户流失直方图3129',fontsize=15)
plt.subplot(2,2,2)
seniorcitizen=sns.countplot(x='SeniorCitizen',hue='Churn',data=data,palette='Set2')
plt.xlabel('老年人')
plt.title('老年人客户流失直方图3129',fontsize=15)
plt.subplot(2,2,3)
partner=sns.countplot(x='Partner',hue='Churn',data=data,palette='Set2')
plt.xlabel('配偶')
plt.title('是否有配偶客户流失直方图3129',fontsize=15)
plt.subplot(2,2,4)
dependents=sns.countplot(x='Dependents',hue='Churn',data=data,palette='Set2')
plt.xlabel('亲属')
plt.title('亲属客户流失直方图3129',fontsize=15)
plt.show()

浙公网安备 33010602011771号