section 7

import pandas as pd
import numpy as np
import pymysql
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
%matplotlib inline
# 数据库引擎
engine = create_engine('mysql+pymysql://root:123456@localhost:3306/datascience')

读取数据

# 读取数据
data = 'data/section7-dau.csv'
dau = pd.read_csv(data)

# 写入 MYSQL
# dau.to_sql('s7_dau',engine,index=False)

dau.head()
region_month region_day app_name user_id device
0 2013-01 2013-01-01 game-02 10061580 FP
1 2013-01 2013-01-01 game-02 10154440 FP
2 2013-01 2013-01-01 game-02 10164762 SP
3 2013-01 2013-01-01 game-02 10165615 FP
4 2013-01 2013-01-01 game-02 10321356 FP
# 查看数据信息
dau.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48988 entries, 0 to 48987
Data columns (total 5 columns):
region_month    48988 non-null object
region_day      48988 non-null object
app_name        48988 non-null object
user_id         48988 non-null int64
device          48988 non-null object
dtypes: int64(1), object(4)
memory usage: 1.9+ MB
print(dau.region_month.value_counts())
print(dau.region_day.unique())
print(dau.device.value_counts())
2013-01    25847
2013-02    23141
Name: region_month, dtype: int64
['2013-01-01' '2013-01-02' '2013-01-03' '2013-01-04' '2013-01-05'
 '2013-01-06' '2013-01-07' '2013-01-08' '2013-01-09' '2013-01-10'
 '2013-01-11' '2013-01-12' '2013-01-13' '2013-01-14' '2013-01-15'
 '2013-01-16' '2013-01-17' '2013-01-18' '2013-01-19' '2013-01-20'
 '2013-01-21' '2013-01-22' '2013-01-23' '2013-01-24' '2013-01-25'
 '2013-01-26' '2013-01-27' '2013-01-28' '2013-01-29' '2013-01-30'
 '2013-01-31' '2013-02-01' '2013-02-02' '2013-02-03' '2013-02-04'
 '2013-02-05' '2013-02-06' '2013-02-07' '2013-02-08' '2013-02-09'
 '2013-02-10' '2013-02-11' '2013-02-12' '2013-02-13' '2013-02-14'
 '2013-02-15' '2013-02-16' '2013-02-17' '2013-02-18' '2013-02-19'
 '2013-02-20' '2013-02-21' '2013-02-22' '2013-02-23' '2013-02-24'
 '2013-02-25' '2013-02-26' '2013-02-27' '2013-02-28']
FP    30331
SP    18657
Name: device, dtype: int64

关于用户是否进行了账号迁转的数据的整理

提取需要的数据列,去除重复项,得到 用户按月份和设备登陆的信息

mau = dau[['region_month','user_id','device']]
mau.head()
region_month user_id device
0 2013-01 10061580 FP
1 2013-01 10154440 FP
2 2013-01 10164762 SP
3 2013-01 10165615 FP
4 2013-01 10321356 FP
# 重复数据 (用户在某一月使用相同设备登陆)
print(mau.duplicated().sum())
mau.drop_duplicates(inplace=True)
print(mau.duplicated().sum())
46007
0


D:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until

非智能手机和智能手机分开

fp = dau[dau['device']=='FP'][['region_month','user_id','device']].drop_duplicates()
sp = dau[dau['device']=='SP'][['region_month','user_id','device']].drop_duplicates()
print(fp.info())
print(sp.info())
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1755 entries, 0 to 48901
Data columns (total 3 columns):
region_month    1755 non-null object
user_id         1755 non-null int64
device          1755 non-null object
dtypes: int64(1), object(2)
memory usage: 54.8+ KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1226 entries, 2 to 48834
Data columns (total 3 columns):
region_month    1226 non-null object
user_id         1226 non-null int64
device          1226 non-null object
dtypes: int64(1), object(2)
memory usage: 38.3+ KB
None

分别获取1月份和2月份的数据

# 分别获取1月份和2月份的数据

fp_m1 = fp[fp['region_month']=='2013-01']
fp_m2 = fp[fp['region_month']=='2013-02']

sp_m1 = sp[sp['region_month']=='2013-01']
sp_m2 = sp[sp['region_month']=='2013-02']

1月份的非智能手机用户在2月份的访问情况

# 1月份的非智能手机用户在2月份的访问情况

mau['is_access'] = 1
fp_m1 = pd.merge(fp_m1,mau[mau['region_month']=='2013-02'][['user_id','is_access']],how='left',on='user_id')
fp_m1['is_access'].fillna(0,inplace=True)

fp_m1.head()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
region_month user_id device is_access
0 2013-01 10061580 FP 1.0
1 2013-01 10154440 FP 0.0
2 2013-01 10165615 FP 1.0
3 2013-01 10321356 FP 1.0
4 2013-01 10447112 FP 1.0

1月份访问过游戏的非智能手机用户在2月份是否是继续通过非智能手机来访问的

# 1月份访问过游戏的非智能手机用户在2月份是否是继续通过非智能手机来访问的

fp_m2['is_fp'] = 1
fp_m1 = pd.merge(fp_m1,fp_m2[['user_id','is_fp']],how='left',on='user_id')
fp_m1['is_fp'].fillna(0,inplace=True)

fp_m1.head()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
region_month user_id device is_access is_fp
0 2013-01 10061580 FP 1.0 1.0
1 2013-01 10154440 FP 0.0 0.0
2 2013-01 10165615 FP 1.0 1.0
3 2013-01 10321356 FP 1.0 1.0
4 2013-01 10447112 FP 1.0 1.0

1月份访问过游戏的非智能手机用户在2月份是否是通过智能手机来访问的

# 1月份访问过游戏的非智能手机用户在2月份是否是通过智能手机来访问的

sp_m2['is_sp'] = 1
fp_m1 = pd.merge(fp_m1,sp_m2[['user_id','is_sp']],how='left',on='user_id')
fp_m1['is_sp'].fillna(0,inplace=True)

fp_m1.head()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
region_month user_id device is_access is_fp is_sp
0 2013-01 10061580 FP 1.0 1.0 0.0
1 2013-01 10154440 FP 0.0 0.0 0.0
2 2013-01 10165615 FP 1.0 1.0 0.0
3 2013-01 10321356 FP 1.0 1.0 0.0
4 2013-01 10447112 FP 1.0 1.0 0.0

1月份通过非智能手机访问但2月份没有访问的用户,或者通过智能手机访问的用户

# 1月份通过非智能手机访问但2月份没有访问的用户,或者通过智能手机访问的用户

fp_m1 = fp_m1[(fp_m1['is_access']==0) | (fp_m1['is_sp']==1)]
fp_m1.head()
region_month user_id device is_access is_fp is_sp
1 2013-01 10154440 FP 0.0 0.0 0.0
7 2013-01 10528830 FP 0.0 0.0 0.0
20 2013-01 1163733 FP 1.0 0.0 1.0
21 2013-01 11727630 FP 0.0 0.0 0.0
43 2013-01 13401362 FP 1.0 0.0 1.0

以上得到的即是可用于逻辑回归的标签项

关于是否是每天访问游戏的数据的整理

# 标记每天登陆记录

fp_dau = dau[(dau['device']=='FP') & (dau['region_month']=='2013-01')]
fp_dau['is_access'] = 1
fp_dau.head()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
region_month region_day app_name user_id device is_access
0 2013-01 2013-01-01 game-02 10061580 FP 1
1 2013-01 2013-01-01 game-02 10154440 FP 1
3 2013-01 2013-01-01 game-02 10165615 FP 1
4 2013-01 2013-01-01 game-02 10321356 FP 1
6 2013-01 2013-01-01 game-02 10447112 FP 1
# COLUMNS 名字
b = []
for a in np.arange(1,32):
    b.append('X'+str(a)+'day')
# b.insert(0,'user_id')

# 透视表转化为登陆信息
fp_dau_pivot = pd.pivot_table(fp_dau, values='is_access', columns='region_day', index='user_id', fill_value=0)
fp_dau_pivot.columns = b
fp_dau_pivot.reset_index(inplace=True)
fp_dau_pivot.head()
user_id X1day X2day X3day X4day X5day X6day X7day X8day X9day ... X22day X23day X24day X25day X26day X27day X28day X29day X30day X31day
0 397286 1 1 1 1 1 1 1 1 1 ... 1 1 1 1 1 1 1 1 1 1
1 471341 1 1 1 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 503874 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 512250 1 1 1 1 1 1 1 1 1 ... 1 1 1 1 1 1 1 1 1 1
4 513811 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 0 1 1 0 1

5 rows × 32 columns

# 将2月份的访问数据和智能手机用户数据合并,注意这里是 inner 。。

fp_dau_m = pd.merge(fp_dau_pivot, fp_m1[['user_id','is_sp']], how='inner', on='user_id')

fp_dau_m.head()
user_id X1day X2day X3day X4day X5day X6day X7day X8day X9day ... X23day X24day X25day X26day X27day X28day X29day X30day X31day is_sp
0 471341 1 1 1 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1.0
1 503874 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0.0
2 1073544 0 0 0 0 0 0 0 0 0 ... 1 1 1 0 0 0 0 0 0 0.0
3 1073864 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0.0
4 1163733 1 1 0 0 0 0 0 0 0 ... 1 1 1 1 1 1 0 0 0 1.0

5 rows × 33 columns

fp_dau_m.isna().sum().sum()
0
fp_dau_m.is_sp.value_counts()
0.0    190
1.0     62
Name: is_sp, dtype: int64

以上数据显示,is_sp 指示: 1表示2月份通过智能手机来访问的用户, 0表示用户为流失用户

2月份流失的用户数有190个, 更换为智能手机用户数为62个!

逻辑回归处理

1.sklearn

通过修改 solve 和 惩罚系数 C ,可以将模型的准确度提升至 100%

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='lbfgs',C=10)

x = fp_dau_m.iloc[:,1:-1]
y = fp_dau_m.iloc[:,-1]

lr.fit(x,y)

print('系数项:',lr.coef_)
print('截距项:',lr.intercept_)
print('得分是:',lr.score(x,y))
系数项: [[ 1.64264315  0.38232509  0.27375659  1.77818234 -1.2604587  -0.62425027
   1.64964331  0.94366796 -0.30971957 -2.45689215  1.05453162 -0.49567095
   1.37452985 -0.79198757 -1.39648934  0.18038175 -0.34026571  1.01401641
  -0.49919155 -0.25791649  0.98296119  1.03952236 -1.03446927  1.53177282
  -0.12212919  0.30942289  0.31267693 -0.08203749  1.32893163  1.57890364
   1.29380472]]
截距项: [-3.9031072]
得分是: 0.9047619047619048
yp = lr.predict_proba(x)[:,1]
df = fp_dau_m.copy()
df['prob'] = yp
df['pred'] = df['prob'].apply(lambda x: 1 if x > 0.5 else 0)
df.head(15)
user_id X1day X2day X3day X4day X5day X6day X7day X8day X9day ... X25day X26day X27day X28day X29day X30day X31day is_sp prob pred
0 471341 1 1 1 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 1.0 0.543341 1
1 503874 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0.0 0.094451 0
2 1073544 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 0 0 0.0 0.002510 0
3 1073864 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0.0 0.025567 0
4 1163733 1 1 0 0 0 0 0 0 0 ... 1 1 1 1 0 0 0 1.0 0.849838 1
5 1454629 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0.0 0.073879 0
6 1557628 0 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 1 0.0 0.051221 0
7 2241462 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1.0 0.094451 0
8 2313236 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0.0 0.085385 0
9 2477685 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 0 0 0.0 0.017546 0
10 2541741 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0.0 0.001726 0
11 2628661 0 0 0 0 0 1 0 0 0 ... 0 1 0 0 0 0 0 0.0 0.014515 0
12 3509436 0 1 0 1 1 1 0 1 1 ... 1 1 1 1 1 1 1 1.0 0.987940 1
13 3509436 0 1 0 1 1 1 0 1 1 ... 1 1 1 1 1 1 1 1.0 0.987940 1
14 3955950 1 1 1 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0.0 0.543341 1

15 rows × 35 columns

df.groupby(['is_sp','pred'])['user_id'].count().reset_index()
is_sp pred user_id
0 0.0 0 181
1 0.0 1 9
2 1.0 0 15
3 1.0 1 47
len(df[df['is_sp']==df['pred']])/len(df)
0.9047619047619048

此模型,无需修改任何参数即可达到准确度 100% 。 重点在于 solve 和 C 的参数。

from sklearn.linear_model import LogisticRegressionCV

lr = LogisticRegressionCV(cv=10)

x = fp_dau_m.iloc[:,1:-1]
y = fp_dau_m.iloc[:,-1]

lr.fit(x,y)

print('系数项:',lr.coef_)
print('截距项:',lr.intercept_)
print('-----------------------------------------------')
print('得分是: ',lr.score(x,y))
系数项: [[ 0.66247469  0.39566209  0.12089587  0.72621501 -0.14485039 -0.11496137
   0.50433275  0.25667173  0.11561233 -0.48159577  0.23713178 -0.12897139
   0.31542595 -0.16714406 -0.1914315  -0.09390318 -0.05036135  0.0924934
  -0.14949742 -0.05918408  0.52355482  0.58543392  0.0882812   0.39783666
   0.07477356  0.14874974  0.39921228  0.38402639  0.68729765  0.6331324
   0.55885631]]
截距项: [-2.95546571]
-----------------------------------------------
得分是:  0.8928571428571429

statsmodels

import statsmodels.api as sm
import statsmodels.formula.api as fsm     # 这个是用公式去拟合,不好用

x = fp_dau_m.iloc[:,1:-1]
x['intercept'] = 1.0          # 此处是为logistics回归添加截距项
y = fp_dau_m.iloc[:,-1]

logit = sm.Logit(y, x)
result = logit.fit(method='bfgs',maxiter=100)
Warning: Maximum number of iterations has been exceeded.
         Current function value: 0.222887
         Iterations: 100
         Function evaluations: 101
         Gradient evaluations: 101


C:\Users\sylva\AppData\Roaming\Python\Python36\site-packages\statsmodels\base\model.py:508: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
# result1 = logit.fit_regularized(alpha=5)
result.pred_table()
array([[180.,  10.],
       [ 14.,  48.]])
# result1.pred_table()
print(result.summary2())
                         Results: Logit
=================================================================
Model:              Logit            Pseudo R-squared: 0.601     
Dependent Variable: is_sp            AIC:              176.3352  
Date:               2018-08-24 12:07 BIC:              289.2770  
No. Observations:   252              Log-Likelihood:   -56.168   
Df Model:           31               LL-Null:          -140.60   
Df Residuals:       220              LLR p-value:      6.6358e-21
Converged:          0.0000           Scale:            1.0000    
------------------------------------------------------------------
               Coef.   Std.Err.     z     P>|z|    [0.025   0.975]
------------------------------------------------------------------
X1day          1.9894    0.8047   2.4720  0.0134   0.4121   3.5666
X2day          0.3311    1.0705   0.3093  0.7571  -1.7671   2.4293
X3day          0.3793    0.9406   0.4033  0.6867  -1.4641   2.2227
X4day          2.0422    0.8359   2.4430  0.0146   0.4038   3.6805
X5day         -1.7597    1.1991  -1.4675  0.1422  -4.1100   0.5906
X6day         -0.6679    1.1717  -0.5701  0.5686  -2.9643   1.6285
X7day          2.0157    1.1176   1.8036  0.0713  -0.1747   4.2061
X8day          1.2119    1.3505   0.8974  0.3695  -1.4350   3.8589
X9day         -0.4495    1.1874  -0.3786  0.7050  -2.7768   1.8778
X10day        -3.2374    1.5580  -2.0779  0.0377  -6.2911  -0.1837
X11day         1.4392    1.2234   1.1764  0.2394  -0.9586   3.8370
X12day        -0.6389    1.5297  -0.4176  0.6762  -3.6370   2.3592
X13day         1.7797    1.1424   1.5579  0.1193  -0.4594   4.0188
X14day        -1.1242    1.2455  -0.9026  0.3668  -3.5653   1.3170
X15day        -1.8115    1.3050  -1.3881  0.1651  -4.3694   0.7463
X16day         0.4940    1.1666   0.4234  0.6720  -1.7925   2.7804
X17day        -0.4448    1.2234  -0.3636  0.7162  -2.8427   1.9531
X18day         1.4321    1.1465   1.2491  0.2116  -0.8150   3.6791
X19day        -0.6132    1.1990  -0.5114  0.6091  -2.9632   1.7369
X20day        -0.3130    1.4007  -0.2235  0.8232  -3.0585   2.4324
X21day         0.9587    1.2558   0.7634  0.4452  -1.5027   3.4201
X22day         1.1954    1.1238   1.0637  0.2875  -1.0072   3.3980
X23day        -1.5371    1.2303  -1.2494  0.2115  -3.9486   0.8743
X24day         1.8445    1.1038   1.6710  0.0947  -0.3190   4.0080
X25day         0.1292    1.5317   0.0844  0.9328  -2.8727   3.1312
X26day         0.3131    1.4280   0.2192  0.8265  -2.4858   3.1119
X27day         0.3365    1.2965   0.2596  0.7952  -2.2045   2.8776
X28day        -0.3918    1.8515  -0.2116  0.8324  -4.0207   3.2372
X29day         1.5941    1.0565   1.5088  0.1314  -0.4767   3.6648
X30day         1.9943    1.2117   1.6459  0.0998  -0.3806   4.3692
X31day         1.5214    1.1798   1.2896  0.1972  -0.7908   3.8337
intercept     -4.2502    0.5904  -7.1985  0.0000  -5.4074  -3.0930
=================================================================
# print(result1.summary2())
xx = fp_dau_m.iloc[:,1:-1]
xx['intercept'] = 1.0         # 预测也要为logistics回归添加截距项

y_p = result.predict(xx)

ydf = fp_dau_m.copy()
ydf['prob'] = y_p
ydf['pred'] = ydf['prob'].apply(lambda x: 1 if x > 0.5 else 0)
ydf.head(15)
user_id X1day X2day X3day X4day X5day X6day X7day X8day X9day ... X25day X26day X27day X28day X29day X30day X31day is_sp prob pred
0 471341 1 1 1 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 1.0 0.620506 1
1 503874 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0.0 0.094416 0
2 1073544 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 0 0 0.0 0.000866 0
3 1073864 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0.0 0.019167 0
4 1163733 1 1 0 0 0 0 0 0 0 ... 1 1 1 1 0 0 0 1.0 0.870576 1
5 1454629 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0.0 0.077951 0
6 1557628 0 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 1 0.0 0.039991 0
7 2241462 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1.0 0.094416 0
8 2313236 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0.0 0.082739 0
9 2477685 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 0 0 0.0 0.015969 0
10 2541741 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0.0 0.000560 0
11 2628661 0 0 0 0 0 1 0 0 0 ... 0 1 0 0 0 0 0 0.0 0.009902 0
12 3509436 0 1 0 1 1 1 0 1 1 ... 1 1 1 1 1 1 1 1.0 0.992456 1
13 3509436 0 1 0 1 1 1 0 1 1 ... 1 1 1 1 1 1 1 1.0 0.992456 1
14 3955950 1 1 1 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0.0 0.620506 1

15 rows × 35 columns

ydf.groupby(['is_sp','pred'])['user_id'].count().reset_index()
is_sp pred user_id
0 0.0 0 180
1 0.0 1 10
2 1.0 0 14
3 1.0 1 48
len(ydf[ydf['is_sp']==ydf['pred']])/len(ydf)
0.9047619047619048

结果观察

根据 sklearn 预测的结果,有9名用户预测为1,即进行了账号迁转,但实际并没有。 根据过去的访问情况来推断,这些用户应该进行了账号迁转,然而实际却是流失的用户群体。

df.head(10)
user_id X1day X2day X3day X4day X5day X6day X7day X8day X9day ... X25day X26day X27day X28day X29day X30day X31day is_sp prob pred
0 471341 1 1 1 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 1.0 0.543341 1
1 503874 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0.0 0.094451 0
2 1073544 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 0 0 0.0 0.002510 0
3 1073864 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0.0 0.025567 0
4 1163733 1 1 0 0 0 0 0 0 0 ... 1 1 1 1 0 0 0 1.0 0.849838 1
5 1454629 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0.0 0.073879 0
6 1557628 0 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 1 0.0 0.051221 0
7 2241462 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1.0 0.094451 0
8 2313236 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0.0 0.085385 0
9 2477685 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 0 0 0.0 0.017546 0

10 rows × 35 columns

df1 = df[(df['is_sp']==1) & (df['pred']==1)]
df1.sort_values(by='prob',ascending=True).head(15)
user_id X1day X2day X3day X4day X5day X6day X7day X8day X9day ... X25day X26day X27day X28day X29day X30day X31day is_sp prob pred
228 52776438 1 1 1 1 1 1 1 1 1 ... 0 0 0 0 0 0 0 1.0 0.512293 1
171 32762652 1 1 1 1 1 1 1 1 1 ... 0 0 0 0 0 0 0 1.0 0.512293 1
155 27800629 1 1 1 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 1.0 0.543341 1
0 471341 1 1 1 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 1.0 0.543341 1
36 8645980 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 1 0 0 1.0 0.551574 1
37 8645980 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 1 0 0 1.0 0.551574 1
169 32500332 1 1 1 1 1 1 1 1 1 ... 1 0 0 0 0 0 0 1.0 0.587923 1
55 11600349 0 1 1 1 1 1 1 1 1 ... 0 0 0 1 1 1 1 1.0 0.684198 1
56 11600349 0 1 1 1 1 1 1 1 1 ... 0 0 0 1 1 1 1 1.0 0.684198 1
146 25787360 0 0 0 0 1 0 1 1 1 ... 0 0 1 0 0 0 0 1.0 0.696295 1
145 25787360 0 0 0 0 1 0 1 1 1 ... 0 0 1 0 0 0 0 1.0 0.696295 1
4 1163733 1 1 0 0 0 0 0 0 0 ... 1 1 1 1 0 0 0 1.0 0.849838 1
48 10406653 0 1 1 1 1 1 1 1 0 ... 1 0 1 1 1 1 1 1.0 0.865393 1
49 10406653 0 1 1 1 1 1 1 1 0 ... 1 0 1 1 1 1 1 1.0 0.865393 1
165 31066299 0 1 1 1 0 1 1 1 1 ... 1 1 1 0 1 1 0 1.0 0.951970 1

15 rows × 35 columns

df2 = df[(df['is_sp']==1) & (df['pred']==1)]
df2.sort_values(by='prob',ascending=False).head(15)
user_id X1day X2day X3day X4day X5day X6day X7day X8day X9day ... X25day X26day X27day X28day X29day X30day X31day is_sp prob pred
136 24791702 1 1 0 1 0 1 1 1 1 ... 1 1 1 1 1 1 1 1.0 0.998618 1
137 24791702 1 1 0 1 0 1 1 1 1 ... 1 1 1 1 1 1 1 1.0 0.998618 1
44 9567562 1 1 1 1 1 1 1 1 1 ... 1 1 1 1 1 1 1 1.0 0.996302 1
43 9567562 1 1 1 1 1 1 1 1 1 ... 1 1 1 1 1 1 1 1.0 0.996302 1
139 24900784 1 1 1 1 1 1 1 1 1 ... 1 1 1 1 1 1 1 1.0 0.993923 1
124 23113079 1 1 1 1 1 1 1 1 1 ... 1 1 1 1 1 1 1 1.0 0.993923 1
133 24581383 1 1 1 1 1 1 1 1 1 ... 1 1 1 1 1 1 1 1.0 0.993923 1
134 24581383 1 1 1 1 1 1 1 1 1 ... 1 1 1 1 1 1 1 1.0 0.993923 1
138 24900784 1 1 1 1 1 1 1 1 1 ... 1 1 1 1 1 1 1 1.0 0.993923 1
123 23113079 1 1 1 1 1 1 1 1 1 ... 1 1 1 1 1 1 1 1.0 0.993923 1
114 21551429 1 1 1 1 1 1 1 1 1 ... 1 1 1 1 1 1 1 1.0 0.993923 1
147 27003770 1 1 1 1 1 1 1 1 1 ... 1 1 1 1 1 1 1 1.0 0.993923 1
148 27003770 1 1 1 1 1 1 1 1 1 ... 1 1 1 1 1 1 1 1.0 0.993923 1
150 27602710 1 1 1 1 1 1 1 1 1 ... 1 1 1 1 1 1 1 1.0 0.993923 1
151 27602710 1 1 1 1 1 1 1 1 1 ... 1 1 1 1 1 1 1 1.0 0.993923 1

15 rows × 35 columns

df3 = df[(df['is_sp']==0) & (df['pred']==1)]
df3.sort_values(by='prob',ascending=False).head(15)
user_id X1day X2day X3day X4day X5day X6day X7day X8day X9day ... X25day X26day X27day X28day X29day X30day X31day is_sp prob pred
194 41590801 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 1 0 1 0.0 0.677458 1
108 19432099 1 1 1 1 0 1 1 1 1 ... 0 0 0 0 0 0 0 0.0 0.643061 1
203 43451947 1 1 1 1 1 0 1 1 1 ... 1 0 0 1 1 0 0 0.0 0.599921 1
197 42276142 1 1 1 1 1 1 0 1 1 ... 1 1 1 1 1 0 0 0.0 0.577420 1
209 46285446 0 0 0 0 1 1 1 1 1 ... 1 1 1 0 1 0 0 0.0 0.576873 1
14 3955950 1 1 1 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0.0 0.543341 1
158 28391896 1 1 1 1 1 1 1 1 1 ... 0 0 0 0 0 0 0 0.0 0.512293 1
240 59561276 1 1 1 1 1 1 1 1 1 ... 0 0 0 0 0 0 0 0.0 0.512293 1
27 6147878 1 0 0 1 1 1 1 1 1 ... 1 1 0 0 0 0 0 0.0 0.502182 1

9 rows × 35 columns

df4 = df[(df['is_sp']==0) & (df['pred']==1)]
df4.sort_values(by='prob',ascending=True).head(15)
user_id X1day X2day X3day X4day X5day X6day X7day X8day X9day ... X25day X26day X27day X28day X29day X30day X31day is_sp prob pred
27 6147878 1 0 0 1 1 1 1 1 1 ... 1 1 0 0 0 0 0 0.0 0.502182 1
158 28391896 1 1 1 1 1 1 1 1 1 ... 0 0 0 0 0 0 0 0.0 0.512293 1
240 59561276 1 1 1 1 1 1 1 1 1 ... 0 0 0 0 0 0 0 0.0 0.512293 1
14 3955950 1 1 1 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0.0 0.543341 1
209 46285446 0 0 0 0 1 1 1 1 1 ... 1 1 1 0 1 0 0 0.0 0.576873 1
197 42276142 1 1 1 1 1 1 0 1 1 ... 1 1 1 1 1 0 0 0.0 0.577420 1
203 43451947 1 1 1 1 1 0 1 1 1 ... 1 0 0 1 1 0 0 0.0 0.599921 1
108 19432099 1 1 1 1 0 1 1 1 1 ... 0 0 0 0 0 0 0 0.0 0.643061 1
194 41590801 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 1 0 1 0.0 0.677458 1

9 rows × 35 columns

df5 = df[(df['is_sp']==0) & (df['pred']==0)]
df5.sort_values(by='prob',ascending=True).head(15)
user_id X1day X2day X3day X4day X5day X6day X7day X8day X9day ... X25day X26day X27day X28day X29day X30day X31day is_sp prob pred
149 27249550 0 0 0 1 1 1 0 0 0 ... 0 0 0 0 0 0 0 0.0 0.000946 0
10 2541741 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0.0 0.001726 0
242 60725457 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0.0 0.001726 0
101 18408297 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0.0 0.001745 0
172 33766090 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0.0 0.002257 0
2 1073544 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 0 0 0.0 0.002510 0
227 52612953 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0.0 0.003087 0
63 12582684 0 0 0 1 1 0 1 0 0 ... 0 0 0 0 0 0 0 0.0 0.004780 0
208 46056688 0 0 0 0 0 1 1 0 0 ... 0 0 0 0 0 0 0 0.0 0.004799 0
66 13157777 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0.0 0.004969 0
190 40654033 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0.0 0.004969 0
120 22437652 0 0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0.0 0.005689 0
87 16601600 0 0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0.0 0.005689 0
70 13967453 0 0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0.0 0.005689 0
112 20955934 0 0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0.0 0.005689 0

15 rows × 35 columns

df6 = df[(df['is_sp']==1) & (df['pred']==0)]
df6.sort_values(by='prob',ascending=False).head(15)
user_id X1day X2day X3day X4day X5day X6day X7day X8day X9day ... X25day X26day X27day X28day X29day X30day X31day is_sp prob pred
198 42438713 1 1 1 1 1 1 1 0 0 ... 0 0 0 0 0 0 0 1.0 0.484688 0
127 23689923 1 1 0 1 1 1 1 1 1 ... 0 0 0 0 0 0 0 1.0 0.359100 0
213 47332069 0 0 0 0 0 0 0 0 0 ... 1 1 0 0 0 0 0 1.0 0.281079 0
140 24914421 1 1 1 0 0 0 0 1 0 ... 0 1 0 0 0 0 0 1.0 0.278119 0
226 52131958 0 0 1 1 1 1 1 1 1 ... 1 1 1 0 0 0 0 1.0 0.259709 0
212 47266966 1 0 0 1 0 1 1 1 1 ... 0 0 0 0 0 0 0 1.0 0.232730 0
236 57869405 0 0 0 0 0 0 1 1 0 ... 0 0 0 0 0 0 0 1.0 0.212521 0
161 29698758 1 1 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1.0 0.167370 0
30 7177251 1 1 1 1 1 1 0 0 0 ... 0 0 0 0 0 0 0 1.0 0.153046 0
7 2241462 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1.0 0.094451 0
67 13401362 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1.0 0.094451 0
80 15569351 0 0 0 0 0 0 1 0 1 ... 0 0 0 0 0 0 0 1.0 0.071546 0
93 17388480 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 1 0 0 1.0 0.070819 0
94 17388480 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 1 0 0 1.0 0.070819 0
163 30103279 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1.0 0.028795 0

15 rows × 35 columns

copy 问题的出现了,!!! = 等号只是引用内存地址, 变量最好用 copy() 属性!!

fp_dau_m.head()
user_id X1day X2day X3day X4day X5day X6day X7day X8day X9day ... X23day X24day X25day X26day X27day X28day X29day X30day X31day is_sp
0 471341 1 1 1 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1.0
1 503874 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0.0
2 1073544 0 0 0 0 0 0 0 0 0 ... 1 1 1 0 0 0 0 0 0 0.0
3 1073864 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0.0
4 1163733 1 1 0 0 0 0 0 0 0 ... 1 1 1 1 1 1 0 0 0 1.0

5 rows × 33 columns

df.equals(fp_dau_m)
False
df.equals(ydf)
False
posted @ 2018-08-24 12:22  CVlas  阅读(272)  评论(0)    收藏  举报