import pandas as pd
import numpy as np
import pymysql
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
%matplotlib inline
# 数据库引擎
engine = create_engine('mysql+pymysql://root:123456@localhost:3306/datascience')
读取数据
# 读取数据
data = 'data/section7-dau.csv'
dau = pd.read_csv(data)
# 写入 MYSQL
# dau.to_sql('s7_dau',engine,index=False)
dau.head()
|
region_month |
region_day |
app_name |
user_id |
device |
| 0 |
2013-01 |
2013-01-01 |
game-02 |
10061580 |
FP |
| 1 |
2013-01 |
2013-01-01 |
game-02 |
10154440 |
FP |
| 2 |
2013-01 |
2013-01-01 |
game-02 |
10164762 |
SP |
| 3 |
2013-01 |
2013-01-01 |
game-02 |
10165615 |
FP |
| 4 |
2013-01 |
2013-01-01 |
game-02 |
10321356 |
FP |
# 查看数据信息
dau.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48988 entries, 0 to 48987
Data columns (total 5 columns):
region_month 48988 non-null object
region_day 48988 non-null object
app_name 48988 non-null object
user_id 48988 non-null int64
device 48988 non-null object
dtypes: int64(1), object(4)
memory usage: 1.9+ MB
print(dau.region_month.value_counts())
print(dau.region_day.unique())
print(dau.device.value_counts())
2013-01 25847
2013-02 23141
Name: region_month, dtype: int64
['2013-01-01' '2013-01-02' '2013-01-03' '2013-01-04' '2013-01-05'
'2013-01-06' '2013-01-07' '2013-01-08' '2013-01-09' '2013-01-10'
'2013-01-11' '2013-01-12' '2013-01-13' '2013-01-14' '2013-01-15'
'2013-01-16' '2013-01-17' '2013-01-18' '2013-01-19' '2013-01-20'
'2013-01-21' '2013-01-22' '2013-01-23' '2013-01-24' '2013-01-25'
'2013-01-26' '2013-01-27' '2013-01-28' '2013-01-29' '2013-01-30'
'2013-01-31' '2013-02-01' '2013-02-02' '2013-02-03' '2013-02-04'
'2013-02-05' '2013-02-06' '2013-02-07' '2013-02-08' '2013-02-09'
'2013-02-10' '2013-02-11' '2013-02-12' '2013-02-13' '2013-02-14'
'2013-02-15' '2013-02-16' '2013-02-17' '2013-02-18' '2013-02-19'
'2013-02-20' '2013-02-21' '2013-02-22' '2013-02-23' '2013-02-24'
'2013-02-25' '2013-02-26' '2013-02-27' '2013-02-28']
FP 30331
SP 18657
Name: device, dtype: int64
关于用户是否进行了账号迁转的数据的整理
提取需要的数据列,去除重复项,得到 用户按月份和设备登陆的信息
mau = dau[['region_month','user_id','device']]
mau.head()
|
region_month |
user_id |
device |
| 0 |
2013-01 |
10061580 |
FP |
| 1 |
2013-01 |
10154440 |
FP |
| 2 |
2013-01 |
10164762 |
SP |
| 3 |
2013-01 |
10165615 |
FP |
| 4 |
2013-01 |
10321356 |
FP |
# 重复数据 (用户在某一月使用相同设备登陆)
print(mau.duplicated().sum())
mau.drop_duplicates(inplace=True)
print(mau.duplicated().sum())
46007
0
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
This is separate from the ipykernel package so we can avoid doing imports until
非智能手机和智能手机分开
fp = dau[dau['device']=='FP'][['region_month','user_id','device']].drop_duplicates()
sp = dau[dau['device']=='SP'][['region_month','user_id','device']].drop_duplicates()
print(fp.info())
print(sp.info())
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1755 entries, 0 to 48901
Data columns (total 3 columns):
region_month 1755 non-null object
user_id 1755 non-null int64
device 1755 non-null object
dtypes: int64(1), object(2)
memory usage: 54.8+ KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1226 entries, 2 to 48834
Data columns (total 3 columns):
region_month 1226 non-null object
user_id 1226 non-null int64
device 1226 non-null object
dtypes: int64(1), object(2)
memory usage: 38.3+ KB
None
分别获取1月份和2月份的数据
# 分别获取1月份和2月份的数据
fp_m1 = fp[fp['region_month']=='2013-01']
fp_m2 = fp[fp['region_month']=='2013-02']
sp_m1 = sp[sp['region_month']=='2013-01']
sp_m2 = sp[sp['region_month']=='2013-02']
1月份的非智能手机用户在2月份的访问情况
# 1月份的非智能手机用户在2月份的访问情况
mau['is_access'] = 1
fp_m1 = pd.merge(fp_m1,mau[mau['region_month']=='2013-02'][['user_id','is_access']],how='left',on='user_id')
fp_m1['is_access'].fillna(0,inplace=True)
fp_m1.head()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
This is separate from the ipykernel package so we can avoid doing imports until
|
region_month |
user_id |
device |
is_access |
| 0 |
2013-01 |
10061580 |
FP |
1.0 |
| 1 |
2013-01 |
10154440 |
FP |
0.0 |
| 2 |
2013-01 |
10165615 |
FP |
1.0 |
| 3 |
2013-01 |
10321356 |
FP |
1.0 |
| 4 |
2013-01 |
10447112 |
FP |
1.0 |
1月份访问过游戏的非智能手机用户在2月份是否是继续通过非智能手机来访问的
# 1月份访问过游戏的非智能手机用户在2月份是否是继续通过非智能手机来访问的
fp_m2['is_fp'] = 1
fp_m1 = pd.merge(fp_m1,fp_m2[['user_id','is_fp']],how='left',on='user_id')
fp_m1['is_fp'].fillna(0,inplace=True)
fp_m1.head()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
This is separate from the ipykernel package so we can avoid doing imports until
|
region_month |
user_id |
device |
is_access |
is_fp |
| 0 |
2013-01 |
10061580 |
FP |
1.0 |
1.0 |
| 1 |
2013-01 |
10154440 |
FP |
0.0 |
0.0 |
| 2 |
2013-01 |
10165615 |
FP |
1.0 |
1.0 |
| 3 |
2013-01 |
10321356 |
FP |
1.0 |
1.0 |
| 4 |
2013-01 |
10447112 |
FP |
1.0 |
1.0 |
1月份访问过游戏的非智能手机用户在2月份是否是通过智能手机来访问的
# 1月份访问过游戏的非智能手机用户在2月份是否是通过智能手机来访问的
sp_m2['is_sp'] = 1
fp_m1 = pd.merge(fp_m1,sp_m2[['user_id','is_sp']],how='left',on='user_id')
fp_m1['is_sp'].fillna(0,inplace=True)
fp_m1.head()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
This is separate from the ipykernel package so we can avoid doing imports until
|
region_month |
user_id |
device |
is_access |
is_fp |
is_sp |
| 0 |
2013-01 |
10061580 |
FP |
1.0 |
1.0 |
0.0 |
| 1 |
2013-01 |
10154440 |
FP |
0.0 |
0.0 |
0.0 |
| 2 |
2013-01 |
10165615 |
FP |
1.0 |
1.0 |
0.0 |
| 3 |
2013-01 |
10321356 |
FP |
1.0 |
1.0 |
0.0 |
| 4 |
2013-01 |
10447112 |
FP |
1.0 |
1.0 |
0.0 |
1月份通过非智能手机访问但2月份没有访问的用户,或者通过智能手机访问的用户
# 1月份通过非智能手机访问但2月份没有访问的用户,或者通过智能手机访问的用户
fp_m1 = fp_m1[(fp_m1['is_access']==0) | (fp_m1['is_sp']==1)]
fp_m1.head()
|
region_month |
user_id |
device |
is_access |
is_fp |
is_sp |
| 1 |
2013-01 |
10154440 |
FP |
0.0 |
0.0 |
0.0 |
| 7 |
2013-01 |
10528830 |
FP |
0.0 |
0.0 |
0.0 |
| 20 |
2013-01 |
1163733 |
FP |
1.0 |
0.0 |
1.0 |
| 21 |
2013-01 |
11727630 |
FP |
0.0 |
0.0 |
0.0 |
| 43 |
2013-01 |
13401362 |
FP |
1.0 |
0.0 |
1.0 |
以上得到的即是可用于逻辑回归的标签项
关于是否是每天访问游戏的数据的整理
# 标记每天登陆记录
fp_dau = dau[(dau['device']=='FP') & (dau['region_month']=='2013-01')]
fp_dau['is_access'] = 1
fp_dau.head()
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:4: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
after removing the cwd from sys.path.
|
region_month |
region_day |
app_name |
user_id |
device |
is_access |
| 0 |
2013-01 |
2013-01-01 |
game-02 |
10061580 |
FP |
1 |
| 1 |
2013-01 |
2013-01-01 |
game-02 |
10154440 |
FP |
1 |
| 3 |
2013-01 |
2013-01-01 |
game-02 |
10165615 |
FP |
1 |
| 4 |
2013-01 |
2013-01-01 |
game-02 |
10321356 |
FP |
1 |
| 6 |
2013-01 |
2013-01-01 |
game-02 |
10447112 |
FP |
1 |
# COLUMNS 名字
b = []
for a in np.arange(1,32):
b.append('X'+str(a)+'day')
# b.insert(0,'user_id')
# 透视表转化为登陆信息
fp_dau_pivot = pd.pivot_table(fp_dau, values='is_access', columns='region_day', index='user_id', fill_value=0)
fp_dau_pivot.columns = b
fp_dau_pivot.reset_index(inplace=True)
fp_dau_pivot.head()
|
user_id |
X1day |
X2day |
X3day |
X4day |
X5day |
X6day |
X7day |
X8day |
X9day |
... |
X22day |
X23day |
X24day |
X25day |
X26day |
X27day |
X28day |
X29day |
X30day |
X31day |
| 0 |
397286 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
... |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
| 1 |
471341 |
1 |
1 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
| 2 |
503874 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
| 3 |
512250 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
... |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
| 4 |
513811 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
1 |
0 |
0 |
0 |
0 |
0 |
1 |
1 |
0 |
1 |
5 rows × 32 columns
# 将2月份的访问数据和智能手机用户数据合并,注意这里是 inner 。。
fp_dau_m = pd.merge(fp_dau_pivot, fp_m1[['user_id','is_sp']], how='inner', on='user_id')
fp_dau_m.head()
|
user_id |
X1day |
X2day |
X3day |
X4day |
X5day |
X6day |
X7day |
X8day |
X9day |
... |
X23day |
X24day |
X25day |
X26day |
X27day |
X28day |
X29day |
X30day |
X31day |
is_sp |
| 0 |
471341 |
1 |
1 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1.0 |
| 1 |
503874 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
| 2 |
1073544 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
1 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
| 3 |
1073864 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
| 4 |
1163733 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
1 |
1 |
1 |
1 |
1 |
1 |
0 |
0 |
0 |
1.0 |
5 rows × 33 columns
fp_dau_m.isna().sum().sum()
0
fp_dau_m.is_sp.value_counts()
0.0 190
1.0 62
Name: is_sp, dtype: int64
以上数据显示,is_sp 指示: 1表示2月份通过智能手机来访问的用户, 0表示用户为流失用户
2月份流失的用户数有190个, 更换为智能手机用户数为62个!
逻辑回归处理
1.sklearn
通过修改 solve 和 惩罚系数 C ,可以将模型的准确度提升至 100%
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='lbfgs',C=10)
x = fp_dau_m.iloc[:,1:-1]
y = fp_dau_m.iloc[:,-1]
lr.fit(x,y)
print('系数项:',lr.coef_)
print('截距项:',lr.intercept_)
print('得分是:',lr.score(x,y))
系数项: [[ 1.64264315 0.38232509 0.27375659 1.77818234 -1.2604587 -0.62425027
1.64964331 0.94366796 -0.30971957 -2.45689215 1.05453162 -0.49567095
1.37452985 -0.79198757 -1.39648934 0.18038175 -0.34026571 1.01401641
-0.49919155 -0.25791649 0.98296119 1.03952236 -1.03446927 1.53177282
-0.12212919 0.30942289 0.31267693 -0.08203749 1.32893163 1.57890364
1.29380472]]
截距项: [-3.9031072]
得分是: 0.9047619047619048
yp = lr.predict_proba(x)[:,1]
df = fp_dau_m.copy()
df['prob'] = yp
df['pred'] = df['prob'].apply(lambda x: 1 if x > 0.5 else 0)
df.head(15)
|
user_id |
X1day |
X2day |
X3day |
X4day |
X5day |
X6day |
X7day |
X8day |
X9day |
... |
X25day |
X26day |
X27day |
X28day |
X29day |
X30day |
X31day |
is_sp |
prob |
pred |
| 0 |
471341 |
1 |
1 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1.0 |
0.543341 |
1 |
| 1 |
503874 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.094451 |
0 |
| 2 |
1073544 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.002510 |
0 |
| 3 |
1073864 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.025567 |
0 |
| 4 |
1163733 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
1 |
1 |
1 |
1 |
0 |
0 |
0 |
1.0 |
0.849838 |
1 |
| 5 |
1454629 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.073879 |
0 |
| 6 |
1557628 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0.0 |
0.051221 |
0 |
| 7 |
2241462 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1.0 |
0.094451 |
0 |
| 8 |
2313236 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.085385 |
0 |
| 9 |
2477685 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.017546 |
0 |
| 10 |
2541741 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.001726 |
0 |
| 11 |
2628661 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
... |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.014515 |
0 |
| 12 |
3509436 |
0 |
1 |
0 |
1 |
1 |
1 |
0 |
1 |
1 |
... |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1.0 |
0.987940 |
1 |
| 13 |
3509436 |
0 |
1 |
0 |
1 |
1 |
1 |
0 |
1 |
1 |
... |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1.0 |
0.987940 |
1 |
| 14 |
3955950 |
1 |
1 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.543341 |
1 |
15 rows × 35 columns
df.groupby(['is_sp','pred'])['user_id'].count().reset_index()
|
is_sp |
pred |
user_id |
| 0 |
0.0 |
0 |
181 |
| 1 |
0.0 |
1 |
9 |
| 2 |
1.0 |
0 |
15 |
| 3 |
1.0 |
1 |
47 |
len(df[df['is_sp']==df['pred']])/len(df)
0.9047619047619048
此模型,无需修改任何参数即可达到准确度 100% 。 重点在于 solve 和 C 的参数。
from sklearn.linear_model import LogisticRegressionCV
lr = LogisticRegressionCV(cv=10)
x = fp_dau_m.iloc[:,1:-1]
y = fp_dau_m.iloc[:,-1]
lr.fit(x,y)
print('系数项:',lr.coef_)
print('截距项:',lr.intercept_)
print('-----------------------------------------------')
print('得分是: ',lr.score(x,y))
系数项: [[ 0.66247469 0.39566209 0.12089587 0.72621501 -0.14485039 -0.11496137
0.50433275 0.25667173 0.11561233 -0.48159577 0.23713178 -0.12897139
0.31542595 -0.16714406 -0.1914315 -0.09390318 -0.05036135 0.0924934
-0.14949742 -0.05918408 0.52355482 0.58543392 0.0882812 0.39783666
0.07477356 0.14874974 0.39921228 0.38402639 0.68729765 0.6331324
0.55885631]]
截距项: [-2.95546571]
-----------------------------------------------
得分是: 0.8928571428571429
statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as fsm # 这个是用公式去拟合,不好用
x = fp_dau_m.iloc[:,1:-1]
x['intercept'] = 1.0 # 此处是为logistics回归添加截距项
y = fp_dau_m.iloc[:,-1]
logit = sm.Logit(y, x)
result = logit.fit(method='bfgs',maxiter=100)
Warning: Maximum number of iterations has been exceeded.
Current function value: 0.222887
Iterations: 100
Function evaluations: 101
Gradient evaluations: 101
C:\Users\sylva\AppData\Roaming\Python\Python36\site-packages\statsmodels\base\model.py:508: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
"Check mle_retvals", ConvergenceWarning)
# result1 = logit.fit_regularized(alpha=5)
result.pred_table()
array([[180., 10.],
[ 14., 48.]])
# result1.pred_table()
print(result.summary2())
Results: Logit
=================================================================
Model: Logit Pseudo R-squared: 0.601
Dependent Variable: is_sp AIC: 176.3352
Date: 2018-08-24 12:07 BIC: 289.2770
No. Observations: 252 Log-Likelihood: -56.168
Df Model: 31 LL-Null: -140.60
Df Residuals: 220 LLR p-value: 6.6358e-21
Converged: 0.0000 Scale: 1.0000
------------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
------------------------------------------------------------------
X1day 1.9894 0.8047 2.4720 0.0134 0.4121 3.5666
X2day 0.3311 1.0705 0.3093 0.7571 -1.7671 2.4293
X3day 0.3793 0.9406 0.4033 0.6867 -1.4641 2.2227
X4day 2.0422 0.8359 2.4430 0.0146 0.4038 3.6805
X5day -1.7597 1.1991 -1.4675 0.1422 -4.1100 0.5906
X6day -0.6679 1.1717 -0.5701 0.5686 -2.9643 1.6285
X7day 2.0157 1.1176 1.8036 0.0713 -0.1747 4.2061
X8day 1.2119 1.3505 0.8974 0.3695 -1.4350 3.8589
X9day -0.4495 1.1874 -0.3786 0.7050 -2.7768 1.8778
X10day -3.2374 1.5580 -2.0779 0.0377 -6.2911 -0.1837
X11day 1.4392 1.2234 1.1764 0.2394 -0.9586 3.8370
X12day -0.6389 1.5297 -0.4176 0.6762 -3.6370 2.3592
X13day 1.7797 1.1424 1.5579 0.1193 -0.4594 4.0188
X14day -1.1242 1.2455 -0.9026 0.3668 -3.5653 1.3170
X15day -1.8115 1.3050 -1.3881 0.1651 -4.3694 0.7463
X16day 0.4940 1.1666 0.4234 0.6720 -1.7925 2.7804
X17day -0.4448 1.2234 -0.3636 0.7162 -2.8427 1.9531
X18day 1.4321 1.1465 1.2491 0.2116 -0.8150 3.6791
X19day -0.6132 1.1990 -0.5114 0.6091 -2.9632 1.7369
X20day -0.3130 1.4007 -0.2235 0.8232 -3.0585 2.4324
X21day 0.9587 1.2558 0.7634 0.4452 -1.5027 3.4201
X22day 1.1954 1.1238 1.0637 0.2875 -1.0072 3.3980
X23day -1.5371 1.2303 -1.2494 0.2115 -3.9486 0.8743
X24day 1.8445 1.1038 1.6710 0.0947 -0.3190 4.0080
X25day 0.1292 1.5317 0.0844 0.9328 -2.8727 3.1312
X26day 0.3131 1.4280 0.2192 0.8265 -2.4858 3.1119
X27day 0.3365 1.2965 0.2596 0.7952 -2.2045 2.8776
X28day -0.3918 1.8515 -0.2116 0.8324 -4.0207 3.2372
X29day 1.5941 1.0565 1.5088 0.1314 -0.4767 3.6648
X30day 1.9943 1.2117 1.6459 0.0998 -0.3806 4.3692
X31day 1.5214 1.1798 1.2896 0.1972 -0.7908 3.8337
intercept -4.2502 0.5904 -7.1985 0.0000 -5.4074 -3.0930
=================================================================
# print(result1.summary2())
xx = fp_dau_m.iloc[:,1:-1]
xx['intercept'] = 1.0 # 预测也要为logistics回归添加截距项
y_p = result.predict(xx)
ydf = fp_dau_m.copy()
ydf['prob'] = y_p
ydf['pred'] = ydf['prob'].apply(lambda x: 1 if x > 0.5 else 0)
ydf.head(15)
|
user_id |
X1day |
X2day |
X3day |
X4day |
X5day |
X6day |
X7day |
X8day |
X9day |
... |
X25day |
X26day |
X27day |
X28day |
X29day |
X30day |
X31day |
is_sp |
prob |
pred |
| 0 |
471341 |
1 |
1 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1.0 |
0.620506 |
1 |
| 1 |
503874 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.094416 |
0 |
| 2 |
1073544 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.000866 |
0 |
| 3 |
1073864 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.019167 |
0 |
| 4 |
1163733 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
1 |
1 |
1 |
1 |
0 |
0 |
0 |
1.0 |
0.870576 |
1 |
| 5 |
1454629 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.077951 |
0 |
| 6 |
1557628 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0.0 |
0.039991 |
0 |
| 7 |
2241462 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1.0 |
0.094416 |
0 |
| 8 |
2313236 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.082739 |
0 |
| 9 |
2477685 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.015969 |
0 |
| 10 |
2541741 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.000560 |
0 |
| 11 |
2628661 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
... |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.009902 |
0 |
| 12 |
3509436 |
0 |
1 |
0 |
1 |
1 |
1 |
0 |
1 |
1 |
... |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1.0 |
0.992456 |
1 |
| 13 |
3509436 |
0 |
1 |
0 |
1 |
1 |
1 |
0 |
1 |
1 |
... |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1.0 |
0.992456 |
1 |
| 14 |
3955950 |
1 |
1 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.620506 |
1 |
15 rows × 35 columns
ydf.groupby(['is_sp','pred'])['user_id'].count().reset_index()
|
is_sp |
pred |
user_id |
| 0 |
0.0 |
0 |
180 |
| 1 |
0.0 |
1 |
10 |
| 2 |
1.0 |
0 |
14 |
| 3 |
1.0 |
1 |
48 |
len(ydf[ydf['is_sp']==ydf['pred']])/len(ydf)
0.9047619047619048
结果观察
根据 sklearn 预测的结果,有9名用户预测为1,即进行了账号迁转,但实际并没有。 根据过去的访问情况来推断,这些用户应该进行了账号迁转,然而实际却是流失的用户群体。
df.head(10)
|
user_id |
X1day |
X2day |
X3day |
X4day |
X5day |
X6day |
X7day |
X8day |
X9day |
... |
X25day |
X26day |
X27day |
X28day |
X29day |
X30day |
X31day |
is_sp |
prob |
pred |
| 0 |
471341 |
1 |
1 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1.0 |
0.543341 |
1 |
| 1 |
503874 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.094451 |
0 |
| 2 |
1073544 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.002510 |
0 |
| 3 |
1073864 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.025567 |
0 |
| 4 |
1163733 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
1 |
1 |
1 |
1 |
0 |
0 |
0 |
1.0 |
0.849838 |
1 |
| 5 |
1454629 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.073879 |
0 |
| 6 |
1557628 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0.0 |
0.051221 |
0 |
| 7 |
2241462 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1.0 |
0.094451 |
0 |
| 8 |
2313236 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.085385 |
0 |
| 9 |
2477685 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.017546 |
0 |
10 rows × 35 columns
df1 = df[(df['is_sp']==1) & (df['pred']==1)]
df1.sort_values(by='prob',ascending=True).head(15)
|
user_id |
X1day |
X2day |
X3day |
X4day |
X5day |
X6day |
X7day |
X8day |
X9day |
... |
X25day |
X26day |
X27day |
X28day |
X29day |
X30day |
X31day |
is_sp |
prob |
pred |
| 228 |
52776438 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1.0 |
0.512293 |
1 |
| 171 |
32762652 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1.0 |
0.512293 |
1 |
| 155 |
27800629 |
1 |
1 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1.0 |
0.543341 |
1 |
| 0 |
471341 |
1 |
1 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1.0 |
0.543341 |
1 |
| 36 |
8645980 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
1.0 |
0.551574 |
1 |
| 37 |
8645980 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
1.0 |
0.551574 |
1 |
| 169 |
32500332 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
... |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
1.0 |
0.587923 |
1 |
| 55 |
11600349 |
0 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
... |
0 |
0 |
0 |
1 |
1 |
1 |
1 |
1.0 |
0.684198 |
1 |
| 56 |
11600349 |
0 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
... |
0 |
0 |
0 |
1 |
1 |
1 |
1 |
1.0 |
0.684198 |
1 |
| 146 |
25787360 |
0 |
0 |
0 |
0 |
1 |
0 |
1 |
1 |
1 |
... |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
1.0 |
0.696295 |
1 |
| 145 |
25787360 |
0 |
0 |
0 |
0 |
1 |
0 |
1 |
1 |
1 |
... |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
1.0 |
0.696295 |
1 |
| 4 |
1163733 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
1 |
1 |
1 |
1 |
0 |
0 |
0 |
1.0 |
0.849838 |
1 |
| 48 |
10406653 |
0 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
0 |
... |
1 |
0 |
1 |
1 |
1 |
1 |
1 |
1.0 |
0.865393 |
1 |
| 49 |
10406653 |
0 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
0 |
... |
1 |
0 |
1 |
1 |
1 |
1 |
1 |
1.0 |
0.865393 |
1 |
| 165 |
31066299 |
0 |
1 |
1 |
1 |
0 |
1 |
1 |
1 |
1 |
... |
1 |
1 |
1 |
0 |
1 |
1 |
0 |
1.0 |
0.951970 |
1 |
15 rows × 35 columns
df2 = df[(df['is_sp']==1) & (df['pred']==1)]
df2.sort_values(by='prob',ascending=False).head(15)
|
user_id |
X1day |
X2day |
X3day |
X4day |
X5day |
X6day |
X7day |
X8day |
X9day |
... |
X25day |
X26day |
X27day |
X28day |
X29day |
X30day |
X31day |
is_sp |
prob |
pred |
| 136 |
24791702 |
1 |
1 |
0 |
1 |
0 |
1 |
1 |
1 |
1 |
... |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1.0 |
0.998618 |
1 |
| 137 |
24791702 |
1 |
1 |
0 |
1 |
0 |
1 |
1 |
1 |
1 |
... |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1.0 |
0.998618 |
1 |
| 44 |
9567562 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
... |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1.0 |
0.996302 |
1 |
| 43 |
9567562 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
... |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1.0 |
0.996302 |
1 |
| 139 |
24900784 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
... |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1.0 |
0.993923 |
1 |
| 124 |
23113079 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
... |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1.0 |
0.993923 |
1 |
| 133 |
24581383 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
... |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1.0 |
0.993923 |
1 |
| 134 |
24581383 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
... |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1.0 |
0.993923 |
1 |
| 138 |
24900784 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
... |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1.0 |
0.993923 |
1 |
| 123 |
23113079 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
... |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1.0 |
0.993923 |
1 |
| 114 |
21551429 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
... |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1.0 |
0.993923 |
1 |
| 147 |
27003770 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
... |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1.0 |
0.993923 |
1 |
| 148 |
27003770 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
... |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1.0 |
0.993923 |
1 |
| 150 |
27602710 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
... |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1.0 |
0.993923 |
1 |
| 151 |
27602710 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
... |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1.0 |
0.993923 |
1 |
15 rows × 35 columns
df3 = df[(df['is_sp']==0) & (df['pred']==1)]
df3.sort_values(by='prob',ascending=False).head(15)
|
user_id |
X1day |
X2day |
X3day |
X4day |
X5day |
X6day |
X7day |
X8day |
X9day |
... |
X25day |
X26day |
X27day |
X28day |
X29day |
X30day |
X31day |
is_sp |
prob |
pred |
| 194 |
41590801 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
1 |
0 |
1 |
0.0 |
0.677458 |
1 |
| 108 |
19432099 |
1 |
1 |
1 |
1 |
0 |
1 |
1 |
1 |
1 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.643061 |
1 |
| 203 |
43451947 |
1 |
1 |
1 |
1 |
1 |
0 |
1 |
1 |
1 |
... |
1 |
0 |
0 |
1 |
1 |
0 |
0 |
0.0 |
0.599921 |
1 |
| 197 |
42276142 |
1 |
1 |
1 |
1 |
1 |
1 |
0 |
1 |
1 |
... |
1 |
1 |
1 |
1 |
1 |
0 |
0 |
0.0 |
0.577420 |
1 |
| 209 |
46285446 |
0 |
0 |
0 |
0 |
1 |
1 |
1 |
1 |
1 |
... |
1 |
1 |
1 |
0 |
1 |
0 |
0 |
0.0 |
0.576873 |
1 |
| 14 |
3955950 |
1 |
1 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.543341 |
1 |
| 158 |
28391896 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.512293 |
1 |
| 240 |
59561276 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.512293 |
1 |
| 27 |
6147878 |
1 |
0 |
0 |
1 |
1 |
1 |
1 |
1 |
1 |
... |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.502182 |
1 |
9 rows × 35 columns
df4 = df[(df['is_sp']==0) & (df['pred']==1)]
df4.sort_values(by='prob',ascending=True).head(15)
|
user_id |
X1day |
X2day |
X3day |
X4day |
X5day |
X6day |
X7day |
X8day |
X9day |
... |
X25day |
X26day |
X27day |
X28day |
X29day |
X30day |
X31day |
is_sp |
prob |
pred |
| 27 |
6147878 |
1 |
0 |
0 |
1 |
1 |
1 |
1 |
1 |
1 |
... |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.502182 |
1 |
| 158 |
28391896 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.512293 |
1 |
| 240 |
59561276 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.512293 |
1 |
| 14 |
3955950 |
1 |
1 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.543341 |
1 |
| 209 |
46285446 |
0 |
0 |
0 |
0 |
1 |
1 |
1 |
1 |
1 |
... |
1 |
1 |
1 |
0 |
1 |
0 |
0 |
0.0 |
0.576873 |
1 |
| 197 |
42276142 |
1 |
1 |
1 |
1 |
1 |
1 |
0 |
1 |
1 |
... |
1 |
1 |
1 |
1 |
1 |
0 |
0 |
0.0 |
0.577420 |
1 |
| 203 |
43451947 |
1 |
1 |
1 |
1 |
1 |
0 |
1 |
1 |
1 |
... |
1 |
0 |
0 |
1 |
1 |
0 |
0 |
0.0 |
0.599921 |
1 |
| 108 |
19432099 |
1 |
1 |
1 |
1 |
0 |
1 |
1 |
1 |
1 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.643061 |
1 |
| 194 |
41590801 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
1 |
0 |
1 |
0.0 |
0.677458 |
1 |
9 rows × 35 columns
df5 = df[(df['is_sp']==0) & (df['pred']==0)]
df5.sort_values(by='prob',ascending=True).head(15)
|
user_id |
X1day |
X2day |
X3day |
X4day |
X5day |
X6day |
X7day |
X8day |
X9day |
... |
X25day |
X26day |
X27day |
X28day |
X29day |
X30day |
X31day |
is_sp |
prob |
pred |
| 149 |
27249550 |
0 |
0 |
0 |
1 |
1 |
1 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.000946 |
0 |
| 10 |
2541741 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.001726 |
0 |
| 242 |
60725457 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.001726 |
0 |
| 101 |
18408297 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.001745 |
0 |
| 172 |
33766090 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.002257 |
0 |
| 2 |
1073544 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.002510 |
0 |
| 227 |
52612953 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.003087 |
0 |
| 63 |
12582684 |
0 |
0 |
0 |
1 |
1 |
0 |
1 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.004780 |
0 |
| 208 |
46056688 |
0 |
0 |
0 |
0 |
0 |
1 |
1 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.004799 |
0 |
| 66 |
13157777 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.004969 |
0 |
| 190 |
40654033 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.004969 |
0 |
| 120 |
22437652 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.005689 |
0 |
| 87 |
16601600 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.005689 |
0 |
| 70 |
13967453 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.005689 |
0 |
| 112 |
20955934 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
0.005689 |
0 |
15 rows × 35 columns
df6 = df[(df['is_sp']==1) & (df['pred']==0)]
df6.sort_values(by='prob',ascending=False).head(15)
|
user_id |
X1day |
X2day |
X3day |
X4day |
X5day |
X6day |
X7day |
X8day |
X9day |
... |
X25day |
X26day |
X27day |
X28day |
X29day |
X30day |
X31day |
is_sp |
prob |
pred |
| 198 |
42438713 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1.0 |
0.484688 |
0 |
| 127 |
23689923 |
1 |
1 |
0 |
1 |
1 |
1 |
1 |
1 |
1 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1.0 |
0.359100 |
0 |
| 213 |
47332069 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
1.0 |
0.281079 |
0 |
| 140 |
24914421 |
1 |
1 |
1 |
0 |
0 |
0 |
0 |
1 |
0 |
... |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
1.0 |
0.278119 |
0 |
| 226 |
52131958 |
0 |
0 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
... |
1 |
1 |
1 |
0 |
0 |
0 |
0 |
1.0 |
0.259709 |
0 |
| 212 |
47266966 |
1 |
0 |
0 |
1 |
0 |
1 |
1 |
1 |
1 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1.0 |
0.232730 |
0 |
| 236 |
57869405 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
1 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1.0 |
0.212521 |
0 |
| 161 |
29698758 |
1 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1.0 |
0.167370 |
0 |
| 30 |
7177251 |
1 |
1 |
1 |
1 |
1 |
1 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1.0 |
0.153046 |
0 |
| 7 |
2241462 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1.0 |
0.094451 |
0 |
| 67 |
13401362 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1.0 |
0.094451 |
0 |
| 80 |
15569351 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
1 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1.0 |
0.071546 |
0 |
| 93 |
17388480 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
1.0 |
0.070819 |
0 |
| 94 |
17388480 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
1.0 |
0.070819 |
0 |
| 163 |
30103279 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1.0 |
0.028795 |
0 |
15 rows × 35 columns
copy 问题的出现了,!!! = 等号只是引用内存地址, 变量最好用 copy() 属性!!
fp_dau_m.head()
|
user_id |
X1day |
X2day |
X3day |
X4day |
X5day |
X6day |
X7day |
X8day |
X9day |
... |
X23day |
X24day |
X25day |
X26day |
X27day |
X28day |
X29day |
X30day |
X31day |
is_sp |
| 0 |
471341 |
1 |
1 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1.0 |
| 1 |
503874 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
| 2 |
1073544 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
1 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
| 3 |
1073864 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0.0 |
| 4 |
1163733 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
... |
1 |
1 |
1 |
1 |
1 |
1 |
0 |
0 |
0 |
1.0 |
5 rows × 33 columns
df.equals(fp_dau_m)
False
df.equals(ydf)
False