练习

1.复购率

SELECT

count( t.userid) 总共消费人数,

count(DISTINCT case when t.`消费次数`>1 then t.userid else null end) as 总共复购人数,

concat(round(count(DISTINCT case when t.`消费次数`>1 then t.userid else null end)/count(DISTINCT t.userid)*100,2),'%')as 复购率 FROM

(

select userid,

count(userid) as 消费次数

from order_info_utf

where ispaid = '已支付'

group by userid

) t

SELECT

count( b.userid) 总共消费人数,

count(DISTINCT case when b.`消费次数`>1 then b.userid else null end) as 总共复购人数,

concat(round(count(DISTINCT case when b.`消费次数`>1 then b.userid else null end)/count(DISTINCT b.userid)*100,2),'%')as 复购率 FROM

( select userid,count(日期) as 消费次数

from

(

select userid,date_format(paidtime,"%Y-%m-%d") as 日期

from order_info_utf

where ispaid = '已支付'

group by userid,日期

) t

group by userid

) b

SELECT t.`日期`,count( t.userid) 当日消费人数,

count(DISTINCT case when t.`消费次数`>1 then t.userid else null end) as 当日复购人数,

concat(round(count(DISTINCT case when t.`消费次数`>1 then t.userid else null end)/count(DISTINCT t.userid)*100,2),'%')as 复购率 FROM

(

select userid,date_format(paidtime,"%Y-%m-%d") as 日期,

count(userid) as 消费次数

from order_info_utf

where ispaid = '已支付'

group by userid,日期

) t

group by t.日期

SELECT t.`日期`,count(distinct t.userid) 当日消费人数,

count(DISTINCT case when t.`消费次数`>1 then t.userid else null end) as 当日复购人数,

concat(round(count(DISTINCT case when t.`消费次数`>1 then t.userid else null end)/count(DISTINCT t.userid)*100,2),'%')as 复购率 FROM

(

select userid,date_format(paidtime,"%Y-%m-%d") as 日期,

count(orderid) as 消费次数

from order_info_utf

where ispaid = '已支付'

group by userid,日期

) t

group by t.日期

2.回购率

select date1,

count( userid) as 当天购买人数,

sum( case when 天数=1 then 1 else 0 end)as 次日购买人数,

sum( case when 天数=1 then 1 else 0 end)/count(DISTINCT userid) as 回购率

from

( select a.userid,a.date1 as date1,datediff(date2,date1) as 天数

from

( (

select date_format(paidtime,"%Y-%m-%d") as date1,userid

from order_info_utf

where ispaid = '已支付'

group by date_format(paidtime,"%Y-%m-%d"),userid

) a

left join

(

select date_format(paidtime,"%Y-%m-%d") as date2,userid

from order_info_utf

where ispaid = '已支付'

group by date_format(paidtime,"%Y-%m-%d"),userid

) b

on a.userid = b.userid

)

) d

group by d.date1

注意先去重再连接，比如用户1第一天购买100次,时间作差不要直接减会出错，使用datediff函数。

（2）python

import pandas as pd

df = pd.read_csv('order_info_utf.csv')

df['paidtime']=pd.to_datetime(df['paidtime'],format="%Y-%m-%d %H:%M:%S")

df['date']=df['paidtime'].dt.strftime('%Y-%m-%d').astype('datetime64[ns]')

#复购率

data=df.groupby(['userid','date'])['orderid'].count().reset_index()

s1=data.groupby('date')['userid'].count() #当日消费人数

s2=data[data['orderid']>1].groupby('date')['userid'].count()#当日复购人数

df_new=pd.concat([s1,s2],axis=1)

df_new.columns=['当日消费人数','当日多次消费人数']

df_new['复购率']=(df_new['当日多次消费人数']/df_new['当日消费人数']).apply(lambda x: format(x,'.2%')) #两位小数百分数

df_new

#回购率

df3=df

df3=df3.drop_duplicates(subset=['userid','date'])

df3=df3[['userid','date']]

order2=pd.merge(df3,df3,on='userid',how='left')

order2['diff']=order2['date_y']-order2['date_x']

order2['diff']=order2['diff'].dt.days

a=order2.groupby(['date_x'])['userid'].count() #当月消费人数

b=order2[order2['diff']==1].groupby(['date_x'])['userid'].count() #次月消费人数

data2=pd.concat([a,b],axis=1)

data2.columns=['当日购买人数','次日回购人数']

data2['回购率']=data2['次日回购人数']/data2['当日购买人数']

3.不同日期的下单数

（1）mysql

select date_format(paidtime,'%Y-%m-%d') as 日期,

count(distinct userid) as 下单人数,

count(userid) as 下单数

from order_info_utf

group by date_format(paidtime,'%Y-%m-%d')

(2)python

df4=df

df6=df4.groupby(['date','userid']).count().reset_index().groupby('date').agg({'userid':'count','orderid':'sum'})

df6=df6.rename(columns={'userid':'下单人数','orderid':'下单次数'})

4.不同性别的下单数

select sex,count(distinct b.userid) as 下单人数,

count(b.userid) as 下单次数

from user_info a

join order_info_utf b

on a.userId = b.userid

group by sex

(2)python

order=df

user=pd.read_csv('user_info.csv')

df6=pd.merge(order,user,left_on='userid',right_on='userId',how='left')

df6=df6.groupby(['sex','userId']).count().reset_index().groupby('sex').agg({'userId':'count','orderid':'sum'})

df6=df6.rename(columns={'userId':'消费人数','orderid':'消费次数'})

5.统计多次消费的用户，第一次和最后一次消费时间的间隔

select userid,

datediff(max(date_format(paidtime,'%Y-%m-%d')),min(date_format(paidtime,'%Y-%m-%d'))) as 间隔

from order_info_utf

where ispaid = '已支付'

group by userid

having count(orderid) > 1

order by userid

(2)python

df8=order.pivot_table(index='userid',values='paidtime',aggfunc=['min','max'])

df8['diff']=df8['max']-df8['min']

df8=df8[df8['diff']>'0 days 00:00:00']

6.不同年龄段的消费频次

SELECT t.`年龄段`,COUNT(t.orderid) as 消费次数,

count(DISTINCT t.userid) as 消费人数,

round(COUNT(t.orderid)/count(DISTINCT t.userid),2) as 人均消费频次 from

(SELECT o.userid,o.orderid,

case

WHEN 2022-year(u.birth)<10 then '[0,10)'

WHEN 2022-year(u.birth)<20 then '[10,20)'

WHEN 2022-year(u.birth)<30 then '[20,30)'

WHEN 2022-year(u.birth)<40 then '[30,40)'

WHEN 2022-year(u.birth)<50 then '[40,50)'

WHEN 2022-year(u.birth)<60 then '[50,60)'

WHEN 2022-year(u.birth)<70 then '[60,70)'

WHEN 2022-year(u.birth)<80 then '[70,80)'

WHEN 2022-year(u.birth)<90 then '[80,90)'

else '90+'

end as 年龄段

from order_info_utf o

LEFT JOIN user_info u

on o.userid=u.userid

where o.ispaid='已支付' and year(u.birth)>1900) t

GROUP BY t.`年龄段`

(2)python

us=pd.read_csv('user_info.csv')

us['birth']=pd.to_datetime(us['birth'],format="%Y-%m-%d")

us['birth']=us['birth'].dt.strftime('%Y-%m-%d').astype('datetime64[ns]')

us['birth']=us['birth'].dt.year

from datetime import datetime

us['age']=datetime(2022,8,30).year-us['birth']

us['label']=pd.cut(us['age'],bins=[0,10,20,30,40,50,60,70,80,90,150],right=False).astype(str)#要把category转换成str类型，才能groupby

#表连接

df=pd.merge(order,us,left_on='userid',right_on='userId',how='left').dropna() #删除缺失值所在行

#数据分组求解

df=df[['userid','orderid','label']]

df=df.groupby(['label','userid']).count().reset_index().groupby('label').agg({'userid':'count','orderid':'sum'})

df=df.rename(columns={'userid':'消费人数','orderid':'消费次数'})

df['消费频次']=df['消费次数']/df['消费人数']

7.消费金额前20%的用户，贡献了多少额度

SELECT * FROM

(SELECT userid,round(sum(price),2) as 'sum',

row_number() over (order by sum(price) desc) as 'ranking'

from order_info_utf

where ispaid='已支付'

GROUP BY userid) t

对每位用户的消费金额进行排名

选出前20%的用户

SELECT sum(t.sum) FROM

(SELECT userid,round(sum(price),2) as 'sum',

row_number() over (order by sum(price) desc) as 'ranking'

from order_info_utf

where ispaid='已支付'

GROUP BY userid) t

#where t.ranking<(SELECT 0.2*count(DISTINCT userid) from order_info_utf

#where ispaid='已支付')

;

(2)python

#计算每个用户的消费金额

df=order.groupby('userid')['price'].sum().reset_index()

#计算贡献排在前20%的分位数

data=df['price'].quantile(0.8,interpolation='nearest') #分位数为1314

#筛选出前20%贡献数据，并计算求和

df=df[df['price']>=data]

posted @ 2022-08-30 22:48 萧六弟阅读(57) 评论(0) 收藏举报

刷新页面返回顶部

戴小帅的学习笔记

练习

1.复购率

2.回购率

3.不同日期的下单数

4.不同性别的下单数

5.统计多次消费的用户，第一次和最后一次消费时间的间隔

6.不同年龄段的消费频次

7.消费金额前20%的用户，贡献了多少额度

公告