大数据商品项目笔记

import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._

def Feat(priors:DataFrame,orders:DataFrame): DataFrame ={
     |   /**
     |     * product feature:
     |     * 1. 销售量 prod_cnt
     |     * 2. 商品被再次购买(reordered)量 prod_sum_rod
     |     * 3. 统计reordered比率 prod_rod_rate
     |     */
     | //    统计销售量
     |     val productCnt = priors.groupBy("product_id").count()
     | 
     |     //    统计商品被再次购买量:sum("reordered"),统计reordered比率:avg("reordered")
     |     val productRodCnt = priors.selectExpr("product_id", "cast(reordered as int)")
     |       .groupBy("product_id")
     |       .agg(sum("reordered").as("prod_sum_rod") , avg("reordered").as("prod_rod_rate"))
     | 
     |     val productFeat = productCnt.join(productRodCnt,"product_id")
     |       .selectExpr("product_id",
     |         "count as prod_cnt",
     |         "prod_sum_rod",
     |         "prod_rod_rate")
     | 
     |     /**
     |       * user Features:
     |       * 1.每个用户平均购买订单的间隔周期 user_avg_day_gap
     |       * 2.每个用户的总订单数量
     |       * 3.每个用户购买的product商品去重后的集合数据
     |       * 4.每个用户总商品数量以及去重后的商品数量总商品数量
     |       * 5.每个用户购买的平均每个订单的商品数量
     |       * +
     |       * 6. user
     |       */
     | //     异常值处理:将days_since_prior_order中的空值进行处理
     |     val ordersNew =orders
     |       .selectExpr("*","if(days_since_prior_order='',0,days_since_prior_order) as dspo")
     |       .drop("days_since_prior_order")
     | //    1.每个用户平均购买订单的间隔周期:avg("dspo")
     |     val userGap = ordersNew.selectExpr("user_id","cast(dspo as int)")
     |       .groupBy("user_id")
     |       .avg("dspo")
     |       .withColumnRenamed("avg(dspo)","user_avg_day_gap")
     | //    2.每个用户的总订单数量
     |     val userOrdCnt = orders.groupBy("user_id").count()
     | //    3.每个用户购买的product商品去重后的集合数据
     |     val op = orders.join(priors,"order_id").select("user_id","product_id")
     | 
     | //    RDD转DataFrame:需要隐式转换 toDF
     |     import priors.sparkSession.implicits._
     | 
     |     val userUniOrdRecs=op.rdd.map(x=>(x(0).toString,x(1).toString))
     |       .groupByKey()
     |       .mapValues(_.toSet.mkString(","))
     |       .toDF("user_id","product_records")
     | 
     | //    4. 每个用户总商品数量以及去重后的商品数量
     | //    可以将3和4进行合并:同时取到product去重的集合和集合的大小
     |     val userProRcdSize=op.rdd.map(x=>(x(0).toString,x(1).toString)).groupByKey().mapValues{record=>
     |       val rs = record.toSet
     |       (rs.size,rs.mkString(","))
     |     }.toDF("user_id","tuple")
     |   .selectExpr("user_id","tuple._1 as prod_dist_cnt","tuple._2 as prod_records")
     | //    5.每个用户购买的平均每个订单的商品数量
     | //    1)先求每个订单的商品数量【对订单做聚合count()】
     |     val ordProCnt = priors.groupBy("order_id").count()
     | //    2)求每个用户订单中商品个数的平均值【对user做聚合,avg(商品个数)】
     |     val userPerOrdProdCnt = orders.join(ordProCnt,"order_id")
     |       .groupBy("user_id")
     |       .avg("count")
     |       .withColumnRenamed("avg(count)","user_avg_ord_prods")
     | 
     |     val userFeat = userGap.join(userOrdCnt,"user_id")
     |       .join(userProRcdSize,"user_id")
     |       .join(userPerOrdProdCnt,"user_id")
     |       .selectExpr("user_id",
     |         "user_avg_day_gap",
     |         "count as user_ord_cnt",
     |         "prod_dist_cnt as user_prod_dist_cnt",
     |         "prod_records as user_prod_records",
     |         "user_avg_ord_prods")
     | orders.join(priors,"order_id")
     |       .select("user_id","product_id","eval_set")
     |       .join(productFeat,"product_id")
     |       .join(userFeat,"user_id")
     |   }

 

scala> val orders = spark.sql("select * from badou.orders")
orders: org.apache.spark.sql.DataFrame = [order_id: string, user_id: string ... 5 more fields]

val priors = spark.sql("select * from badou.priors") scala
> val dfFeat = Feat(priors,orders) dfFeat: org.apache.spark.sql.DataFrame = [user_id: string, product_id: string ... 9 more fields]
scala> dfFeat.show()
+-------+----------+--------+--------+------------+-------------------+----------------+------------+------------------+--------------------+------------------+
|user_id|product_id|eval_set|prod_cnt|prod_sum_rod|      prod_rod_rate|user_avg_day_gap|user_ord_cnt|user_prod_dist_cnt|   user_prod_records|user_avg_ord_prods|
+-------+----------+--------+--------+------------+-------------------+----------------+------------+------------------+--------------------+------------------+
| 100010|     48370|   prior|    3934|        2751|  0.699288256227758|             6.5|          12|               119|22285,48825,44325...|18.727272727272727|
| 100010|     48370|   prior|    3934|        2751|  0.699288256227758|             6.5|          12|               119|22285,48825,44325...|18.727272727272727|
| 100010|     48370|   prior|    3934|        2751|  0.699288256227758|             6.5|          12|               119|22285,48825,44325...|18.727272727272727|
| 100010|     48370|   prior|    3934|        2751|  0.699288256227758|             6.5|          12|               119|22285,48825,44325...|18.727272727272727|
| 100010|     48370|   prior|    3934|        2751|  0.699288256227758|             6.5|          12|               119|22285,48825,44325...|18.727272727272727|
| 100010|     26079|   prior|     406|         236| 0.5812807881773399|             6.5|          12|               119|22285,48825,44325...|18.727272727272727|
| 100010|       169|   prior|    1877|        1016| 0.5412892914224827|             6.5|          12|               119|22285,48825,44325...|18.727272727272727|
| 100010|     33129|   prior|   12196|        6436| 0.5277140045916694|             6.5|          12|               119|22285,48825,44325...|18.727272727272727|
| 100010|     22285|   prior|     276|         196| 0.7101449275362319|             6.5|          12|               119|22285,48825,44325...|18.727272727272727|
| 100010|     14811|   prior|     249|         110|0.44176706827309237|             6.5|          12|               119|22285,48825,44325...|18.727272727272727|
| 100010|     31506|   prior|   50255|       23954| 0.4766490896428216|             6.5|          12|               119|22285,48825,44325...|18.727272727272727|
| 100010|     31506|   prior|   50255|       23954| 0.4766490896428216|             6.5|          12|               119|22285,48825,44325...|18.727272727272727|
| 100010|     21616|   prior|   72829|       49798| 0.6837660821925332|             6.5|          12|               119|22285,48825,44325...|18.727272727272727|
| 100010|     40516|   prior|   12495|        5867|0.46954781912765103|             6.5|          12|               119|22285,48825,44325...|18.727272727272727|
| 100010|     40516|   prior|   12495|        5867|0.46954781912765103|             6.5|          12|               119|22285,48825,44325...|18.727272727272727|
| 100010|     40516|   prior|   12495|        5867|0.46954781912765103|             6.5|          12|               119|22285,48825,44325...|18.727272727272727|
| 100010|     40516|   prior|   12495|        5867|0.46954781912765103|             6.5|          12|               119|22285,48825,44325...|18.727272727272727|
| 100010|     40516|   prior|   12495|        5867|0.46954781912765103|             6.5|          12|               119|22285,48825,44325...|18.727272727272727|
| 100010|     16168|   prior|    4593|        2568| 0.5591116917047682|             6.5|          12|               119|22285,48825,44325...|18.727272727272727|
| 100010|     44142|   prior|   43009|       23609| 0.5489316189634728|             6.5|          12|               119|22285,48825,44325...|18.727272727272727|
+-------+----------+--------+--------+------------+-------------------+----------------+------------+------------------+--------------------+------------------+
only showing top 20 rows
scala> val dfFeat = Feat(priors,orders)
<console>:35: error: not found: value priors
       val dfFeat = Feat(priors,orders)
                         ^

scala> val dfFeat = Feat(priors,orders)
<console>:35: error: not found: value priors
       val dfFeat = Feat(priors,orders)
                         ^

scala> val priors = spark.sql("select * from badou.priors")
priors: org.apache.spark.sql.DataFrame = [order_id: string, product_id: string ... 2 more fields]

scala> val dfFeat = Feat(priors,orders)
dfFeat: org.apache.spark.sql.DataFrame = [user_id: string, product_id: string ... 9 more fields]

scala> dfFeat.show()

 

备注:

标签数据:

orders, eval_set prior train test

train作为标签数据
标签数据表示下一个订单中已经产生购买了
如果在train里面的,表示label为1,否则为0


用LR逻辑回归做什么事情呢?
预测其他的商品对于这个用户在下个订单中的概率(打分)
p(l)

最终目标预测一个用户下一个订单会买什么?
给我们是这个订单已经购买了一些商品,剩下的商品靠预测。

1-10,预测第11订单的商品会是什么?
1-9作为训练,10作为预测
1-9产生一个model,用这个模型预测10中会有哪些商品,模型预测:商品集合1

实际第10个订单:实际的商品集合2

orders:用户产生订单的数据,evel_set=prior,  train, test
products:产品信息的数据
priors:订单对应的产品  ,有3000万条数据
trains:订单对应的产品,作为最后一个订单里面的商品

特征 feature feat

 

create table products
(
product_id string,
product_name string,
aisle_id string,
department_id string
)
row format delimited fields terminated by ',' lines terminated by '\n';



order_id,product_id,add_to_cart_order,reordered



create table trains
(
order_id string,
product_id string,
add_to_cart_order string,
reordered string
)
row format delimited fields terminated by ',' lines terminated by '\n';
import org.apache.spark.ml.feature.OneHotEncoder
val oh = new OneHotEncoder().setInputCol("order_dow").setOutputCol("dow")
val ohDf=oh.transform(orders.selectExpr("cast(order_dow as double)"))

 

posted @ 2020-03-28 00:35  Simon92  阅读(174)  评论(0编辑  收藏  举报