#列表转成 spark dataframe 几种方法
data = [1,5,7,10,23,20,6,5,10,7,10]
(1)转成rdd
rdd = sc.parallelize(data)
rdd = rdd.map(lambda x:(x,))
(2)dfdata = spark.createDataFrame([(x,) for x in data]).toDF("value") 转变size
(3)rdd = sc.parallelize(data)
rdd = rdd.map(lambda x:Row(x)) 使用Row
#df筛选
1、df.filter()
2、df.where('这里是sql where形式') 如 df.where("count={}".format(max_count)) df..where("avg_score>75.0")
#嵌套列表元祖转成 spark df
#任务:有一批学生信息表格,包括name,age,score, 找出score排名前3的学生, score相同可以任取
students = [("LiLei",18,87),("HanMeiMei",16,77),("DaChui",16,66),("Jim",18,77),("RuHua",18,50)]
n = 3
(1)
dfstudents = spark.createDataFrame(students).toDF("name","age","score")
(2)
df = spark.createDataFrame(students, ['name','age','score'])
#取头部几行
1、head(n) take(n):返回list格式数据
df.sort(df['score'].desc()).head(3)
out:
[Row(name='LiLei', age=18, score=87),
Row(name='HanMeiMei', age=16, score=77),
Row(name='Jim', age=18, score=77)]
2、limit(n): 返回dataframe
#窗口函数 (从序号1开始的,如果需要从0开始需要 -1)
row_number over(partition by colname1 order by colname2)
partition by:按照分区排序,如果按照全部数据排序则去掉 partition by
order by:按照某列或某几列排序
#排序
sort(df['col'])
orderBy([col1,col2,...],ascending=False)
如 : dfstudents.orderBy(dfstudents["score"].desc(),dfstudents["age"].desc())
#排序后取n个ROW df.limit(n)
#连接操作
dfstudents = dfclass.join(dfscore,on ="name" ,how = "left")
#UDF 自定义函数使用
def mode(arr):
pass
dfmode = dfscores.selectExpr("class","udf_mode(scores) as mode_score")
注意函数前面加‘udf_ + 你定义的函数名称’