嗯,在Windows上折腾spark会很类,所以建议是用linux。别问为什么。。配置上会遇到非常多的问题。
安装pyspark:
pip install pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark
查看信息:
SparkSession - in-memory
SparkContext
<p><a href="http://192.168.31.166:4040">Spark UI</a></p>
Version:v3.1.1
Master:local[*]
AppName:pyspark-shell
常用的一些配置
spark.conf.set("spark.sql.repl.eagerEval.enabled",True)
spark.conf.set("spark.sql.repl.eagerEval.maxNumRows",1000)
from datetime import datetime, date
import pandas as pd
from pyspark.sql import Row
df = spark.createDataFrame([
Row(a=1, b=2., c='string1', d=date(2000, 1, 1), e=datetime(2000, 1, 1, 12, 0)),
Row(a=2, b=3., c='string2', d=date(2000, 2, 1), e=datetime(2000, 1, 2, 12, 0)),
Row(a=4, b=5., c='string3', d=date(2000, 3, 1), e=datetime(2000, 1, 3, 12, 0))
])
df
| a | b | c | d | e |
| 1 | 2.0 | string1 | 2000-01-01 | 2000-01-01 12:00:00 |
| 2 | 3.0 | string2 | 2000-02-01 | 2000-01-02 12:00:00 |
| 4 | 5.0 | string3 | 2000-03-01 | 2000-01-03 12:00:00 |
df.show()
df.printSchema()
+---+---+-------+----------+-------------------+
| a| b| c| d| e|
+---+---+-------+----------+-------------------+
| 1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
| 2|3.0|string2|2000-02-01|2000-01-02 12:00:00|
| 4|5.0|string3|2000-03-01|2000-01-03 12:00:00|
+---+---+-------+----------+-------------------+
root
|-- a: long (nullable = true)
|-- b: double (nullable = true)
|-- c: string (nullable = true)
|-- d: date (nullable = true)
|-- e: timestamp (nullable = true)
创建DataFrame
df1=spark.createDataFrame([
Row(a=1,b=2,c="name"),
Row(a=11,b=22,c="tets")
])
#Firstly, you can create a PySpark DataFrame from a list of rows
df2=spark.createDataFrame(
[
(1,2,3),
(11,22,33)
],
schema='a int,b int,c int'
)
#Create a PySpark DataFrame with an explicit schema.
pandas_df = pd.DataFrame({
'a': [1, 2, 3],
'b': [2., 3., 4.],
'c': ['string1', 'string2', 'string3'],
'd': [date(2000, 1, 1), date(2000, 2, 1), date(2000, 3, 1)],
'e': [datetime(2000, 1, 1, 12, 0), datetime(2000, 1, 2, 12, 0), datetime(2000, 1, 3, 12, 0)]
})
df3=spark.createDataFrame(pandas_df)
# Create a PySpark DataFrame from an RDD consisting of a list of tuples.
rdd = spark.sparkContext.parallelize([
(1, 2., 'string1', date(2000, 1, 1), datetime(2000, 1, 1, 12, 0)),
(2, 3., 'string2', date(2000, 2, 1), datetime(2000, 1, 2, 12, 0)),
(3, 4., 'string3', date(2000, 3, 1), datetime(2000, 1, 3, 12, 0))
])
df = spark.createDataFrame(rdd, schema=['a', 'b', 'c', 'd', 'e'])
df
| a | b | c | d | e |
| 1 | 2.0 | string1 | 2000-01-01 | 2000-01-01 12:00:00 |
| 2 | 3.0 | string2 | 2000-02-01 | 2000-01-02 12:00:00 |
| 3 | 4.0 | string3 | 2000-03-01 | 2000-01-03 12:00:00 |
查看数据
df.show(2)
+---+---+-------+----------+-------------------+
| a| b| c| d| e|
+---+---+-------+----------+-------------------+
| 1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
| 2|3.0|string2|2000-02-01|2000-01-02 12:00:00|
+---+---+-------+----------+-------------------+
only showing top 2 rows
一些常用配置
spark.conf.set("spark.sql.repl.eagerEval.enabled",True)
spark.conf.set("spark.sql.repl.eagerEval.maxNumRows",1000)
df
| a | b | c | d | e |
| 1 | 2.0 | string1 | 2000-01-01 | 2000-01-01 12:00:00 |
| 2 | 3.0 | string2 | 2000-02-01 | 2000-01-02 12:00:00 |
| 3 | 4.0 | string3 | 2000-03-01 | 2000-01-03 12:00:00 |
df.columns
['a', 'b', 'c', 'd', 'e']
df.printSchema()
root
|-- a: long (nullable = true)
|-- b: double (nullable = true)
|-- c: string (nullable = true)
|-- d: date (nullable = true)
|-- e: timestamp (nullable = true)
df.select("a","b").describe()
# 查看DataFrame的摘要统计
| summary | a | b |
| count | 3 | 3 |
| mean | 2.0 | 3.0 |
| stddev | 1.0 | 1.0 |
| min | 1 | 2.0 |
| max | 3 | 4.0 |
tmp_df=df.collect()
#DataFrame.collect()Python中将分布式数据作为本地数据收集到驱动端。请注意,当数据集太大而无法放入驱动程序端时,这可能会引发内存不足错误,因为它将所有数据从执行程序收集到驱动程序端。
#DataFrame.collect()Python中将分布式数据作为本地数据收集到驱动端。请注意,当数据集太大而无法放入驱动程序端时,这可能会引发内存不足错误,因为它将所有数据从执行程序收集到驱动程序端。
df.take(1)
[Row(a=1, b=2.0, c='string1', d=datetime.date(2000, 1, 1), e=datetime.datetime(2000, 1, 1, 12, 0))]
df.tail(1)
[Row(a=3, b=4.0, c='string3', d=datetime.date(2000, 3, 1), e=datetime.datetime(2000, 1, 3, 12, 0))]
为了避免内存不足的问题,可以使用task或tail来采集部分样例数据
pddf=df.toPandas()
# 转换为Pandas的DataFrame
pddf
|
a |
b |
c |
d |
e |
| 0 |
1 |
2.0 |
string1 |
2000-01-01 |
2000-01-01 12:00:00 |
| 1 |
2 |
3.0 |
string2 |
2000-02-01 |
2000-01-02 12:00:00 |
| 2 |
3 |
4.0 |
string3 |
2000-03-01 |
2000-01-03 12:00:00 |
选择和访问数据
df.a
# PySpark DataFrame 是惰性求值的,简单地选择一列不会触发计算,但会返回一个Column实例。
Column<'a'>
from pyspark.sql import Column
from pyspark.sql.functions import upper
df.c.isNull()
Column<'(c IS NULL)'>
df.select(df.c).show()
+-------+
| c|
+-------+
|string1|
|string2|
|string3|
+-------+
df.filter(df.a==1).show()
+---+---+-------+----------+-------------------+
| a| b| c| d| e|
+---+---+-------+----------+-------------------+
| 1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
+---+---+-------+----------+-------------------+
使用函数
from pyspark.sql.functions import pandas_udf
@pandas_udf("long")
def addone(series:pd.Series)->pd.Series:
return series+1
# !pip install PyArrow==1.0.0
df.select(addone(df.a)).show()
+---------+
|addone(a)|
+---------+
| 2|
| 3|
| 4|
+---------+
def pandas_filter_func(iterator):
for pandas_df in iterator:
yield pandas_df[pandas_df.a==1]
df.mapInPandas(pandas_filter_func,schema=df.schema).show()
+---+---+-------+----------+-------------------+
| a| b| c| d| e|
+---+---+-------+----------+-------------------+
| 1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
+---+---+-------+----------+-------------------+
聚合
df = spark.createDataFrame([
['red', 'banana', 1, 10], ['blue', 'banana', 2, 20], ['red', 'carrot', 3, 30],
['blue', 'grape', 4, 40], ['red', 'carrot', 5, 50], ['black', 'carrot', 6, 60],
['red', 'banana', 7, 70], ['red', 'grape', 8, 80]], schema=['color', 'fruit', 'v1', 'v2'])
df.show()
+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
| red|banana| 1| 10|
| blue|banana| 2| 20|
| red|carrot| 3| 30|
| blue| grape| 4| 40|
| red|carrot| 5| 50|
|black|carrot| 6| 60|
| red|banana| 7| 70|
| red| grape| 8| 80|
+-----+------+---+---+
df.groupBy('color').count().show()
+-----+-----+
|color|count|
+-----+-----+
| red| 5|
|black| 1|
| blue| 2|
+-----+-----+
def plus_mean(df):
return df.assign(v1=df.v1-df.v1.mean())
df.groupBy('color').applyInPandas(plus_mean,schema=df.schema)
| color | fruit | v1 | v2 |
| red | banana | -3 | 10 |
| red | carrot | -1 | 30 |
| red | carrot | 0 | 50 |
| red | banana | 2 | 70 |
| red | grape | 3 | 80 |
| black | carrot | 0 | 60 |
| blue | banana | -1 | 20 |
| blue | grape | 1 | 40 |
df1 = spark.createDataFrame(
[(20000101, 1, 1.0), (20000101, 2, 2.0), (20000102, 1, 3.0), (20000102, 2, 4.0)],
('time', 'id', 'v1'))
df2 = spark.createDataFrame(
[(20000101, 1, 'x'), (20000101, 2, 'y')],
('time', 'id', 'v2'))
def asof_join(l, r):
return pd.merge_asof(l, r, on='time', by='id')
df1.groupby('id').cogroup(df2.groupby('id')).applyInPandas(asof_join, schema='time int, id int, v1 double, v2 string')
| time | id | v1 | v2 |
| 20000101 | 1 | 1.0 | x |
| 20000102 | 1 | 3.0 | x |
| 20000101 | 2 | 2.0 | y |
| 20000102 | 2 | 4.0 | y |
数据导入导出
df.write.csv("demo.csv",header=True)
spark.read.csv("demo.csv",header=True)
| color | fruit | v1 | v2 |
| black | carrot | 6 | 60 |
| blue | banana | 2 | 20 |
| red | banana | 7 | 70 |
| red | carrot | 3 | 30 |
| red | banana | 1 | 10 |
| blue | grape | 4 | 40 |
| red | carrot | 5 | 50 |
| red | grape | 8 | 80 |
df.write.orc("df.orc")
spark.read.orc("df.orc")
| color | fruit | v1 | v2 |
| red | banana | 7 | 70 |
| red | grape | 8 | 80 |
| black | carrot | 6 | 60 |
| blue | banana | 2 | 20 |
| red | carrot | 5 | 50 |
| red | banana | 1 | 10 |
| blue | grape | 4 | 40 |
| red | carrot | 3 | 30 |
和SQL结合
df.createOrReplaceTempView("tba")
spark.sql("select count(1) as cnt from tba")
@pandas_udf("integer")
def add_one(s:pd.Series)->pd.Series:
return s+1
spark.udf.register("add_one",add_one)
<function __main__.add_one(s: pandas.core.series.Series) -> pandas.core.series.Series>
spark.sql("select add_one(v1) as inf from tba")
from pyspark.sql.functions import expr
df.selectExpr('add_one(v1)').show()
df.select(expr('count(*)') > 0).show()
+-----------+
|add_one(v1)|
+-----------+
| 2|
| 3|
| 4|
| 5|
| 6|
| 7|
| 8|
| 9|
+-----------+
+--------------+
|(count(1) > 0)|
+--------------+
| true|
+--------------+