2.2总结
第十四天
今天进行了sparksql练习
# coding:utf8 from pyspark.sql import SparkSession from pyspark.sql.types import StructType, StringType, IntegerType if __name__ == '__main__': # 0. 构建执行环境入口对象SparkSession spark = SparkSession.builder.\ appName("test").\ master("local[*]").\ getOrCreate() sc = spark.sparkContext # 基于RDD转换成DataFrame rdd = sc.textFile("../data/input/sql/people.txt").\ map(lambda x: x.split(",")).\ map(lambda x: (x[0], int(x[1]))) # toDF的方式构建DataFrame df1 = rdd.toDF(["name", "age"]) df1.printSchema() df1.show() # toDF的方式2 通过StructType来构建 schema = StructType().add("name", StringType(), nullable=True).\ add("age", IntegerType(), nullable=False) df2 = rdd.toDF(schema=schema) df2.printSchema() df2.show()
# coding:utf8 from pyspark.sql import SparkSession from pyspark.sql.types import StructType, StringType, IntegerType import pandas as pd if __name__ == '__main__': # 0. 构建执行环境入口对象SparkSession spark = SparkSession.builder.\ appName("test").\ master("local[*]").\ getOrCreate() sc = spark.sparkContext # 基于Pandas的DataFrame构建SparkSQL的DataFrame对象 pdf = pd.DataFrame( { "id": [1, 2, 3], "name": ["张大仙", "王晓晓", "吕不为"], "age": [11, 21, 11] } ) df = spark.createDataFrame(pdf) df.printSchema() df.show()
# coding:utf8 from pyspark.sql import SparkSession from pyspark.sql.types import StructType, StringType, IntegerType import pandas as pd if __name__ == '__main__': # 0. 构建执行环境入口对象SparkSession spark = SparkSession.builder.\ appName("test").\ master("local[*]").\ getOrCreate() sc = spark.sparkContext # 构建StructType, text数据源, 读取数据的特点是, 将一整行只作为`一个列`读取, 默认列名是value 类型是String schema = StructType().add("data", StringType(), nullable=True) df = spark.read.format("text").\ schema(schema=schema).\ load("../data/input/sql/people.txt") df.printSchema() df.show()
# coding:utf8 from pyspark.sql import SparkSession from pyspark.sql.types import StructType, StringType, IntegerType import pandas as pd if __name__ == '__main__': # 0. 构建执行环境入口对象SparkSession spark = SparkSession.builder.\ appName("test").\ master("local[*]").\ getOrCreate() sc = spark.sparkContext # JSON类型自带有Schema信息 df = spark.read.format("json").load("../data/input/sql/people.json") df.printSchema() df.show()

浙公网安备 33010602011771号