from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import (
StructType, StructField, StringType, IntegerType, DateType,
TimestampType, DoubleType, ArrayType, MapType
)
from pyspark.sql.window import Window
import pandas as pd
import json
# 初始化SparkSession
spark = SparkSession.builder \
.appName("PySpark Function Demo") \
.master("local[*]") \
.getOrCreate()
sc = spark.sparkContext
# ======================
# 1. 原始数据准备(覆盖多类型字段,增加数据量)
# ======================
raw_schema = StructType([
StructField("user_id", IntegerType(), True),
StructField("name", StringType(), True), # 需字符串处理
StructField("age", IntegerType(), True), # 含空值
StructField("phone", StringType(), True), # 需脱敏(字符串处理)
StructField("join_date", StringType(), True), # 非标准日期
StructField("last_login", TimestampType(), True), # 时间戳
StructField("comments", StringType(), True), # 含空格/大小写混合
StructField("salary", IntegerType(), True), # 薪资(聚合用)
StructField("bonus", IntegerType(), True), # 奖金(空值处理用)
StructField("other_info", StringType(), True) # 新增字段,用于存储JSON格式的其他信息
])
# 增加数据量
raw_data = [
(1, " Alice ", None, "13812345678", "2023-01-05", "2024-03-15 08:30:00", " GREAT! ", 18000, None, '{"hobbies": ["reading", "swimming"], "city": "New York"}'),
(2, "bob", 28, "139 8765 4321", "2023/02/15", "2024-03-14 14:15:00", " good ", 22000, 3000, '{"hobbies": ["running", "painting"], "city": "Los Angeles"}'),
(3, " CHARLIE ", 35, "136****1234", "2023-03-25", "2024-03-13 09:45:00", "BAD", None, 2000, '{"hobbies": ["singing", "dancing"], "city": "Chicago"}'),
(4, "diana", 40, None, "20230401", "2024-03-12 16:20:00", " okay ", 23000, None, '{"hobbies": ["writing", "cooking"], "city": "Houston"}'),
(5, " Eve ", 29, "18600001111", "2023-05-20", None, " FANTASTIC ", 22000, 1500, '{"hobbies": ["reading", "traveling"], "city": "Miami"}'),
(6, "Frank", 32, "15512345678", "2023-06-10", "2024-03-10 10:30:00", " excellent ", 25000, 4000, '{"hobbies": ["photography", "cycling"], "city": "Seattle"}'),
(7, "Grace", 27, "13387654321", "2023-07-15", "2024-03-09 14:15:00", " nice ", 19000, 1000, '{"hobbies": ["hiking", "yoga"], "city": "Denver"}')
]
raw_df = spark.createDataFrame(raw_data, schema=raw_schema)
print("===== 原始数据 =====")
raw_df.show(truncate=False)
# Pandas 对应操作:创建 DataFrame
# pd_raw_df = pd.DataFrame(raw_data, columns=[field.name for field in raw_schema.fields])
# ======================
# 2. 字符串处理函数(TRIM, SUBSTRING, CONCAT等)
# ======================
string_processed_df = raw_df \
# TRIM:去除首尾空格
.withColumn("name_clean", F.trim("name")) \
# LOWER/UPPER:统一大小写
.withColumn("name_lower", F.lower("name_clean")) \
.withColumn("name_upper", F.upper("name_clean")) \
# SUBSTRING:提取手机号前3位(位置从1开始,长度3)
.withColumn("phone_prefix", F.substring("phone", 1, 3)) \
# CONCAT:拼接姓名和年龄(处理空值)
.withColumn("name_age", F.concat("name_clean", F.lit("_"), F.coalesce(F.col("age").cast(StringType()), F.lit("未知")))) \
# REGEXP_REPLACE:手机号脱敏(替换中间4位为*)
.withColumn("phone_masked", F.regexp_replace("phone", r"(\d{3})\d{4}(\d{4})", "$1****$2")) \
# SPLIT:分割评论(按空格分割,过滤空值)
.withColumn("comment_words", F.split(F.trim("comments"), "\\s+")) \
.withColumn("comment_first_word", F.element_at("comment_words", 1)) # 取第一个单词
print("\n===== 字符串处理结果 =====")
string_processed_df.select(
"name", "name_clean", "name_lower", "phone_prefix",
"phone_masked", "comment_first_word"
).show(truncate=False)
# Pandas 对应操作:字符串处理
# pd_string_processed_df = pd_raw_df.copy()
# pd_string_processed_df['name_clean'] = pd_string_processed_df['name'].str.strip()
# pd_string_processed_df['name_lower'] = pd_string_processed_df['name_clean'].str.lower()
# pd_string_processed_df['name_upper'] = pd_string_processed_df['name_clean'].str.upper()
# pd_string_processed_df['phone_prefix'] = pd_string_processed_df['phone'].str[:3]
# pd_string_processed_df['name_age'] = pd_string_processed_df['name_clean'] + '_' + pd_string_processed_df['age'].fillna('未知').astype(str)
# pd_string_processed_df['phone_masked'] = pd_string_processed_df['phone'].str.replace(r'(\d{3})\d{4}(\d{4})', r'\1****\2', regex=True)
# pd_string_processed_df['comment_words'] = pd_string_processed_df['comments'].str.strip().str.split()
# pd_string_processed_df['comment_first_word'] = pd_string_processed_df['comment_words'].str[0]
# ======================
# 3. 空值处理函数(COALESCE, NVL, NULLIF等)
# ======================
null_processed_df = string_processed_df \
# COALESCE:取第一个非空值(salary为空时用0填充)
.withColumn("salary_filled", F.coalesce("salary", F.lit(0))) \
# NVL:等价于COALESCE(PySpark中F.nvl()与COALESCE类似)
.withColumn("bonus_filled", F.nvl("bonus", 0)) \
# IFNULL:Hive兼容函数(等价于COALESCE)
.withColumn("age_filled", F.expr("ifnull(age, 0)")) \
# NULLIF:如果奖金等于0,返回null(示例:奖金为0时隐藏)
.withColumn("bonus_nullif", F.nullif("bonus_filled", 0)) \
# 过滤phone为空的记录(na.drop)
.na.drop(subset=["phone"])
print("\n===== 空值处理结果 =====")
null_processed_df.select("salary", "salary_filled", "bonus", "bonus_filled", "age_filled").show()
# Pandas 对应操作:空值处理
# pd_null_processed_df = pd_string_processed_df.copy()
# pd_null_processed_df['salary_filled'] = pd_null_processed_df['salary'].fillna(0)
# pd_null_processed_df['bonus_filled'] = pd_null_processed_df['bonus'].fillna(0)
# pd_null_processed_df['age_filled'] = pd_null_processed_df['age'].fillna(0)
# pd_null_processed_df['bonus_nullif'] = pd_null_processed_df['bonus_filled'].where(pd_null_processed_df['bonus_filled'] != 0)
# pd_null_processed_df = pd_null_processed_df.dropna(subset=['phone'])
# ======================
# 4. 日期处理函数(DATE_FORMAT, DATEDIFF等)
# ======================
date_processed_df = null_processed_df \
# CURRENT_DATE:获取当前日期
.withColumn("current_date", F.current_date()) \
# DATE_FORMAT:标准化日期格式(原始日期可能有多种格式)
.withColumn("join_date_std", F.coalesce(
F.to_date("join_date", "yyyy-MM-dd"),
F.to_date("join_date", "yyyy/MM/dd"),
F.to_date("join_date", "yyyyMMdd")
)) \
# DATEDIFF:计算入职天数(当前日期 - 入职日期)
.withColumn("days_since_join", F.datediff("current_date", "join_date_std")) \
# MONTHS_BETWEEN:计算入职月数(保留2位小数)
.withColumn("months_since_join", F.round(F.months_between("current_date", "join_date_std"), 2)) \
# ADD_MONTHS:计算入职1年后的日期
.withColumn("join_anniversary", F.add_months("join_date_std", 12)) \
# 处理last_login空值(用入职日期填充)
.withColumn("last_login_filled", F.coalesce("last_login", F.to_timestamp("join_date_std", "yyyy-MM-dd")))
print("\n===== 日期处理结果 =====")
date_processed_df.select(
"join_date", "join_date_std", "current_date",
"days_since_join", "join_anniversary"
).show(truncate=False)
# Pandas 对应操作:日期处理
# pd_date_processed_df = pd_null_processed_df.copy()
# pd_date_processed_df['current_date'] = pd.Timestamp.now().date()
# pd_date_processed_df['join_date_std'] = pd.to_datetime(pd_date_processed_df['join_date'], errors='coerce',
# format=['%Y-%m-%d', '%Y/%m/%d', '%Y%m%d']).ffill()
# pd_date_processed_df['days_since_join'] = (pd_date_processed_df['current_date'] - pd_date_processed_df['join_date_std']).dt.days
# pd_date_processed_df['months_since_join'] = round((pd_date_processed_df['current_date'] - pd_date_processed_df['join_date_std']).dt.days / 30, 2)
# pd_date_processed_df['join_anniversary'] = pd_date_processed_df['join_date_std'] + pd.DateOffset(years=1)
# pd_date_processed_df['last_login_filled'] = pd_date_processed_df['last_login'].fillna(pd_date_processed_df['join_date_std'])
# ======================
# 5. 条件逻辑函数(CASE WHEN, IF等)
# ======================
condition_df = date_processed_df \
# CASE WHEN:根据薪资分段(多条件判断)
.withColumn("salary_level", F.when(F.col("salary_filled") < 20000, "初级")
.when(F.col("salary_filled") < 25000, "中级")
.otherwise("高级")) \
# IF:简化条件判断(等价于CASE WHEN)
.withColumn("is_high_salary", F.if(F.col("salary_filled") >= 25000, "是", "否")) \
# 评论情感分类(结合字符串处理)
.withColumn("comment_sentiment", F.when(F.lower("comment_first_word") == "great", "积极")
.when(F.lower("comment_first_word") == "bad", "消极")
.otherwise("中性"))
print("\n===== 条件逻辑结果 =====")
condition_df.select("salary_filled", "salary_level", "is_high_salary", "comment_sentiment").show()
# Pandas 对应操作:条件逻辑
# pd_condition_df = pd_date_processed_df.copy()
# pd_condition_df['salary_level'] = pd.cut(pd_condition_df['salary_filled'], bins=[0, 20000, 25000, float('inf')],
# labels=['初级', '中级', '高级'])
# pd_condition_df['is_high_salary'] = pd_condition_df['salary_filled'].apply(lambda x: '是' if x >= 25000 else '否')
# pd_condition_df['comment_sentiment'] = pd_condition_df['comment_first_word'].str.lower().map(
# {'great': '积极', 'bad': '消极'}).fillna('中性')
# ======================
# 6. 新增:JSON解析和explode操作
# ======================
# 定义JSON字段的Schema
json_schema = StructType([
StructField("hobbies", ArrayType(StringType()), True),
StructField("city", StringType(), True)
])
json_processed_df = condition_df \
# 解析JSON字段
.withColumn("parsed_other_info", F.from_json("other_info", json_schema)) \
# 提取城市信息
.withColumn("city", F.col("parsed_other_info.city")) \
# 展开爱好数组
.withColumn("hobby", F.explode(F.col("parsed_other_info.hobbies")))
print("\n===== JSON解析和explode结果 =====")
json_processed_df.select("user_id", "name_clean", "city", "hobby").show(truncate=False)
# Pandas 对应操作:JSON解析和explode
# pd_json_processed_df = pd_condition_df.copy()
# pd_json_processed_df['parsed_other_info'] = pd_json_processed_df['other_info'].apply(json.loads)
# pd_json_processed_df['city'] = pd_json_processed_df['parsed_other_info'].apply(lambda x: x.get('city'))
# pd_json_processed_df = pd_json_processed_df.explode('parsed_other_info')
# pd_json_processed_df['hobby'] = pd_json_processed_df['parsed_other_info'].apply(lambda x: x.get('hobbies'))
# ======================
# 7. 新增:使用split + explode添加额外字段
# ======================
split_explode_df = json_processed_df \
# 添加一个新的字段,值为逗号分隔的字符串
.withColumn("skills", F.lit("Python,Java,SQL")) \
# 使用split函数将字符串分割成数组
.withColumn("skill_list", F.split("skills", ",")) \
# 使用explode函数将数组展开成多行
.withColumn("skill", F.explode("skill_list"))
print("\n===== split + explode结果 =====")
split_explode_df.select("user_id", "name_clean", "skills", "skill").show(truncate=False)
# Pandas 对应操作:split + explode
# pd_split_explode_df = pd_json_processed_df.copy()
# pd_split_explode_df['skills'] = "Python,Java,SQL"
# pd_split_explode_df['skill_list'] = pd_split_explode_df['skills'].str.split(',')
# pd_split_explode_df = pd_split_explode_df.explode('skill_list')
# pd_split_explode_df['skill'] = pd_split_explode_df['skill_list']
# ======================
# 8. 新增:使用窗口函数添加额外字段
# ======================
# 定义窗口规范,按薪资等级分区,按年龄排序
window_spec = Window.partitionBy("salary_level").orderBy("age_filled")
window_df = split_explode_df \
# 计算每个薪资等级内的年龄排名
.withColumn("age_rank", F.rank().over(window_spec)) \
# 计算每个薪资等级内的累计薪资
.withColumn("cumulative_salary", F.sum("salary_filled").over(window_spec.rowsBetween(Window.unboundedPreceding, Window.currentRow)))
print("\n===== 窗口函数结果 =====")
window_df.select("user_id", "name_clean", "salary_level", "age_filled", "age_rank", "cumulative_salary").show(truncate=False)
# Pandas 对应操作:窗口函数
# pd_window_df = pd_split_explode_df.copy()
# pd_window_df['age_rank'] = pd_window_df.groupby('salary_level')['age_filled'].rank()
# pd_window_df['cumulative_salary'] = pd_window_df.groupby('salary_level')['salary_filled'].cumsum()
# ======================
# 9. 新增:行转列,列转行
# ======================
# 行转列:按薪资等级和爱好进行分组,统计每个爱好的人数
pivot_df = window_df.groupBy("salary_level").pivot("hobby").count()
print("\n===== 行转列结果 =====")
pivot_df.show(truncate=False)
# Pandas 对应操作:行转列
# pd_pivot_df = pd_window_df.pivot_table(index='salary_level', columns='hobby', values='user_id', aggfunc='count')
# 列转行:将薪资和奖金列转换为键值对形式
unpivot_expr = "stack(2, 'salary', salary_filled, 'bonus', bonus_filled) as (income_type, income_amount)"
unpivot_df = window_df.select("user_id", "name_clean", F.expr(unpivot_expr))
print("\n===== 列转行结果 =====")
unpivot_df.show(truncate=False)
# Pandas 对应操作:列转行
# pd_unpivot_df = pd.melt(pd_window_df, id_vars=['user_id', 'name_clean'],
# value_vars=['salary_filled', 'bonus_filled'],
# var_name='income_type', value_name='income_amount')
# ======================
# 10. 聚合函数(SUM, AVG, COLLECT_LIST等)
# ======================
agg_df = window_df \
# 按薪资等级分组聚合
.groupBy("salary_level") \
.agg(
F.count("*").alias("user_count"), # 计数
F.sum("salary_filled").alias("total_salary"), # 总和
F.avg("salary_filled").alias("avg_salary"), # 平均值
F.min("age_filled").alias("min_age"), # 最小年龄
F.max("age_filled").alias("max_age"), # 最大年龄
F.collect_list("name_clean").alias("user_names") # 收集姓名列表
)
print("\n===== 聚合结果 =====")
agg_df.show(truncate=False)
# Pandas 对应操作:聚合函数
# pd_agg_df = pd_window_df.groupby('salary_level').agg({
# 'user_id': 'count',
# 'salary_filled': ['sum', 'mean'],
# 'age_filled': ['min', 'max'],
# 'name_clean': lambda x: list(x)
# })
# pd_agg_df.columns = ['user_count', 'total_salary', 'avg_salary', 'min_age', 'max_age', 'user_names']
spark.stop()