PySpark - Ingesting Streaming Data - ZhangZhihuiAAA

nc -lk 9999

from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split

spark = (SparkSession.builder
           .appName("config-streaming")
           .master("spark://ZZHPC:7077")
           .getOrCreate())
spark.sparkContext.setLogLevel("ERROR")

# Create DataFrame representing the stream of input lines from connection to localhost:9999
lines = (spark.readStream.format("socket")
         .option("host", "localhost")
         .option("port", 9999)
         .load())

# Split the lines into words
words = lines.select(explode(split(lines.value, " ")).alias("word"))

words.printSchema()

root
 |-- word: string (nullable = false)

wordCounts = words.groupBy("word").count()

 # Start running the query that prints the running counts to the console
query = (wordCounts.writeStream.format("console")
         .outputMode("complete")
         .start())

-------------------------------------------
Batch: 0
-------------------------------------------
+----+-----+
|word|count|
+----+-----+
+----+-----+

A new batch for the stream query is triggered, and the output is updated as shown:

query.stop()

spark.stop()

import json
import random
import time
from kafka import KafkaProducer
import datetime

producer = KafkaProducer(bootstrap_servers='localhost:9092')
countries = ['USA', 'UK', 'India', 'China', 'Brazil', 'Canada', 'Australia']
genders = ['M', 'F']

while True:
    current_time = time.time()
    message = {
        'id': random.randint(1, 100),
        'name': f'user{random.randint(1, 100)}',
        'age': random.randint(18, 65),
        'gender': random.choice(genders),
        'country': random.choice(countries),
        'timestamp':datetime.datetime.fromtimestamp(current_time).strftime("%m/%d/%Y, %H:%M:%S")
    }
    producer.send('users', value=json.dumps(message).encode('utf-8'))
    print(message)
    time.sleep(30)

{'id': 48, 'name': 'user13', 'age': 34, 'gender': 'F', 'country': 'UK', 'timestamp': '02/05/2025, 19:32:31'}
{'id': 63, 'name': 'user39', 'age': 31, 'gender': 'M', 'country': 'China', 'timestamp': '02/05/2025, 19:32:41'}
{'id': 24, 'name': 'user34', 'age': 60, 'gender': 'M', 'country': 'Australia', 'timestamp': '02/05/2025, 19:32:51'}
{'id': 33, 'name': 'user23', 'age': 27, 'gender': 'M', 'country': 'Brazil', 'timestamp': '02/05/2025, 19:33:01'}
{'id': 21, 'name': 'user23', 'age': 62, 'gender': 'M', 'country': 'Brazil', 'timestamp': '02/05/2025, 19:33:11'}

zzh@ZZHPC:~$ kafka-console-consumer.sh --topic users --bootstrap-server localhost:9092 --from-beginning
zzh
{"id": 48, "name": "user13", "age": 34, "gender": "F", "country": "UK", "timestamp": "02/05/2025, 19:32:31"}
{"id": 63, "name": "user39", "age": 31, "gender": "M", "country": "China", "timestamp": "02/05/2025, 19:32:41"}
{"id": 24, "name": "user34", "age": 60, "gender": "M", "country": "Australia", "timestamp": "02/05/2025, 19:32:51"}
{"id": 33, "name": "user23", "age": 27, "gender": "M", "country": "Brazil", "timestamp": "02/05/2025, 19:33:01"}
{"id": 21, "name": "user23", "age": 62, "gender": "M", "country": "Brazil", "timestamp": "02/05/2025, 19:33:11"}

from delta import configure_spark_with_delta_pip
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

builder = (SparkSession.builder
           .appName("connect-kafka-streaming")
           .master("spark://ZZHPC:7077")
           .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
           .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"))

spark = configure_spark_with_delta_pip(builder,['org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.1']).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

df = (spark.readStream.format("kafka")
      .option("kafka.bootstrap.servers", "localhost:9092")
      .option("subscribe", "users")
      .option("startingOffsets", "earliest")
      .load())

schema = StructType([
    StructField('id', IntegerType(), True),
    StructField('name', StringType(), True),
    StructField('age', IntegerType(), True),
    StructField('gender', StringType(), True),
    StructField('country', StringType(), True)])

df = df.withColumn('value', from_json(col('value').cast("STRING"), schema))

df = df.select(
    col('value.id').alias('id'),
    col('value.name').alias('name'),
    col('value.age').alias('age'),
    col('value.gender').alias('gender'),
    col('value.country').alias('country'))

query = (df.writeStream.format('console')
    .outputMode('append')
    .start())

......

query.stop()

from delta import configure_spark_with_delta_pip
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, avg
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

builder = (SparkSession.builder
           .appName("transform-filter-streaming")
           .master("spark://ZZHPC:7077")
           .config("spark.executor.memory", "512m")
           .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
           .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"))

spark = configure_spark_with_delta_pip(builder,['org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.1']).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

25/02/06 10:24:41 WARN Utils: Your hostname, ZZHPC resolves to a loopback address: 127.0.1.1; using 192.168.1.16 instead (on interface wlo1)
25/02/06 10:24:41 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/home/zzh/Downloads/sfw/spark-3.4.1-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /home/zzh/.ivy2/cache
The jars for the packages stored in: /home/zzh/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-acdab19a-0418-4169-814d-6f0620da624d;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.4.1 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.4.1 in central
	found org.apache.kafka#kafka-clients;3.3.2 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.1 in central
	found org.slf4j#slf4j-api;2.0.6 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
:: resolution report :: resolve 428ms :: artifacts dl 19ms
	:: modules in use:
	com.google.code.findbugs#jsr305;3.0.0 from central in [default]
	commons-logging#commons-logging;1.1.3 from central in [default]
	io.delta#delta-core_2.12;2.4.0 from central in [default]
	io.delta#delta-storage;2.4.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	org.apache.commons#commons-pool2;2.11.1 from central in [default]
	org.apache.hadoop#hadoop-client-api;3.3.4 from central in [default]
	org.apache.hadoop#hadoop-client-runtime;3.3.4 from central in [default]
	org.apache.kafka#kafka-clients;3.3.2 from central in [default]
	org.apache.spark#spark-sql-kafka-0-10_2.12;3.4.1 from central in [default]
	org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.4.1 from central in [default]
	org.lz4#lz4-java;1.8.0 from central in [default]
	org.slf4j#slf4j-api;2.0.6 from central in [default]
	org.xerial.snappy#snappy-java;1.1.10.1 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   14  |   0   |   0   |   0   ||   14  |   0   |
	---------------------------------------------------------------------
:: retrieving :: org.apache.spark#spark-submit-parent-acdab19a-0418-4169-814d-6f0620da624d
	confs: [default]
	0 artifacts copied, 14 already retrieved (0kB/10ms)
25/02/06 10:24:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).

df = (spark.readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", "localhost:9092")
      .option("subscribe", "users")
      .option("startingOffsets", "earliest")
      .load())

schema = StructType([
    StructField('id', IntegerType(), True),
    StructField('name', StringType(), True),
    StructField('age', IntegerType(), True),
    StructField('gender', StringType(), True),
    StructField('country', StringType(), True)])

df = df.withColumn('value', from_json(col('value').cast("STRING"), schema))

df = df.select(
    col('value.id').alias('id'),
    col('value.name').alias('name'),
    col('value.age').alias('age'),
    col('value.gender').alias('gender'),
    col('value.country').alias('country'))

df = (df.select('age','country', 'gender').filter("age >= 21").groupBy('country', 'gender').agg(avg('age').alias('avg_age')))

query = (df.writeStream
    .outputMode('complete')
    .format('console')
    .start())

-------------------------------------------
Batch: 0
-------------------------------------------
+---------+------+-------+
|  country|gender|avg_age|
+---------+------+-------+
|   Brazil|     F|   36.0|
|   Brazil|     M|   63.0|
|Australia|     F|   40.0|
|   Canada|     M|   61.0|
|       UK|     M|   50.0|
|    India|     M|   52.0|
|    China|     M|   51.0|
|    China|     F|   27.5|
|   Canada|     F|   30.5|
|Australia|     M|   49.0|
|    India|     F|  44.75|
|       UK|     F|   38.0|
+---------+------+-------+

                                                                                
-------------------------------------------
Batch: 1
-------------------------------------------
+---------+------+-------+
|  country|gender|avg_age|
+---------+------+-------+
|   Brazil|     F|   36.0|
|   Brazil|     M|   63.0|
|Australia|     F|   40.0|
|   Canada|     M|   61.0|
|       UK|     M|   50.0|
|    India|     M|   52.0|
|    China|     M|   51.0|
|    China|     F|   27.5|
|   Canada|     F|   33.0|
|Australia|     M|   49.0|
|    India|     F|  44.75|
|       UK|     F|   38.0|
+---------+------+-------+

                                                                                
-------------------------------------------
Batch: 2
-------------------------------------------
+---------+------+-------+
|  country|gender|avg_age|
+---------+------+-------+
|   Brazil|     F|   36.0|
|   Brazil|     M|   63.0|
|Australia|     F|   40.0|
|   Canada|     M|   61.0|
|       UK|     M|   50.0|
|    India|     M|   52.0|
|    China|     M|   51.0|
|    China|     F|   27.5|
|   Canada|     F|   33.0|
|Australia|     M|   46.0|
|    India|     F|  44.75|
|       UK|     F|   38.0|
+---------+------+-------+

query.stop()

spark.stop()

from delta import configure_spark_with_delta_pip
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

builder = (SparkSession.builder
           .appName("config-checkpoints")
           .master("spark://ZZHPC:7077")
           .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
           .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"))

spark = configure_spark_with_delta_pip(builder,['org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.1']).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

25/02/06 10:47:28 WARN Utils: Your hostname, ZZHPC resolves to a loopback address: 127.0.1.1; using 192.168.1.16 instead (on interface wlo1)
25/02/06 10:47:28 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/home/zzh/Downloads/sfw/spark-3.4.1-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /home/zzh/.ivy2/cache
The jars for the packages stored in: /home/zzh/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-2172143e-be74-4efa-8d11-0235ebe436cb;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.4.1 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.4.1 in central
	found org.apache.kafka#kafka-clients;3.3.2 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.1 in central
	found org.slf4j#slf4j-api;2.0.6 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
:: resolution report :: resolve 455ms :: artifacts dl 17ms
	:: modules in use:
	com.google.code.findbugs#jsr305;3.0.0 from central in [default]
	commons-logging#commons-logging;1.1.3 from central in [default]
	io.delta#delta-core_2.12;2.4.0 from central in [default]
	io.delta#delta-storage;2.4.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	org.apache.commons#commons-pool2;2.11.1 from central in [default]
	org.apache.hadoop#hadoop-client-api;3.3.4 from central in [default]
	org.apache.hadoop#hadoop-client-runtime;3.3.4 from central in [default]
	org.apache.kafka#kafka-clients;3.3.2 from central in [default]
	org.apache.spark#spark-sql-kafka-0-10_2.12;3.4.1 from central in [default]
	org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.4.1 from central in [default]
	org.lz4#lz4-java;1.8.0 from central in [default]
	org.slf4j#slf4j-api;2.0.6 from central in [default]
	org.xerial.snappy#snappy-java;1.1.10.1 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   14  |   0   |   0   |   0   ||   14  |   0   |
	---------------------------------------------------------------------
:: retrieving :: org.apache.spark#spark-submit-parent-2172143e-be74-4efa-8d11-0235ebe436cb
	confs: [default]
	0 artifacts copied, 14 already retrieved (0kB/9ms)
25/02/06 10:47:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).

get_ipython().run_line_magic('load_ext', 'sparksql_magic')
get_ipython().run_line_magic('config', 'SparkSql.limit=20')

df = (spark.readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", "localhost:9092")
      .option("subscribe", "users")
      .option("startingOffsets", "earliest")
      .load())

schema = StructType([
    StructField('id', IntegerType(), True),
    StructField('name', StringType(), True),
    StructField('age', IntegerType(), True),
    StructField('gender', StringType(), True),
    StructField('country', StringType(), True)])

df = df.withColumn('value', from_json(col('value').cast("STRING"), schema))

df = df.select(
    col('value.id').alias('id'),
    col('value.name').alias('name'),
    col('value.age').alias('age'),
    col('value.gender').alias('gender'),
    col('value.country').alias('country'))

query = (df.writeStream
   .format("console")
   .outputMode("append")
   .option("checkpointLocation", "/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/checkpoint")
   .start())

-------------------------------------------
Batch: 0
-------------------------------------------
+---+------+---+------+---------+
| id|  name|age|gender|  country|
+---+------+---+------+---------+
| 41| user4| 40|     F|Australia|
| 27|user30| 39|     F|   Canada|
| 19|user80| 30|     F|    China|
| 44|user95| 51|     F|      USA|
| 31| user6| 22|     M|    China|
| 52| user5| 62|     M|   Brazil|
| 36|user73| 57|     M|       UK|
| 86|user18| 62|     M|   Canada|
| 22|user66| 18|     M|       UK|
| 42| user8| 22|     F|   Canada|
| 37|user38| 51|     M|    China|
| 98|user85| 62|     M|Australia|
| 22| user8| 50|     M|       UK|
| 48|user64| 21|     F|   Brazil|
| 41|user98| 26|     F|    India|
| 97|user90| 38|     F|       UK|
| 64|user12| 38|     F|   Canada|
| 47|user33| 40|     M|Australia|
|  9|user89| 62|     F|Australia|
| 91|user42| 54|     F|   Brazil|
+---+------+---+------+---------+
only showing top 20 rows

-------------------------------------------
Batch: 1
-------------------------------------------
+---+------+---+------+-------+
| id|  name|age|gender|country|
+---+------+---+------+-------+
|  4|user16| 21|     M|  India|
+---+------+---+------+-------+

                                                                                
-------------------------------------------
Batch: 2
-------------------------------------------
+---+------+---+------+-------+
| id|  name|age|gender|country|
+---+------+---+------+-------+
| 67|user57| 40|     F| Brazil|
+---+------+---+------+-------+

zzh@ZZHPC:/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/checkpoint$ tree .
.
├── commits
│   ├── 0
│   ├── 1
│   └── 2
├── metadata
├── offsets
│   ├── 0
│   ├── 1
│   └── 2
└── sources
    └── 0
        └── 0

query.stop()

query = (df.writeStream 
   .format("console") 
   .outputMode("append") 
   .option("checkpointLocation", "/zdata/Github/Data-Engineering-with-Databricks-Cookbook-main/data/checkpoint") 
   .start())

-------------------------------------------
Batch: 3
-------------------------------------------
+---+-------+---+------+---------+
| id|   name|age|gender|  country|
+---+-------+---+------+---------+
| 71| user94| 48|     M|   Brazil|
| 27| user91| 52|     M|    India|
| 30| user14| 57|     F|   Brazil|
| 40| user40| 20|     M|    China|
| 88| user84| 29|     F|      USA|
| 68| user35| 22|     F|   Brazil|
| 18| user85| 63|     F|   Brazil|
| 69| user15| 51|     M|   Canada|
| 83| user35| 47|     F|Australia|
| 58| user37| 54|     F|    China|
| 14|user100| 53|     M|Australia|
| 21| user68| 48|     F|   Brazil|
+---+-------+---+------+---------+

                                                                                
-------------------------------------------
Batch: 4
-------------------------------------------
+---+------+---+------+-------+
| id|  name|age|gender|country|
+---+------+---+------+-------+
| 21|user29| 21|     F|     UK|
+---+------+---+------+-------+

query.stop()

spark.stop()

from delta import configure_spark_with_delta_pip
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

builder = (SparkSession.builder
           .appName("config-triggers")
           .master("spark://ZZHPC:7077")
           .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
           .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"))

spark = configure_spark_with_delta_pip(builder,['org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.1']).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

get_ipython().run_line_magic('load_ext', 'sparksql_magic')
get_ipython().run_line_magic('config', 'SparkSql.limit=20')

df = (spark.readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", "localhost:9092")
      .option("subscribe", "users")
      .option("startingOffsets", "earliest")
      .load())

schema = StructType([
    StructField('id', IntegerType(), True),
    StructField('name', StringType(), True),
    StructField('age', IntegerType(), True),
    StructField('gender', StringType(), True),
    StructField('country', StringType(), True)])

df = df.withColumn('value', from_json(col('value').cast("STRING"), schema))

df = df.select(
    col('value.id').alias('id'),
    col('value.name').alias('name'),
    col('value.age').alias('age'),
    col('value.gender').alias('gender'),
    col('value.country').alias('country'))

query = (df.writeStream
   .format("console")
   .outputMode("append")
   .start())

-------------------------------------------
Batch: 0
-------------------------------------------
+----+--------------------+-----+---------+------+--------------------+-------------+
| key|               value|topic|partition|offset|           timestamp|timestampType|
+----+--------------------+-----+---------+------+--------------------+-------------+
|null|[7B 22 69 64 22 3...|users|        1|     0|2025-02-06 10:32:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|     1|2025-02-06 10:35:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|     2|2025-02-06 10:38:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|     3|2025-02-06 10:44:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|     4|2025-02-06 10:45:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|     5|2025-02-06 10:46:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|     6|2025-02-06 10:49:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|     7|2025-02-06 10:51:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|     8|2025-02-06 10:53:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|     9|2025-02-06 10:54:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|    10|2025-02-06 10:58:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|    11|2025-02-06 10:59:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|    12|2025-02-06 11:04:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|    13|2025-02-06 11:05:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|    14|2025-02-06 11:07:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|    15|2025-02-06 11:08:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|    16|2025-02-06 11:09:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|    17|2025-02-06 11:10:...|            0|
|null|[7B 22 69 64 22 3...|users|        0|     0|2025-02-06 10:29:...|            0|
|null|[7B 22 69 64 22 3...|users|        0|     1|2025-02-06 10:35:...|            0|
+----+--------------------+-----+---------+------+--------------------+-------------+
only showing top 20 rows

-------------------------------------------
Batch: 1
-------------------------------------------
+----+--------------------+-----+---------+------+--------------------+-------------+
| key|               value|topic|partition|offset|           timestamp|timestampType|
+----+--------------------+-----+---------+------+--------------------+-------------+
|null|[7B 22 69 64 22 3...|users|        1|    18|2025-02-06 11:21:...|            0|
+----+--------------------+-----+---------+------+--------------------+-------------+

                                                                                
-------------------------------------------
Batch: 2
-------------------------------------------
+----+--------------------+-----+---------+------+--------------------+-------------+
| key|               value|topic|partition|offset|           timestamp|timestampType|
+----+--------------------+-----+---------+------+--------------------+-------------+
|null|[7B 22 69 64 22 3...|users|        1|    19|2025-02-06 11:22:...|            0|
+----+--------------------+-----+---------+------+--------------------+-------------+

                                                                                
-------------------------------------------
Batch: 3
-------------------------------------------
+----+--------------------+-----+---------+------+--------------------+-------------+
| key|               value|topic|partition|offset|           timestamp|timestampType|
+----+--------------------+-----+---------+------+--------------------+-------------+
|null|[7B 22 69 64 22 3...|users|        1|    20|2025-02-06 11:23:...|            0|
+----+--------------------+-----+---------+------+--------------------+-------------+

query.stop()

query = (df.writeStream
   .format("console")
   .outputMode("append")
   .trigger(processingTime='30 seconds')
   .start())

-------------------------------------------
Batch: 0
-------------------------------------------
+----+--------------------+-----+---------+------+--------------------+-------------+
| key|               value|topic|partition|offset|           timestamp|timestampType|
+----+--------------------+-----+---------+------+--------------------+-------------+
|null|[7B 22 69 64 22 3...|users|        1|     0|2025-02-06 10:32:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|     1|2025-02-06 10:35:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|     2|2025-02-06 10:38:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|     3|2025-02-06 10:44:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|     4|2025-02-06 10:45:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|     5|2025-02-06 10:46:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|     6|2025-02-06 10:49:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|     7|2025-02-06 10:51:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|     8|2025-02-06 10:53:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|     9|2025-02-06 10:54:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|    10|2025-02-06 10:58:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|    11|2025-02-06 10:59:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|    12|2025-02-06 11:04:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|    13|2025-02-06 11:05:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|    14|2025-02-06 11:07:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|    15|2025-02-06 11:08:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|    16|2025-02-06 11:09:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|    17|2025-02-06 11:10:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|    18|2025-02-06 11:21:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|    19|2025-02-06 11:22:...|            0|
+----+--------------------+-----+---------+------+--------------------+-------------+
only showing top 20 rows

-------------------------------------------
Batch: 1
-------------------------------------------
+----+--------------------+-----+---------+------+--------------------+-------------+
| key|               value|topic|partition|offset|           timestamp|timestampType|
+----+--------------------+-----+---------+------+--------------------+-------------+
|null|[7B 22 69 64 22 3...|users|        1|    22|2025-02-06 11:29:...|            0|
|null|[7B 22 69 64 22 3...|users|        2|    19|2025-02-06 11:29:...|            0|
+----+--------------------+-----+---------+------+--------------------+-------------+

                                                                                
-------------------------------------------
Batch: 2
-------------------------------------------
+----+--------------------+-----+---------+------+--------------------+-------------+
| key|               value|topic|partition|offset|           timestamp|timestampType|
+----+--------------------+-----+---------+------+--------------------+-------------+
|null|[7B 22 69 64 22 3...|users|        1|    23|2025-02-06 11:29:...|            0|
|null|[7B 22 69 64 22 3...|users|        2|    20|2025-02-06 11:29:...|            0|
|null|[7B 22 69 64 22 3...|users|        2|    21|2025-02-06 11:29:...|            0|
+----+--------------------+-----+---------+------+--------------------+-------------+

query.stop()

query = (df.writeStream
   .format("console")
   .outputMode("append")
   .trigger(once=True)
   .start())

-------------------------------------------
Batch: 0
-------------------------------------------
+----+--------------------+-----+---------+------+--------------------+-------------+
| key|               value|topic|partition|offset|           timestamp|timestampType|
+----+--------------------+-----+---------+------+--------------------+-------------+
|null|[7B 22 69 64 22 3...|users|        1|     0|2025-02-06 10:32:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|     1|2025-02-06 10:35:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|     2|2025-02-06 10:38:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|     3|2025-02-06 10:44:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|     4|2025-02-06 10:45:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|     5|2025-02-06 10:46:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|     6|2025-02-06 10:49:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|     7|2025-02-06 10:51:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|     8|2025-02-06 10:53:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|     9|2025-02-06 10:54:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|    10|2025-02-06 10:58:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|    11|2025-02-06 10:59:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|    12|2025-02-06 11:04:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|    13|2025-02-06 11:05:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|    14|2025-02-06 11:07:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|    15|2025-02-06 11:08:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|    16|2025-02-06 11:09:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|    17|2025-02-06 11:10:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|    18|2025-02-06 11:21:...|            0|
|null|[7B 22 69 64 22 3...|users|        1|    19|2025-02-06 11:22:...|            0|
+----+--------------------+-----+---------+------+--------------------+-------------+
only showing top 20 rows

query.stop()

spark.stop()

import random
import json
from kafka import KafkaProducer
import time
import datetime;

# Define the bootstrap servers and the topic name
bootstrap_servers = "localhost:9092"
topic = "events"

# Create a Kafka producer with JSON value serializer
producer = KafkaProducer(bootstrap_servers=bootstrap_servers)

# Define a function to generate random event data
def generate_event():
  current_time = time.time()
  user_id = random.randint(1, 100)
  event_type = random.choice(["click", "view", "purchase", "like", "share"])
  event_time = datetime.datetime.fromtimestamp(current_time- abs(random.normalvariate(0, 10))).strftime("%m/%d/%Y, %H:%M:%S")
  processing_time =datetime.datetime.fromtimestamp(current_time).strftime("%m/%d/%Y, %H:%M:%S")
  return {"user_id": user_id, "event_type": event_type, "event_time": event_time, "processing_time": processing_time}

while True:
  event = generate_event()
  print(event)
    
  producer.send(topic, value=json.dumps(event).encode('utf-8'))
  time.sleep(50)

from delta import configure_spark_with_delta_pip
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, window, count, to_timestamp
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

builder = (SparkSession.builder
           .appName("apply-window-aggregations")
           .master("spark://ZZHPC:7077")
           .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
           .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"))

spark = configure_spark_with_delta_pip(builder,['org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.1']).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

df = (spark.readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", "localhost:9092")
      .option("subscribe", "events")
      .option("startingOffsets", "earliest")
      .load())

schema = StructType([
    StructField('user_id', IntegerType(), True),
    StructField('event_type', StringType(), True),
    StructField('event_time', StringType(), True),
    StructField('processing_time', StringType(), True)])

df = df.withColumn('value', from_json(col('value').cast("STRING"), schema))

df = (df
      .select(
          col('value.user_id').alias('user_id'),
          col('value.event_type').alias('event_type'),
          col('value.event_time').alias('event_time'),
          col('value.processing_time').alias('processing_time'))
      .withColumn("event_time", to_timestamp(col("event_time"), "MM/dd/yyyy, HH:mm:ss" ))
      .withColumn("processing_time", to_timestamp(col("processing_time"), "MM/dd/yyyy, HH:mm:ss")))

df = (df.groupBy(
            window(col("event_time"), "60 minute", "60 minute"), 
            col("event_type"))
        .agg(count(col("user_id")).alias("NumberOfUsers")))

query = (df.writeStream
    .outputMode('complete')
    .format('console')
    .option("truncate", False)
    .start())

-------------------------------------------
Batch: 0
-------------------------------------------
+------------------------------------------+----------+-------------+
|window                                    |event_type|NumberOfUsers|
+------------------------------------------+----------+-------------+
|{2025-02-06 16:00:00, 2025-02-06 17:00:00}|view      |5            |
|{2025-02-06 16:00:00, 2025-02-06 17:00:00}|purchase  |1            |
|{2025-02-06 16:00:00, 2025-02-06 17:00:00}|share     |6            |
|{2025-02-06 16:00:00, 2025-02-06 17:00:00}|click     |6            |
+------------------------------------------+----------+-------------+

                                                                                
-------------------------------------------
Batch: 1
-------------------------------------------
+------------------------------------------+----------+-------------+
|window                                    |event_type|NumberOfUsers|
+------------------------------------------+----------+-------------+
|{2025-02-06 16:00:00, 2025-02-06 17:00:00}|view      |5            |
|{2025-02-06 16:00:00, 2025-02-06 17:00:00}|like      |1            |
|{2025-02-06 16:00:00, 2025-02-06 17:00:00}|purchase  |1            |
|{2025-02-06 16:00:00, 2025-02-06 17:00:00}|share     |6            |
|{2025-02-06 16:00:00, 2025-02-06 17:00:00}|click     |6            |
+------------------------------------------+----------+-------------+

                                                                                
-------------------------------------------
Batch: 2
-------------------------------------------
+------------------------------------------+----------+-------------+
|window                                    |event_type|NumberOfUsers|
+------------------------------------------+----------+-------------+
|{2025-02-06 16:00:00, 2025-02-06 17:00:00}|view      |6            |
|{2025-02-06 16:00:00, 2025-02-06 17:00:00}|like      |1            |
|{2025-02-06 16:00:00, 2025-02-06 17:00:00}|purchase  |1            |
|{2025-02-06 16:00:00, 2025-02-06 17:00:00}|share     |6            |
|{2025-02-06 16:00:00, 2025-02-06 17:00:00}|click     |6            |
+------------------------------------------+----------+-------------+

query.stop()

# Update output mode 
query = (df.writeStream.outputMode("update") 
    .format("console") 
    .option("truncate", False) 
    .start())

-------------------------------------------
Batch: 0
-------------------------------------------
+------------------------------------------+----------+-------------+
|window                                    |event_type|NumberOfUsers|
+------------------------------------------+----------+-------------+
|{2025-02-06 16:00:00, 2025-02-06 17:00:00}|view      |6            |
|{2025-02-06 16:00:00, 2025-02-06 17:00:00}|like      |1            |
|{2025-02-06 16:00:00, 2025-02-06 17:00:00}|purchase  |2            |
|{2025-02-06 16:00:00, 2025-02-06 17:00:00}|share     |7            |
|{2025-02-06 16:00:00, 2025-02-06 17:00:00}|click     |7            |
+------------------------------------------+----------+-------------+

                                                                                
-------------------------------------------
Batch: 1
-------------------------------------------
+------------------------------------------+----------+-------------+
|window                                    |event_type|NumberOfUsers|
+------------------------------------------+----------+-------------+
|{2025-02-06 16:00:00, 2025-02-06 17:00:00}|share     |8            |
+------------------------------------------+----------+-------------+

query.stop()

spark.stop()

from delta import configure_spark_with_delta_pip
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, window, count, to_timestamp
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

builder = (SparkSession.builder
           .appName("handle-late-and-out-of-order-data")
           .master("spark://ZZHPC:7077")
           .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
           .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"))

spark = configure_spark_with_delta_pip(builder,['org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.1']).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

df = (spark.readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", "localhost:9092")
      .option("subscribe", "events")
      .option("startingOffsets", "latest")
      .load())

schema = StructType([
    StructField('user_id', IntegerType(), True),
    StructField('event_type', StringType(), True),
    StructField('event_time', StringType(), True),
    StructField('processing_time', StringType(), True)])

df = df.withColumn('value', from_json(col('value').cast("STRING"), schema))

df = (df.select(
          col('value.user_id').alias('user_id'),
          col('value.event_type').alias('event_type'),
          col('value.event_time').alias('event_time'),
          col('value.processing_time').alias('processing_time'))
      .withColumn("event_time", to_timestamp(col("event_time"), "MM/dd/yyyy, HH:mm:ss" ))
      .withColumn("processing_time", to_timestamp(col("processing_time"), "MM/dd/yyyy, HH:mm:ss")))

df = df.withWatermark("event_time", "10 seconds")

df = (df
      .groupBy(window(col("event_time"), "1 minute", "1 minute"), col("user_id"))
      .count().alias("NumberOfEvents"))

query = (df.writeStream
    .outputMode('update')
    .format('console')
    .option("truncate", False)
    .start())

-------------------------------------------
Batch: 0
-------------------------------------------
+------+-------+-----+
|window|user_id|count|
+------+-------+-----+
+------+-------+-----+

                                                                                
-------------------------------------------
Batch: 1
-------------------------------------------
+------------------------------------------+-------+-----+
|window                                    |user_id|count|
+------------------------------------------+-------+-----+
|{2025-02-06 17:17:00, 2025-02-06 17:18:00}|97     |1    |
+------------------------------------------+-------+-----+

                                                                                
-------------------------------------------
Batch: 2
-------------------------------------------
+------+-------+-----+
|window|user_id|count|
+------+-------+-----+
+------+-------+-----+

                                                                                
-------------------------------------------
Batch: 3
-------------------------------------------
+------------------------------------------+-------+-----+
|window                                    |user_id|count|
+------------------------------------------+-------+-----+
|{2025-02-06 17:17:00, 2025-02-06 17:18:00}|97     |2    |
+------------------------------------------+-------+-----+

                                                                                
-------------------------------------------
Batch: 4
-------------------------------------------
+------+-------+-----+
|window|user_id|count|
+------+-------+-----+
+------+-------+-----+

                                                                                
-------------------------------------------
Batch: 5
-------------------------------------------
+------------------------------------------+-------+-----+
|window                                    |user_id|count|
+------------------------------------------+-------+-----+
|{2025-02-06 17:17:00, 2025-02-06 17:18:00}|63     |1    |
+------------------------------------------+-------+-----+

                                                                                
-------------------------------------------
Batch: 6
-------------------------------------------
+------+-------+-----+
|window|user_id|count|
+------+-------+-----+
+------+-------+-----+

                                                                                
-------------------------------------------
Batch: 7
-------------------------------------------
+------------------------------------------+-------+-----+
|window                                    |user_id|count|
+------------------------------------------+-------+-----+
|{2025-02-06 17:18:00, 2025-02-06 17:19:00}|77     |1    |
+------------------------------------------+-------+-----+

                                                                                
-------------------------------------------
Batch: 8
-------------------------------------------
+------+-------+-----+
|window|user_id|count|
+------+-------+-----+
+------+-------+-----+

                                                                                
-------------------------------------------
Batch: 9
-------------------------------------------
+------------------------------------------+-------+-----+
|window                                    |user_id|count|
+------------------------------------------+-------+-----+
|{2025-02-06 17:18:00, 2025-02-06 17:19:00}|90     |1    |
+------------------------------------------+-------+-----+

                                                                                
-------------------------------------------
Batch: 10
-------------------------------------------
+------+-------+-----+
|window|user_id|count|
+------+-------+-----+
+------+-------+-----+

query.stop()

spark.stop()

posted on 2025-02-05 16:34 ZhangZhihuiAAA 阅读(24) 评论(0) 收藏举报

刷新页面返回顶部


博客园 © 2004-2025 浙公网安备 33010602011771号浙ICP备2021040463号-3

导航