spark将dataframe按照比例分割为2份方法
import pyspark # importing sparksession from pyspark.sql module from pyspark.sql import SparkSession def split2df(prod_df, ratio=0.8): # Calculate count of each dataframe rows length = int(prod_df.count() * ratio) # Create a copy of original dataframe copy_df = prod_df # Iterate for each dataframe temp_df = copy_df.limit(length) # Truncate the `copy_df` to remove # the contents fetched for `temp_df` copy_df = copy_df.subtract(temp_df) length2 = prod_df.count() - length temp_df2 = copy_df.limit(length2) copy_df2 = copy_df.subtract(temp_df2) return temp_df, temp_df2 # creating sparksession and giving an app name spark = SparkSession.builder.appName('sparkdf').getOrCreate() # Column names for the dataframe columns = ["Brand", "Product"] # Row data for the dataframe data = [ ("HP", "Laptop"), ("Lenovo", "Mouse"), ("Dell", "Keyboard"), ("Samsung", "Monitor"), ("MSI", "Graphics Card"), ("Asus", "Motherboard"), ("Gigabyte", "Motherboard"), ("Zebronics", "Cabinet"), ("Adata", "RAM"), ("Transcend", "SSD"), ("Kingston", "HDD"), ("Toshiba", "DVD Writer") ] # Create the dataframe using the above values prod_df = spark.createDataFrame(data=data, schema=columns) # View the dataframe prod_df.show() df1, df2 = split2df(prod_df) df1.show(truncate=False) df2.show(truncate=False)
分割结果:
+---------+-------------+
| Brand| Product|
+---------+-------------+
| HP| Laptop|
| Lenovo| Mouse|
| Dell| Keyboard|
| Samsung| Monitor|
| MSI|Graphics Card|
| Asus| Motherboard|
| Gigabyte| Motherboard|
|Zebronics| Cabinet|
| Adata| RAM|
|Transcend| SSD|
| Kingston| HDD|
| Toshiba| DVD Writer|
+---------+-------------+
+---------+-------------+
|Brand |Product |
+---------+-------------+
|HP |Laptop |
|Lenovo |Mouse |
|Dell |Keyboard |
|Samsung |Monitor |
|MSI |Graphics Card|
|Asus |Motherboard |
|Gigabyte |Motherboard |
|Zebronics|Cabinet |
|Adata |RAM |
+---------+-------------+
+---------+----------+
|Brand |Product |
+---------+----------+
|Transcend|SSD |
|Toshiba |DVD Writer|
|Kingston |HDD |
+---------+----------+
参考:
https://www.geeksforgeeks.org/pyspark-split-dataframe-into-equal-number-of-rows/