Pyspark DataFrame 字段|列数据[正则]替换 PySpark Replace Column Values in DataFrame

PySpark Replace Column Values in DataFrame

Pyspark 字段|列数据[正则]替换

转载:[Reprint]: https://sparkbyexamples.com/pyspark/pyspark-replace-column-values/#:~:text=By using PySpark SQL function regexp_replace () you,value with Road string on address column. 2.

1.Create DataFrame

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[1]").appName("SparkByExamples.com").getOrCreate()
address = [(1,"14851 Jeffrey Rd","DE"),
    (2,"43421 Margarita St","NY"),
    (3,"13111 Siemon Ave","CA")]
df = spark.createDataFrame(address,["id","address","state"])
df.show()

2.Use Regular expression to replace String Column Value

#Replace part of string with another string
from pyspark.sql.functions import regexp_replace
df.withColumn('address', regexp_replace('address', 'Rd', 'Road')) \
  .show(truncate=False)
# createVar[f"{table_name}_df"] = getattr(sys.modules[__name__], f'{table_name}_df').withColumn('STVINNO',regexp_replace('STVINNO', '�', ''))

#+---+------------------+-----+
#|id |address           |state|
#+---+------------------+-----+
#|1  |14851 Jeffrey Road|DE   |
#|2  |43421 Margarita St|NY   |
#|3  |13111 Siemon Ave  |CA   |
#+---+------------------+-----+

3.Replace Column Values Conditionally

#Replace string column value conditionally
from pyspark.sql.functions import when
df.withColumn('address', 
    when(df.address.endswith('Rd'),regexp_replace(df.address,'Rd','Road')) \
   .when(df.address.endswith('St'),regexp_replace(df.address,'St','Street')) \
   .when(df.address.endswith('Ave'),regexp_replace(df.address,'Ave','Avenue')) \
   .otherwise(df.address)) \
   .show(truncate=False)

#+---+----------------------+-----+
#|id |address               |state|
#+---+----------------------+-----+
#|1  |14851 Jeffrey Road    |DE   |
#|2  |43421 Margarita Street|NY   |
#|3  |13111 Siemon Avenue   |CA   |
#+---+----------------------+-----+ 

4.Replace Column Value with Dictionary (map)

#Replace values from Dictionary
stateDic={'CA':'California','NY':'New York','DE':'Delaware'}
df2=df.rdd.map(lambda x: 
    (x.id,x.address,stateDic[x.state]) 
    ).toDF(["id","address","state"])
df2.show()

#+---+------------------+----------+
#| id|           address|     state|
#+---+------------------+----------+
#|  1|  14851 Jeffrey Rd|  Delaware|
#|  2|43421 Margarita St|  New York|
#|  3|  13111 Siemon Ave|California|
#+---+------------------+----------+

5.Replace Column Value Character by Character

#Using translate to replace character by character
from pyspark.sql.functions import translate
df.withColumn('address', translate('address', '123', 'ABC')) \
  .show(truncate=False)

#+---+------------------+-----+
#|id |address           |state|
#+---+------------------+-----+
#|1  |A485A Jeffrey Rd  |DE   |
#|2  |4C4BA Margarita St|NY   |
#|3  |ACAAA Siemon Ave  |CA   |
#+---+------------------+-----+

6.Replace Column with Another Column Value

#Replace column with another column
from pyspark.sql.functions import expr
df = spark.createDataFrame(
   [("ABCDE_XYZ", "XYZ","FGH")], 
    ("col1", "col2","col3")
  )
df.withColumn("new_column",
              expr("regexp_replace(col1, col2, col3)")
              .alias("replaced_value")
              ).show()

#+---------+----+----+----------+
#|     col1|col2|col3|new_column|
#+---------+----+----+----------+
#|ABCDE_XYZ| XYZ| FGH| ABCDE_FGH|
#+---------+----+----+----------+

7.All In One

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[1]").appName("SparkByExamples.com").getOrCreate()

address = [(1,"14851 Jeffrey Rd","DE"),
    (2,"43421 Margarita St","NY"),
    (3,"13111 Siemon Ave","CA")]

df =spark.createDataFrame(address,["id","address","state"])
df.show()

#Replace string
from pyspark.sql.functions import regexp_replace
df.withColumn('address', regexp_replace('address', 'Rd', 'Road')) \
  .show(truncate=False)

#Replace string
from pyspark.sql.functions import when
df.withColumn('address', 
      when(df.address.endswith('Rd'),regexp_replace(df.address,'Rd','Road')) \
     .when(df.address.endswith('St'),regexp_replace(df.address,'St','Street')) \
     .when(df.address.endswith('Ave'),regexp_replace(df.address,'Ave','Avenue')) \
     .otherwise(df.address)) \
     .show(truncate=False)   


#Replace values from Dictionary
stateDic={'CA':'California','NY':'New York','DE':'Delaware'}
df2=df.rdd.map(lambda x: 
    (x.id,x.address,stateDic[x.state]) 
    ).toDF(["id","address","state"])
df2.show()

#Using translate
from pyspark.sql.functions import translate
df.withColumn('address', translate('address', '123', 'ABC')) \
  .show(truncate=False)

#Replace column with another column
from pyspark.sql.functions import expr
df = spark.createDataFrame([("ABCDE_XYZ", "XYZ","FGH")], ("col1", "col2","col3"))
df.withColumn("new_column",
              expr("regexp_replace(col1, col2, col3)")
              .alias("replaced_value")
              ).show()
  
#Overlay
from pyspark.sql.functions import overlay
df = spark.createDataFrame([("ABCDE_XYZ", "FGH")], ("col1", "col2"))
df.select(overlay("col1", "col2", 7).alias("overlayed")).show()
posted @ 2021-07-19 11:34  Jerry-1  阅读(1479)  评论(0编辑  收藏  举报