Pandas数据处理(1): 数据的导入导出与缺失值处理

1、数据的导入与导出

import pandas as pd
import numpy as np
#导入csv文件
df=pd.read_csv("data/test.csv")
df
EmployeeID birthdate_key age city_name department job_title gender
0 1318 1/3/1954 61 Vancouver Executive CEO M
1 1319 1/3/1957 58 Vancouver Executive VP Stores F
2 1320 1/2/1955 60 Vancouver Executive Legal Counsel F
3 1321 1/2/1959 56 Vancouver Executive VP Human Resources M
4 1322 1/9/1958 57 Vancouver Executive VP Finance M
... ... ... ... ... ... ... ...
6279 8036 8/9/1992 23 New Westminister Customer Service Cashier F
6280 8181 9/26/1993 22 Prince George Customer Service Cashier M
6281 8223 2/11/1994 21 Trail Customer Service Cashier M
6282 8226 2/16/1994 21 Victoria Customer Service Cashier F
6283 8264 6/13/1994 21 Vancouver Customer Service Cashier F

6284 rows × 7 columns

print(df.tail())
print(df.age.mean())
      EmployeeID birthdate_key  age         city_name        department  \
6279        8036      8/9/1992   23  New Westminister  Customer Service   
6280        8181     9/26/1993   22     Prince George  Customer Service   
6281        8223     2/11/1994   21             Trail  Customer Service   
6282        8226     2/16/1994   21          Victoria  Customer Service   
6283        8264     6/13/1994   21         Vancouver  Customer Service   

     job_title gender  
6279   Cashier      F  
6280   Cashier      M  
6281   Cashier      M  
6282   Cashier      F  
6283   Cashier      F  
45.78341820496499
df1=df.head(10)
df1.to_csv("data/test_new.csv")
#导入json文件
#方式1
df2=pd.read_json("data/test2.json")
print(df2)
   id name  age city   salary  is_student
0   1   张三   25   北京   8500.5       False
1   2   李四   30   上海  12000.0       False
2   3   王五   22   广州   5000.0        True
#方式2
import json
with open("data/test2.json",encoding="utf-8")as f:
    data=json.load(f)
df3=pd.DataFrame(data)
df3
id name age city salary is_student
0 1 张三 25 北京 8500.5 False
1 2 李四 30 上海 12000.0 False
2 3 王五 22 广州 5000.0 True

2、缺失值的处理

2.1 查看缺失值
s=pd.Series([1,2,3,4,np.nan,None,pd.NA])
print(s)
0       1
1       2
2       3
3       4
4     NaN
5    None
6    <NA>
dtype: object
print(s.isna()) #或者使用   print(s.isnull())
0    False
1    False
2    False
3    False
4     True
5     True
6     True
dtype: bool
df=pd.DataFrame([[1,2,np.nan],[2,3,4],[np.nan,np.nan,99]],columns=["A","B","C"])
print(df)
print(df.isna())
print("缺失值数量:\n",df.isna().sum(axis=0))#axis=0时按列,axis=1时按行
     A    B     C
0  1.0  2.0   NaN
1  2.0  3.0   4.0
2  NaN  NaN  99.0
       A      B      C
0  False  False   True
1  False  False  False
2   True   True  False
缺失值数量:
 A    1
B    1
C    1
dtype: int64
2.2剔除缺失值
print(s.dropna())#直接删除nan数据
0    1
1    2
2    3
3    4
dtype: object
print(df.dropna(axis=0))#按行检索,只要该行有nan,直接删除本行
     A    B    C
1  2.0  3.0  4.0
print(df.dropna(how="all",axis=0))#仅当全nan时删除该行
     A    B     C
0  1.0  2.0   NaN
1  2.0  3.0   4.0
2  NaN  NaN  99.0
print(df.dropna(thresh=2,axis=0))#如果至少有两个!=nan就保留下来
     A    B    C
0  1.0  2.0  NaN
1  2.0  3.0  4.0
#若某列中元素有nan,则剔除对应行
print(df.dropna(subset=["A"]))
     A    B    C
0  1.0  2.0  NaN
1  2.0  3.0  4.0
2.3 填充缺失值
df=pd.read_csv("data/random_data.csv")
df
product_name quantity unit_price category rating
0 Laptop NaN 119.98 Electronics 4.5
1 Mouse 6.0 26.09 Electronics 3.8
2 Keyboard 2.0 361.70 NaN 5.0
3 Monitor 1.0 88.81 NaN 4.2
4 Headphones NaN 175.60 Electronics 3.5
5 USB Drive 23.0 406.39 NaN NaN
6 Printer 25.0 NaN Peripherals 4.9
7 Scanner 24.0 356.09 Electronics 2.8
8 Webcam 6.0 435.50 NaN NaN
9 Speaker 24.0 184.76 Peripherals 4.0
df.isna().sum(axis=0)
product_name    0
quantity        2
unit_price      1
category        4
rating          2
dtype: int64
print(df.ffill())  #直接用前面那个序号的值填充 front
  product_name  quantity  unit_price     category  rating
0       Laptop       NaN      119.98  Electronics     4.5
1        Mouse       6.0       26.09  Electronics     3.8
2     Keyboard       2.0      361.70  Electronics     5.0
3      Monitor       1.0       88.81  Electronics     4.2
4   Headphones       1.0      175.60  Electronics     3.5
5    USB Drive      23.0      406.39  Electronics     3.5
6      Printer      25.0      406.39  Peripherals     4.9
7      Scanner      24.0      356.09  Electronics     2.8
8       Webcam       6.0      435.50  Electronics     2.8
9      Speaker      24.0      184.76  Peripherals     4.0
print(df.bfill())  #直接用后边那个序号的值填充 behind
  product_name  quantity  unit_price     category  rating
0       Laptop       6.0      119.98  Electronics     4.5
1        Mouse       6.0       26.09  Electronics     3.8
2     Keyboard       2.0      361.70  Electronics     5.0
3      Monitor       1.0       88.81  Electronics     4.2
4   Headphones      23.0      175.60  Electronics     3.5
5    USB Drive      23.0      406.39  Peripherals     4.9
6      Printer      25.0      356.09  Peripherals     4.9
7      Scanner      24.0      356.09  Electronics     2.8
8       Webcam       6.0      435.50  Peripherals     4.0
9      Speaker      24.0      184.76  Peripherals     4.0
#对特定列的nan填充固定值
print(df.fillna({"quantity":0,"rating":1}))  
  product_name  quantity  unit_price     category  rating
0       Laptop       0.0      119.98  Electronics     4.5
1        Mouse       6.0       26.09  Electronics     3.8
2     Keyboard       2.0      361.70          NaN     5.0
3      Monitor       1.0       88.81          NaN     4.2
4   Headphones       0.0      175.60  Electronics     3.5
5    USB Drive      23.0      406.39          NaN     1.0
6      Printer      25.0         NaN  Peripherals     4.9
7      Scanner      24.0      356.09  Electronics     2.8
8       Webcam       6.0      435.50          NaN     1.0
9      Speaker      24.0      184.76  Peripherals     4.0
#填充列平均值
print(df.fillna(df[["quantity","rating"]].mean()))#注意是两个中括号
  product_name  quantity  unit_price     category  rating
0       Laptop    13.875      119.98  Electronics  4.5000
1        Mouse     6.000       26.09  Electronics  3.8000
2     Keyboard     2.000      361.70          NaN  5.0000
3      Monitor     1.000       88.81          NaN  4.2000
4   Headphones    13.875      175.60  Electronics  3.5000
5    USB Drive    23.000      406.39          NaN  4.0875
6      Printer    25.000         NaN  Peripherals  4.9000
7      Scanner    24.000      356.09  Electronics  2.8000
8       Webcam     6.000      435.50          NaN  4.0875
9      Speaker    24.000      184.76  Peripherals  4.0000
posted @ 2026-03-04 22:58  wangzy336  阅读(5)  评论(0)    收藏  举报