1、数据的导入与导出
import pandas as pd
import numpy as np
#导入csv文件
df=pd.read_csv("data/test.csv")
df
|
EmployeeID |
birthdate_key |
age |
city_name |
department |
job_title |
gender |
| 0 |
1318 |
1/3/1954 |
61 |
Vancouver |
Executive |
CEO |
M |
| 1 |
1319 |
1/3/1957 |
58 |
Vancouver |
Executive |
VP Stores |
F |
| 2 |
1320 |
1/2/1955 |
60 |
Vancouver |
Executive |
Legal Counsel |
F |
| 3 |
1321 |
1/2/1959 |
56 |
Vancouver |
Executive |
VP Human Resources |
M |
| 4 |
1322 |
1/9/1958 |
57 |
Vancouver |
Executive |
VP Finance |
M |
| ... |
... |
... |
... |
... |
... |
... |
... |
| 6279 |
8036 |
8/9/1992 |
23 |
New Westminister |
Customer Service |
Cashier |
F |
| 6280 |
8181 |
9/26/1993 |
22 |
Prince George |
Customer Service |
Cashier |
M |
| 6281 |
8223 |
2/11/1994 |
21 |
Trail |
Customer Service |
Cashier |
M |
| 6282 |
8226 |
2/16/1994 |
21 |
Victoria |
Customer Service |
Cashier |
F |
| 6283 |
8264 |
6/13/1994 |
21 |
Vancouver |
Customer Service |
Cashier |
F |
6284 rows × 7 columns
print(df.tail())
print(df.age.mean())
EmployeeID birthdate_key age city_name department \
6279 8036 8/9/1992 23 New Westminister Customer Service
6280 8181 9/26/1993 22 Prince George Customer Service
6281 8223 2/11/1994 21 Trail Customer Service
6282 8226 2/16/1994 21 Victoria Customer Service
6283 8264 6/13/1994 21 Vancouver Customer Service
job_title gender
6279 Cashier F
6280 Cashier M
6281 Cashier M
6282 Cashier F
6283 Cashier F
45.78341820496499
df1=df.head(10)
df1.to_csv("data/test_new.csv")
#导入json文件
#方式1
df2=pd.read_json("data/test2.json")
print(df2)
id name age city salary is_student
0 1 张三 25 北京 8500.5 False
1 2 李四 30 上海 12000.0 False
2 3 王五 22 广州 5000.0 True
#方式2
import json
with open("data/test2.json",encoding="utf-8")as f:
data=json.load(f)
df3=pd.DataFrame(data)
df3
|
id |
name |
age |
city |
salary |
is_student |
| 0 |
1 |
张三 |
25 |
北京 |
8500.5 |
False |
| 1 |
2 |
李四 |
30 |
上海 |
12000.0 |
False |
| 2 |
3 |
王五 |
22 |
广州 |
5000.0 |
True |
2、缺失值的处理
2.1 查看缺失值
s=pd.Series([1,2,3,4,np.nan,None,pd.NA])
print(s)
0 1
1 2
2 3
3 4
4 NaN
5 None
6 <NA>
dtype: object
print(s.isna()) #或者使用 print(s.isnull())
0 False
1 False
2 False
3 False
4 True
5 True
6 True
dtype: bool
df=pd.DataFrame([[1,2,np.nan],[2,3,4],[np.nan,np.nan,99]],columns=["A","B","C"])
print(df)
print(df.isna())
print("缺失值数量:\n",df.isna().sum(axis=0))#axis=0时按列,axis=1时按行
A B C
0 1.0 2.0 NaN
1 2.0 3.0 4.0
2 NaN NaN 99.0
A B C
0 False False True
1 False False False
2 True True False
缺失值数量:
A 1
B 1
C 1
dtype: int64
2.2剔除缺失值
print(s.dropna())#直接删除nan数据
0 1
1 2
2 3
3 4
dtype: object
print(df.dropna(axis=0))#按行检索,只要该行有nan,直接删除本行
A B C
1 2.0 3.0 4.0
print(df.dropna(how="all",axis=0))#仅当全nan时删除该行
A B C
0 1.0 2.0 NaN
1 2.0 3.0 4.0
2 NaN NaN 99.0
print(df.dropna(thresh=2,axis=0))#如果至少有两个!=nan就保留下来
A B C
0 1.0 2.0 NaN
1 2.0 3.0 4.0
#若某列中元素有nan,则剔除对应行
print(df.dropna(subset=["A"]))
A B C
0 1.0 2.0 NaN
1 2.0 3.0 4.0
2.3 填充缺失值
df=pd.read_csv("data/random_data.csv")
df
|
product_name |
quantity |
unit_price |
category |
rating |
| 0 |
Laptop |
NaN |
119.98 |
Electronics |
4.5 |
| 1 |
Mouse |
6.0 |
26.09 |
Electronics |
3.8 |
| 2 |
Keyboard |
2.0 |
361.70 |
NaN |
5.0 |
| 3 |
Monitor |
1.0 |
88.81 |
NaN |
4.2 |
| 4 |
Headphones |
NaN |
175.60 |
Electronics |
3.5 |
| 5 |
USB Drive |
23.0 |
406.39 |
NaN |
NaN |
| 6 |
Printer |
25.0 |
NaN |
Peripherals |
4.9 |
| 7 |
Scanner |
24.0 |
356.09 |
Electronics |
2.8 |
| 8 |
Webcam |
6.0 |
435.50 |
NaN |
NaN |
| 9 |
Speaker |
24.0 |
184.76 |
Peripherals |
4.0 |
df.isna().sum(axis=0)
product_name 0
quantity 2
unit_price 1
category 4
rating 2
dtype: int64
print(df.ffill()) #直接用前面那个序号的值填充 front
product_name quantity unit_price category rating
0 Laptop NaN 119.98 Electronics 4.5
1 Mouse 6.0 26.09 Electronics 3.8
2 Keyboard 2.0 361.70 Electronics 5.0
3 Monitor 1.0 88.81 Electronics 4.2
4 Headphones 1.0 175.60 Electronics 3.5
5 USB Drive 23.0 406.39 Electronics 3.5
6 Printer 25.0 406.39 Peripherals 4.9
7 Scanner 24.0 356.09 Electronics 2.8
8 Webcam 6.0 435.50 Electronics 2.8
9 Speaker 24.0 184.76 Peripherals 4.0
print(df.bfill()) #直接用后边那个序号的值填充 behind
product_name quantity unit_price category rating
0 Laptop 6.0 119.98 Electronics 4.5
1 Mouse 6.0 26.09 Electronics 3.8
2 Keyboard 2.0 361.70 Electronics 5.0
3 Monitor 1.0 88.81 Electronics 4.2
4 Headphones 23.0 175.60 Electronics 3.5
5 USB Drive 23.0 406.39 Peripherals 4.9
6 Printer 25.0 356.09 Peripherals 4.9
7 Scanner 24.0 356.09 Electronics 2.8
8 Webcam 6.0 435.50 Peripherals 4.0
9 Speaker 24.0 184.76 Peripherals 4.0
#对特定列的nan填充固定值
print(df.fillna({"quantity":0,"rating":1}))
product_name quantity unit_price category rating
0 Laptop 0.0 119.98 Electronics 4.5
1 Mouse 6.0 26.09 Electronics 3.8
2 Keyboard 2.0 361.70 NaN 5.0
3 Monitor 1.0 88.81 NaN 4.2
4 Headphones 0.0 175.60 Electronics 3.5
5 USB Drive 23.0 406.39 NaN 1.0
6 Printer 25.0 NaN Peripherals 4.9
7 Scanner 24.0 356.09 Electronics 2.8
8 Webcam 6.0 435.50 NaN 1.0
9 Speaker 24.0 184.76 Peripherals 4.0
#填充列平均值
print(df.fillna(df[["quantity","rating"]].mean()))#注意是两个中括号
product_name quantity unit_price category rating
0 Laptop 13.875 119.98 Electronics 4.5000
1 Mouse 6.000 26.09 Electronics 3.8000
2 Keyboard 2.000 361.70 NaN 5.0000
3 Monitor 1.000 88.81 NaN 4.2000
4 Headphones 13.875 175.60 Electronics 3.5000
5 USB Drive 23.000 406.39 NaN 4.0875
6 Printer 25.000 NaN Peripherals 4.9000
7 Scanner 24.000 356.09 Electronics 2.8000
8 Webcam 6.000 435.50 NaN 4.0875
9 Speaker 24.000 184.76 Peripherals 4.0000