Python数据分析 #002 Numpy
NumPy(Numerical Python) 是 Python 语言的一个扩展程序库,支持大量的维度数组与矩阵运算,此外也针对数组运算提供大量的数学函数库。
1. Numpy 介绍

-
NumPy 是一个运行速度非常快的数学库,主要用于数组计算,包含:线性代数、傅里叶变换、随机数生成等功能
-
python在图像处理中常常使用Numpy进行像素的运算
-
安装Numpy:
pip install numpy
2. 初识numpy
2.1 存储数据
numpy 只能存储相同类型的数据
import numpy as np
score = np.array([[80, 89, 86, 67, 79],
[78, 97, 89, 67, 81],
[90, 94, 78, 67, 74],
[91, 91, 90, 67, 69],
[76, 87, 75, 67, 86],
[70, 79, 84, 67, 84],
[94, 92, 93, 67, 64],
[86, 85, 83, 67, 80]])
print(score)
print(type(score))
[[80 89 86 67 79]
[78 97 89 67 81]
[90 94 78 67 74]
[91 91 90 67 69]
[76 87 75 67 86]
[70 79 84 67 84]
[94 92 93 67 64]
[86 85 83 67 80]]
<class 'numpy.ndarray'>
2.2 numpy与python list对比
import random
import time
list_data = []
for i in range(100000000):
list_data.append(random.random())
np_list = np.array(list_data)
t1 = time.time()
python_list = sum(list_data)
t2 = time.time()
d1 = t2 - t1
t3 = time.time()
np_ndarry = np.sum(np_list)
t4 = time.time()
d2 = t4-t3
print(d1)
print(d2)
0.8986742496490479
0.24837088584899902
3. 基本操作
3.1 ndarry的属性
import numpy as np
# score为二维数组,即列表内嵌套一层列表,[ [],[],[] ]
score = np.array([[80, 89, 86, 67, 79],
[78, 97, 89, 67, 81],
[90, 94, 78, 67, 74],
[91, 91, 90, 67, 69],
[76, 87, 75, 67, 86],
[70, 79, 84, 67, 84],
[94, 92, 93, 67, 64],
[86, 85, 83, 67, 80]])
# 数组的形状
## 8 为 内层列表的个数
## 5 为内层列表内元素个数
print(score.shape)
# 数组的维数
print(score.ndim)
# 数组的大小(总元素个数)
print(score.size)
# 数组中元素类型
print(score.dtype)
# 数组中一个元素的大小
print(score.itemsize)
(8, 5)
2
40
int32
4
3.2 ndarry的形状
import numpy as np
# 二维
a = np.array([[1,2,3],[4,5,6]])
# 一维
b = np.array([1,2,3,4])
# 三维
c = np.array( [ [[1,2,3],[4,5,6]], [[1,2,3], [4,5,6]] ])
print(a.shape) # 从外到内,二维数组内一维数组个数为2,一维数组内元素个数为3
print(b.shape) # 从外到内,三维数组内二维数组个数为2,二维数组内一维数组个数为2,一维数组内元素个数为3
print(c.shape) # 从外到内,三维数组内二维数组个数为2,二维数组内一维数组个数为2,一维数组内元素个数为3
(2, 3)
(4,)
(2, 2, 3)
3.3 ndarry的数据类型
定义数组时没有指定类型则:
int 默认为 int32, float 默认为float64
# 1.生成数组不指定类型
import numpy as np
data1 = np.array([1.1, 2.2, 3.3, 4.4, 5.5])
data2 = np.array([1, 2, 3, 4, 5])
print(data1)
print(data2)
print(data1.dtype)
print(data2.dtype)
[1.1 2.2 3.3 4.4 5.5]
[1 2 3 4 5]
float64
int32
# 2.生成数组指定类型
import numpy as np
data3 = np.array([1, 2, 3], dtype="int32")
print(data3.dtype)
data4 = np.array([1, 2, 3], dtype=np.int64)
print(data4.dtype)
int32
int64
3.4 生成数组的方法
3.4.1 生成0和1的数组
import numpy as np
# shape 可封装成数组或列表
data1 = np.zeros(shape=(3, 4), dtype="int64")
print(data1)
data2 = np.ones(shape=[3, 4], dtype=np.int32)
print(data2)
[[0 0 0 0]
[0 0 0 0]
[0 0 0 0]]
[[1 1 1 1]
[1 1 1 1]
[1 1 1 1]]
3.4.2 从已有数组生成
import numpy as np
score = np.array([[80, 89, 86, 67, 79],
[78, 97, 89, 67, 81],
[90, 94, 78, 67, 74],
[91, 91, 90, 67, 69],
[76, 87, 75, 67, 86],
[70, 79, 84, 67, 84],
[94, 92, 93, 67, 64],
[86, 85, 83, 67, 80]])
data1 = np.array(score) # 深拷贝
data3 = np.copy(score) # 深拷贝
data2 = np.asarray(score) # 浅拷贝
3.4.3 生成固定范围的数组
import numpy as np
# [0, 10] 均匀的取5个数
data = np.linspace(0, 10, 5)
print(data)
[ 0. 2.5 5. 7.5 10. ]
3.4.4 生成随机数组
import numpy as np
# [low, high) 左闭右开,生成1000个数
# 生成的数,出现的概率相同
data = np.random.uniform(low=-1, high=1, size=1000000)
# 验证分布是否均匀
import matplotlib.pyplot as plt
plt.figure(figsize=(20, 8), dpi=80)
plt.hist(data, 1000)
plt.show()

3.4.5 正态分布
import numpy as np
# loc 为正态分布中的μ,即随机变量的均值(对称轴)
# scale 为正态分布的σ, 即随机变量的方差(值越大,越胖越矮)
data = np.random.normal(loc=1.75, scale=0.1, size=1000000)
import matplotlib.pyplot as plt
plt.figure(figsize=(20, 8), dpi=80)
plt.hist(data,1000)
plt.show()

3.5 数组的索引操作
import numpy as np
data = np.random.normal(loc=0, scale=1, size=(6, 4))
print(data)
[[-2.80765847 0.9806412 0.4627258 -0.8282472 ]
[ 0.61957529 1.10015229 3.17409706 -1.47719302]
[-0.68976519 -0.65544996 0.07485424 1.8778339 ]
[-0.87999222 0.8916781 0.32213593 -0.22866971]
[ 0.0454046 0.28607432 0.6885944 -0.94492507]
[ 0.28773556 0.86663415 0.81204776 -0.14302773]]
# 第一个参数为第几个列表(索引从0开始),第二个为列表切片
data[1,:3]
array([0.61957529, 1.10015229, 3.17409706])
data2 = np.array([ [[1,2,3],[4,5,6]], [[12,3,34],[5,6,7]]])
# 第2个二维数组中的第1个一位数组中索引为2的值
data2[1,0,2]
34
3.6 形状修改
import numpy as np
data = np.random.normal(loc=0, scale=1, size=(5, 4))
print(data)
[[ 1.68222052 -0.43455475 1.38495045 0.64855998]
[ 0.75760956 -0.72282992 -0.680814 0.33704623]
[-1.12369912 0.57891835 -1.11248383 1.58782981]
[ 0.05110219 -0.18106968 0.83644638 -0.73372486]
[-1.23898805 -1.08965331 0.86758045 -1.50162934]]
data.reshape(4, 5) # 返回新的数组,原始数组未改变
array([[ 1.68222052, -0.43455475, 1.38495045, 0.64855998, 0.75760956],
[-0.72282992, -0.680814 , 0.33704623, -1.12369912, 0.57891835],
[-1.11248383, 1.58782981, 0.05110219, -0.18106968, 0.83644638],
[-0.73372486, -1.23898805, -1.08965331, 0.86758045, -1.50162934]])
data # 原始数组未改变
array([[ 1.68222052, -0.43455475, 1.38495045, 0.64855998],
[ 0.75760956, -0.72282992, -0.680814 , 0.33704623],
[-1.12369912, 0.57891835, -1.11248383, 1.58782981],
[ 0.05110219, -0.18106968, 0.83644638, -0.73372486],
[-1.23898805, -1.08965331, 0.86758045, -1.50162934]])
data.resize(4,5) # 没有返回值,原始数组被改变
data # 原始数组被改变
array([[ 1.68222052, -0.43455475, 1.38495045, 0.64855998, 0.75760956],
[-0.72282992, -0.680814 , 0.33704623, -1.12369912, 0.57891835],
[-1.11248383, 1.58782981, 0.05110219, -0.18106968, 0.83644638],
[-0.73372486, -1.23898805, -1.08965331, 0.86758045, -1.50162934]])
data.T # 转置,行变成列, 列变成行,原始数组未改变
array([[ 1.68222052, -0.72282992, -1.11248383, -0.73372486],
[-0.43455475, -0.680814 , 1.58782981, -1.23898805],
[ 1.38495045, 0.33704623, 0.05110219, -1.08965331],
[ 0.64855998, -1.12369912, -0.18106968, 0.86758045],
[ 0.75760956, 0.57891835, 0.83644638, -1.50162934]])
data # 原始数组未改变
array([[ 1.68222052, -0.43455475, 1.38495045, 0.64855998, 0.75760956],
[-0.72282992, -0.680814 , 0.33704623, -1.12369912, 0.57891835],
[-1.11248383, 1.58782981, 0.05110219, -0.18106968, 0.83644638],
[-0.73372486, -1.23898805, -1.08965331, 0.86758045, -1.50162934]])
3.7 类型修改
import numpy as np
data = np.random.normal(loc=0, scale=1, size=(5, 4))
print(data)
[[ 0.36497627 -0.59952847 0.79296164 0.41958173]
[-0.28965997 -1.54775829 -2.00663764 0.33787286]
[-2.02337191 -0.33037293 0.16974602 0.80092937]
[ 1.26416642 -1.54476757 1.15745786 -2.20419407]
[-0.82871831 -0.75315981 0.45672733 0.25010667]]
int_data = data.astype("int32")
print(int_data)
[[ 0 0 0 0]
[ 0 -1 -2 0]
[-2 0 0 0]
[ 1 -1 1 -2]
[ 0 0 0 0]]
# 序列化到本地
bytes_data = data.tostring()
print(bytes_data)
b'\xac\xa9Op\xc5[\xd7?=@\xadTV/\xe3\xbf\xdcML\x14\xf1_\xe9?p_qQm\xda\xda?\xf4\t\xa5\xfb\xc9\x89\xd2\xbf\xf2([3\x9e\xc3\xf8\xbf\xc9\xcc/\t\x98\r\x00\xc0\xb1\xf4j\x7f\xb5\x9f\xd5?\x0b\xaf\x88\x9c\xdd/\x00\xc0z.X~\xd4$\xd5\xbf\xb5\xd3\x96\xd5<\xba\xc5?\xaa$\x19\xa16\xa1\xe9?!\xdf!\x90\x06:\xf4?x\xf5f3^\xb7\xf8\xbf\x12\x93\x89\x87\xf2\x84\xf2?)xm\x800\xa2\x01\xc0\xb8^?E\xdc\x84\xea\xbfK\xd4s\x9c\xe2\x19\xe8\xbf`$\xa5D\x05;\xdd?\xd4\xd9+f\xbf\x01\xd0?'
3.8 数组去重
import numpy as np
temp = np.array([[1, 2, 3, 4],[3, 4, 5, 6]])
# 方法一:unique方法
print(np.unique(temp))
# 方法二:flatten把数组变为一维,再转化为集合(集合的去重功能)
## set 只能操作一位数组
print(set(temp.flatten()))
[1 2 3 4 5 6]
{1, 2, 3, 4, 5, 6}
4. ndarry运算
4.1 逻辑运算
4.1.1 单条件
import numpy as np
data = np.random.normal(loc=0, scale=1, size=(6, 4))
print(data)
[[ 0.08410401 -0.37538945 0.13432796 -0.24338345]
[-0.82588345 -0.30354523 1.20103 -0.63800444]
[ 0.53934054 1.1967968 2.1809332 -0.27882138]
[ 0.43126568 -0.06625733 -0.03355664 2.07546607]
[ 3.36921258 -1.3003321 1.10167876 -0.21506415]
[ 0.38562248 0.09991896 0.09002989 0.01234904]]
data = data[:3, :]
print(data)
[[ 0.08410401 -0.37538945 0.13432796 -0.24338345]
[-0.82588345 -0.30354523 1.20103 -0.63800444]
[ 0.53934054 1.1967968 2.1809332 -0.27882138]]
data > 0.5
array([[False, False, False, False],
[False, False, True, False],
[ True, True, True, False]])
data[data > 0.5] = 1.1
print(data)
[[ 0.08410401 -0.37538945 0.13432796 -0.24338345]
[-0.82588345 -0.30354523 1.1 -0.63800444]
[ 1.1 1.1 1.1 -0.27882138]]
# data > 0.5 中全为True则返回True,否则返回False
np.all(data > 0.5)
False
# data > 0.5 中全为False则返回False,否则返回True
np.any(data > 0.5)
True
4.1.2 多条件
import numpy as np
data = np.random.normal(loc=0, scale=1, size=(6, 4))
print(data)
[[ 0.76787982 0.88737247 1.74263682 -0.49420296]
[-0.18329718 -1.19295672 -1.18810181 1.23850121]
[ 0.14925692 -0.67148871 2.52844995 -0.47884825]
[-0.86996908 1.06541647 -2.33653304 0.63750588]
[-0.42733769 0.81996285 0.8012661 -0.16436488]
[-0.04465013 -0.10945832 0.28680191 -0.33861742]]
data = data[:4, :]
print(data)
[[ 0.76787982 0.88737247 1.74263682 -0.49420296]
[-0.18329718 -1.19295672 -1.18810181 1.23850121]
[ 0.14925692 -0.67148871 2.52844995 -0.47884825]
[-0.86996908 1.06541647 -2.33653304 0.63750588]]
# 逻辑且
np.logical_and(data>0.5, data<1)
array([[ True, True, False, False],
[False, False, False, False],
[False, False, False, False],
[False, False, False, True]])
# 逻辑或
np.logical_or(data>0.5, data<-1)
array([[ True, True, True, False],
[False, True, True, True],
[False, False, True, False],
[False, True, True, True]])
4.1.3 三元运算符
import numpy as np
data = np.random.normal(loc=0, scale=1, size=(6, 4))
print(data)
[[-0.52116204 0.67736524 0.43120268 -0.5997491 ]
[ 1.70802526 -1.46056336 1.09866457 -1.748535 ]
[-0.88784022 0.0807401 -0.55049389 -0.09172021]
[-0.48830458 0.25574224 0.15369485 0.08506734]
[-0.86505423 1.19853149 -2.42415664 0.87008424]
[-0.81790143 -0.40965744 0.29866362 1.55827793]]
data = data[1:5, :]
print(data)
[[-0.88784022 0.0807401 -0.55049389 -0.09172021]
[-0.48830458 0.25574224 0.15369485 0.08506734]
[-0.86505423 1.19853149 -2.42415664 0.87008424]]
data > 0.1
array([[False, False, False, False],
[False, True, True, False],
[False, True, False, True]])
# np.where(布尔值, 为True时显示的值, 为False时显示的值)
np.where(data>0.1, 1, 0)
array([[0, 0, 0, 0],
[0, 1, 1, 0],
[0, 1, 0, 1]])
4.2 统计运算
import numpy as np
data = np.random.normal(loc=0, scale=1, size=(6, 4))
print(data)
[[ 0.49955014 0.37754483 0.47376036 0.93311119]
[-0.43336049 -0.39221278 -1.57935361 -0.57287375]
[-1.12843407 -0.74063168 -0.02489084 -1.62302954]
[ 1.61659902 0.16823097 -1.41860306 -0.83117873]
[ 0.04641276 0.48024601 1.48735848 1.10058279]
[-0.77769939 0.56009927 0.48404536 -1.90639525]]
# 所有元素中最大的值
print(np.max(data))
print(data.max())
1.6165990231076133
1.6165990231076133
# 每一列中最大的值
# 或写成 np.max(data, axis=0)
print(data.max(axis=0))
# 每一行中最大的值
print(data.max(axis=1))
# 每行中最大的数的位置,索引从0开始
print(data.argmax(axis=1))
[1.61659902 0.56009927 1.48735848 1.10058279]
[ 0.93311119 -0.39221278 -0.02489084 1.61659902 1.48735848 0.56009927]
[3 1 2 0 2 1]
4.3 数组运算
4.3.1 数组与数的运算
import numpy as np
arr = np.array([[1, 2, 3, 2, 1, 4], [5, 6, 1, 2, 3, 1]])
arr * 10
# 其他运算符同理
array([[10, 20, 30, 20, 10, 40],
[50, 60, 10, 20, 30, 10]])
4.3.2 数组与数组的运算
条件:满足广播机制,即shape中数字相同或中其中有一个为1
import numpy as np
arr1 = np.array([[1, 2, 3, 2, 1, 4], [5, 6, 1, 2, 3, 1]])
arr2 = np.array([[1, 2, 3, 4], [3, 4, 5, 6]])
arr3 = np.array([[1], [3], [5]])
arr4 = np.array([[1], [3]])
# shape从右往左看
# arr1 和 arr2 shape中一个为 6,一个为 4,不相等,且没有一个数为1,则不能进行运算
# arr1 和 arr3 shape中一个为6,一个为1,满足。一个为2,一个为3不满足,不能运算
# arr1 和 arr4 shape中 6和1, 满足。2和2 满足,则能运算。
print(arr1.shape)
print(arr2.shape)
print(arr3.shape)
print(arr4.shape)
(2, 6)
(2, 4)
(3, 1)
(2, 1)
arr1 = np.array([[1, 2, 3, 2, 1, 4], [5, 6, 1, 2, 3, 1]])
arr4 = np.array([[1], [3]])
# (2,6) , (2, 1) 得到的数组shape为(2,6),取最大
arr1 + arr4
array([[2, 3, 4, 3, 2, 5],
[8, 9, 4, 5, 6, 4]])
# 不满足广播机制求数组的乘积
data = np.array([[80, 86],[82, 80],[85, 78],[90, 90],[86, 82],[82, 90],[78, 80],[92, 94]]) # (8,2)
weights = np.array([[0.3], [0.7]]) # (2, 1)
# 两种方法
print(np.matmul(data, weights))
print(np.dot(data, weights))
# 若写成 data * weights 会报错,因为不满足广播机制的数组间不能执行运算操作(加减乘除)
[[84.2]
[80.6]
[80.1]
[90. ]
[83.2]
[87.6]
[79.4]
[93.4]]
[[84.2]
[80.6]
[80.1]
[90. ]
[83.2]
[87.6]
[79.4]
[93.4]]
5. 矩阵运算
矩阵一定是二维数组,二维数组不一定是矩阵(可能是ndarry),
矩阵运算法则:(M行, N列) x (N行, L列) = (M行, L列)
import numpy as np
# ndarray存储矩阵
data_arry = np.array([[80, 86],[82, 80],[85, 78],[90, 90],[86, 82],[82, 90],[78, 80],[92, 94]])
# matrix储存矩阵
data_mat = np.mat([[80, 86],[82, 80],[85, 78],[90, 90],[86, 82],[82, 90],[78, 80],[92, 94]])
print(data_arry)
print(type(data_arry))
print(data_mat)
print(type(data_mat))
[[80 86]
[82 80]
[85 78]
[90 90]
[86 82]
[82 90]
[78 80]
[92 94]]
<class 'numpy.ndarray'>
[[80 86]
[82 80]
[85 78]
[90 90]
[86 82]
[82 90]
[78 80]
[92 94]]
<class 'numpy.matrix'>
# 矩阵运算不需要满足广播机制(前提是数据为矩阵对象,即type为numpy.matrix)
data_mat = np.mat([[80, 86],[82, 80],[85, 78],[90, 90],[86, 82],[82, 90],[78, 80],[92, 94]]) # (8, 2)
weights_mat = np.mat([[0.3], [0.7]]) # (2, 1)
# 矩阵运算法则 (8, 2) * (2, 1) = (8, 1)
data_mat * weights_mat
matrix([[84.2],
[80.6],
[80.1],
[90. ],
[83.2],
[87.6],
[79.4],
[93.4]])
6. 合并,分割
6.1 水平合并
import numpy as np
a = np.array([1, 2, 3]) # (3,)
b = np.array([4, 5, 6])
m = np.array([[1], [2], [3]]) # (3, 1)
n = np.array([[4], [5], [6]])
c = np.hstack((a, b))
d = np.hstack((m, n))
print(c)
print(d)
[1 2 3 4 5 6]
[[1 4]
[2 5]
[3 6]]
6.2 竖直合并
import numpy as np
a = np.array([1, 2, 3]) # (3,)
b = np.array([4, 5, 6])
m = np.array([[1], [2], [3]]) # (3, 1)
n = np.array([[4], [5], [6]])
c = np.vstack((a, b))
d = np.vstack((m, n))
print(c)
print(d)
[[1 2 3]
[4 5 6]]
[[1]
[2]
[3]
[4]
[5]
[6]]
6.3 concatenate
该方法不能水平合并一维数组,axis=1对一维数组来说是越界的
import numpy as np
a = np.array([[1], [2], [3]]) # (3, 1)
b = np.array([[4], [5], [6]])
c = np.concatenate((a, b), axis=0) # 竖直
d = np.concatenate((a, b), axis=1) # 水平
print(c)
print(d)
[[1]
[2]
[3]
[4]
[5]
[6]]
[[1 4]
[2 5]
[3 6]]
6.4 分割
import numpy as np
# 生成0-8的一维+数组
x = np.arange(9)
print(x)
# 根据个数分割
data1 = np.split(x, 3)
print(data1)
# 根据索引分割 [0,3), [3, 5), [5, 7)
data2 = np.split(x, [3, 5, 7])
print(data2)
[0 1 2 3 4 5 6 7 8]
[array([0, 1, 2]), array([3, 4, 5]), array([6, 7, 8])]
[array([0, 1, 2]), array([3, 4]), array([5, 6]), array([7, 8])]
7. numpy读取文件
numpy适合打开纯int类型数据的文件,否则容易产生缺失值,即出现nan
import numpy as np
# 打开的文件,包含str, float类型数据会显示nan,所以一般不用numpy打开文件
# 打开文件,指定分隔符为 “ ,”
data = np.genfromtxt(r"C:\Users\ASUS PC\Desktop\test.csv", delimiter=",")
print(data)
[[ nan nan nan nan]
[ 1. 123. 1.4 23. ]
[ 2. 110. nan 18. ]
[ 3. nan 2.1 19. ]]
7.1 缺失值处理
import numpy as np
data = np.genfromtxt(r"C:\Users\ASUS PC\Desktop\test.csv", delimiter=",")
print(data)
# 把数据为 nan 的修改为该列的平均值
# 若直接把 nan 的数据直接赋值为0或1,则会改变当前行或当前列的平均值大小,不推荐
def fill_nan_by_column_mean(t):
for i in range(t.shape[1]):
# 计算nan的个数
nan_num = np.count_nonzero(t[:, i][t[:, i] != t[:, i]])
if nan_num > 0:
now_col = t[:, i]
# 求和
now_col_not_nan = now_col[np.isnan(now_col) == False].sum()
# 和/个数
now_col_mean = now_col_not_nan / (t.shape[0] - nan_num)
# 赋值给now_col
now_col[np.isnan(now_col)] = now_col_mean
# 赋值给t,即更新t的当前列
t[:, i] = now_col
return t
fill_nan_by_column_mean(data)
[[ nan nan nan nan]
[ 1. 123. 1.4 23. ]
[ 2. 110. nan 18. ]
[ 3. nan 2.1 19. ]]
array([[ 2. , 116.5 , 1.75, 20. ],
[ 1. , 123. , 1.4 , 23. ],
[ 2. , 110. , 1.75, 18. ],
[ 3. , 116.5 , 2.1 , 19. ]])
本文来自博客园,作者:{枫_Null},转载请注明原文链接:https://www.cnblogs.com/fengNull/articles/16663794.html

浙公网安备 33010602011771号