numpy笔记2
数组属性
# Array properties
a = np.array([[11, 12, 13, 14, 15],
[16, 17, 18, 19, 20],
[21, 22, 23, 24, 25],
[26, 27, 28 ,29, 30],
[31, 32, 33, 34, 35]])
print(type(a)) # >>><class 'numpy.ndarray'>
print(a.dtype) # >>>int64
print(a.size) # >>>25
print(a.shape) # >>>(5, 5)
print(a.itemsize) # >>>8
print(a.ndim) # >>>2
print(a.nbytes) # >>>200
整数数组索引
import numpy as np
a = np.array([[1,2],
[3, 4],
[5, 6]])
# An example of integer array indexing.
# The returned array will have shape (3,) and
print(a[[0, 1, 2], [0, 1, 0]]) # Prints "[1 4 5]" #第一个列表是一维索引,第二个列表是第二维索引
# The above example of integer array indexing is equivalent to this:
print(np.array([a[0, 0], a[1, 1], a[2, 0]])) # Prints "[1 4 5]"
# When using integer array indexing, you can reuse the same
# element from the source array:
print(a[[0, 0], [1, 1]]) # Prints "[2 2]"
# Equivalent to the previous integer array indexing example
print(np.array([a[0, 1], a[0, 1]])) # Prints "[2 2]"
布尔数组索引: 布尔数组索引允许你选择数组的任意元素。通常,这种类型的索引用于选择满足某些条件的数组元素
import numpy as np
a = np.array([[1,2],
[3, 4],
[5, 6]])
bool_idx = (a > 2) # Find the elements of a that are bigger than 2;
# this returns a numpy array of Booleans of the same
# shape as a, where each slot of bool_idx tells
# whether that element of a is > 2.
print(bool_idx) # Prints "[[False False]
# [ True True]
# [ True True]]"
# We use boolean array indexing to construct a rank 1 array
# consisting of the elements of a corresponding to the True values
# of bool_idx
print(a[bool_idx]) # Prints "[3 4 5 6]"
# We can do all of the above in a single concise statement:
print(a[a > 2]) # Prints "[3 4 5 6]"
Numpy为在数组上执行计算提供了许多有用的函数;其中最有用的函数之一是 SUM:
Numpy函数篇:https://www.numpy.org.cn/reference/routines/math.html
import numpy as np
x = np.array([[1,2],
[3,4]])
print(np.sum(x)) # Compute sum of all elements; prints "10"
print(np.sum(x, axis=0)) # Compute sum of each column; prints "[4 6]"
print(np.sum(x, axis=1)) # Compute sum of each row; prints "[3 7]"
tile函数:
import numpy as np
# We will add the vector v to each row of the matrix x,
# storing the result in the matrix y
x = np.array([[1,2,3], [4,5,6], [7,8,9], [10, 11, 12]])
v = np.array([1, 0, 1])
vv = np.tile(v, (4, 1)) # Stack 4 copies of v on top of each other
print(vv) # Prints "[[1 0 1]
# [1 0 1]
# [1 0 1]
# [1 0 1]]"
y = x + vv # Add x and vv elementwise
print(y) # Prints "[[ 2 2 4
# [ 5 5 7]
# [ 8 8 10]
# [11 11 13]]"
reshape():
arr = np.arange(10)
arr.reshape(2, -1) # Setting to -1 automatically decides the number of cols
# > array([[0, 1, 2, 3, 4],
# > [5, 6, 7, 8, 9]])
垂直叠加数组和水平叠加数组:
a = np.arange(10).reshape(2,-1) b = np.repeat(1, 10).reshape(2,-1) # Answers # Method 1: np.concatenate([a, b], axis=0) # Method 2: np.vstack([a, b]) # Method 3: np.r_[a, b] # > array([[0, 1, 2, 3, 4], # > [5, 6, 7, 8, 9], # > [1, 1, 1, 1, 1], # > [1, 1, 1, 1, 1]])
# Answers 水平叠加
# Method 1:
np.concatenate([a, b], axis=1)
# Method 2:
np.hstack([a, b])
# Method 3:
np.c_[a, b]
# > array([[0, 1, 2, 3, 4, 1, 1, 1, 1, 1],
# > [5, 6, 7, 8, 9, 1, 1, 1, 1, 1]])
获取两个numpy数组之间的公共项
a = np.array([1,2,3,2,3,4,3,4,5,6]) b = np.array([7,2,10,2,7,4,9,4,9,8]) np.intersect1d(a,b) # > array([2, 4])
一个数组中删除存在于另一个数组中的项
a = np.array([1,2,3,4,5]) b = np.array([5,6,7,8,9]) # From 'a' remove all of 'b' np.setdiff1d(a,b) # > array([1, 2, 3, 4])
得到两个数组元素匹配的位置
a = np.array([1,2,3,2,3,4,3,4,5,6]) b = np.array([7,2,10,2,7,4,9,4,9,8]) np.where(a == b) # > (array([1, 3, 5, 7]),)
提取给定范围内的所有数字
a = np.arange(15) # Method 1 index = np.where((a >= 5) & (a <= 10)) a[index] # Method 2: index = np.where(np.logical_and(a>=5, a<=10)) a[index] # > (array([6, 9, 10]),) # Method 3: (thanks loganzk!) a[(a >= 5) & (a <= 10)]
找到numpy数组的百分位数
# Input url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data' sepallength = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0]) # Solution np.percentile(sepallength, q=[5, 95]) # > array([ 4.6 , 7.255])
数组中的随机位置插入值
# Input url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data' iris_2d = np.genfromtxt(url, delimiter=',', dtype='object') # Method 1 i, j = np.where(iris_2d) # i, j contain the row numbers and column numbers of 600 elements of iris_x np.random.seed(100) iris_2d[np.random.choice((i), 20), np.random.choice((j), 20)] = np.nan # Method 2 np.random.seed(100) iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan # Print first 10 rows print(iris_2d[:10])
在numpy数组中找到缺失值的位置
# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan
# Solution
print("Number of missing values: \n", np.isnan(iris_2d[:, 0]).sum())
print("Position of missing values: \n", np.where(np.isnan(iris_2d[:, 0])))
# > Number of missing values:
# > 5
# > Position of missing values:
# > (array([ 39, 88, 99, 130, 147]),)
根据两个或多个条件过滤numpy数组
# Input url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data' iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3]) # Solution condition = (iris_2d[:, 2] > 1.5) & (iris_2d[:, 0] < 5.0) iris_2d[condition] # > array([[ 4.8, 3.4, 1.6, 0.2], # > [ 4.8, 3.4, 1.9, 0.2], # > [ 4.7, 3.2, 1.6, 0.2], # > [ 4.8, 3.1, 1.6, 0.2], # > [ 4.9, 2.4, 3.3, 1. ], # > [ 4.9, 2.5, 4.5, 1.7]])
从numpy数组中删除包含缺失值的行
# Input url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data' iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3]) iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan # Solution # No direct numpy function for this. # Method 1: any_nan_in_row = np.array([~np.any(np.isnan(row)) for row in iris_2d]) iris_2d[any_nan_in_row][:5] # Method 2: (By Rong) iris_2d[np.sum(np.isnan(iris_2d), axis = 1) == 0][:5] # > array([[ 4.9, 3. , 1.4, 0.2], # > [ 4.7, 3.2, 1.3, 0.2], # > [ 4.6, 3.1, 1.5, 0.2], # > [ 5. , 3.6, 1.4, 0.2], # > [ 5.4, 3.9, 1.7, 0.4]])
找到numpy数组的两列之间的相关性
# Input url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data' iris = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3]) # Solution 1 返回协方差举证,对角线为1. np.corrcoef(iris[:, 0], iris[:, 2])[0, 1] # Solution 2 from scipy.stats.stats import pearsonr corr, p_value = pearsonr(iris[:, 0], iris[:, 2]) print(corr) # Correlation coef indicates the degree of linear relationship between two numeric variables. # It can range between -1 to +1. # The p-value roughly indicates the probability of an uncorrelated system producing # datasets that have a correlation at least as extreme as the one computed. # The lower the p-value (<0.01), stronger is the significance of the relationship. # It is not an indicator of the strength. # > 0.871754157305
查找给定数组是否具有任何空值
# Input url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data' iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3]) np.isnan(iris_2d).any() # > False
在numpy数组中用0替换所有缺失值
# Input url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data' iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3]) iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan # Solution iris_2d[np.isnan(iris_2d)] = 0 iris_2d[:4] # > array([[ 5.1, 3.5, 1.4, 0. ], # > [ 4.9, 3. , 1.4, 0.2], # > [ 4.7, 3.2, 1.3, 0.2], # > [ 4.6, 3.1, 1.5, 0.2]])
在numpy数组中查找唯一值的计数
# Import iris keeping the text column intact
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')
# Solution
# Extract the species column as an array
species = np.array([row.tolist()[4] for row in iris])
# Get the unique values and the counts
np.unique(species, return_counts=True)
# > (array([b'Iris-setosa', b'Iris-versicolor', b'Iris-virginica'],
# > dtype='|S15'), array([50, 50, 50]))
将数字转换为分类(文本)数组
# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')
# Bin petallength
petal_length_bin = np.digitize(iris[:, 2].astype('float'), [0, 3, 5, 10])
# Map it to respective category
label_map = {1: 'small', 2: 'medium', 3: 'large', 4: np.nan}
petal_length_cat = [label_map[x] for x in petal_length_bin]
# View
petal_length_cat[:4]
<# > ['small', 'small', 'small', 'small']
从numpy数组的现有列创建新列
# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='object')
# Solution
# Compute volume
sepallength = iris_2d[:, 0].astype('float')
petallength = iris_2d[:, 2].astype('float')
# 一维行向量
volume = (np.pi * petallength * (sepallength**2))/3
# Introduce new dimension to match iris_2d's 二维矩阵n行1列
volume = volume[:, np.newaxis]
# Add the new column
out = np.hstack([iris_2d, volume])
# View
out[:4]
# > array([[b'5.1', b'3.5', b'1.4', b'0.2', b'Iris-setosa', 38.13265162927291],
# > [b'4.9', b'3.0', b'1.4', b'0.2', b'Iris-setosa', 35.200498485922445],
# > [b'4.7', b'3.2', b'1.3', b'0.2', b'Iris-setosa', 30.0723720777127],
# > [b'4.6', b'3.1', b'1.5', b'0.2', b'Iris-setosa', 33.238050274980004]], dtype=object)
在numpy中进行概率抽样
# Import iris keeping the text column intact url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data' iris = np.genfromtxt(url, delimiter=',', dtype='object') # Solution # Get the species column species = iris[:, 4] # Approach 1: Generate Probablistically np.random.seed(100) a = np.array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']) species_out = np.random.choice(a, 150, p=[0.5, 0.25, 0.25]) # Approach 2: Probablistic Sampling (preferred) np.random.seed(100) probs = np.r_[np.linspace(0, 0.500, num=50), np.linspace(0.501, .750, num=50), np.linspace(.751, 1.0, num=50)] index = np.searchsorted(probs, np.random.random(150)) species_out = species[index] print(np.unique(species_out, return_counts=True)) # > (array([b'Iris-setosa', b'Iris-versicolor', b'Iris-virginica'], dtype=object), array([77, 37, 36]))
在按另一个数组分组时获取数组的第二大值
# Import iris keeping the text column intact
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
# Solution
# Get the species and petal length columns
petal_len_setosa = iris[iris[:, 4] == b'Iris-setosa', [2]].astype('float')
# Get the second last value
np.unique(np.sort(petal_len_setosa))[-2]
# > 1.7
按列对2D数组进行排序
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')
# Sort by column position 0: SepalLength
print(iris[iris[:,0].argsort()][:8])
# > [[b'4.3' b'3.0' b'1.1' b'0.1' b'Iris-setosa']
# > [b'4.4' b'3.2' b'1.3' b'0.2' b'Iris-setosa']
# > [b'4.4' b'3.0' b'1.3' b'0.2' b'Iris-setosa']
# > [b'4.4' b'2.9' b'1.4' b'0.2' b'Iris-setosa']
# > [b'4.5' b'2.3' b'1.3' b'0.3' b'Iris-setosa']
# > [b'4.6' b'3.6' b'1.0' b'0.2' b'Iris-setosa']
# > [b'4.6' b'3.1' b'1.5' b'0.2' b'Iris-setosa']
# > [b'4.6' b'3.4' b'1.4' b'0.3' b'Iris-setosa']
在numpy数组中找到最常见的值
# **给定:** url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data' iris = np.genfromtxt(url, delimiter=',', dtype='object') # Solution: vals, counts = np.unique(iris[:, 2], return_counts=True) print(vals[np.argmax(counts)]) # > b'1.5'
找到第一次出现的值大于给定值的位置
# **给定:** url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data' iris = np.genfromtxt(url, delimiter=',', dtype='object') # Solution: (edit: changed argmax to argwhere. Thanks Rong!) np.argwhere(iris[:, 3].astype(float) > 1.0)[0] # > 50
将大于给定值的所有值替换为给定的截止值
# Input np.set_printoptions(precision=2) np.random.seed(100) a = np.random.uniform(1,50, 20) # Solution 1: Using np.clip np.clip(a, a_min=10, a_max=30) # Solution 2: Using np.where print(np.where(a < 10, 10, np.where(a > 30, 30, a))) # > [ 27.63 14.64 21.8 30. 10. 10. 30. 30. 10. 29.18 30. # > 11.25 10.08 10. 11.77 30. 30. 10. 30. 14.43]
从numpy数组中获取最大n值的位置
np.random.seed(100) a = np.random.uniform(1,50, 20) # Input np.random.seed(100) a = np.random.uniform(1,50, 20) # Solution: print(a.argsort()) # > [18 7 3 10 15] # Solution 2: np.argpartition(-a, 5)[:5] # > [15 10 3 7 18] # Below methods will get you the values. # Method 1: a[a.argsort()][-5:] # Method 2: np.sort(a)[-5:] # Method 3: np.partition(a, kth=-5)[-5:] # Method 4: a[np.argpartition(-a, 5)][:5]

浙公网安备 33010602011771号