NumPy数据分析练习

本练习题目来源于

# 1.导入并查看版本
import numpy as np
print(np.__version__)

1.15.1

# 2.创建数组
arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

# 3.创建布尔数组
# 创建所有True的 3×3 numpy数组
np.full((3, 3), True, dtype=bool)

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]])

np.ones((3, 3), dtype=bool)

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]])

# 4.如何从一维数组中提取满足给定条件的项目？
# 问：从中提取所有奇数 arr

# Input
arr = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

# Solution
arr[arr % 2 == 1]

array([1, 3, 5, 7, 9])

# 偶数
arr[arr % 2 == 0]

array([0, 2, 4, 6, 8])

# 5.如何用numpy数组中的另一个值替换满足条件的项目？
# 问：将所有奇数替换arr为-1
arr[arr % 2 == 1] = -1
arr

array([ 0, -1,  2, -1,  4, -1,  6, -1,  8, -1])

# 6.如何替换满足条件的项目而不影响原始数组？
# 问：将rr中的所有奇数替换为-1，而无需更改 arr
arr = np.arange(10)
out = np.where(arr % 2 == 1, -1, arr)
print(arr)
out

[0 1 2 3 4 5 6 7 8 9]





array([ 0, -1,  2, -1,  4, -1,  6, -1,  8, -1])

# 7.如何重塑数组？
# 问：将一维数组转换为具有两行的二维数组
arr = np.arange(10)
# arr.reshape((2, 5))
arr.reshape((2, -1))

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9]])

# 8.如何垂直堆叠两个阵列？
# 问：堆叠数组a和b垂直

a = np.arange(10).reshape(2,-1)
b = np.repeat(1, 10).reshape(2,-1) # repeat重复，两端开口

# # Method 1
# np.vstack([a, b])

# Method 2
np.concatenate([a, b], axis=0)

# Method 3
# np.r_[a, b]

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9],
       [1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1]])

# 9.如何水平堆叠两个阵列？
# 问：堆叠阵列a并b水平放置。
a = np.arange(10).reshape(2,-1)
b = np.repeat(1, 10).reshape(2,-1)

# np.concatenate([a, b], axis=1)
# np.hstack([a, b])
np.c_[a, b]

array([[0, 1, 2, 3, 4, 1, 1, 1, 1, 1],
       [5, 6, 7, 8, 9, 1, 1, 1, 1, 1]])

# 10.如何在numpy中生成自定义序列而不进行硬编码？
# 问：创建以下模式而不进行硬编码。仅使用numpy函数和下面的输入数组a。
a = np.array([1,2,3])
print(a.repeat(3))
print(np.tile(a, 3))
np.r_[np.repeat(a, 3), np.tile(a, 3)]

[1 1 1 2 2 2 3 3 3]
[1 2 3 1 2 3 1 2 3]





array([1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3])

# 11.如何获取两个python numpy数组之间的公共项？
# 问：获取a和之间的共同之处b
a = np.array([1,2,3,2,3,4,3,4,5,6])
b = np.array([7,2,10,2,7,4,9,4,9,8])
np.intersect1d(a, b)

array([2, 4])

# 12.如何从一个数组中删除另一个数组中存在的那些项？
# 问：从数组中a删除数组中存在的所有项目b
a = np.array([1,2,3,4,5])
b = np.array([5,6,7,8,9])

np.setdiff1d(a, b)

array([1, 2, 3, 4])

# 13.如何获得两个数组的元素匹配的位置？
# 问：获取a和b匹配的位置
a = np.array([1,2,3,2,3,4,3,4,5,6])
b = np.array([7,2,10,2,7,4,9,4,9,8])

np.where(a == b) # 返回位置

(array([1, 3, 5, 7], dtype=int64),)

# 14.如何从numpy数组中提取给定范围内的所有数字？
# 问：从中获取5到10之间的所有项目a。
a = np.array([2, 6, 1, 9, 10, 3, 27])

# Method 1
# index = np.where((a>5) & (a<=10))
# a[index]

# Mothod 2
# index = np.where(np.logical_and(a>5, a<=10))
# a[index]

# Mothod 3
a[(a>5) & (a<=10)]

array([ 6,  9, 10])

# 15.如何制作一个处理标量的python函数以在numpy数组上工作？
# 问maxx：将对两个标量起作用的函数转换为对两个数组起作用。

def maxx(x, y):
    """Get the maximum of two items"""
    if x >= y:
        return x
    else:
        return y
    
pair_max = np.vectorize(maxx, otypes=[float])
    
a = np.array([5, 7, 9, 8, 6, 4, 5])
b = np.array([6, 3, 4, 8, 9, 7, 1])
pair_max(a, b)

array([6., 7., 9., 8., 9., 7., 5.])

# 16.如何在2d numpy数组中交换两列？
# 问：交换数组中的列1和2 arr。
arr = np.arange(9).reshape(3,3)
arr[:, [1,0,2]]

array([[1, 0, 2],
       [4, 3, 5],
       [7, 6, 8]])

# 17.如何在2d numpy数组中交换两行？
# 问：交换数组中的第1行和第2行arr：
arr = np.arange(9).reshape(3,3)
arr[[1, 0 , 2], :]

array([[3, 4, 5],
       [0, 1, 2],
       [6, 7, 8]])

# 18.如何反转2D数组的行？
# 问：反转2D数组的行arr。
arr = np.arange(9).reshape(3,3)
arr[::-1]

array([[6, 7, 8],
       [3, 4, 5],
       [0, 1, 2]])

# 19.如何反转2D数组的列？
# 问：反转2D数组的列arr。
arr = np.arange(9).reshape(3,3)
arr[:, ::-1]

array([[2, 1, 0],
       [5, 4, 3],
       [8, 7, 6]])

# 20.如何创建一个包含5到10之间的随机浮点数的2D数组？
# 问：创建一个形状为5x3的2D数组，以包含5到10之间的随机十进制数

# rand_arr = np.random.randint(low=5, high=10, size=(5,3)) + np.random.random((5,3))

arr = np.random.uniform(5, 10, (5,3))
arr

array([[6.42645631, 9.80495607, 5.86564206],
       [5.72762991, 6.44257377, 7.20572599],
       [9.70168216, 7.66469088, 5.80734655],
       [9.93728715, 5.78137556, 5.9138876 ],
       [9.15506353, 8.74634588, 6.54821786]])

# 21.如何在python numpy数组中仅打印3个小数位？
# 问：打印或仅显示numpy数组的3个小数位rand_arr。
rand_arr = np.random.random((5,3))
np.set_printoptions(precision=3)
rand_arr

array([[0.385, 0.379, 0.789],
       [0.502, 0.249, 0.923],
       [0.582, 0.741, 0.849],
       [0.653, 0.362, 0.521],
       [0.533, 0.353, 0.734]])

# 22.如何通过抑制科学计数法（如1e10）来漂亮地打印一个numpy数组？
# 问rand_arr：压制科学记数法（例如1e10）可以打印出漂亮的文字

# Reset printoptions to default
np.set_printoptions(suppress=False)
# Create the random array
np.random.seed(100)
rand_arr = np.random.random([3,3])/1e3
rand_arr

array([[5.434049e-04, 2.783694e-04, 4.245176e-04],
       [8.447761e-04, 4.718856e-06, 1.215691e-04],
       [6.707491e-04, 8.258528e-04, 1.367066e-04]])

np.set_printoptions(suppress=True, precision=6)  # precision is optional
rand_arr

array([[0.000543, 0.000278, 0.000425],
       [0.000845, 0.000005, 0.000122],
       [0.000671, 0.000826, 0.000137]])

# 23.如何限制numpy数组输出中打印的项目数？
# 问：限制在python numpy数组中打印的项目数a最多为6个元素。
np.set_printoptions(threshold=6)
a = np.arange(15)
a

array([ 0,  1,  2, ..., 12, 13, 14])

# 24.如何在不截断的情况下打印完整的numpy数组
# 问：打印完整的numpy数组a而不被截断。
# Input
np.set_printoptions(threshold=6)
a = np.arange(15)

# Solution
np.set_printoptions(threshold=np.nan)
a

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

# 25.如何导入带有数字和文本的数据集，以使python numpy中的文本保持完整？
# 问：导入iris.data，使文本保持完整。

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=",", dtype="object")
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')
iris[:5]

# 由于我们想保留物种，所以将文本字段设置dtype为object。如果设置为1 ，则将返回一维元组数组。dtype=None

array([[b'5.1', b'3.5', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.9', b'3.0', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.7', b'3.2', b'1.3', b'0.2', b'Iris-setosa'],
       [b'4.6', b'3.1', b'1.5', b'0.2', b'Iris-setosa'],
       [b'5.0', b'3.6', b'1.4', b'0.2', b'Iris-setosa']], dtype=object)

# 26.如何从一维元组数组中提取特定列？
# 问：species从iris上一个问题中导入的一维中提取文本列。
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_1d = np.genfromtxt(url, delimiter=',', dtype=None, encoding="utf-8")
print(iris_1d.shape)

species = np.array([row[4] for row in iris_1d])
species[:5]

(150,)





array(['Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa'], dtype='<U15')

# 27.如何将一维元组数组转换为二维numpy数组？
# 问：通过省略文本字段将一维转换iris为二维数组。iris_2dspecies

# Input:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_1d = np.genfromtxt(url, delimiter=',', dtype=None, encoding='utf-8')

# Solution:
# Method 1: Convert each row to a list and get the first 4 items
iris_2d = np.array([row.tolist()[:4] for row in iris_1d])
iris_2d[:4]

# Alt Method 2: Import only the first 4 columns from source url
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
iris_2d[:4]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2]])

# 28.如何计算一个numpy数组的平均值，中位数和标准偏差？
# 问：找到虹膜的平均值，中位数，标准偏差sepallength（第一列）

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
sepallength = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0])

mu, med, sd = np.mean(sepallength), np.median(sepallength), np.std(sepallength)
print(mu, med, sd)

5.843333333333334 5.8 0.8253012917851409

# 29.如何规范化数组，以使值恰好在0到1之间？
# Q.创建的归一化形式iris的sepallength，其值范围恰好0与1之间，使得最小的值为0，并且最大的值为1。
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
sepallength = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0])

Smax, Smin = sepallength.max(), sepallength.min()
# S = (sepallength - Smin)/(Smax - Smin)

# or 
S = (sepallength - Smin)/sepallength.ptp()
print(S)

[0.22222222 0.16666667 0.11111111 0.08333333 0.19444444 0.30555556
 0.08333333 0.19444444 0.02777778 0.16666667 0.30555556 0.13888889
 0.13888889 0.         0.41666667 0.38888889 0.30555556 0.22222222
 0.38888889 0.22222222 0.30555556 0.22222222 0.08333333 0.22222222
 0.13888889 0.19444444 0.19444444 0.25       0.25       0.11111111
 0.13888889 0.30555556 0.25       0.33333333 0.16666667 0.19444444
 0.33333333 0.16666667 0.02777778 0.22222222 0.19444444 0.05555556
 0.02777778 0.19444444 0.22222222 0.13888889 0.22222222 0.08333333
 0.27777778 0.19444444 0.75       0.58333333 0.72222222 0.33333333
 0.61111111 0.38888889 0.55555556 0.16666667 0.63888889 0.25
 0.19444444 0.44444444 0.47222222 0.5        0.36111111 0.66666667
 0.36111111 0.41666667 0.52777778 0.36111111 0.44444444 0.5
 0.55555556 0.5        0.58333333 0.63888889 0.69444444 0.66666667
 0.47222222 0.38888889 0.33333333 0.33333333 0.41666667 0.47222222
 0.30555556 0.47222222 0.66666667 0.55555556 0.36111111 0.33333333
 0.33333333 0.5        0.41666667 0.19444444 0.36111111 0.38888889
 0.38888889 0.52777778 0.22222222 0.38888889 0.55555556 0.41666667
 0.77777778 0.55555556 0.61111111 0.91666667 0.16666667 0.83333333
 0.66666667 0.80555556 0.61111111 0.58333333 0.69444444 0.38888889
 0.41666667 0.58333333 0.61111111 0.94444444 0.94444444 0.47222222
 0.72222222 0.36111111 0.94444444 0.55555556 0.66666667 0.80555556
 0.52777778 0.5        0.58333333 0.80555556 0.86111111 1.
 0.58333333 0.55555556 0.5        0.94444444 0.55555556 0.58333333
 0.47222222 0.72222222 0.66666667 0.72222222 0.41666667 0.69444444
 0.66666667 0.66666667 0.55555556 0.61111111 0.52777778 0.44444444]

# 30.如何计算softmax分数？
# 问：计算的softmax得分sepallength。

# 31.如何找到一个numpy数组的百分位数？
# 问：找到虹膜虹膜的第5个百分点和第95个百分点 sepallength

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
sepallength = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0])

np.percentile(sepallength, q=[5, 95])

array([4.6  , 7.255])

# 32.如何在数组的随机位置插入值？
# 问：在数据集中的20个随机位置插入值np.naniris_2d

# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='object')
# Method 1
i, j = np.where(iris_2d)

# i, j contain the row numbers and column numbers of 600 elements of iris_x
# np.random.seed(100)
# iris_2d[np.random.choice((i), 20), np.random.choice((j), 20)] = np.nan

# Method 2
np.random.seed(100)
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan

# Print first 10 rows
print(iris_2d[:10])

[[b'5.1' b'3.5' b'1.4' b'0.2' b'Iris-setosa']
 [b'4.9' b'3.0' b'1.4' b'0.2' b'Iris-setosa']
 [b'4.7' b'3.2' b'1.3' b'0.2' b'Iris-setosa']
 [b'4.6' b'3.1' b'1.5' b'0.2' b'Iris-setosa']
 [b'5.0' b'3.6' b'1.4' b'0.2' b'Iris-setosa']
 [b'5.4' b'3.9' b'1.7' b'0.4' b'Iris-setosa']
 [b'4.6' b'3.4' b'1.4' b'0.3' b'Iris-setosa']
 [b'5.0' b'3.4' b'1.5' b'0.2' b'Iris-setosa']
 [b'4.4' nan b'1.4' b'0.2' b'Iris-setosa']
 [b'4.9' b'3.1' b'1.5' b'0.1' b'Iris-setosa']]

# 33.如何在numpy数组中查找缺失值的位置？
# 问：在iris_2d的sepallength（第1列）中找到缺失值的数量和位置

# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float')
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan

# Solution
print("Number of missing values: \n", np.isnan(iris_2d[:, 0]).sum())
print("Position of missing values: \n", np.where(np.isnan(iris_2d[:, 0])))

Number of missing values: 
 3
Position of missing values: 
 (array([ 82, 100, 127], dtype=int64),)

# 34.如何根据两个或多个条件过滤一个numpy数组？
# 问：筛选iris_2d具有和的行 petallength (3rd column) > 1.5 sepallength (1st column) < 5.0

# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])

condition = iris_2d[(iris_2d[:, 2] > 1.5) & (iris_2d[:, 0] < 5.0)]
print(condition)

[[4.8 3.4 1.6 0.2]
 [4.8 3.4 1.9 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [4.9 2.4 3.3 1. ]
 [4.9 2.5 4.5 1.7]]

# 35.如何从numpy数组中删除包含缺失值的行？
# 问：选择没有任何nan值的iris_2d行。

# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan

# Solution
# No direct numpy function for this.
# Method 1:
# any_nan_in_row = np.array([~np.any(np.isnan(row)) for row in iris_2d])
# iris_2d[any_nan_in_row][:5]

# Method 2: (By Rong)
iris_2d[np.sum(np.isnan(iris_2d), axis = 1) == 0][:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

# 36.如何找到一个numpy数组的两列之间的相关性？
# 问：在iris_2d中找到SepalLength（第一列）和PetalLength（第三列）之间的相关性

# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])

# Solution 1
np.corrcoef(iris[:, 0], iris[:, 2])[0, 1]

# Solution 2
from scipy.stats.stats import pearsonr  
corr, p_value = pearsonr(iris[:, 0], iris[:, 2])
print(corr)

0.8717541573048712

# 37.如何查找给定数组是否具有空值？
# 问：找出是否iris_2d缺少任何值。
# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])

np.isnan(iris_2d).any()

False

# 38.如何在numpy数组中将所有缺失值替换为0？
# 问：nan在numpy数组中所有出现的都替换为0

# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan

# Solution
iris_2d[np.isnan(iris_2d)] = 0
iris_2d[:4]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 0. , 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2]])

# 39.如何在numpy数组中查找唯一值的计数？
# 问：查找虹膜的唯一值和唯一值的计数 species

# Import iris keeping the text column intact
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')

# Solution
# Extract the species column as an array
species = np.array([row.tolist()[4] for row in iris])

# Get the unique values and the counts
np.unique(species, return_counts=True)

(array([b'Iris-setosa', b'Iris-versicolor', b'Iris-virginica'],
       dtype='|S15'), array([50, 50, 50], dtype=int64))

# 40.如何将数字转换为分类（文本）数组？
# 问：对iris_2d的花瓣长度（第3列）进行装箱以形成文本数组，例如，如果花瓣长度为：
# 少于3->'小'
# 3-5->'中等'
# '> = 5->'大'

# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')

# Bin petallength 
petal_length_bin = np.digitize(iris[:, 2].astype('float'), [0, 3, 5, 10])
print(petal_length_bin)
# Map it to respective category
label_map = {1: 'small', 2: 'medium', 3: 'large', 4: np.nan}
petal_length_cat = [label_map[x] for x in petal_length_bin]

# View
petal_length_cat[:4]

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 3 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 2 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 2 3 2 3 3 2 2 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3
 3 3]





['small', 'small', 'small', 'small']

# 41.如何从numpy数组的现有列创建新列？
# 问：在iris_2d中为音量创建一个新列，其中音量为 (pi x petallength x sepal_length^2)/3
# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='object')

# Solution
# Compute volume
sepallength = iris_2d[:, 0].astype('float')
petallength = iris_2d[:, 2].astype('float')
volume = (np.pi * petallength * (sepallength**2))/3

# Introduce new dimension to match iris_2d's
volume = volume[:, np.newaxis]

# Add the new column
out = np.hstack([iris_2d, volume])

# View
out[:4]

array([[b'5.1', b'3.5', b'1.4', b'0.2', b'Iris-setosa',
        38.13265162927291],
       [b'4.9', b'3.0', b'1.4', b'0.2', b'Iris-setosa',
        35.200498485922445],
       [b'4.7', b'3.2', b'1.3', b'0.2', b'Iris-setosa', 30.0723720777127],
       [b'4.6', b'3.1', b'1.5', b'0.2', b'Iris-setosa',
        33.238050274980004]], dtype=object)

# 42.如何在numpy中进行概率抽样？
# Q.随机抽样iris的species，使得setose是数量的两倍versicolor和virginica
# Import iris keeping the text column intact
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')

# Solution
# Get the species column
species = iris[:, 4]

# Approach 1: Generate Probablistically
np.random.seed(100)
a = np.array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'])
species_out = np.random.choice(a, 150, p=[0.5, 0.25, 0.25])

# Approach 2: Probablistic Sampling (preferred)
np.random.seed(100)
probs = np.r_[np.linspace(0, 0.500, num=50), np.linspace(0.501, .750, num=50), np.linspace(.751, 1.0, num=50)]
index = np.searchsorted(probs, np.random.random(150))
species_out = species[index]
print(np.unique(species_out, return_counts=True))

# 方法2是首选方法，因为它会创建一个索引变量，该变量可用于对二维表格数据进行采样

# 43.当被另一个数组分组时，如何获得数组的第二大值？
# 问：第二长petallength物种的价值是多少setosa

# Import iris keeping the text column intact
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')

# Solution
# Get the species and petal length columns
petal_len_setosa = iris[iris[:, 4] == b'Iris-setosa', [2]].astype('float')

# Get the second last value
np.unique(np.sort(petal_len_setosa))[-2]

1.7

# 44.如何按列对二维数组排序
# 问：根据sepallength列对虹膜数据集进行排序。

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')

print(iris[iris[:,0].argsort()][:20])

[[b'4.3' b'3.0' b'1.1' b'0.1' b'Iris-setosa']
 [b'4.4' b'3.2' b'1.3' b'0.2' b'Iris-setosa']
 [b'4.4' b'3.0' b'1.3' b'0.2' b'Iris-setosa']
 [b'4.4' b'2.9' b'1.4' b'0.2' b'Iris-setosa']
 [b'4.5' b'2.3' b'1.3' b'0.3' b'Iris-setosa']
 [b'4.6' b'3.6' b'1.0' b'0.2' b'Iris-setosa']
 [b'4.6' b'3.1' b'1.5' b'0.2' b'Iris-setosa']
 [b'4.6' b'3.4' b'1.4' b'0.3' b'Iris-setosa']
 [b'4.6' b'3.2' b'1.4' b'0.2' b'Iris-setosa']
 [b'4.7' b'3.2' b'1.3' b'0.2' b'Iris-setosa']
 [b'4.7' b'3.2' b'1.6' b'0.2' b'Iris-setosa']
 [b'4.8' b'3.0' b'1.4' b'0.1' b'Iris-setosa']
 [b'4.8' b'3.0' b'1.4' b'0.3' b'Iris-setosa']
 [b'4.8' b'3.4' b'1.9' b'0.2' b'Iris-setosa']
 [b'4.8' b'3.4' b'1.6' b'0.2' b'Iris-setosa']
 [b'4.8' b'3.1' b'1.6' b'0.2' b'Iris-setosa']
 [b'4.9' b'2.4' b'3.3' b'1.0' b'Iris-versicolor']
 [b'4.9' b'2.5' b'4.5' b'1.7' b'Iris-virginica']
 [b'4.9' b'3.1' b'1.5' b'0.1' b'Iris-setosa']
 [b'4.9' b'3.1' b'1.5' b'0.1' b'Iris-setosa']]

# 45.如何在numpy数组中查找最频繁的值？
# 问：在虹膜数据集中找到最常见的花瓣长度值（第3列）。
# Input:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')

# Solution:
vals, counts = np.unique(iris[:, 2], return_counts=True)
print(vals)
print(counts)
print('-------------')
print(vals[np.argmax(counts)]) # np.argmax取出数组中元素最大值所对应的索引

[b'1.0' b'1.1' b'1.2' b'1.3' b'1.4' b'1.5' b'1.6' b'1.7' b'1.9' b'3.0'
 b'3.3' b'3.5' b'3.6' b'3.7' b'3.8' b'3.9' b'4.0' b'4.1' b'4.2' b'4.3'
 b'4.4' b'4.5' b'4.6' b'4.7' b'4.8' b'4.9' b'5.0' b'5.1' b'5.2' b'5.3'
 b'5.4' b'5.5' b'5.6' b'5.7' b'5.8' b'5.9' b'6.0' b'6.1' b'6.3' b'6.4'
 b'6.6' b'6.7' b'6.9']
[ 1  1  2  7 12 14  7  4  2  1  2  2  1  1  1  3  5  3  4  2  4  8  3  5
  4  5  4  8  2  2  2  3  6  3  3  2  2  3  1  1  1  2  1]
-------------
b'1.5'

# 46.如何找到第一次出现的值大于给定值的位置？
# 问：在虹膜数据集第4列的花瓣宽度第4列中找到第一次出现的值大于1.0的位置。

# Input:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')

# Solution: (edit: changed argmax to argwhere. Thanks Rong!)
np.argwhere(iris[:, 3].astype(float) > 1.0)[0]  # np.argwhere( a )返回非0的数组元组的索引，其中a是要索引数组的条件。

array([50], dtype=int64)

# 47.如何将大于给定值的所有值替换为给定截止值？
# 问：从数组中a，替换所有大于30到30且小于10到10的值。
# Input
np.set_printoptions(precision=2)
np.random.seed(100)
a = np.random.uniform(1,50, 20)

# Solution 1: Using np.clip
np.clip(a, a_min=10, a_max=30)

# Solution 2: Using np.where
print(np.where(a < 10, 10, np.where(a > 30, 30, a)))

[27.63 14.64 21.8  30.   10.   10.   30.   30.   10.   29.18 30.   11.25
 10.08 10.   11.77 30.   30.   10.   30.   14.43]

# 48.如何n从numpy数组中获取最高值的位置？
# 问：获取给定数组中前5个最大值的位置a。
# Input
np.random.seed(100)
a = np.random.uniform(1,50, 20)

# Solution:
print(a.argsort())
#> [18 7 3 10 15]

# Solution 2:
np.argpartition(-a, 5)[:5]
#> [15 10  3  7 18]

# Below methods will get you the values.
# Method 1:
a[a.argsort()][-5:]

# Method 2:
np.sort(a)[-5:]

# Method 3:
np.partition(a, kth=-5)[-5:]

# Method 4:
a[np.argpartition(-a, 5)][:5]

# 49.如何计算数组中所有可能值的按行计数？
# 问：按行计算唯一值的计数。

# Input:
np.random.seed(100)
arr = np.random.randint(1,11,size=(6, 10))
arr
#> array([[ 9,  9,  4,  8,  8,  1,  5,  3,  6,  3],
#>        [ 3,  3,  2,  1,  9,  5,  1, 10,  7,  3],
#>        [ 5,  2,  6,  4,  5,  5,  4,  8,  2,  2],
#>        [ 8,  8,  1,  3, 10, 10,  4,  3,  6,  9],
#>        [ 2,  1,  8,  7,  3,  1,  9,  3,  6,  2],
#>        [ 9,  2,  6,  5,  3,  9,  4,  6,  1, 10]])

# Solution
def counts_of_all_values_rowwise(arr2d):
    # Unique values and its counts row wise
    num_counts_array = [np.unique(row, return_counts=True) for row in arr2d]

    # Counts of all values row wise
    return([[int(b[a==i]) if i in a else 0 for i in np.unique(arr2d)] for a, b in num_counts_array])

# Print
print(np.arange(1,11))
counts_of_all_values_rowwise(arr)

[ 1  2  3  4  5  6  7  8  9 10]





[[1, 0, 2, 1, 1, 1, 0, 2, 2, 0],
 [2, 1, 3, 0, 1, 0, 1, 0, 1, 1],
 [0, 3, 0, 2, 3, 1, 0, 1, 0, 0],
 [1, 0, 2, 1, 0, 1, 0, 2, 1, 2],
 [2, 2, 2, 0, 0, 1, 1, 1, 1, 0],
 [1, 1, 1, 1, 1, 2, 0, 0, 2, 1]]

# 50.如何将数组数组转换为平面一维数组？
# 问：转换array_of_arrays为平面线性一维数组。

# Input:
arr1 = np.arange(3)
arr2 = np.arange(3,7)
arr3 = np.arange(7,10)

array_of_arrays = np.array([arr1, arr2, arr3])
print('array_of_arrays: ', array_of_arrays)

# Solution 1
arr_2d = np.array([a for arr in array_of_arrays for a in arr])

# Solution 2:
arr_2d = np.concatenate(array_of_arrays)
print(arr_2d)

# 51.如何在numpy中为数组生成一次性编码？
# 问：计算一次性编码（数组中每个唯一值的虚拟二进制变量）
# Input:
np.random.seed(101) 
arr = np.random.randint(1,4, size=6)
arr
#> array([2, 3, 2, 2, 2, 1])

# Solution:
def one_hot_encodings(arr):
    uniqs = np.unique(arr)
    out = np.zeros((arr.shape[0], uniqs.shape[0]))
    for i, k in enumerate(arr):
        out[i, k-1] = 1
    return out

one_hot_encodings(arr)
#> array([[ 0.,  1.,  0.],
#>        [ 0.,  0.,  1.],
#>        [ 0.,  1.,  0.],
#>        [ 0.,  1.,  0.],
#>        [ 0.,  1.,  0.],
#>        [ 1.,  0.,  0.]])

# Method 2:
(arr[:, None] == np.unique(arr)).view(np.int8)

# 52.如何创建按类别变量分组的行号？
# 问：创建按类别变量分组的行号。使用以下示例iris species作为输入
# Input:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
species = np.genfromtxt(url, delimiter=',', dtype='str', usecols=4)
np.random.seed(100)
species_small = np.sort(np.random.choice(species, size=20))
species_small

print([i for val in np.unique(species_small) for i, grp in enumerate(species_small[species_small==val])])

# 53.如何基于给定的分类变量创建实体ID？
# 问：根据给定的分类变量创建组ID。使用以下示例iris species作为输入。
# Input:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
species = np.genfromtxt(url, delimiter=',', dtype='str', usecols=4)
np.random.seed(100)
species_small = np.sort(np.random.choice(species, size=20))
species_small

# 54.如何使用numpy对数组中的项目进行排名？
# 问：创建给定数值数组的等级a。
np.random.seed(10)
a = np.random.randint(20, size=10)
print('Array: ', a)

# Solution
print(a.argsort().argsort())
print('Array: ', a)

# 55.如何使用numpy对多维数组中的项目进行排名？
# 问：创建与给定数值数组形状相同的等级数组a。
# Input:
np.random.seed(10)
a = np.random.randint(20, size=[2,5])
print(a)

# Solution
print(a.ravel().argsort().argsort().reshape(a.shape))

# 56.如何在numpy数组2d的每一行中找到最大值？
# 问：计算给定数组中每一行的最大值。
# Input
np.random.seed(100)
a = np.random.randint(1,10, [5,3])
a

# Solution 1
np.amax(a, axis=1)

# Solution 2
np.apply_along_axis(np.max, arr=a, axis=1)

# 57.如何计算一个numpy数组2d的每一行的最大最小值？
# 问：为给定的2d numpy数组计算每行的最大最小值。

# Input
np.random.seed(100)
a = np.random.randint(1,10, [5,3])
a

# Solution
np.apply_along_axis(lambda x: np.min(x)/np.max(x), arr=a, axis=1)

# 58.如何在numpy数组中查找重复记录？
# 问：在给定的numpy数组中找到重复的条目（第二次出现），并将其标记为True。第一次出现应该是False。
# Input
np.random.seed(100)
a = np.random.randint(0, 5, 10)

## Solution
# There is no direct function to do this as of 1.13.3

# Create an all True array
out = np.full(a.shape[0], True)

# Find the index positions of unique elements
unique_positions = np.unique(a, return_index=True)[1]

# Mark those positions as False
out[unique_positions] = False

print(out)

# 59.如何找到numpy中的分组均值？
# 问：在二维numpy数组中找到按分类列分组的数字列的平均值
# Input
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')


# Solution
# No direct way to implement this. Just a version of a workaround.
numeric_column = iris[:, 1].astype('float')  # sepalwidth
grouping_column = iris[:, 4]  # species

# List comprehension version
[[group_val, numeric_column[grouping_column==group_val].mean()] for group_val in np.unique(grouping_column)]

# For Loop version
output = []
for group_val in np.unique(grouping_column):
    output.append([group_val, numeric_column[grouping_column==group_val].mean()])

output

# 60.如何将PIL图像转换为numpy数组？
# 问：从以下URL导入图像并将其转换为numpy数组。
# 网址='https://upload.wikimedia.org/wikipedia/commons/8/8b/Denali_Mt_McKinley.jpg'
from io import BytesIO
from PIL import Image
import PIL, requests

# Import image from URL
URL = 'https://upload.wikimedia.org/wikipedia/commons/8/8b/Denali_Mt_McKinley.jpg'
response = requests.get(URL)

# Read it as Image
I = Image.open(BytesIO(response.content))

# Optionally resize
I = I.resize([150,150])

# Convert to numpy array
arr = np.asarray(I)

# Optionaly Convert it back to an image and show
im = PIL.Image.fromarray(np.uint8(arr))
Image.Image.show(im)

# 61.如何从numpy数组中删除所有缺少的值？
# 问：nan从一维numpy数组中删除所有值

a = np.array([1,2,3,np.nan,5,6,7,np.nan])
a[~np.isnan(a)]

# 62.如何计算两个数组之间的欧式距离？
# Q.计算两个阵列之间的欧几里得距离a和b。
# Input
a = np.array([1,2,3,4,5])
b = np.array([4,5,6,7,8])

# Solution
dist = np.linalg.norm(a-b)
dist

63.如何找到一维数组中的所有局部最大值（或峰值）？
问：在一维numpy数组中找到所有峰a。峰值是两边都被较小值包围的点。
a = np.array([1, 3, 7, 1, 2, 6, 0, 1])
doublediff = np.diff(np.sign(np.diff(a)))
peak_locations = np.where(doublediff == -2)[0] + 1
peak_locations

# 64.如何从2d数组中减去1d数组，其中1d数组的每一项都从相应的行中减去？
# 问：b_1d从2d数组中减去1d数组a_2d，以便b_1d从的相应行中减去的每一项a_2d。

# Input
a_2d = np.array([[3,3,3],[4,4,4],[5,5,5]])
b_1d = np.array([1,2,3])

# Solution
print(a_2d - b_1d[:,None])

[[2 2 2]
 [2 2 2]
 [2 2 2]]

# 65.如何找到数组中第n个重复项的索引
# 问：在中找到数字1的第5个重复的索引x。
x = np.array([1, 2, 1, 1, 3, 4, 3, 1, 1, 2, 1, 1, 2])
n = 5

# Solution 1: List comprehension
[i for i, v in enumerate(x) if v == 1][n-1]

# Solution 2: Numpy version
np.where(x == 1)[0][n-1]

66.如何将numpy的datetime64对象转换为datetime的datetime对象？
问：将numpy的datetime64对象转换为datetime的datetime对象

# Input: a numpy datetime64 object
dt64 = np.datetime64('2018-02-25 22:10:10')

# Solution
from datetime import datetime
dt64.tolist()

# or

dt64.astype(datetime)

# 67.如何计算一个numpy数组的移动平均值？
# 问：计算给定的一维数组的窗口大小3的移动平均值。
# Solution
# Source: https://stackoverflow.com/questions/14313510/how-to-calculate-moving-average-using-numpy
def moving_average(a, n=3) :
    ret = np.cumsum(a, dtype=float)
    ret[n:] = ret[n:] - ret[:-n]
    return ret[n - 1:] / n

np.random.seed(100)
Z = np.random.randint(10, size=10)
print('array: ', Z)
# Method 1
moving_average(Z, n=3).round(2)

# Method 2:  # Thanks AlanLRH!
# np.ones(3)/3 gives equal weights. Use np.ones(4)/4 for window size 4.
np.convolve(Z, np.ones(3)/3, mode='valid') .

# 68.如何仅给出起点，长度和步长来创建一个numpy数组序列？
# 问：创建一个长度为10的numpy数组，从5开始，连续数字之间的步长为3
length = 10
start = 5
step = 3

def seq(start, length, step):
    end = start + (step*length)
    return np.arange(start, end, step)

seq(start, length, step)

# 69.如何在一系列不规则的数字日期中填写缺失的日期？
# 问：给定一个非连续日期序列的数组。通过填写缺失的日期，使其成为连续的日期序列。
# Input
dates = np.arange(np.datetime64('2018-02-01'), np.datetime64('2018-02-25'), 2)
print(dates)

# Solution ---------------
filled_in = np.array([np.arange(date, (date+d)) for date, d in zip(dates, np.diff(dates))]).reshape(-1)

# add the last day
output = np.hstack([filled_in, dates[-1]])
output

# For loop version -------
out = []
for date, d in zip(dates, np.diff(dates)):
    out.append(np.arange(date, (date+d)))

filled_in = np.array(out).reshape(-1)

# add the last day
output = np.hstack([filled_in, dates[-1]])
output

# 70.如何从给定的一维数组创建步幅？
# 难度等级：L4

# 问：从给定的1d数组arr，使用步长生成2d矩阵，步长为4，窗口长为2，步长为2，例如[[0,1,2,3]，[2,3,4,5]，[4 ，5,6,7] ..]
def gen_strides(a, stride_len=5, window_len=5):
    n_strides = ((a.size-window_len)//stride_len) + 1
    # return np.array([a[s:(s+window_len)] for s in np.arange(0, a.size, stride_len)[:n_strides]])
    return np.array([a[s:(s+window_len)] for s in np.arange(0, n_strides*stride_len, stride_len)])

print(gen_strides(np.arange(15), stride_len=2, window_len=4))

posted @ 2019-11-26 10:34 ohou 阅读(866) 评论(0) 收藏举报

刷新页面返回顶部

心若在，梦不远 (・ω・)ノ- ( ゜- ゜)つロ

NumPy数据分析练习

公告