I'm so hard (=;

Time fies so fast ~ there is no time left for you to sad.

(#~# my poor english)

understand AXIS in NUMPY ~

import numpy as np
import torch
arr = np.arange(0, 10)
out = np.where(arr%2==0, 0, arr)

#1 repeat, tile
arr = np.array([1, 2, 3])
out1 = np.hstack((arr.repeat(3), np.tile(arr, 3)))

a = np.arange(6).reshape(-1, 3)
b = np.full((2,3), 1)
# print(np.r_[a, b])
# print(np.c_[a, b])

#2 r: 矩阵上下相加, c:左右相加
a = np.arange(3)
b = np.arange(3)*(-1)
# print(np.r_[a, b])
# print(np.c_[a, b])

#3 the same numbers in (a, b)
a = np.array([1,2,3,2,3,4,3,4,5,6])
b = np.array([7,2,10,2,7,4,9,4,9,8])
# print(np.intersect1d(a, b))

#4 the numbers in a but not in b
a = np.array([1,2,3,4,5])
b = np.array([5,6,7,8,9])
# print(np.setdiff1d(a, b))

a = np.array([1,2,3,2,3,4,3,4,5,6])
b = np.array([7,2,10,2,7,4,9,4,9,8])
# print(np.where(a==b))
# print(np.where([True, True, False, False]))

#5 condition
a = np.array([2, 6, 1, 9, 10, 3, 27])
# a[np.where( (5<=a) & (a<=10 ))]
# print(a[(5<=a) & (a<=10 )])

#6 exchange column
# reverse  row & column
arr = np.arange(9).reshape(3, 3)
# print(arr[:, [1, 0, 2]])
# print(arr[::-1])
# print(arr[:,::-1])

#7 Create a 2D array of shape 5x3 to contain random decimal numbers between 5 and 10.
# print(np.random.uniform(5, 10, (2,2)))
# print(np.random.random((2,2))) # 范围(0,1)
# print(np.random.randint(5, 10, (2,2))) # int

#8 设置精度
# np.set_printoptions(3)

#10 设置是否科学计数法
X = np.random.random((3, 3))/1e3
# np.set_printoptions(suppress=False)
# print(X)

#11 设置打印个数
a = np.arange(10)
# print(a)
# np.set_printoptions(threshold=6)
# print(a)

#12
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')
# print(iris[:3])
out = np.array(iris[:, 4])
# print(out[:5])

# iris_1D = np.genfromtxt(url, delimiter=',', dtype=None) # shape, 1D
# out = np.array([ row.tolist()[:4] for row in iris_1D])  # 1d -> 2d
# print(out[:3])

# 13 AXIS
A = np.arange(12).reshape(2, 2, 3)
# print(A)
# input(">>>")
mean = np.mean(A, axis=0, dtype='int')
# 2*3 [ [3, 4, 5], [6, 7, 8] ]
# print(mean)
mean = np.mean(A, axis=1, dtype='int')
# 2*3 [ [1, 2, 3], [7, 8, 9] ]
# print(mean)
mean = np.mean(A, axis=2, dtype='int')
# 2*2 [ [1, 4], [7, 10] ]
# print(mean)

#14 normalize
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
sepallength = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0])
Smax, Smin = sepallength.max(), sepallength.min()
out = (sepallength - Smin)/(Smax - Smin)
# print(out)

#15 softmax
iris = np.genfromtxt(url, delimiter=',', dtype='object')
sepallength = np.array([float(row[0]) for row in iris])
out = torch.softmax(torch.from_numpy(sepallength), dim=0)
# print(out[:5])
x = np.exp(sepallength)
out = x/np.sum(x)
# print(out[:5])

#15 percentile 百分数
sepallength = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0])
out = np.percentile(sepallength, q=[5, 95])
# print(out)

#16 np.random
np.random.seed(233)
x = np.arange(10)
y = np.random.choice(x, 5)
# print(y)
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
##################################
i, j = np.where(iris_2d)
##################################
iris_2d[np.arange(150), np.random.choice(j, 150)] = np.nan
x = np.where(np.isnan(iris_2d))
# 不可以用 iris_2d == np.nan
x = np.c_[x] # 合并
# print(x)

#17 condition of where
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
condition = (iris_2d[:, 2] > 1.5) & (iris_2d[:, 0] < 5.0)
# print(iris_2d[condition])

#18  drop rows that contain a missing value
iris_2d[np.random.randint(150, size=10), np.random.randint(4, size=10)] = np.nan
# 方法1
pos = np.array([~np.any(np.isnan(row)) for row in iris_2d])
# print(iris_2d[pos].shape)
# 方法2
pos = ~np.any(np.isnan(iris_2d), axis=1)
# print(iris_2d[pos].shape)

#19 相关系数
iris = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
out = np.corrcoef(iris[:, 0], iris[:, 2])[1, 0]
# print(out)
# 法2
l1, l2 = iris[:, 0], iris[:, 2]
out = np.mean((l1-np.mean(l1)) * (l2-np.mean(l2))) / (np.std(l1)*np.std(l2))
# print(out)

#20 替换
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan

# print(iris_2d[:30])
iris_2d[np.isnan(iris_2d)] = 0
# print(iris_2d[:30])

#21 np.unique(很重要) 数清种类数
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')

species = np.array([row.tolist()[4] for row in iris])
#####################################################
out = np.unique(species, return_counts=True)
#####################################################
# print(out)

# 22 映射
##########################################################################
petal_length_bin = np.digitize(iris[:, 2].astype('float'), [0, 3, 5, 10])
##########################################################################
# print(petal_length_bin)
mp = {1: 'small', 2: 'meidum', 3: 'large', 4: np.nan}
petal_length_bin_cat = [mp[x] for x in petal_length_bin]
# print(petal_length_bin_cat)

#23 add new columns - pi*(row[2]^2)*row[0]/3
iris_2d = np.genfromtxt(url, delimiter=',', dtype='object')
r0 = iris_2d[:, 0].astype('float')
r2 = iris_2d[:, 2].astype('float')
x = np.pi * r0 * (r2**2) / 3
iris_2d = np.c_[iris_2d[:, 0:4], x]
# print(iris_2d)

#24 概率采样
np.random.seed(None)
x = np.arange(3)
y = np.random.choice(x, 2000, p=[0.1, 0.3, 0.6])
count = np.unique(y, return_counts=True)
# print(count[1]/2000)

# 25 searchsorted 寻找到可以插入的位置
x = np.linspace(0, 1, num=11)
y = np.searchsorted(x, 0.21)
# print(x, y)

# 26 What is the value of second longest petallength of species setosa
iris_2d = np.genfromtxt(url, delimiter=',', dtype='object')
setosa_r2 = iris_2d[iris_2d[:, 4] == b'Iris-setosa'][:, 2].astype('float')
x = np.unique(np.sort(setosa_r2))[-2]
# print(x)

# 27 Find the most frequent value of petal length (3rd column) in iris dataset.
x = np.unique(iris_2d[:, 2], return_counts=True)
val = x[0][np.argmax(x[1])]
# print(val)

#28 Find the position of the first occurrence of a value greater than 1.0 in petalwidth 4th column
#######################################################
x = np.argwhere(iris_2d[:, 3].astype('float')>1.0)
#######################################################
# print(x[0])

# 29 np.clip
np.random.seed(100)
a = np.random.uniform(1,50, 20)
# a[a>30] = 30
# a[a<10] = 10
# 法二
################################################
np.where(a < 10, 10, np.where(a > 30, 30, a))
################################################
# 法三
np.clip(a, a_min=10, a_max=30)
# print(a)

# 30 np.argsort()
np.random.seed(100)
a = np.random.randint(1,50, 20)
pos = a.argsort()[-5:]
# print(pos)

# 31 partition
x = np.array([1,4,2,7,5,3])
# print(np.partition(x, kth=1)) #类似快排, 以第k+1大数为界限分割数组
# >>>[1, 2, 4, 7, 5, 3] # 以2为界限

# 32 find 数组各个元素出现的个数
np.random.seed(100)
arr = np.random.randint(1,11,size=(6, 10))
def f(arr):
    num_counts = [np.unique(row, return_counts=True) for row in arr]
    # np.unique(arr) 代之1-10
    # b[a==i] 寻找i在a中出现的位置
    # 疑问这样遍历效率高嘛?
    return [[int(b[a==i]) if i in a else 0 for i in np.unique(arr)] for a, b in num_counts]
# print(f(arr))

# 33 Convert array_of_arrays into a flat linear 1d array.
arr1 = np.arange(3)
arr2 = np.arange(3,7)
arr3 = np.arange(7,10)
array_of_arrays = np.array([arr1, arr2, arr3])
# x = array_of_arrays.flatten()
x = np.concatenate(array_of_arrays)
# 法二
# x = np.array([a for arr in array_of_arrays for a in arr])
print(x)