1 import numpy as np
2 import pandas as pd
3
4 a = np.random.randint(0, 10, 10)
5 print(a, '\n')
6 # [0 0 1 8 4 4 2 8 2 6]
7
8 # say you want to split at 1 and 3
9 boundaries = [1, 3]
10 # add min and max values of your data
11 boundaries = sorted({a.min(), a.max() + 1} | set(boundaries))
12 print(boundaries)
13 # [0, 1, 3, 9]
14
15 a_discretized_0 = pd.cut(a, bins=boundaries, right=False)
16 a_discretized_1 = pd.cut(a, bins=boundaries, right=False, labels=False)
17 a_discretized_2 = pd.cut(a, bins=boundaries, labels=range(len(boundaries) - 1), right=False)
18 a_discretized_3 = pd.cut(a, bins=boundaries, labels=range(len(boundaries) - 1), right=False).astype(float)
19
20 print(a_discretized_0, '\n', a_discretized_0.dtype, '\n')
21 # [[0, 1), [0, 1), [1, 3), [3, 9), [3, 9), [3, 9), [1, 3), [3, 9), [1, 3), [3, 9)]
22 # Categories (3, interval[int64]): [[0, 1) < [1, 3) < [3, 9)]
23 # category
24
25 print(a_discretized_1, '\n', a_discretized_1.dtype, '\n')
26 # [0 0 1 2 2 2 1 2 1 2]
27 # int64
28
29 print(a_discretized_2, '\n', a_discretized_2.dtype, '\n')
30 # [0, 0, 1, 2, 2, 2, 1, 2, 1, 2]
31 # Categories (3, int64): [0 < 1 < 2]
32 # category
33
34 print(a_discretized_3, '\n', a_discretized_3.dtype, '\n')
35 # [0. 0. 1. 2. 2. 2. 1. 2. 1. 2.]
36 # float64