分桶离散

 1 import numpy as np
 2 import pandas as pd
 3 
 4 a = np.random.randint(0, 10, 10)
 5 print(a, '\n')
 6 # [0 0 1 8 4 4 2 8 2 6] 
 7 
 8 # say you want to split at 1 and 3
 9 boundaries = [1, 3]
10 # add min and max values of your data
11 boundaries = sorted({a.min(), a.max() + 1} | set(boundaries))
12 print(boundaries)
13 # [0, 1, 3, 9]
14 
15 a_discretized_0 = pd.cut(a, bins=boundaries, right=False)
16 a_discretized_1 = pd.cut(a, bins=boundaries, right=False, labels=False)
17 a_discretized_2 = pd.cut(a, bins=boundaries, labels=range(len(boundaries) - 1), right=False)
18 a_discretized_3 = pd.cut(a, bins=boundaries, labels=range(len(boundaries) - 1), right=False).astype(float)
19 
20 print(a_discretized_0, '\n', a_discretized_0.dtype, '\n')
21 # [[0, 1), [0, 1), [1, 3), [3, 9), [3, 9), [3, 9), [1, 3), [3, 9), [1, 3), [3, 9)]
22 # Categories (3, interval[int64]): [[0, 1) < [1, 3) < [3, 9)] 
23 #  category 
24 
25 print(a_discretized_1, '\n', a_discretized_1.dtype, '\n')
26 # [0 0 1 2 2 2 1 2 1 2] 
27 #  int64 
28 
29 print(a_discretized_2, '\n', a_discretized_2.dtype, '\n')
30 # [0, 0, 1, 2, 2, 2, 1, 2, 1, 2]
31 # Categories (3, int64): [0 < 1 < 2] 
32 #  category 
33 
34 print(a_discretized_3, '\n', a_discretized_3.dtype, '\n')
35 # [0. 0. 1. 2. 2. 2. 1. 2. 1. 2.] 
36 #  float64 

 

posted @ 2020-05-01 10:24  hzn2003  阅读(120)  评论(0)    收藏  举报