当前位置 : 主页 > 编程语言 > python >

python_范围切分cut

来源:互联网 收集:自由互联 发布时间:2022-07-19
python_范围切分cut Discretization and Binning¶ ages = [ 20 , 22 , 25 , 27 , 21 , 23 , 37 , 31 , 61 , 45 , 41 , 32 ] bins = [ 18 , 25 , 35 , 60 , 100 ] # 离散化和⾯元划分 范围切分 cats = pd . cut ( ages , bins ) cats [( 18 ,


python_范围切分cut

Discretization and Binning¶
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]

# 离散化和⾯元划分 范围切分
cats = pd.cut(ages, bins)
cats
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
cats.categories
pd.value_counts(cats)
# pandas返回的是⼀个特殊的Categorical对象。结果展示了
# pandas.cut划分的⾯元。你可以将其看做⼀组表示⾯元名称的字
# 符串。它的底层含有⼀个表示不同分类名称的类型数组,以及⼀
# 个codes属性中的年龄数据的标签:
cats.codes
cats.categories
pd.value_counts(cats)
(18, 25] 5
(35, 60] 3
(25, 35] 3
(60, 100] 1
dtype: int64
# 划分区间
pd.cut(ages, [18, 26, 36, 61, 100], right=False)
[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]
# 设置区间名称
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)
[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]
data = np.random.rand(20)
pd.cut(data, 4, precision=2)
data = np.random.randn(1000) # Normally distributed
cats = pd.qcut(data, 4) # Cut into quartiles
cats
pd.value_counts(cats)
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])
Detecting and Filtering Outliers
检测和过滤异常值
# 过滤或变换异常值(outlier)在很⼤程度上就是运⽤数组运算。
# 来看⼀个含有正态分布数据的DataFrame:
# 检测和过滤异常值
# 过滤或变换异常值(outlier)在很⼤程度上就是运⽤数组运算。
# 来看⼀个含有正态分布数据的DataFrame:
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()
0 1 2 3
count 1000.000000 1000.000000 1000.000000 1000.000000
mean -0.042219 -0.021445 -0.007653 -0.007220
std 0.949976 0.987228 1.030498 0.997743
min -3.194414 -3.108915 -3.645860 -3.481593
25% -0.702411 -0.699059 -0.714481 -0.689237
50% -0.048305 0.001385 0.029337 0.022671
75% 0.617693 0.619308 0.685055 0.674289
max 3.023720 2.859053 3.189940 3.525865
# 假设你想要找出某列中绝对值⼤⼩超过3的值
col = data[2]
col[np.abs(col) > 3]
30 -3.645860
334 -3.018842
489 -3.183867
536 -3.140963
929 3.082067
957 3.189940
Name: 2, dtype: float64
# 含有“超过3或-3的值”的⾏,可以在布尔型DataFrame中使⽤any⽅法:
data[(np.abs(data) > 3).any(1)]
0 1 2 3
9 0.582317 -0.658090 -0.207434 3.525865
30 -0.080332 0.599947 -3.645860 0.255475
252 -1.528975 -1.559625 0.336788 -3.333767
334 0.581893 -1.116332 -3.018842 -0.298748
359 -0.048478 -3.108915 1.117755 -0.152780
489 -0.274138 1.188742 -3.183867 1.050471
536 1.741426 -2.214074 -3.140963 -1.509976
702 -3.194414 0.077839 -1.733549 0.235425
732 3.023720 -1.105312 0.105141 0.995257
760 0.062528 2.368010 0.452649 -3.481593
929 -0.071320 0.164293 3.082067 -0.516982
957 0.617599 -0.843849 3.189940 0.070978
# 以将值限制在区间-3到3以内:
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()
0 1 2 3
count 1000.000000 1000.000000 1000.000000 1000.000000
mean -0.042048 -0.021336 -0.006936 -0.006930
std 0.949274 0.986893 1.026570 0.993388
min -3.000000 -3.000000 -3.000000 -3.000000
25% -0.702411 -0.699059 -0.714481 -0.689237
50% -0.048305 0.001385 0.029337 0.022671
75% 0.617693 0.619308 0.685055 0.674289
max 3.000000 2.859053 3.000000 3.000000
# 根据数据的值是正还是负,np.sign(data)可以⽣成1和-1:
np.sign(data).head()
0 1 2 3
0 1.0 -1.0 1.0 -1.0
1 -1.0 1.0 -1.0 -1.0
2 1.0 -1.0 -1.0 1.0
3 -1.0 -1.0 1.0 -1.0
4 -1.0 1.0 1.0 1.0


网友评论