python_汇总和计算描述统计 Axis Indexes with Duplicate Labels # 带有重复标签的轴索引 obj = pd . Series ( range ( 5 ), index = [ 'a' , 'a' , 'b' , 'b' , 'c' ]) obj a 0 a 1 b 2 b 3 c 4 dtype : int64 obj . index . is_unique F
python_汇总和计算描述统计
Axis Indexes with Duplicate Labels# 带有重复标签的轴索引
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj
a 0
a 1
b 2
b 3
c 4
dtype: int64
obj.index.is_unique
False
obj['a']
obj['c']
4
# 生成随机矩阵
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df
# df.loc['b']
0 1 2
a -1.442180 -2.276836 0.316662
a -0.672861 0.644555 -0.593982
b 3.645069 -0.690898 -1.010551
b 0.590857 -0.285636 1.329229
Summarizing and Computing Descriptive Statistics
# 5.3 汇总和计算描述统计
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
[np.nan, np.nan], [0.75, -1.3]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])
df
one two
a 1.40 NaN
b 7.10 -4.5
c NaN NaN
d 0.75 -1.3
df.sum()
one 9.25
two -5.80
dtype: float64
# 传⼊axis='columns'或axis=1将会按⾏进⾏求和运算:
df.sum(axis='columns')
a 1.40
b 2.60
c 0.00
d -0.55
dtype: float64
# NA值会⾃动被排除,除⾮整个切⽚(这⾥指的是⾏或列)都是NA。通过skipna选项可以禁⽤该功能
df.mean(axis='columns', skipna=False)
a NaN
b 1.300
c NaN
d -0.275
dtype: float64
df.idxmax()
one b
two d
dtype: object
# python_排名
df.cumsum()
one two
a 1.40 NaN
b 8.50 -4.5
c NaN NaN
d 9.25 -5.8
# 还有⼀种⽅法,它既不是约简型也不是累计型。describe就是⼀
# 个例⼦,它⽤于⼀次性产⽣多个汇总统计:
df.describe()
one two
count 3.000000 2.000000
mean 3.083333 -2.900000
std 3.493685 2.262742
min 0.750000 -4.500000
25% 1.075000 -3.700000
50% 1.400000 -2.900000
75% 4.250000 -2.100000
max 7.100000 -1.300000
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)
obj.describe()
count 16
unique 3
top a
freq 8
dtype: object
Correlation and Covariance
conda install pandas-datareader
# 相关系数与协⽅差
# 加载数据
price = pd.read_pickle('C:/file/code-sample-ok/python_practise/pydata-book-2nd-edition/examples/yahoo_price.pkl')
volume = pd.read_pickle('C:/file/code-sample-ok/python_practise/pydata-book-2nd-edition/examples/yahoo_volume.pkl')
import pandas_datareader.data as web all_data = {ticker: web.get_data_yahoo(ticker) for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}
price = pd.DataFrame({ticker: data['Adj Close'] for ticker, data in all_data.items()}) volume = pd.DataFrame({ticker: data['Volume'] for ticker, data in all_data.items()})
price.head()
AAPL GOOG IBM MSFT
Date
2010-01-04 27.990226 313.062468 113.304536 25.884104
2010-01-05 28.038618 311.683844 111.935822 25.892466
2010-01-06 27.592626 303.826685 111.208683 25.733566
2010-01-07 27.541619 296.753749 110.823732 25.465944
2010-01-08 27.724725 300.709808 111.935822 25.641571
# 现在计算价格的百分数变化
returns = price.pct_change()
returns.tail()
AAPL GOOG IBM MSFT
Date
2016-10-17 -0.000680 0.001837 0.002072 -0.003483
2016-10-18 -0.000681 0.019616 -0.026168 0.007690
2016-10-19 -0.002979 0.007846 0.003583 -0.002255
2016-10-20 -0.000512 -0.005652 0.001719 -0.004867
2016-10-21 -0.003930 0.003011 -0.012474 0.042096
returns['MSFT'].corr(returns['IBM'])
returns['MSFT'].cov(returns['IBM'])
8.870655479703546e-05
returns.MSFT.corr(returns.IBM)
0.4997636114415114
# 另⼀⽅⾯,DataFrame的corr和cov⽅法将以DataFrame的形式
# 分别返回完整的相关系数或协⽅差矩阵:
returns.corr()
returns.cov()
AAPL GOOG IBM MSFT
AAPL 0.000277 0.000107 0.000078 0.000095
GOOG 0.000107 0.000251 0.000078 0.000108
IBM 0.000078 0.000078 0.000146 0.000089
MSFT 0.000095 0.000108 0.000089 0.000215
利⽤DataFrame的corrwith⽅法,你可以计算其列或⾏跟另⼀个
# Series或DataFrame之间的相关系数。传⼊⼀个Series将会返回
# ⼀个相关系数值Series(针对各列进⾏计算):
# 利⽤DataFrame的corrwith⽅法,你可以计算其列或⾏跟另⼀个
# Series或DataFrame之间的相关系数。传⼊⼀个Series将会返回
# ⼀个相关系数值Series(针对各列进⾏计算):
returns.corrwith(returns.IBM)
AAPL 0.386817
GOOG 0.405099
IBM 1.000000
MSFT 0.499764
dtype: float64
传⼊⼀个DataFrame则会计算按列名配对的相关系数。这⾥,我
# 计算百分⽐变化与成交量的相关系数:
# 传⼊⼀个DataFrame则会计算按列名配对的相关系数。这⾥,我
# 计算百分⽐变化与成交量的相关系数:
returns.corrwith(volume)
AAPL -0.075565
GOOG -0.007067
IBM -0.204849
MSFT -0.092950
dtype:
唯⼀值、值计数以及成员资格
Unique Values, Value Counts, and Membership# 唯⼀值、值计数以及成员资格
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
uniques = obj.unique()
uniques
array(['c', 'a', 'd', 'b'], dtype=object)
value_counts⽤于计算⼀个
# Series中各值出现的频率:
# value_counts⽤于计算⼀个
# Series中各值出现的频率:
obj.value_counts()
a 3
c 3
b 2
d 1
dtype: int64
# isin⽤于判断⽮量化集合的成员资格,可⽤于过滤Series中或
# DataFrame列中数据的⼦集:
pd.value_counts(obj.values, sort=False)
a 3
c 3
d 1
b 2
dtype: int64
obj
mask = obj.isin(['b', 'c'])
mask
obj[mask]
0 c
5 b
6 b
7 c
8 c
dtype: object
# 与isin类似的是Index.get_indexer⽅法,它可以给你⼀个索引数
# 组,从可能包含重复值的数组到另⼀个不同值的数组:
to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])
unique_vals = pd.Series(['c', 'b', 'a'])
pd.Index(unique_vals).get_indexer(to_match)
array([0, 2, 1, 1, 0, 2], dtype=int64)
# 到DataFrame中多个相关列的⼀张柱状图。
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
'Qu2': [2, 3, 1, 2, 3],
'Qu3': [1, 5, 2, 4, 4]})
data
Qu1 Qu2 Qu3
0 1 2 1
1 3 3 5
2 4 1 2
3 3 2 4
4 4 3 4
# 将pandas.value_counts传给该DataFrame的apply函数
# result = data.apply(pd.value_counts).fillna(0)
result = data.apply(pd.value_counts)
result
Qu1 Qu2 Qu3
1 1.0 1.0 1.0
2 NaN 2.0 1.0
3 2.0 2.0 NaN
4 2.0 NaN 2.0
5 NaN NaN 1.0