# from http://pandas.pydata.org/pandas-docs/stable/indexing.html# pandas 0.22.0
import pandas as pdimport numpy as np
"""尽量利用pandas提供的专用索引方式,而不是python通用的切片方式。三种主要的索引方式:indexers.loc label based 基于标签, 可以是标签名,可以是布尔值,可以是一元函数.iloc integer psition based 基于整数位置(from 0 to length-1 of the axis),和python切片类似[]""""""Object Type Selection Return Value Type------------------------------------------------------------Series series[label] scalar valueDataFrame frame[colname] Series corresponding to colnamePanel panel[itemname] DataFrame corresponding to the itemname"""
'\n尽量利用pandas提供的专用索引方式,而不是python通用的切片方式。\n三种主要的索引方式:indexers\n.loc label based 基于标签, 可以是标签名,可以是布尔值,可以是一元函数\n.iloc integer psition based 基于整数位置(from 0 to length-1 of the axis),和python切片类似\n[]\n\n''\nObject Type \t\tIndexers\nSeries \t\t\t\ts.loc[indexer]\nDataFrame \t\t\tdf.loc[row_indexer,column_indexer]\nPanel \t\t\t\tp.loc[item_indexer,major_indexer,minor_indexer]\n'
# Here we construct a simple time series data set to use for illustrating the indexing functionality # 构造时间序列,举例说明索引的功能dates = pd.date_range('1/1/2000', periods=8)datesdf = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])df
DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04', '2000-01-05', '2000-01-06', '2000-01-07', '2000-01-08'], dtype='datetime64[ns]', freq='D')
| A | B | C | D |
2000-01-01 | 0.136628 | 0.324097 | 0.856313 | -0.145259 |
2000-01-02 | 0.113819 | -0.718630 | 0.016217 | -1.571263 |
2000-01-03 | -0.603598 | -0.143643 | -1.143063 | -0.425266 |
2000-01-04 | -0.486200 | -0.136663 | -2.016020 | -0.815514 |
2000-01-05 | 0.835318 | 1.036607 | -0.502919 | 0.878680 |
2000-01-06 | -2.376205 | 0.362577 | -0.484754 | -0.478711 |
2000-01-07 | 0.193371 | 1.330468 | 0.544160 | 1.030900 |
2000-01-08 | 0.476533 | -0.476653 | -0.434356 | 0.744500 |
panel = pd.Panel({'one' : df, 'two' : df - df.mean()}) # 多维表格panelpanel['one']panel['two']
Dimensions: 2 (items) x 8 (major_axis) x 4 (minor_axis)Items axis: one to twoMajor_axis axis: 2000-01-01 00:00:00 to 2000-01-08 00:00:00Minor_axis axis: A to D
| A | B | C | D |
2000-01-01 | 0.136628 | 0.324097 | 0.856313 | -0.145259 |
2000-01-02 | 0.113819 | -0.718630 | 0.016217 | -1.571263 |
2000-01-03 | -0.603598 | -0.143643 | -1.143063 | -0.425266 |
2000-01-04 | -0.486200 | -0.136663 | -2.016020 | -0.815514 |
2000-01-05 | 0.835318 | 1.036607 | -0.502919 | 0.878680 |
2000-01-06 | -2.376205 | 0.362577 | -0.484754 | -0.478711 |
2000-01-07 | 0.193371 | 1.330468 | 0.544160 | 1.030900 |
2000-01-08 | 0.476533 | -0.476653 | -0.434356 | 0.744500 |
| A | B | C | D |
2000-01-01 | 0.350419 | 0.126827 | 1.251866 | -0.047518 |
2000-01-02 | 0.327611 | -0.915900 | 0.411770 | -1.473522 |
2000-01-03 | -0.389806 | -0.340913 | -0.747511 | -0.327524 |
2000-01-04 | -0.272408 | -0.333933 | -1.620467 | -0.717772 |
2000-01-05 | 1.049110 | 0.839337 | -0.107366 | 0.976422 |
2000-01-06 | -2.162413 | 0.165307 | -0.089201 | -0.380969 |
2000-01-07 | 0.407163 | 1.133198 | 0.939713 | 1.128641 |
2000-01-08 | 0.690325 | -0.673923 | -0.038803 | 0.842241 |
# Thus, as per above, we have the most basic indexing using []:# 最基本的索引方法,使用[]s = df['A']sdates[5] # 注意切片从0计数s[dates[5]]
2000-01-01 0.1366282000-01-02 0.1138192000-01-03 -0.6035982000-01-04 -0.4862002000-01-05 0.8353182000-01-06 -2.3762052000-01-07 0.1933712000-01-08 0.476533Freq: D, Name: A, dtype: float64Timestamp('2000-01-06 00:00:00', freq='D')-2.376204948581219
# 在[]中传入列名的列表,如[ 'A', "B" ]columns_l = ['A', 'B']df[columns_l] df[[ 'A', "B" ]] # 相当于上面,注意两重[]df
| A | B |
2000-01-01 | 0.324097 | 0.136628 |
2000-01-02 | -0.718630 | 0.113819 |
2000-01-03 | -0.143643 | -0.603598 |
2000-01-04 | -0.136663 | -0.486200 |
2000-01-05 | 1.036607 | 0.835318 |
2000-01-06 | 0.362577 | -2.376205 |
2000-01-07 | 1.330468 | 0.193371 |
2000-01-08 | -0.476653 | 0.476533 |
| A | B |
2000-01-01 | 0.324097 | 0.136628 |
2000-01-02 | -0.718630 | 0.113819 |
2000-01-03 | -0.143643 | -0.603598 |
2000-01-04 | -0.136663 | -0.486200 |
2000-01-05 | 1.036607 | 0.835318 |
2000-01-06 | 0.362577 | -2.376205 |
2000-01-07 | 1.330468 | 0.193371 |
2000-01-08 | -0.476653 | 0.476533 |
| A | B | C | D |
2000-01-01 | 0.324097 | 0.136628 | 0.856313 | -0.145259 |
2000-01-02 | -0.718630 | 0.113819 | 0.016217 | -1.571263 |
2000-01-03 | -0.143643 | -0.603598 | -1.143063 | -0.425266 |
2000-01-04 | -0.136663 | -0.486200 | -2.016020 | -0.815514 |
2000-01-05 | 1.036607 | 0.835318 | -0.502919 | 0.878680 |
2000-01-06 | 0.362577 | -2.376205 | -0.484754 | -0.478711 |
2000-01-07 | 1.330468 | 0.193371 | 0.544160 | 1.030900 |
2000-01-08 | -0.476653 | 0.476533 | -0.434356 | 0.744500 |
df[['A', 'B']] = df[['B', 'A']] # 交换两列的值dfdf.loc[:,['B', 'A']]# 下式不能交换两列的值# This will not modify df because the column alignment is before value assignment. ? 不理解?# 列赋值在值赋值之前?df.loc[:,['B', 'A']] = df[['A', 'B']] df# df.loc[:,['B', 'A']] = df[:, ['A', 'B']] # 错误?# df# 正确的方式:df.loc[:,['B', 'A']] = df[['A', 'B']].values # 取列的值df
| A | B | C | D |
2000-01-01 | 0.136628 | 0.324097 | 0.856313 | -0.145259 |
2000-01-02 | 0.113819 | -0.718630 | 0.016217 | -1.571263 |
2000-01-03 | -0.603598 | -0.143643 | -1.143063 | -0.425266 |
2000-01-04 | -0.486200 | -0.136663 | -2.016020 | -0.815514 |
2000-01-05 | 0.835318 | 1.036607 | -0.502919 | 0.878680 |
2000-01-06 | -2.376205 | 0.362577 | -0.484754 | -0.478711 |
2000-01-07 | 0.193371 | 1.330468 | 0.544160 | 1.030900 |
2000-01-08 | 0.476533 | -0.476653 | -0.434356 | 0.744500 |
| B | A |
2000-01-01 | 0.324097 | 0.136628 |
2000-01-02 | -0.718630 | 0.113819 |
2000-01-03 | -0.143643 | -0.603598 |
2000-01-04 | -0.136663 | -0.486200 |
2000-01-05 | 1.036607 | 0.835318 |
2000-01-06 | 0.362577 | -2.376205 |
2000-01-07 | 1.330468 | 0.193371 |
2000-01-08 | -0.476653 | 0.476533 |
| A | B | C | D |
2000-01-01 | 0.136628 | 0.324097 | 0.856313 | -0.145259 |
2000-01-02 | 0.113819 | -0.718630 | 0.016217 | -1.571263 |
2000-01-03 | -0.603598 | -0.143643 | -1.143063 | -0.425266 |
2000-01-04 | -0.486200 | -0.136663 | -2.016020 | -0.815514 |
2000-01-05 | 0.835318 | 1.036607 | -0.502919 | 0.878680 |
2000-01-06 | -2.376205 | 0.362577 | -0.484754 | -0.478711 |
2000-01-07 | 0.193371 | 1.330468 | 0.544160 | 1.030900 |
2000-01-08 | 0.476533 | -0.476653 | -0.434356 | 0.744500 |
| A | B | C | D |
2000-01-01 | 0.324097 | 0.136628 | 0.856313 | -0.145259 |
2000-01-02 | -0.718630 | 0.113819 | 0.016217 | -1.571263 |
2000-01-03 | -0.143643 | -0.603598 | -1.143063 | -0.425266 |
2000-01-04 | -0.136663 | -0.486200 | -2.016020 | -0.815514 |
2000-01-05 | 1.036607 | 0.835318 | -0.502919 | 0.878680 |
2000-01-06 | 0.362577 | -2.376205 | -0.484754 | -0.478711 |
2000-01-07 | 1.330468 | 0.193371 | 0.544160 | 1.030900 |
2000-01-08 | -0.476653 | 0.476533 | -0.434356 | 0.744500 |
# Attribute Access¶# 属性访问,属性存取# You may access an index on a Series, column on a DataFrame, # and an item on a Panel directly as an attribute:sa = pd.Series([1,2,3],index=list('abc'))sadfa = df.copy()dfa
a 1b 2c 3dtype: int64
| A | B | C | D |
2000-01-01 | 0.324097 | 0.136628 | 0.856313 | -0.145259 |
2000-01-02 | -0.718630 | 0.113819 | 0.016217 | -1.571263 |
2000-01-03 | -0.143643 | -0.603598 | -1.143063 | -0.425266 |
2000-01-04 | -0.136663 | -0.486200 | -2.016020 | -0.815514 |
2000-01-05 | 1.036607 | 0.835318 | -0.502919 | 0.878680 |
2000-01-06 | 0.362577 | -2.376205 | -0.484754 | -0.478711 |
2000-01-07 | 1.330468 | 0.193371 | 0.544160 | 1.030900 |
2000-01-08 | -0.476653 | 0.476533 | -0.434356 | 0.744500 |
# 对item column等赋值# 就像属性一样的存取,但需要注意:# 1. 名称应符合python命名规则,由字母、数字和下划线组成# 2. 不能和已有的方法名称重名,例如 min# 3. 不能与pandas内部“关键字”重名,例如 index axis items labels# 以上情况,可以使用 [ "" ] 引用sa.a = 5sadfa.indexdfa.A = list(range(len(dfa.index)))dfa
a 5b 2c 3dtype: int64DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04', '2000-01-05', '2000-01-06', '2000-01-07', '2000-01-08'], dtype='datetime64[ns]', freq='D')
| A | B | C | D |
2000-01-01 | 0 | 0.136628 | 0.856313 | -0.145259 |
2000-01-02 | 1 | 0.113819 | 0.016217 | -1.571263 |
2000-01-03 | 2 | -0.603598 | -1.143063 | -0.425266 |
2000-01-04 | 3 | -0.486200 | -2.016020 | -0.815514 |
2000-01-05 | 4 | 0.835318 | -0.502919 | 0.878680 |
2000-01-06 | 5 | -2.376205 | -0.484754 | -0.478711 |
2000-01-07 | 6 | 0.193371 | 0.544160 | 1.030900 |
2000-01-08 | 7 | 0.476533 | -0.434356 | 0.744500 |
x = pd.DataFrame({'x': [1, 2, 3], 'y': [3, 4, 5]}) # 字典key值为列名x
dict(x=9, y=99)x.iloc[1]x.iloc[1] = dict(x=9, y=99)x
{'x': 9, 'y': 99}x 2y 4Name: 1, dtype: int64
df = pd.DataFrame({'one': [1., 2., 3.]})dfdf.two = [4, 5, 6] # 错误,不能增加一列,利用属性的方式不能对没有的列赋值df.two # 但是增加了一项属性,而且可以取得这项属性dfdf['two'] = [4, 5, 6] # 可以增加一列df
d:\python\36-64\lib\site-packages\ipykernel_launcher.py:3: UserWarning: Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access This is separate from the ipykernel package so we can avoid doing imports until[4, 5, 6]
| one | two |
0 | 1.0 | 4 |
1 | 2.0 | 5 |
2 | 3.0 | 6 |
# Slicing ranges# 切片范围# iloc 方法是最稳健和兼容的# 下面介绍 方括号 [] 操作符 作为切片# 对series操作# 取值ss[:5]s[::2]s[::-1]
2000-01-01 0.3240972000-01-02 -0.7186302000-01-03 -0.1436432000-01-04 -0.1366632000-01-05 1.0366072000-01-06 0.3625772000-01-07 1.3304682000-01-08 -0.476653Freq: D, Name: A, dtype: float642000-01-01 0.3240972000-01-02 -0.7186302000-01-03 -0.1436432000-01-04 -0.1366632000-01-05 1.036607Freq: D, Name: A, dtype: float642000-01-01 0.3240972000-01-03 -0.1436432000-01-05 1.0366072000-01-07 1.330468Freq: 2D, Name: A, dtype: float642000-01-08 -0.4766532000-01-07 1.3304682000-01-06 0.3625772000-01-05 1.0366072000-01-04 -0.1366632000-01-03 -0.1436432000-01-02 -0.7186302000-01-01 0.324097Freq: -1D, Name: A, dtype: float64
# 赋值s2 = s.copy()s2s2[:5] = 0s2
2000-01-01 0.3240972000-01-02 -0.7186302000-01-03 -0.1436432000-01-04 -0.1366632000-01-05 1.0366072000-01-06 0.3625772000-01-07 1.3304682000-01-08 -0.476653Freq: D, Name: A, dtype: float642000-01-01 0.0000002000-01-02 0.0000002000-01-03 0.0000002000-01-04 0.0000002000-01-05 0.0000002000-01-06 0.3625772000-01-07 1.3304682000-01-08 -0.476653Freq: D, Name: A, dtype: float64
# 对DataFrame操作# [] 操作 选择的是行dfdf[:3]df[::-1]
| one | two |
0 | 1.0 | 4 |
1 | 2.0 | 5 |
2 | 3.0 | 6 |
| one | two |
0 | 1.0 | 4 |
1 | 2.0 | 5 |
2 | 3.0 | 6 |
| one | two |
2 | 3.0 | 6 |
1 | 2.0 | 5 |
0 | 1.0 | 4 |
# Selection By Label# 通过标签选择# 这种方式可能依靠上下文关系,有时候会调用链式赋值,这应该避免。参考:# Returning a view versus a copy 返回视图 对比 拷贝# When setting values in a pandas object, # care must be taken to avoid what is called chained indexing. # Here is an example.[list('abcd'),list('efgh'),list('ijkl'),list('mnop')][['one','two'],['first','second']]dfmi = pd.DataFrame([list('abcd'),list('efgh'),list('ijkl'),list('mnop')], # 定义四列数值 columns=pd.MultiIndex.from_product( [['one','two'],['first','second']])) # 定义多重索引,第一个list是第一层,以下类推dfmi
[['a', 'b', 'c', 'd'], ['e', 'f', 'g', 'h'], ['i', 'j', 'k', 'l'], ['m', 'n', 'o', 'p']][['one', 'two'], ['first', 'second']]
| one | two |
| first | second | first | second |
0 | a | b | c | d |
1 | e | f | g | h |
2 | i | j | k | l |
3 | m | n | o | p |
# Compare these two access methods:dfmi['one']['second'] # chaineddfmi.loc[:,('one','second')]
0 b1 f2 j3 nName: second, dtype: object0 b1 f2 j3 nName: (one, second), dtype: object
# 两种方式分析# 第一种方式:链式 使用两个 []dfmi['one'] # 第一个[] 先生成了一个DataFramedfmi['one']['second'] # pandas 把两个[] 作为分开的事件,他们执行分开的两步调用 __getitem__
| first | second |
0 | a | b |
1 | e | f |
2 | i | j |
3 | m | n |
# 第二种方式:loc# 通过嵌套的元组切片df_s = (slice(None), ('one', 'second'))dfmi.loc[df_s] # 只调用了一次 __getitem__
0 b1 f2 j3 nName: (one, second), dtype: object
# Selection By Label# 通过标签选择# loc要注意索引的数据类型,必须与索引的数据类型一致才可以,# 例如 datetimeIndex 中,使用loc[2:3] ,即整数型的slice 将会出现TypeErrordfl = pd.DataFrame(np.random.randn(5,4), columns=list('ABCD'), index=pd.date_range('20130101',periods=5))dfl# dfl.loc[2:3] # 错误的locdfl.loc['20130102':'20130104'] # 使用可转换为datetime的字符串dfl.loc['20130202':'20130204'] # 不报错,返回为空DataFrame# dfl.loc['20130202'] # 报错,错误信息是index无此值dfl.loc['20130104':'20130114'] # 只返回存在的数据
| A | B | C | D |
2013-01-01 | -0.887358 | 1.068362 | 0.551961 | -0.378400 |
2013-01-02 | 1.239840 | -0.986149 | -0.880655 | 2.112731 |
2013-01-03 | -0.785526 | 1.583703 | -0.871005 | -0.659880 |
2013-01-04 | -1.267462 | 2.500886 | -0.980569 | 1.308624 |
2013-01-05 | -0.842107 | -0.921086 | 1.020196 | -0.055930 |
| A | B | C | D |
2013-01-02 | 1.239840 | -0.986149 | -0.880655 | 2.112731 |
2013-01-03 | -0.785526 | 1.583703 | -0.871005 | -0.659880 |
2013-01-04 | -1.267462 | 2.500886 | -0.980569 | 1.308624 |
| A | B | C | D |
2013-01-04 | -1.267462 | 2.500886 | -0.980569 | 1.308624 |
2013-01-05 | -0.842107 | -0.921086 | 1.020196 | -0.055930 |
# loc可以使用整数,但此时的整数不代表位置,而是label# loc是基本的取值方法# 给loc的输入,即在[]中的值,可以是:# 1. 单独的label e.g. 5 or 'a# 2. labels的list ['a', 'b', 'c']# 3. slice对象 'a':'f' !! 注意:与python的切片不同,pandas的切片包括开始和结尾,而python不包括结尾# 4. 布尔值# 5. 调用函数 [lambda df: df.A > 0, :]# Seriess1 = pd.Series(np.random.randn(6),index=list('abcdef'))s1s1.loc['c':]s1.loc['b']s1.loc['c':] = 0s1
a 0.796911b -1.341250c 0.008152d -0.745881e 0.674385f 1.108411dtype: float64c 0.008152d -0.745881e 0.674385f 1.108411dtype: float64-1.3412499335785426a 0.796911b -1.341250c 0.000000d 0.000000e 0.000000f 0.000000dtype: float64
# DataFramedf1 = pd.DataFrame(np.random.randn(6,4), # 6X4 阵列 index=list('abcdef'), # 索引 columns=list('ABCD')) # 行号df1
| A | B | C | D |
a | -1.912141 | -0.835589 | -0.188341 | -1.024797 |
b | -0.977498 | -2.050214 | 0.355172 | -0.291794 |
c | -0.183401 | -0.376330 | -0.188848 | -2.116438 |
d | -1.008359 | 0.230593 | -0.099235 | -0.426229 |
e | -0.027338 | 2.125459 | 0.066200 | -0.247813 |
f | -1.132103 | 1.945235 | 1.891179 | 1.549750 |
df1.loc[['a', 'b', 'd'], :] # 先是行label选择,再是列label选择df1.loc['d':, 'A':'C']# 取得 a cross section 截面,用单个的label,返回Series# 以下三式等同df1.loc['a']df1.loc['a',:]df1.xs('a')type(df1.loc['a'])
| A | B | C | D |
a | -1.912141 | -0.835589 | -0.188341 | -1.024797 |
b | -0.977498 | -2.050214 | 0.355172 | -0.291794 |
d | -1.008359 | 0.230593 | -0.099235 | -0.426229 |
| A | B | C |
d | -1.008359 | 0.230593 | -0.099235 |
e | -0.027338 | 2.125459 | 0.066200 |
f | -1.132103 | 1.945235 | 1.891179 |
A -1.912141B -0.835589C -0.188341D -1.024797Name: a, dtype: float64A -1.912141B -0.835589C -0.188341D -1.024797Name: a, dtype: float64A -1.912141B -0.835589C -0.188341D -1.024797Name: a, dtype: float64pandas.core.series.Series
# 通过布尔值数组取值df1.loc['a'] > 0df1.loc[:, df1.loc['a'] > 0]
A FalseB FalseC FalseD FalseName: a, dtype: bool
# 获取一个值# this is also equivalent to ``df1.at['a','A']``df1.loc['a', 'A']df1.at['a','A']
-1.9121410098540752-1.9121410098540752
# Slicing with labels¶# 用labels切片s = pd.Series(list('abcde'), index=[0,3,2,5,4])ss.loc[3:5] # 包含 5, 注意不是 3 4 5 ,而是从 标签3 到 标签5
0 a3 b2 c5 d4 edtype: object3 b2 c5 ddtype: object
# s.loc[3:6] # 2个错误,ValueError: index must be monotonic increasing or decreasing KeyError: 6# 如果排序的话,可以超出范围s.sort_index() # 不改变原值ss.sort_index().loc[1:6] # 可以超出范围s.sort_index().loc[6:8] # 即使一个都没有# s.sort_index().loc[8] # 但不能是单值,必须是切片
0 a2 c3 b4 e5 ddtype: object0 a3 b2 c5 d4 edtype: object2 c3 b4 e5 ddtype: objectSeries([], dtype: object)
# Selection By Position# 通过位置选择,仅通过基于索引的整数,与python和numpy类似,从0开始,且不包括最后一个# iloc的输入:# 1. 整数# 2. 整数的list# 3. 整数的切片# 4. 布尔值数组# 5. 调用函数# Seriess1 = pd.Series(np.random.randn(5), index=list(range(0,10,2)))s1s1.iloc[:3]s1.iloc[3]s1.iloc[:3] = 0s1
0 -0.3127162 1.4259364 1.7165756 2.0996668 0.262365dtype: float640 -0.3127162 1.4259364 1.716575dtype: float642.0996656798699750 0.0000002 0.0000004 0.0000006 2.0996668 0.262365dtype: float64
# DataFramedf1 = pd.DataFrame(np.random.randn(6,4), index=list(range(0,12,2)), columns=list(range(0,8,2)))df1
| 0 | 2 | 4 | 6 |
0 | 1.635993 | -0.512450 | 1.786760 | -0.002533 |
2 | 0.173188 | 0.275977 | -0.044987 | -1.077772 |
4 | 1.985020 | 1.604020 | 0.127853 | -1.003384 |
6 | 0.250428 | -0.102090 | 1.566787 | -1.708521 |
8 | -2.111103 | -1.232141 | 0.863753 | -0.545229 |
10 | -1.762999 | 1.009840 | 0.274013 | 0.786940 |
df1.iloc[:3]df1.iloc[1:5, 2:4]df1.iloc[[1, 3, 5], [1, 3]]df1.iloc[1:3, :]df1.iloc[:, 1:3]
| 0 | 2 | 4 | 6 |
0 | 1.635993 | -0.512450 | 1.786760 | -0.002533 |
2 | 0.173188 | 0.275977 | -0.044987 | -1.077772 |
4 | 1.985020 | 1.604020 | 0.127853 | -1.003384 |
| 4 | 6 |
2 | -0.044987 | -1.077772 |
4 | 0.127853 | -1.003384 |
6 | 1.566787 | -1.708521 |
8 | 0.863753 | -0.545229 |
| 2 | 6 |
2 | 0.275977 | -1.077772 |
6 | -0.102090 | -1.708521 |
10 | 1.009840 | 0.786940 |
| 0 | 2 | 4 | 6 |
2 | 0.173188 | 0.275977 | -0.044987 | -1.077772 |
4 | 1.985020 | 1.604020 | 0.127853 | -1.003384 |
| 2 | 4 |
0 | -0.512450 | 1.786760 |
2 | 0.275977 | -0.044987 |
4 | 1.604020 | 0.127853 |
6 | -0.102090 | 1.566787 |
8 | -1.232141 | 0.863753 |
10 | 1.009840 | 0.274013 |
# this is also equivalent to ``df1.iat[1,1]``# 取单个的值df1.iloc[1, 1]df1.iat[1, 1]
0.27597747846210130.2759774784621013
# For getting a cross section using an integer position (equiv to df.xs(1))# 取截面,得到Seriesdf1.iloc[1]df1.iloc[:,1]
0 0.1731882 0.2759774 -0.0449876 -1.077772Name: 2, dtype: float640 -0.5124502 0.2759774 1.6040206 -0.1020908 -1.23214110 1.009840Name: 2, dtype: float64
# Out of range slice indexes are handled gracefully just as in Python/Numpy.# 超过索引的切片处理,与python和numpy一样# 注意:不能是单独索引,或列表中,有超过界限的值,只可以是slice,即带冒号的切片才不会提示错误x = list('abcdef')x[4:10]x[8:10]s = pd.Series(x)s.iloc[4:10]s.iloc[8:10] # 超过界限bound返回空
['e', 'f'][]4 e5 fdtype: objectSeries([], dtype: object)
dfl = pd.DataFrame(np.random.randn(5,2), columns=list('AB'))dfldfl.iloc[:, 2:3]dfl.iloc[:, 1:3]dfl.iloc[4:6]
| A | B |
0 | -0.069941 | 1.124145 |
1 | -0.025781 | 0.940736 |
2 | -0.117417 | 0.503736 |
3 | 0.882286 | 0.302845 |
4 | -0.136374 | 0.276822 |
| B |
0 | 1.124145 |
1 | 0.940736 |
2 | 0.503736 |
3 | 0.302845 |
4 | 0.276822 |
# Selection By Callable# 通过调用函数进行选择df1 = pd.DataFrame(np.random.randn(6,4), index=list("abcdef"), columns=list("ABCD"))df1df1.A
| A | B | C | D |
a | -1.947578 | 0.874286 | 1.139484 | -3.004564 |
b | 0.565255 | 0.028440 | 0.685688 | 0.973264 |
c | -1.275992 | 0.732339 | -0.324490 | 1.116887 |
d | 0.433325 | 0.002567 | -1.310127 | 0.844756 |
e | 0.341412 | -0.606646 | 0.034623 | 0.772968 |
f | 1.518936 | -0.590351 | 0.604839 | -1.461750 |
a -1.947578b 0.565255c -1.275992d 0.433325e 0.341412f 1.518936Name: A, dtype: float64
df1.loc[lambda df: df.A > 0, :]df1.loc[:, lambda df: ['A', 'B']]df1.iloc[:, lambda df: [0, 1]]df1[lambda df: df.columns[0]]
| A | B | C | D |
b | 0.565255 | 0.028440 | 0.685688 | 0.973264 |
d | 0.433325 | 0.002567 | -1.310127 | 0.844756 |
e | 0.341412 | -0.606646 | 0.034623 | 0.772968 |
f | 1.518936 | -0.590351 | 0.604839 | -1.461750 |
| A | B |
a | -1.947578 | 0.874286 |
b | 0.565255 | 0.028440 |
c | -1.275992 | 0.732339 |
d | 0.433325 | 0.002567 |
e | 0.341412 | -0.606646 |
f | 1.518936 | -0.590351 |
| A | B |
a | -1.947578 | 0.874286 |
b | 0.565255 | 0.028440 |
c | -1.275992 | 0.732339 |
d | 0.433325 | 0.002567 |
e | 0.341412 | -0.606646 |
f | 1.518936 | -0.590351 |
a -1.947578b 0.565255c -1.275992d 0.433325e 0.341412f 1.518936Name: A, dtype: float64
df1.Adf1.A.loc[lambda s: s > 0]df1.A.loc[df1.A > 0]
a -1.947578b 0.565255c -1.275992d 0.433325e 0.341412f 1.518936Name: A, dtype: float64b 0.565255d 0.433325e 0.341412f 1.518936Name: A, dtype: float64b 0.565255d 0.433325e 0.341412f 1.518936Name: A, dtype: float64
# 使用这些方法或索引,可以使用链式的选择方法,而不用中间的临时变量。链式方法,不是链式[]bb = pd.read_csv('data/baseball.csv', index_col='id')bb.groupby(['year', 'team']).sum().loc[lambda df: df.r > 100]
# IX Indexer is Deprecated# 不推荐使用ix# in favor of the more strict .iloc and .loc indexers.# 使用.iloc和.loc代替.ixdfd = pd.DataFrame({'A': [1, 2, 3],'B': [4, 5, 6]},index=list('abc'))dfd
dfd.ix[[0, 2], 'A']
d:\python\36-64\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning: .ix is deprecated. Please use.loc for label based indexing or.iloc for positional indexingSee the documentation here:http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated """Entry point for launching an IPython kernel.a 1c 3Name: A, dtype: int64
dfd.index[[0, 2]] # 取得索引的名称dfd.loc[dfd.index[[0, 2]], 'A']dfd.columns.get_loc('A') # 取得列的索引值dfd.iloc[[0, 2], dfd.columns.get_loc('A')]
Index(['a', 'c'], dtype='object')a 1c 3Name: A, dtype: int640a 1c 3Name: A, dtype: int64
# Indexing with list with missing labels is Deprecated# 不推荐用有缺失标签的list进行索引# using .loc or [] with a list with one or more missing labels, is deprecated, in favor of .reindex.# 推荐使用.reindexs = pd.Series([1, 2, 3])s
0 11 22 3dtype: int64
s.loc[[1, 2]] # list中的keys都存在,则没有变化
1 22 3dtype: int64
s.loc[[1, 2, 3]] # 当有缺失时,赋值为NaN
d:\python\36-64\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Passing list-likes to .loc or [] with any missing label will raiseKeyError in the future, you can use .reindex() as an alternative.See the documentation here:http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike """Entry point for launching an IPython kernel.1 2.02 3.03 NaNdtype: float64
# Reindexings.reindex([1, 2, 3])
1 2.02 3.03 NaNdtype: float64
# 如果仅选择有效的keyslabels = [1, 2, 3]s.index.intersection(labels) # index和labes的交集s.loc[s.index.intersection(labels)]
Int64Index([1, 2], dtype='int64')1 22 3dtype: int64
# reindex 索引中不能有重复的项s = pd.Series(np.arange(4), index=['a', 'a', 'b', 'c'])slabels = ['c', 'd']# s.reindex(labels) # 不能reindex
a 0a 1b 2c 3dtype: int32
# 可以先把交集切出来,再进行reindex# 但是交集不能有重复的indexs.index.intersection(labels)s.loc[s.index.intersection(labels)]s.loc[s.index.intersection(labels)].reindex(labels)
Index(['c'], dtype='object')c 3dtype: int32c 3.0d NaNdtype: float64
# Selecting Random Samples# 随机取样选择# 默认行取样s = pd.Series([0,1,2,3,4,5])s
0 01 12 23 34 45 5dtype: int64
s.sample() # 默认取1行s.sample(n=3) # 取3行s.sample(frac=0.5) # 小数,行数百分数s.sample(frac=0.8) # 小数s
1 1dtype: int643 34 45 5dtype: int642 23 34 4dtype: int642 21 13 30 05 5dtype: int640 01 12 23 34 45 5dtype: int64
s.sample(n=6, replace=False) # 默认s.sample(n=6, replace=True) # replace,不改变本身s
5 54 41 10 02 23 3dtype: int640 04 40 02 20 00 0dtype: int640 01 12 23 34 45 5dtype: int64
# 默认每行都有同样的概率被抽样到,也可以指定每行的概率比重example_weights = [0, 0, 0.2, 0.2, 0.2, 0.4]s.sample(n=3, weights=example_weights)example_weights2 = [0.5, 0, 0, 0, 0, 0] # 权重将会自动的归一化s.sample(n=1, weights=example_weights2)
2 23 35 5dtype: int640 0dtype: int64
# 可以指定DataFrame的某列作为权重df2 = pd.DataFrame({'col1':[9,8,7,6], 'weight_column':[0.5, 0.4, 0.1, 0]})df2.sample(n = 3, weights = 'weight_column')
| col1 | weight_column |
1 | 8 | 0.4 |
2 | 7 | 0.1 |
0 | 9 | 0.5 |
# 对列进行抽样df3 = pd.DataFrame({'col1':[1,2,3], 'col2':[2,3,4], 'col3':[3,4,5]})df3df3.sample(n=2, axis=1) # 指定axis=1,对列抽样,抽取的是列的组合df3.sample(n=2, axis=0)
| col1 | col2 | col3 |
0 | 1 | 2 | 3 |
1 | 2 | 3 | 4 |
2 | 3 | 4 | 5 |
| col3 | col2 |
0 | 3 | 2 |
1 | 4 | 3 |
2 | 5 | 4 |
| col1 | col2 | col3 |
2 | 3 | 4 | 5 |
0 | 1 | 2 | 3 |
# 可以指定random seed 或者 numpy 的 randomState 对象,作为sample 随机数生成器的种子# 一旦seed确定,随机数不变df3.sample(n=2, random_state=2)df3.sample(n=2, random_state=2)df3.sample(n=2, random_state=200)df3.sample(n=2, random_state=200)
| col1 | col2 | col3 |
2 | 3 | 4 | 5 |
1 | 2 | 3 | 4 |
| col1 | col2 | col3 |
2 | 3 | 4 | 5 |
1 | 2 | 3 | 4 |
| col1 | col2 | col3 |
0 | 1 | 2 | 3 |
1 | 2 | 3 | 4 |
| col1 | col2 | col3 |
0 | 1 | 2 | 3 |
1 | 2 | 3 | 4 |
# Setting With Enlargement# loc或[] 操作 可以通过赋值不存在的键扩大Series或Dataframe# Seriesse = pd.Series([1,2,3])sese[5] = 5 # appendse
0 11 22 3dtype: int640 11 22 35 5dtype: int64
# DataFramedfi = pd.DataFrame(np.arange(6).reshape(3,2),columns=['A','B'])dfi
dfi.loc[:,'C'] = dfi.loc[:,'A'] # enlarge 增加列dfi
| A | B | C |
0 | 0 | 1 | 0 |
1 | 2 | 3 | 2 |
2 | 4 | 5 | 4 |
dfi.loc[3] = 5 # append 增加行dfi
| A | B | C |
0 | 0 | 1 | 0 |
1 | 2 | 3 | 2 |
2 | 4 | 5 | 4 |
3 | 5 | 5 | 5 |
# Fast scalar value getting and setting# 快速取得或设置标量的值# [] 可以进行很多操作,所以它为了知道你要进行那种操作,会有一点计算开销# 最快的标量访问方式是使用 at 和 iat 方法,他们可以在所有的数据结构上使用# at 类似于 loc 基于 label# iat 类似于 iloc 基于 整数indexs = pd.Series([0,1,2,3,4,5])sdf = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])df
0 01 12 23 34 45 5dtype: int64
| A | B | C | D |
2000-01-01 | 0.845046 | -0.119758 | 1.354224 | -0.134199 |
2000-01-02 | -1.935579 | -1.469185 | -2.581439 | 0.355347 |
2000-01-03 | -0.038740 | -1.524056 | 1.376257 | 1.572331 |
2000-01-04 | -0.846971 | 0.189231 | -0.287885 | -0.561706 |
2000-01-05 | -0.127290 | -0.043918 | 0.103347 | -1.055387 |
2000-01-06 | 0.406437 | 1.917624 | 0.810463 | 0.367583 |
2000-01-07 | 0.438904 | -0.230190 | 0.593607 | -0.438856 |
2000-01-08 | -1.955122 | 1.531260 | 0.889124 | -0.014259 |
# get values.iat[5]df.at[dates[5], 'A']df.iat[3, 0]
50.40643702489386246-0.8469710793801154
# set valuedf.at[dates[5], 'E'] = 7 # 没有列就增加一列,没有值的默认赋值为nandf
| A | B | C | D | E |
2000-01-01 | 0.845046 | -0.119758 | 1.354224 | -0.134199 | NaN |
2000-01-02 | -1.935579 | -1.469185 | -2.581439 | 0.355347 | NaN |
2000-01-03 | -0.038740 | -1.524056 | 1.376257 | 1.572331 | NaN |
2000-01-04 | -0.846971 | 0.189231 | -0.287885 | -0.561706 | NaN |
2000-01-05 | -0.127290 | -0.043918 | 0.103347 | -1.055387 | NaN |
2000-01-06 | 0.406437 | 1.917624 | 0.810463 | 0.367583 | 7.0 |
2000-01-07 | 0.438904 | -0.230190 | 0.593607 | -0.438856 | NaN |
2000-01-08 | -1.955122 | 1.531260 | 0.889124 | -0.014259 | NaN |
df.iat[3, 0] = 7df
| A | B | C | D | E |
2000-01-01 | 0.845046 | -0.119758 | 1.354224 | -0.134199 | NaN |
2000-01-02 | -1.935579 | -1.469185 | -2.581439 | 0.355347 | NaN |
2000-01-03 | -0.038740 | -1.524056 | 1.376257 | 1.572331 | NaN |
2000-01-04 | 7.000000 | 0.189231 | -0.287885 | -0.561706 | NaN |
2000-01-05 | -0.127290 | -0.043918 | 0.103347 | -1.055387 | NaN |
2000-01-06 | 0.406437 | 1.917624 | 0.810463 | 0.367583 | 7.0 |
2000-01-07 | 0.438904 | -0.230190 | 0.593607 | -0.438856 | NaN |
2000-01-08 | -1.955122 | 1.531260 | 0.889124 | -0.014259 | NaN |
df.at[dates[-1]+1, 0] = 7 # 行和列都扩展了df
| A | B | C | D | E | 0 |
2000-01-01 | 0.845046 | -0.119758 | 1.354224 | -0.134199 | NaN | NaN |
2000-01-02 | -1.935579 | -1.469185 | -2.581439 | 0.355347 | NaN | NaN |
2000-01-03 | -0.038740 | -1.524056 | 1.376257 | 1.572331 | NaN | NaN |
2000-01-04 | 7.000000 | 0.189231 | -0.287885 | -0.561706 | NaN | NaN |
2000-01-05 | -0.127290 | -0.043918 | 0.103347 | -1.055387 | NaN | NaN |
2000-01-06 | 0.406437 | 1.917624 | 0.810463 | 0.367583 | 7.0 | NaN |
2000-01-07 | 0.438904 | -0.230190 | 0.593607 | -0.438856 | NaN | NaN |
2000-01-08 | -1.955122 | 1.531260 | 0.889124 | -0.014259 | NaN | NaN |
2000-01-09 | NaN | NaN | NaN | NaN | NaN | 7.0 |
# Boolean indexing# 使用布尔向量过滤数据# 操作符包括:| for or, & for and, and ~ for not ,必须用括号进行分组
s = pd.Series(range(-3, 4)) # 不用把range先list,直接可以seriess
0 -31 -22 -13 04 15 26 3dtype: int64
s[s >= 0] # 直接用series,不用取其值s[~(s < 0)]s[(s < -1) | (s > 0.5)]
3 04 15 26 3dtype: int643 04 15 26 3dtype: int640 -31 -24 15 26 3dtype: int64
df[df['A'] > 0]
| A | B | C | D | E | 0 |
2000-01-01 | 0.845046 | -0.119758 | 1.354224 | -0.134199 | NaN | NaN |
2000-01-04 | 7.000000 | 0.189231 | -0.287885 | -0.561706 | NaN | NaN |
2000-01-06 | 0.406437 | 1.917624 | 0.810463 | 0.367583 | 7.0 | NaN |
2000-01-07 | 0.438904 | -0.230190 | 0.593607 | -0.438856 | NaN | NaN |
# List comprehensions and map method of Series can also be used to produce more complex criteria:# 列表生成式和map方法 也可以用来生成 更复杂的条件判断df2 = pd.DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'], 'b' : ['x', 'y', 'y', 'x', 'y', 'x', 'x'], 'c' : np.random.randn(7)})df2
| a | b | c |
0 | one | x | 1.706402 |
1 | one | y | 0.491977 |
2 | two | y | 1.357986 |
3 | three | x | -1.023513 |
4 | two | y | -0.653028 |
5 | one | x | 0.041052 |
6 | six | x | 1.021882 |
# 判别式criterion = df2['a'].map(lambda x: x.startswith('t')) # 选择 two 和 threecriterion
0 False1 False2 True3 True4 True5 False6 FalseName: a, dtype: bool
df2[criterion] # 根据 a 列的判别式 选择数据表的一部分,包含其他列
| a | b | c |
2 | two | y | 1.357986 |
3 | three | x | -1.023513 |
4 | two | y | -0.653028 |
# 等价的方式,但是速度慢一些 df2[[x.startswith('t') for x in df2['a']]] # 不适用map,而是用列表生成式
| a | b | c |
2 | two | y | 1.357986 |
3 | three | x | -1.023513 |
4 | two | y | -0.653028 |
# 复合判断df2[criterion & (df2['b'] == 'x')] # a 列 和 b 列 均符合某类要求
| a | b | c |
3 | three | x | -1.023513 |
# 布尔向量选择 可以与索引选择一并使用df2.loc[criterion & (df2['b'] == 'x'),'b':'c'] # 只选择 b 和 c 两列,不选择c列
# Indexing with isin# 用isin索引
# series# 对列数据进行判断s = pd.Series(np.arange(5), index=np.arange(5)[::-1], dtype='int64')s
4 03 12 21 30 4dtype: int64
s.isin([2, 4, 6]) # 列数据中是否存在列表中的值,返回布尔值s[s.isin([2, 4, 6])] # 可以利用返回的布尔值进行选择
4 False3 False2 True1 False0 Truedtype: bool2 20 4dtype: int64
# 也可以对index obj 进行筛选s[s.index.isin([2, 4, 6])]s.reindex([2, 4, 6]) # reindex不同,列表中没有的值返回了nan,且原来的int64返回了float64数据类型
4 02 2dtype: int642 2.04 0.06 NaNdtype: float64
# 对应多重索引,可以单独选择索引级别s_mi = pd.Series(np.arange(6),index=pd.MultiIndex.from_product([[0, 1], ['a', 'b', 'c']]))s_mi
0 a 0 b 1 c 21 a 3 b 4 c 5dtype: int32
s_mi.iloc[s_mi.index.isin([(1, 'a'), (2, 'b'), (0, 'c')])]s_mi.iloc[s_mi.index.isin(['a', 'c', 'e'], level=1)] # 指定索引级别,在第二级索引中选择
0 c 21 a 3dtype: int320 a 0 c 21 a 3 c 5dtype: int32
# DataFramedf = pd.DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'],'ids2': ['a', 'n', 'c', 'n']})df
| ids | ids2 | vals |
0 | a | a | 1 |
1 | b | n | 2 |
2 | f | c | 3 |
3 | n | n | 4 |
values = ['a', 'b', 1, 3]df.isin(values) # 匹配所有的值
| ids | ids2 | vals |
0 | True | True | True |
1 | True | False | False |
2 | False | False | True |
3 | False | False | False |
values = {'ids': ['a', 'b'], 'ids2': ['a', 'c'], 'vals': [1, 3]}row_mask = df.isin(values) # 对不用的列,分别匹配某些值row_mask# ?row_mask.all # Return whether all elements are True over requested axisrow_mask = row_mask.all(1)row_maskdf[row_mask] # 选择全是True的行
| ids | ids2 | vals |
0 | True | True | True |
1 | True | False | False |
2 | False | True | True |
3 | False | False | False |
0 True1 False2 False3 Falsedtype: bool
# The where() Method and Masking# To guarantee that selection output has the same shape as the original data# 保证选集输出与原数据有同样的shape形态# seriesss[s > 0] # 只返回满足的项s.where(s > 0) # 全部返回,不满足的项,赋值nan
4 03 12 21 30 4dtype: int643 12 21 30 4dtype: int644 NaN3 1.02 2.01 3.00 4.0dtype: float64
# DataFrame 使用布尔值选择时,返回值保留原数据结构df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])df
| A | B | C | D |
2000-01-01 | 0.383691 | -0.500453 | 0.101632 | 0.848213 |
2000-01-02 | -1.048619 | 0.856605 | -0.295425 | 1.060710 |
2000-01-03 | -0.214918 | -1.989228 | 0.278514 | 1.088771 |
2000-01-04 | -2.326385 | -0.225754 | -1.331598 | 1.457230 |
2000-01-05 | 0.656919 | 1.081810 | 1.148303 | -0.089382 |
2000-01-06 | -1.041194 | 0.533706 | -1.084442 | 1.824709 |
2000-01-07 | -1.040706 | -2.336161 | 0.565496 | 0.269414 |
2000-01-08 | 0.166739 | -0.075381 | -0.951126 | -0.347865 |
df[df < 0]df.where(df < 0) # 等价于上式df.where(df < 0, -df) # where 可以传入另一个参数,用于替换条件为 False 的项,返回copy数据拷贝,不修改原值df
| A | B | C | D |
2000-01-01 | NaN | -0.500453 | NaN | NaN |
2000-01-02 | -1.048619 | NaN | -0.295425 | NaN |
2000-01-03 | -0.214918 | -1.989228 | NaN | NaN |
2000-01-04 | -2.326385 | -0.225754 | -1.331598 | NaN |
2000-01-05 | NaN | NaN | NaN | -0.089382 |
2000-01-06 | -1.041194 | NaN | -1.084442 | NaN |
2000-01-07 | -1.040706 | -2.336161 | NaN | NaN |
2000-01-08 | NaN | -0.075381 | -0.951126 | -0.347865 |
| A | B | C | D |
2000-01-01 | NaN | -0.500453 | NaN | NaN |
2000-01-02 | -1.048619 | NaN | -0.295425 | NaN |
2000-01-03 | -0.214918 | -1.989228 | NaN | NaN |
2000-01-04 | -2.326385 | -0.225754 | -1.331598 | NaN |
2000-01-05 | NaN | NaN | NaN | -0.089382 |
2000-01-06 | -1.041194 | NaN | -1.084442 | NaN |
2000-01-07 | -1.040706 | -2.336161 | NaN | NaN |
2000-01-08 | NaN | -0.075381 | -0.951126 | -0.347865 |
| A | B | C | D |
2000-01-01 | -0.383691 | -0.500453 | -0.101632 | -0.848213 |
2000-01-02 | -1.048619 | -0.856605 | -0.295425 | -1.060710 |
2000-01-03 | -0.214918 | -1.989228 | -0.278514 | -1.088771 |
2000-01-04 | -2.326385 | -0.225754 | -1.331598 | -1.457230 |
2000-01-05 | -0.656919 | -1.081810 | -1.148303 | -0.089382 |
2000-01-06 | -1.041194 | -0.533706 | -1.084442 | -1.824709 |
2000-01-07 | -1.040706 | -2.336161 | -0.565496 | -0.269414 |
2000-01-08 | -0.166739 | -0.075381 | -0.951126 | -0.347865 |
| A | B | C | D |
2000-01-01 | 0.383691 | -0.500453 | 0.101632 | 0.848213 |
2000-01-02 | -1.048619 | 0.856605 | -0.295425 | 1.060710 |
2000-01-03 | -0.214918 | -1.989228 | 0.278514 | 1.088771 |
2000-01-04 | -2.326385 | -0.225754 | -1.331598 | 1.457230 |
2000-01-05 | 0.656919 | 1.081810 | 1.148303 | -0.089382 |
2000-01-06 | -1.041194 | 0.533706 | -1.084442 | 1.824709 |
2000-01-07 | -1.040706 | -2.336161 | 0.565496 | 0.269414 |
2000-01-08 | 0.166739 | -0.075381 | -0.951126 | -0.347865 |
# You may wish to set values based on some boolean criteria. This can be done intuitively like so:# 设置基于布尔值的项值s2 = s.copy()s2s2[s2 < 3] = 0s2df2 = df.copy()df2df2[df2 < 0] = 0df2
4 03 12 21 30 4dtype: int644 03 02 01 30 4dtype: int64
| A | B | C | D |
2000-01-01 | 0.383691 | -0.500453 | 0.101632 | 0.848213 |
2000-01-02 | -1.048619 | 0.856605 | -0.295425 | 1.060710 |
2000-01-03 | -0.214918 | -1.989228 | 0.278514 | 1.088771 |
2000-01-04 | -2.326385 | -0.225754 | -1.331598 | 1.457230 |
2000-01-05 | 0.656919 | 1.081810 | 1.148303 | -0.089382 |
2000-01-06 | -1.041194 | 0.533706 | -1.084442 | 1.824709 |
2000-01-07 | -1.040706 | -2.336161 | 0.565496 | 0.269414 |
2000-01-08 | 0.166739 | -0.075381 | -0.951126 | -0.347865 |
| A | B | C | D |
2000-01-01 | 0.383691 | 0.000000 | 0.101632 | 0.848213 |
2000-01-02 | 0.000000 | 0.856605 | 0.000000 | 1.060710 |
2000-01-03 | 0.000000 | 0.000000 | 0.278514 | 1.088771 |
2000-01-04 | 0.000000 | 0.000000 | 0.000000 | 1.457230 |
2000-01-05 | 0.656919 | 1.081810 | 1.148303 | 0.000000 |
2000-01-06 | 0.000000 | 0.533706 | 0.000000 | 1.824709 |
2000-01-07 | 0.000000 | 0.000000 | 0.565496 | 0.269414 |
2000-01-08 | 0.166739 | 0.000000 | 0.000000 | 0.000000 |
# where 默认返回修改后的数据拷贝,原值不变;可以设置inplace参数,直接修改原值,而不是创建拷贝df_orig = df.copy()df_orig.where(df > 0, -df, inplace=True)df_orig
| A | B | C | D |
2000-01-01 | 0.383691 | 0.500453 | 0.101632 | 0.848213 |
2000-01-02 | 1.048619 | 0.856605 | 0.295425 | 1.060710 |
2000-01-03 | 0.214918 | 1.989228 | 0.278514 | 1.088771 |
2000-01-04 | 2.326385 | 0.225754 | 1.331598 | 1.457230 |
2000-01-05 | 0.656919 | 1.081810 | 1.148303 | 0.089382 |
2000-01-06 | 1.041194 | 0.533706 | 1.084442 | 1.824709 |
2000-01-07 | 1.040706 | 2.336161 | 0.565496 | 0.269414 |
2000-01-08 | 0.166739 | 0.075381 | 0.951126 | 0.347865 |
# 注意 :pandas 和 numpy 的where方法不一样# 一般的,df1.where(m, df2) 相当于 np.where(m, df1, df2)df.where(df < 0, -df) == np.where(df < 0, df, -df)
| A | B | C | D |
2000-01-01 | True | True | True | True |
2000-01-02 | True | True | True | True |
2000-01-03 | True | True | True | True |
2000-01-04 | True | True | True | True |
2000-01-05 | True | True | True | True |
2000-01-06 | True | True | True | True |
2000-01-07 | True | True | True | True |
2000-01-08 | True | True | True | True |
# alignment 定位,对齐# where 可以选择局部(部分区域)的布尔条件 # This is analogous to partial setting via .loc (but on the contents rather than the axis labels)df2 = df.copy()df2df2[1:4] # 行选择df2.where(df2[1:4] > 0, 3) # 对不符合项的值进行赋值!df2[ df2[1:4] > 0 ] = 3 # 只定位部分区域,对符合项的值进行赋值df2
| A | B | C | D |
2000-01-01 | 0.383691 | -0.500453 | 0.101632 | 0.848213 |
2000-01-02 | -1.048619 | 0.856605 | -0.295425 | 1.060710 |
2000-01-03 | -0.214918 | -1.989228 | 0.278514 | 1.088771 |
2000-01-04 | -2.326385 | -0.225754 | -1.331598 | 1.457230 |
2000-01-05 | 0.656919 | 1.081810 | 1.148303 | -0.089382 |
2000-01-06 | -1.041194 | 0.533706 | -1.084442 | 1.824709 |
2000-01-07 | -1.040706 | -2.336161 | 0.565496 | 0.269414 |
2000-01-08 | 0.166739 | -0.075381 | -0.951126 | -0.347865 |
| A | B | C | D |
2000-01-02 | -1.048619 | 0.856605 | -0.295425 | 1.060710 |
2000-01-03 | -0.214918 | -1.989228 | 0.278514 | 1.088771 |
2000-01-04 | -2.326385 | -0.225754 | -1.331598 | 1.457230 |
| A | B | C | D |
2000-01-01 | 3.0 | 3.000000 | 3.000000 | 3.000000 |
2000-01-02 | 3.0 | 0.856605 | 3.000000 | 1.060710 |
2000-01-03 | 3.0 | 3.000000 | 0.278514 | 1.088771 |
2000-01-04 | 3.0 | 3.000000 | 3.000000 | 1.457230 |
2000-01-05 | 3.0 | 3.000000 | 3.000000 | 3.000000 |
2000-01-06 | 3.0 | 3.000000 | 3.000000 | 3.000000 |
2000-01-07 | 3.0 | 3.000000 | 3.000000 | 3.000000 |
2000-01-08 | 3.0 | 3.000000 | 3.000000 | 3.000000 |
| A | B | C | D |
2000-01-01 | 0.383691 | -0.500453 | 0.101632 | 0.848213 |
2000-01-02 | -1.048619 | 3.000000 | -0.295425 | 3.000000 |
2000-01-03 | -0.214918 | -1.989228 | 3.000000 | 3.000000 |
2000-01-04 | -2.326385 | -0.225754 | -1.331598 | 3.000000 |
2000-01-05 | 0.656919 | 1.081810 | 1.148303 | -0.089382 |
2000-01-06 | -1.041194 | 0.533706 | -1.084442 | 1.824709 |
2000-01-07 | -1.040706 | -2.336161 | 0.565496 | 0.269414 |
2000-01-08 | 0.166739 | -0.075381 | -0.951126 | -0.347865 |
# Where can also accept axis and level parameters to align the input when performing the where.# where 可以接受 轴参数axis 和 级别参数level df2 = df.copy()df2df2.where(df2>0,df2['A'],axis='index') # 小于等于0的值,赋值为A列的值df2df2.apply(lambda x, y: x.where(x>0,y), y=df['A']) # 相当于上式,但此式较慢,同样不改变原值,而是生成一个拷贝copydf2
| A | B | C | D |
2000-01-01 | 0.383691 | -0.500453 | 0.101632 | 0.848213 |
2000-01-02 | -1.048619 | 0.856605 | -0.295425 | 1.060710 |
2000-01-03 | -0.214918 | -1.989228 | 0.278514 | 1.088771 |
2000-01-04 | -2.326385 | -0.225754 | -1.331598 | 1.457230 |
2000-01-05 | 0.656919 | 1.081810 | 1.148303 | -0.089382 |
2000-01-06 | -1.041194 | 0.533706 | -1.084442 | 1.824709 |
2000-01-07 | -1.040706 | -2.336161 | 0.565496 | 0.269414 |
2000-01-08 | 0.166739 | -0.075381 | -0.951126 | -0.347865 |
| A | B | C | D |
2000-01-01 | 0.383691 | 0.383691 | 0.101632 | 0.848213 |
2000-01-02 | -1.048619 | 0.856605 | -1.048619 | 1.060710 |
2000-01-03 | -0.214918 | -0.214918 | 0.278514 | 1.088771 |
2000-01-04 | -2.326385 | -2.326385 | -2.326385 | 1.457230 |
2000-01-05 | 0.656919 | 1.081810 | 1.148303 | 0.656919 |
2000-01-06 | -1.041194 | 0.533706 | -1.041194 | 1.824709 |
2000-01-07 | -1.040706 | -1.040706 | 0.565496 | 0.269414 |
2000-01-08 | 0.166739 | 0.166739 | 0.166739 | 0.166739 |
| A | B | C | D |
2000-01-01 | 0.383691 | -0.500453 | 0.101632 | 0.848213 |
2000-01-02 | -1.048619 | 0.856605 | -0.295425 | 1.060710 |
2000-01-03 | -0.214918 | -1.989228 | 0.278514 | 1.088771 |
2000-01-04 | -2.326385 | -0.225754 | -1.331598 | 1.457230 |
2000-01-05 | 0.656919 | 1.081810 | 1.148303 | -0.089382 |
2000-01-06 | -1.041194 | 0.533706 | -1.084442 | 1.824709 |
2000-01-07 | -1.040706 | -2.336161 | 0.565496 | 0.269414 |
2000-01-08 | 0.166739 | -0.075381 | -0.951126 | -0.347865 |
| A | B | C | D |
2000-01-01 | 0.383691 | 0.383691 | 0.101632 | 0.848213 |
2000-01-02 | -1.048619 | 0.856605 | -1.048619 | 1.060710 |
2000-01-03 | -0.214918 | -0.214918 | 0.278514 | 1.088771 |
2000-01-04 | -2.326385 | -2.326385 | -2.326385 | 1.457230 |
2000-01-05 | 0.656919 | 1.081810 | 1.148303 | 0.656919 |
2000-01-06 | -1.041194 | 0.533706 | -1.041194 | 1.824709 |
2000-01-07 | -1.040706 | -1.040706 | 0.565496 | 0.269414 |
2000-01-08 | 0.166739 | 0.166739 | 0.166739 | 0.166739 |
| A | B | C | D |
2000-01-01 | 0.383691 | -0.500453 | 0.101632 | 0.848213 |
2000-01-02 | -1.048619 | 0.856605 | -0.295425 | 1.060710 |
2000-01-03 | -0.214918 | -1.989228 | 0.278514 | 1.088771 |
2000-01-04 | -2.326385 | -0.225754 | -1.331598 | 1.457230 |
2000-01-05 | 0.656919 | 1.081810 | 1.148303 | -0.089382 |
2000-01-06 | -1.041194 | 0.533706 | -1.084442 | 1.824709 |
2000-01-07 | -1.040706 | -2.336161 | 0.565496 | 0.269414 |
2000-01-08 | 0.166739 | -0.075381 | -0.951126 | -0.347865 |
# where 可以接受一个函数调用,此函数只能有一个参数,且返回有效的布尔条件df3 = pd.DataFrame({'A': [1, 2, 3],'B': [4, 5, 6],'C': [7, 8, 9]})df3df3.where(lambda x: x > 4, lambda x: x + 10) # x<=4 的值 赋值为 x+10
| A | B | C |
0 | 1 | 4 | 7 |
1 | 2 | 5 | 8 |
2 | 3 | 6 | 9 |
| A | B | C |
0 | 11 | 14 | 7 |
1 | 12 | 5 | 8 |
2 | 13 | 6 | 9 |
# mask 遮罩 mask is the inverse boolean operation of where. 反向的布尔值操作ss.mask(s >= 0) # 选择<0 的值, 不符合项 置为nandfdf.mask(df >= 0)
4 03 12 21 30 4dtype: int644 NaN3 NaN2 NaN1 NaN0 NaNdtype: float64
| A | B | C | D |
2000-01-01 | 0.383691 | -0.500453 | 0.101632 | 0.848213 |
2000-01-02 | -1.048619 | 0.856605 | -0.295425 | 1.060710 |
2000-01-03 | -0.214918 | -1.989228 | 0.278514 | 1.088771 |
2000-01-04 | -2.326385 | -0.225754 | -1.331598 | 1.457230 |
2000-01-05 | 0.656919 | 1.081810 | 1.148303 | -0.089382 |
2000-01-06 | -1.041194 | 0.533706 | -1.084442 | 1.824709 |
2000-01-07 | -1.040706 | -2.336161 | 0.565496 | 0.269414 |
2000-01-08 | 0.166739 | -0.075381 | -0.951126 | -0.347865 |
| A | B | C | D |
2000-01-01 | NaN | -0.500453 | NaN | NaN |
2000-01-02 | -1.048619 | NaN | -0.295425 | NaN |
2000-01-03 | -0.214918 | -1.989228 | NaN | NaN |
2000-01-04 | -2.326385 | -0.225754 | -1.331598 | NaN |
2000-01-05 | NaN | NaN | NaN | -0.089382 |
2000-01-06 | -1.041194 | NaN | -1.084442 | NaN |
2000-01-07 | -1.040706 | -2.336161 | NaN | NaN |
2000-01-08 | NaN | -0.075381 | -0.951126 | -0.347865 |
# The query() Method (Experimental) 对 DataFrame对象 使用表达式进行选择# 例如 :选择b列中 在a列和c列两值中间的 行,a
| a | b | c |
0 | 0.283094 | 0.051807 | 0.126487 |
1 | 0.020097 | 0.373023 | 0.147193 |
2 | 0.091921 | 0.830956 | 0.143214 |
3 | 0.340304 | 0.527246 | 0.709769 |
4 | 0.651722 | 0.344524 | 0.151233 |
5 | 0.396685 | 0.524376 | 0.540237 |
6 | 0.502751 | 0.627708 | 0.708038 |
7 | 0.472338 | 0.269770 | 0.586165 |
8 | 0.937522 | 0.239560 | 0.861873 |
9 | 0.661879 | 0.465536 | 0.271580 |
# pure pythondf[(df.a < df.b) & (df.b < df.c)]
| a | b | c |
3 | 0.340304 | 0.527246 | 0.709769 |
5 | 0.396685 | 0.524376 | 0.540237 |
6 | 0.502751 | 0.627708 | 0.708038 |
# query,传入的是str表达式df.query('(a < b) & (b < c)') # 比纯py慢?!
| a | b | c |
3 | 0.340304 | 0.527246 | 0.709769 |
5 | 0.396685 | 0.524376 | 0.540237 |
6 | 0.502751 | 0.627708 | 0.708038 |
# 对于命名的index索引是低效的,比利用列名称# 而且如果索引的名称和列名同名,列名优先df = pd.DataFrame(np.random.randint(n / 2, size=(n, 2)), columns=list('bc'))# dfdf.index.name = "a"df
| b | c |
a | | |
0 | 0 | 4 |
1 | 1 | 2 |
2 | 3 | 0 |
3 | 0 | 3 |
4 | 4 | 3 |
5 | 0 | 0 |
6 | 0 | 4 |
7 | 1 | 4 |
8 | 3 | 0 |
9 | 1 | 4 |
df.query('a <= b and b <= c')
# 可以不使用索引的名称,而是直接用index,这样同时可以避免与列名重名df.query('index <= b <= c')
# MultiIndex query() Syntax# 对于多重索引n = 10colors = np.random.choice(['red', 'green'], size=n)colorsfoods = np.random.choice(['eggs', 'ham'], size=n)foods
array(['green', 'green', 'green', 'green', 'green', 'green', 'green', 'red', 'green', 'green'], dtype='
index = pd.MultiIndex.from_arrays([colors, foods], names=['color', 'food'])indexdf = pd.DataFrame(np.random.randn(n, 2), index=index)df
MultiIndex(levels=[['green', 'red'], ['eggs', 'ham']], labels=[[0, 0, 0, 0, 0, 0, 0, 1, 0, 0], [1, 0, 0, 0, 0, 1, 0, 0, 1, 1]], names=['color', 'food'])
| | 0 | 1 |
color | food | | |
green | ham | -0.295761 | 1.399719 |
eggs | 1.807185 | 0.498136 |
eggs | -0.119640 | 2.279162 |
eggs | -0.238709 | -0.650418 |
eggs | 2.235827 | 1.066954 |
ham | 1.156794 | 1.694717 |
eggs | -0.037158 | -0.529213 |
red | eggs | 0.046799 | 0.763592 |
green | ham | 0.488240 | -0.455112 |
ham | -0.169486 | -0.646891 |
df.query('color == "red"')
| | 0 | 1 |
color | food | | |
red | eggs | 0.046799 | 0.763592 |
# 如果多重索引没有命名 可以使用特殊的名字df.index.names = [None, None]dfdf.query('ilevel_0 == "red"') # The convention is ilevel_0, which means “index level 0” for the 0th level of the index.
| | 0 | 1 |
green | ham | -0.295761 | 1.399719 |
eggs | 1.807185 | 0.498136 |
eggs | -0.119640 | 2.279162 |
eggs | -0.238709 | -0.650418 |
eggs | 2.235827 | 1.066954 |
ham | 1.156794 | 1.694717 |
eggs | -0.037158 | -0.529213 |
red | eggs | 0.046799 | 0.763592 |
green | ham | 0.488240 | -0.455112 |
ham | -0.169486 | -0.646891 |
| | 0 | 1 |
red | eggs | 0.046799 | 0.763592 |
# query() Use Cases 使用示例# A use case for query() is when you have a collection of DataFrame objects # that have a subset of column names (or index levels/names) in common. ## 有公共列名称子集的df# You can pass the same query to both frames # without having to specify which frame you’re interested in querying. ## 传入同样的query
df = pd.DataFrame(np.random.rand(n, 3), columns=list('abc'))df2 = pd.DataFrame(np.random.rand(n + 2, 3), columns=df.columns)dfdf2
| a | b | c |
0 | 0.096087 | 0.799309 | 0.112070 |
1 | 0.083101 | 0.505151 | 0.830588 |
2 | 0.501472 | 0.692792 | 0.663570 |
3 | 0.872821 | 0.442541 | 0.904903 |
4 | 0.513651 | 0.375617 | 0.786898 |
5 | 0.184368 | 0.285290 | 0.055147 |
6 | 0.284951 | 0.243556 | 0.732871 |
7 | 0.431525 | 0.676385 | 0.458296 |
8 | 0.743057 | 0.326079 | 0.434655 |
9 | 0.610921 | 0.717639 | 0.580765 |
| a | b | c |
0 | 0.804932 | 0.533058 | 0.713190 |
1 | 0.305893 | 0.460048 | 0.879321 |
2 | 0.344438 | 0.970870 | 0.685098 |
3 | 0.348010 | 0.839449 | 0.799309 |
4 | 0.844713 | 0.267562 | 0.771202 |
5 | 0.403534 | 0.088786 | 0.950782 |
6 | 0.181616 | 0.567118 | 0.989711 |
7 | 0.363736 | 0.852080 | 0.140771 |
8 | 0.866127 | 0.285365 | 0.025491 |
9 | 0.329751 | 0.121716 | 0.782729 |
10 | 0.029253 | 0.419409 | 0.051255 |
11 | 0.057406 | 0.106595 | 0.559687 |
expr = '0.0 <= a <= c <= 0.5'mp = map(lambda frame: frame.query(expr), [df, df2]) # 同一个表达式,作用在有同样列名的多个df上
for i in mp: print(i)
a b c0 0.096087 0.799309 0.1120707 0.431525 0.676385 0.458296 a b c10 0.029253 0.419409 0.051255
# query() Python versus pandas Syntax Comparison# Full numpy-like syntaxdf = pd.DataFrame(np.random.randint(n, size=(n, 3)), columns=list('abc'))df
| a | b | c |
0 | 8 | 4 | 5 |
1 | 7 | 4 | 7 |
2 | 9 | 3 | 2 |
3 | 4 | 8 | 7 |
4 | 4 | 5 | 8 |
5 | 7 | 8 | 8 |
6 | 1 | 7 | 6 |
7 | 5 | 9 | 1 |
8 | 3 | 3 | 8 |
9 | 7 | 1 | 5 |
df.query('(a < b) & (b < c)') df[(df.a < df.b) & (df.b < df.c)]
# 几种其他的不同的写法df.query('a < b & b < c') # 去掉括号df.query('a < b and b < c') # 使用英文anddf.query('a < b < c') # 连写,优雅的表达
# The in and not in operators# get all rows where columns "a" and "b" have overlapping values # 得到a列和b列有重叠值的行,a列中的值在 in b列的值中df = pd.DataFrame({'a': list('aabbccddeeff'), 'b': list('aaaabbbbcccc'), 'c': np.random.randint(5, size=12), 'd': np.random.randint(9, size=12)})df
| a | b | c | d |
0 | a | a | 0 | 2 |
1 | a | a | 0 | 6 |
2 | b | a | 3 | 3 |
3 | b | a | 1 | 2 |
4 | c | b | 1 | 2 |
5 | c | b | 1 | 2 |
6 | d | b | 3 | 8 |
7 | d | b | 1 | 0 |
8 | e | c | 2 | 7 |
9 | e | c | 0 | 1 |
10 | f | c | 0 | 1 |
11 | f | c | 3 | 1 |
df.query('a in b') # 第一次运行113ms 第二次明显加快 25ms
| a | b | c | d |
0 | a | a | 0 | 2 |
1 | a | a | 0 | 6 |
2 | b | a | 3 | 3 |
3 | b | a | 1 | 2 |
4 | c | b | 1 | 2 |
5 | c | b | 1 | 2 |
df[df.a.isin(df.b)] # How you'd do it in pure Python 仍然比query快 35ms
| a | b | c | d |
0 | a | a | 0 | 2 |
1 | a | a | 0 | 6 |
2 | b | a | 3 | 3 |
3 | b | a | 1 | 2 |
4 | c | b | 1 | 2 |
5 | c | b | 1 | 2 |
df[~df.a.isin(df.b)]
| a | b | c | d |
6 | d | b | 3 | 8 |
7 | d | b | 1 | 0 |
8 | e | c | 2 | 7 |
9 | e | c | 0 | 1 |
10 | f | c | 0 | 1 |
11 | f | c | 3 | 1 |
df.query('a not in b')
| a | b | c | d |
6 | d | b | 3 | 8 |
7 | d | b | 1 | 0 |
8 | e | c | 2 | 7 |
9 | e | c | 0 | 1 |
10 | f | c | 0 | 1 |
11 | f | c | 3 | 1 |
# rows where cols a and b have overlapping values and col c's values are less than col d'sdf.query('a in b and c < d')df[df.a.isin(df.b) & (df.c < df.d)]
| a | b | c | d |
0 | a | a | 0 | 2 |
1 | a | a | 0 | 6 |
3 | b | a | 1 | 2 |
4 | c | b | 1 | 2 |
5 | c | b | 1 | 2 |
| a | b | c | d |
0 | a | a | 0 | 2 |
1 | a | a | 0 | 6 |
3 | b | a | 1 | 2 |
4 | c | b | 1 | 2 |
5 | c | b | 1 | 2 |
# 注意:?????# Note that in and not in are evaluated in Python, since numexpr has no equivalent of this operation. # However, only the in/not in expression itself is evaluated in vanilla Python. # For example, in the expressiondf.query('a in b + c + d')# (b + c + d) is evaluated by numexpr and then the in operation is evaluated in plain Python. In general, any operations that can be evaluated using numexpr will be.
# Special use of the == operator with list objects# 特别的用法,== 用于list# Comparing a list of values to a column using ==/!= works similarly to in/not in# 用==/!=比较列表的值与列的值,类似于in/not indf
| a | b | c | d |
0 | a | a | 0 | 2 |
1 | a | a | 0 | 6 |
2 | b | a | 3 | 3 |
3 | b | a | 1 | 2 |
4 | c | b | 1 | 2 |
5 | c | b | 1 | 2 |
6 | d | b | 3 | 8 |
7 | d | b | 1 | 0 |
8 | e | c | 2 | 7 |
9 | e | c | 0 | 1 |
10 | f | c | 0 | 1 |
11 | f | c | 3 | 1 |
df.query('b == ["a", "b", "c"]') # b列中有列表中值的
| a | b | c | d |
0 | a | a | 0 | 2 |
1 | a | a | 0 | 6 |
2 | b | a | 3 | 3 |
3 | b | a | 1 | 2 |
4 | c | b | 1 | 2 |
5 | c | b | 1 | 2 |
6 | d | b | 3 | 8 |
7 | d | b | 1 | 0 |
8 | e | c | 2 | 7 |
9 | e | c | 0 | 1 |
10 | f | c | 0 | 1 |
11 | f | c | 3 | 1 |
df.query('c == [1, 2]')df.query('[1, 2] in c')df[df.c.isin([1, 2])]
| a | b | c | d |
3 | b | a | 1 | 2 |
4 | c | b | 1 | 2 |
5 | c | b | 1 | 2 |
7 | d | b | 1 | 0 |
8 | e | c | 2 | 7 |
| a | b | c | d |
3 | b | a | 1 | 2 |
4 | c | b | 1 | 2 |
5 | c | b | 1 | 2 |
7 | d | b | 1 | 0 |
8 | e | c | 2 | 7 |
| a | b | c | d |
3 | b | a | 1 | 2 |
4 | c | b | 1 | 2 |
5 | c | b | 1 | 2 |
7 | d | b | 1 | 0 |
8 | e | c | 2 | 7 |
# Boolean Operators 布尔操作符# ~ 或 notdf = pd.DataFrame(np.random.rand(n, 3), columns=list('abc'))dfdf['bools'] = np.random.rand(len(df)) > 0.5df
| a | b | c |
0 | 0.528671 | 0.265870 | 0.932892 |
1 | 0.520069 | 0.047895 | 0.478818 |
2 | 0.648595 | 0.180744 | 0.838445 |
3 | 0.145614 | 0.906190 | 0.762163 |
4 | 0.612637 | 0.027232 | 0.778020 |
5 | 0.640565 | 0.535538 | 0.018280 |
6 | 0.633664 | 0.124654 | 0.185709 |
7 | 0.635088 | 0.377300 | 0.914968 |
8 | 0.702684 | 0.504459 | 0.107014 |
9 | 0.635757 | 0.261144 | 0.665611 |
| a | b | c | bools |
0 | 0.528671 | 0.265870 | 0.932892 | False |
1 | 0.520069 | 0.047895 | 0.478818 | True |
2 | 0.648595 | 0.180744 | 0.838445 | True |
3 | 0.145614 | 0.906190 | 0.762163 | False |
4 | 0.612637 | 0.027232 | 0.778020 | True |
5 | 0.640565 | 0.535538 | 0.018280 | True |
6 | 0.633664 | 0.124654 | 0.185709 | True |
7 | 0.635088 | 0.377300 | 0.914968 | True |
8 | 0.702684 | 0.504459 | 0.107014 | False |
9 | 0.635757 | 0.261144 | 0.665611 | False |
df.query('~bools')df.query('not bools')
| a | b | c | bools |
0 | 0.528671 | 0.265870 | 0.932892 | False |
3 | 0.145614 | 0.906190 | 0.762163 | False |
8 | 0.702684 | 0.504459 | 0.107014 | False |
9 | 0.635757 | 0.261144 | 0.665611 | False |
| a | b | c | bools |
0 | 0.528671 | 0.265870 | 0.932892 | False |
3 | 0.145614 | 0.906190 | 0.762163 | False |
8 | 0.702684 | 0.504459 | 0.107014 | False |
9 | 0.635757 | 0.261144 | 0.665611 | False |
# 任意组合布尔表达式shorter = df.query('a < b < c and (not bools) or bools > 2') # short query syntaxlonger = df[(df.a < df.b) & (df.b < df.c) & (~df.bools) | (df.bools > 2)] # equivalent in pure Pythonshorterlonger
# Performance of query()# DataFrame.query() using numexpr is slightly faster than Python for large frames# Note : You will only see the performance benefits of using the numexpr engine with DataFrame.query() # if your frame has more than approximately 200,000 rows# query() 效能在大数据时高于数字表达式
# Duplicate Data# 识别和剔除重复数据,两个方法:duplicated and drop_duplicates,他们的参数都是列# duplicated 返回布尔向量,标示重复的行# drop_duplicates 删除重复列# 默认保留第一个找到的值,但可以通过keep参数指定保留的值# keep='first' (default): mark / drop duplicates except for the first occurrence. 只保留第一个# keep='last': mark / drop duplicates except for the last occurrence. 只保留最后一个重复值# keep=False: mark / drop all duplicates. 剔除所有重复值
df2 = pd.DataFrame({'a': ['one', 'one', 'two', 'two', 'two', 'three', 'four'], 'b': ['x', 'y', 'x', 'y', 'x', 'x', 'x'], 'c': np.random.randn(7)})df2
| a | b | c |
0 | one | x | 2.206877 |
1 | one | y | -1.852209 |
2 | two | x | -0.706555 |
3 | two | y | -1.007860 |
4 | two | x | -0.185356 |
5 | three | x | -0.687592 |
6 | four | x | -2.052029 |
df2.duplicated('a')df2.duplicated("a", keep="last")df2.duplicated('a', keep=False)
0 False1 True2 False3 True4 True5 False6 Falsedtype: bool0 True1 False2 True3 True4 False5 False6 Falsedtype: bool0 True1 True2 True3 True4 True5 False6 Falsedtype: bool
df2.drop_duplicates('a')df2.drop_duplicates('a', keep="last")df2.drop_duplicates('a', keep=False)
| a | b | c |
0 | one | x | 2.206877 |
2 | two | x | -0.706555 |
5 | three | x | -0.687592 |
6 | four | x | -2.052029 |
| a | b | c |
1 | one | y | -1.852209 |
4 | two | x | -0.185356 |
5 | three | x | -0.687592 |
6 | four | x | -2.052029 |
| a | b | c |
5 | three | x | -0.687592 |
6 | four | x | -2.052029 |
# 传入一个列表作为参数df2.duplicated(['a', 'b']) # a b 两列看做一个整体,标示重复值df2.drop_duplicates(['a', 'b'])
0 False1 False2 False3 False4 True5 False6 Falsedtype: bool
| a | b | c |
0 | one | x | 2.206877 |
1 | one | y | -1.852209 |
2 | two | x | -0.706555 |
3 | two | y | -1.007860 |
5 | three | x | -0.687592 |
6 | four | x | -2.052029 |
# index索引去重 index.duplicateddf3 = pd.DataFrame({'a': np.arange(6),'b': np.random.randn(6)},index=['a', 'a', 'b', 'c', 'b', 'a'])df3
| a | b |
a | 0 | 1.446937 |
a | 1 | -1.740284 |
b | 2 | -0.604590 |
c | 3 | 0.096239 |
b | 4 | 0.823314 |
a | 5 | 1.990803 |
df3.index.duplicated()df3[~df3.index.duplicated()]df3[~df3.index.duplicated(keep='last')]df3[~df3.index.duplicated(keep=False)]
array([False, True, False, False, True, True])
| a | b |
a | 0 | 1.446937 |
b | 2 | -0.604590 |
c | 3 | 0.096239 |
| a | b |
c | 3 | 0.096239 |
b | 4 | 0.823314 |
a | 5 | 1.990803 |
# Dictionary-like get() method get方法# Each of Series, DataFrame, and Panel have a get method which can return a default value.s = pd.Series([1,2,3], index=['a','b','c'])s.get("a") # 相当于s["a"]s.get("x", default=-1) # 可以对不存在的index赋值
1-1
# The lookup() Method# 按一定的顺序取得行/列的值dflookup = pd.DataFrame(np.random.rand(20,4), columns = ['A','B','C','D'])dflookup
| A | B | C | D |
0 | 0.942679 | 0.727316 | 0.658345 | 0.465770 |
1 | 0.437278 | 0.225158 | 0.436522 | 0.164805 |
2 | 0.270947 | 0.280223 | 0.309800 | 0.015967 |
3 | 0.559836 | 0.630962 | 0.673678 | 0.712503 |
4 | 0.535372 | 0.989887 | 0.661567 | 0.361962 |
5 | 0.726322 | 0.601192 | 0.547858 | 0.477509 |
6 | 0.829411 | 0.583613 | 0.871647 | 0.460966 |
7 | 0.365722 | 0.563660 | 0.164954 | 0.134314 |
8 | 0.817334 | 0.523003 | 0.684492 | 0.369386 |
9 | 0.710906 | 0.693633 | 0.618877 | 0.888263 |
10 | 0.026953 | 0.185217 | 0.746235 | 0.568846 |
11 | 0.192765 | 0.927200 | 0.431736 | 0.084300 |
12 | 0.988460 | 0.784320 | 0.180145 | 0.039405 |
13 | 0.233349 | 0.148678 | 0.888210 | 0.004917 |
14 | 0.105130 | 0.146724 | 0.261370 | 0.936558 |
15 | 0.701224 | 0.791860 | 0.420083 | 0.391538 |
16 | 0.290186 | 0.307993 | 0.139429 | 0.618879 |
17 | 0.182132 | 0.174420 | 0.845501 | 0.647986 |
18 | 0.732009 | 0.919950 | 0.197361 | 0.582814 |
19 | 0.568096 | 0.539125 | 0.269016 | 0.537584 |
list(range(0,10,2))dflookup.lookup(list(range(0,10,2)), ['B','C','A','B','D'])
[0, 2, 4, 6, 8]array([0.72731646, 0.30979986, 0.53537223, 0.58361289, 0.36938598])
# index objets 索引对象# 索引可以通过list或其他序列对象直接创建index = pd.Index(['e', 'd', 'a', 'b'])index# 可以命名index = pd.Index(['e', 'd', 'a', 'b'], name='something')index.nameindex
Index(['e', 'd', 'a', 'b'], dtype='object')'something'Index(['e', 'd', 'a', 'b'], dtype='object', name='something')
index = pd.Index(list(range(5)), name='rows')indexcolumns = pd.Index(['A', 'B', 'C'], name='cols')columns
Int64Index([0, 1, 2, 3, 4], dtype='int64', name='rows')Index(['A', 'B', 'C'], dtype='object', name='cols')
df = pd.DataFrame(np.random.randn(5, 3), index=index, columns=columns) # 使用索引给列命名,列名是一个索引对象df
cols | A | B | C |
rows | | | |
0 | 1.868165 | 0.704614 | -2.049472 |
1 | -0.878810 | 0.706876 | -0.741121 |
2 | -2.649523 | -0.952211 | 0.806387 |
3 | 0.369280 | -0.052788 | -0.995775 |
4 | 0.218246 | -0.034493 | -0.198815 |
# Setting metadata 设置元数据# 索引“多半”是“不可变”,但是可以设置和改变其元数据,比如索引的名称,或者,多重索引的级别和标签# 可以使用 rename, set_names, set_levels, and set_labels 设置这些属性。# 默认返回一个拷贝(不修改原值),也可以就地修改in placeind = pd.Index([1, 2, 3])ind
Int64Index([1, 2, 3], dtype='int64')
ind.name = "ind"indind.rename("apple")indind.name = "apple"indind.set_names(["bob"], inplace=True)ind
Int64Index([1, 2, 3], dtype='int64', name='ind')Int64Index([1, 2, 3], dtype='int64', name='apple')Int64Index([1, 2, 3], dtype='int64', name='ind')Int64Index([1, 2, 3], dtype='int64', name='apple')Int64Index([1, 2, 3], dtype='int64', name='bob')
# set_names, set_levels, and set_labels also take an optional level` argument# df.query('ilevel_0 == "red"') 在query中使用 ilevel_0 ……index = pd.MultiIndex.from_product([range(3), ['one', 'two']], names=['first', 'second'])index
MultiIndex(levels=[[0, 1, 2], ['one', 'two']], labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], names=['first', 'second'])
index.levels[1]index.levels[0]index.set_levels(["a", "b"], level=1)
Index(['one', 'two'], dtype='object', name='second')Int64Index([0, 1, 2], dtype='int64', name='first')MultiIndex(levels=[[0, 1, 2], ['a', 'b']], labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], names=['first', 'second'])
# Set operations on Index objects 索引对象中的集合操作 # Note: The resulting index from a set operation will be sorted in ascending order. # 注意: 索引的集合操作返回的是升序排序后的结果a = pd.Index(['c', 'b', 'a'])b = pd.Index(['c', 'e', 'd'])a | b # 并集a & b # 交集# a - b # 不支持a.difference(b) # 差集,非对称a.symmetric_difference(b) # 对称差集,相当于idx1.difference(idx2).union(idx2.difference(idx1))a ^ b # 对称差集
Index(['a', 'b', 'c', 'd', 'e'], dtype='object')Index(['c'], dtype='object')Index(['a', 'b'], dtype='object')Index(['a', 'b', 'd', 'e'], dtype='object')Index(['a', 'b', 'd', 'e'], dtype='object')
# Missing values 缺失值# 注意:索引能够容纳缺失值,但是应该避免这样的情况。因为可能出现不可预料的结果,例如有些操作默认排除缺失值。idx1 = pd.Index([1, np.nan, 3, 4])idx1
Float64Index([1.0, nan, 3.0, 4.0], dtype='float64')
idx1.fillna(2) # 缺失值赋值为2,nan的index赋值为2
Float64Index([1.0, 2.0, 3.0, 4.0], dtype='float64')
idx2 = pd.DatetimeIndex([pd.Timestamp('2011-01-01'), pd.NaT, pd.Timestamp('2011-01-03')]) # pd的缺失值是NaT,与np不同idx2
DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03'], dtype='datetime64[ns]', freq=None)
idx2.fillna(pd.Timestamp('2011-01-02'))
DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], dtype='datetime64[ns]', freq=None)
# Set / Reset Index 设置/重设索引# DataFramedata = pd.DataFrame({"a":["bar", "bar", "foo", "foo"], "b":["one","two","one","two"], "c":["z","y","x", "w"], "d":range(1,5)}) # range 不用listdata
| a | b | c | d |
0 | bar | one | z | 1 |
1 | bar | two | y | 2 |
2 | foo | one | x | 3 |
3 | foo | two | w | 4 |
indexed1 = data.set_index("c") # 把df的某列设置为index索引indexed1
| a | b | d |
c | | | |
z | bar | one | 1 |
y | bar | two | 2 |
x | foo | one | 3 |
w | foo | two | 4 |
indexed2 = data.set_index(['a', 'b']) # 把df的多列设置为多重索引indexed2
| | c | d |
a | b | | |
bar | one | z | 1 |
two | y | 2 |
foo | one | x | 3 |
two | w | 4 |
frame = data.set_index('c', drop=False) # drop参数,可以设置索引时不删除列,默认为删除列frameframe = frame.set_index(['a', 'b'], append=True) # append参数,在原有索引的基础上,增加索引,变成复合索引frame
| a | b | c | d |
c | | | | |
z | bar | one | z | 1 |
y | bar | two | y | 2 |
x | foo | one | x | 3 |
w | foo | two | w | 4 |
| | | c | d |
c | a | b | | |
z | bar | one | z | 1 |
y | bar | two | y | 2 |
x | foo | one | x | 3 |
w | foo | two | w | 4 |
data.set_index('c', drop=False) # 把某列设为索引,但不删除datadata.set_index(['a', 'b'], inplace=True) # 就地修改,而不是返回拷贝data
| a | b | c | d |
c | | | | |
z | bar | one | z | 1 |
y | bar | two | y | 2 |
x | foo | one | x | 3 |
w | foo | two | w | 4 |
| a | b | c | d |
0 | bar | one | z | 1 |
1 | bar | two | y | 2 |
2 | foo | one | x | 3 |
3 | foo | two | w | 4 |
| | c | d |
a | b | | |
bar | one | z | 1 |
two | y | 2 |
foo | one | x | 3 |
two | w | 4 |
# Reset the index 重置索引# Note: The reset_index method used to be called delevel which is now deprecated. 不推荐使用deleveldatadata.index.name # 无返回值?data.index.names # 索引列的名称data.reset_index() # 重置索引,原索引变为普通列,原索引名称变为列名
| | c | d |
a | b | | |
bar | one | z | 1 |
two | y | 2 |
foo | one | x | 3 |
two | w | 4 |
FrozenList(['a', 'b'])
| a | b | c | d |
0 | bar | one | z | 1 |
1 | bar | two | y | 2 |
2 | foo | one | x | 3 |
3 | foo | two | w | 4 |
frameframe.reset_index(level=1) # 对于多重索引,可以指定重置哪一级索引,而不是全部重置
| | | c | d |
c | a | b | | |
z | bar | one | z | 1 |
y | bar | two | y | 2 |
x | foo | one | x | 3 |
w | foo | two | w | 4 |
| | a | c | d |
c | b | | | |
z | one | bar | z | 1 |
y | two | bar | y | 2 |
x | one | foo | x | 3 |
w | two | foo | w | 4 |
# reset_index方法可以使用drop参数,若其为true,则仅是把索引剔除,而不转换为df数据列frameframe.reset_index(level=2, drop=True)
| | | c | d |
c | a | b | | |
z | bar | one | z | 1 |
y | bar | two | y | 2 |
x | foo | one | x | 3 |
w | foo | two | w | 4 |
| | c | d |
c | a | | |
z | bar | z | 1 |
y | bar | y | 2 |
x | foo | x | 3 |
w | foo | w | 4 |
# Adding an ad hoc index# If you create an index yourself, you can just assign it to the index field# 创建index对象,并赋值给dfindex = pd.MultiIndex.from_product([range(2), ['one', 'two']], names=['first', 'second'])indexdatadata.index = index # 注意:index的长度要与df向适应data
MultiIndex(levels=[[0, 1], ['one', 'two']], labels=[[0, 0, 1, 1], [0, 1, 0, 1]], names=['first', 'second'])
| a | b | c | d |
0 | bar | one | z | 1 |
1 | bar | two | y | 2 |
2 | foo | one | x | 3 |
3 | foo | two | w | 4 |
| | a | b | c | d |
first | second | | | | |
0 | one | bar | one | z | 1 |
two | bar | two | y | 2 |
1 | one | foo | one | x | 3 |
two | foo | two | w | 4 |
# Returning a view versus a copy 返回视图 对比 返回拷贝# 避免链式操作,即多个连接的[],具体见前面的说明# Why does assignment fail when using chained indexing? 为什么链式索引赋值失败# python解释器,对以下赋值的区别# 第一种:非链式赋值dfmi.loc[:,('one','second')] = value# becomes, python解释为:dfmi.loc.__setitem__((slice(None), ('one', 'second')), value) # 直接set# Of course, dfmi.loc.__getitem__(idx) may be a view or a copy of dfmi.# 第二种:链式赋值dfmi['one']['second'] = value # pd会抛出异常SettingWithCopy,链式操作具有不确定性!# becomes, python解释为:dfmi.__getitem__('one').__setitem__('second', value) # 先get,再set,前面的get返回的是一个copy
# SettingWithCopy有时候会在没有明显的链式操作的情况下出现,例如:def do_something(df): foo = df[['bar', 'baz']] # Is foo a view? A copy? Nobody knows! # ... many lines here ... foo['quux'] = value # We don't know whether this will modify df or not! # “隐式”的链式操作 return foo
# Evaluation order matters 赋值命令事情# 链式赋值操作引起的SettingWithCopyWarning,可以通过设置option mode取消或抑制。dfb = pd.DataFrame({'a' : ['one', 'one', 'two','three', 'two', 'one', 'six'],'c' : np.arange(7)})dfb
| a | c |
0 | one | 0 |
1 | one | 1 |
2 | two | 2 |
3 | three | 3 |
4 | two | 4 |
5 | one | 5 |
6 | six | 6 |
# This will show the SettingWithCopyWarning# but the frame values will be setdfb['c'][dfb.a.str.startswith('o')] = 42dfb
d:\python\36-64\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrameSee the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy This is separate from the ipykernel package so we can avoid doing imports until
| a | c |
0 | one | 42 |
1 | one | 42 |
2 | two | 2 |
3 | three | 3 |
4 | two | 4 |
5 | one | 42 |
6 | six | 6 |
# This however is operating on a copy and will not work.# 把option的提示级别由warn改为raise ????pd.set_option('mode.chained_assignment','warn') # This however is operating on a copy and will not work. # ??
# A chained assignment can also crop up in setting in a mixed dtype frame. 出现在# Note : These setting rules apply to all of .loc/.iloc # This is the correct access method 正确的存取方法dfc = pd.DataFrame({'A':['aaa','bbb','ccc'],'B':[1,2,3]})dfcdfc.loc[0,'A'] = 11dfc
| A | B |
0 | aaa | 1 |
1 | bbb | 2 |
2 | ccc | 3 |
| A | B |
0 | 11 | 1 |
1 | bbb | 2 |
2 | ccc | 3 |
dfc = dfc.copy()dfcdfc['A'][0] = 111 # This can work at times, but is not guaranteed, and so should be avoided 有时可以工作,但不保证,应避免使用dfc
| A | B |
0 | 11 | 1 |
1 | bbb | 2 |
2 | ccc | 3 |
d:\python\36-64\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrameSee the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy This is separate from the ipykernel package so we can avoid doing imports until
| A | B |
0 | 111 | 1 |
1 | bbb | 2 |
2 | ccc | 3 |
pd.set_option('mode.chained_assignment', 'raise') # 链式赋值将不会执行
# http://pandas.pydata.org/pandas-docs/stable/indexing.html 全文完# 2018-02-19