import numpy as np
import pandas as pd
import matplotlib.pyplot as plt# 创建S: 索引index是一个数组组成的列表
data = pd.Series(np.random.randn(9),
index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
[1, 2, 3, 1, 3, 1, 2, 2, 3]])
dataa 1 1.832067
2 -0.501033
3 -0.602755
b 1 -0.731398
3 -0.707528
c 1 -0.382131
2 -0.177199
d 2 -0.826364
3 -1.874992
dtype: float64data.indexMultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1, 1, 2]])data['b']1 -0.731398
3 -0.707528
dtype: float64# 部分索引选取数据子集
# 切片形式
data['b':'c']b 1 -0.731398
3 -0.707528
c 1 -0.382131
2 -0.177199
dtype: float64# 列表形式
data.loc[['b', 'c']]b 1 -0.731398
3 -0.707528
c 1 -0.382131
2 -0.177199
dtype: float64data.loc[['b', 'd']]b 1 -0.731398
3 -0.707528
d 2 -0.826364
3 -1.874992
dtype: float64data.loc[:, 2]a -0.501033
c -0.177199
d -0.826364
dtype: float64# 2表示含有索引是2
data.loc[:, 2]a -0.501033
c -0.177199
d -0.826364
dtype: float64dataa 1 1.832067
2 -0.501033
3 -0.602755
b 1 -0.731398
3 -0.707528
c 1 -0.382131
2 -0.177199
d 2 -0.826364
3 -1.874992
dtype: float64data.unstack() # 将层次化索引的数据变成DF形式.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
1 | 2 | 3 | |
|---|---|---|---|
a | 1.832067 | -0.501033 | -0.602755 |
b | -0.731398 | NaN | -0.707528 |
c | -0.382131 | -0.177199 | NaN |
d | NaN | -0.826364 | -1.874992 |
data.unstack().stack()a 1 1.832067
2 -0.501033
3 -0.602755
b 1 -0.731398
3 -0.707528
c 1 -0.382131
2 -0.177199
d 2 -0.826364
3 -1.874992
dtype: float64# 对于DF类型数据
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),
index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
columns=[['Ohio', 'Ohio', 'Colorado'],
['Green', 'Red', 'Green']])
frame.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; } </code></pre>
Ohio | Colorado | |||
|---|---|---|---|---|
Green | Red | Green | ||
a | 1 | 0 | 1 | 2 |
2 | 3 | 4 | 5 | |
b | 1 | 6 | 7 | 8 |
2 | 9 | 10 | 11 | |
# 索引设置名字
frame.index.names = ['key1', 'key2']
# 属性设置名字
frame.columns.names = ['state', 'color']
frame.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; } .dataframe thead tr:last-of-type th { text-align: right; } </code></pre>
state | Ohio | Colorado | ||
|---|---|---|---|---|
color | Green | Red | Green | |
key1 | key2 | |||
a | 1 | 0 | 1 | 2 |
2 | 3 | 4 | 5 | |
b | 1 | 6 | 7 | 8 |
2 | 9 | 10 | 11 | |
frame['Ohio'].dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
color | Green | Red | |
|---|---|---|---|
key1 | key2 | ||
a | 1 | 0 | 1 |
2 | 3 | 4 | |
b | 1 | 6 | 7 |
2 | 9 | 10 |
from pandas import MultiIndex
MultiIndex.from_arrays([['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']],
names=['state', 'color'])MultiIndex(levels=[['Colorado', 'Ohio'], ['Green', 'Red']],
codes=[[1, 1, 0], [0, 1, 0]],
names=['state', 'color'])# 交换位置
frame.swaplevel('key1', 'key2').dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; } .dataframe thead tr:last-of-type th { text-align: right; } </code></pre>
state | Ohio | Colorado | ||
|---|---|---|---|---|
color | Green | Red | Green | |
key2 | key1 | |||
1 | a | 0 | 1 | 2 |
2 | a | 3 | 4 | 5 |
1 | b | 6 | 7 | 8 |
2 | b | 9 | 10 | 11 |
frame.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; } .dataframe thead tr:last-of-type th { text-align: right; } </code></pre>
state | Ohio | Colorado | ||
|---|---|---|---|---|
color | Green | Red | Green | |
key1 | key2 | |||
a | 1 | 0 | 1 | 2 |
2 | 3 | 4 | 5 | |
b | 1 | 6 | 7 | 8 |
2 | 9 | 10 | 11 | |
# level=0 通过第一层索引key1排序
frame.sort_index(level=0).dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; } .dataframe thead tr:last-of-type th { text-align: right; } </code></pre>
state | Ohio | Colorado | ||
|---|---|---|---|---|
color | Green | Red | Green | |
key1 | key2 | |||
a | 1 | 0 | 1 | 2 |
2 | 3 | 4 | 5 | |
b | 1 | 6 | 7 | 8 |
2 | 9 | 10 | 11 | |
# level=1 通过第一层索引key2排序
frame.sort_index(level=1).dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; } .dataframe thead tr:last-of-type th { text-align: right; } </code></pre>
state | Ohio | Colorado | ||
|---|---|---|---|---|
color | Green | Red | Green | |
key1 | key2 | |||
a | 1 | 0 | 1 | 2 |
b | 1 | 6 | 7 | 8 |
a | 2 | 3 | 4 | 5 |
b | 2 | 9 | 10 | 11 |
# swaplevel 不仅可以交换两个索引值,还可以交换它们的索引数值
frame.swaplevel(0, 1).dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; } .dataframe thead tr:last-of-type th { text-align: right; } </code></pre>
state | Ohio | Colorado | ||
|---|---|---|---|---|
color | Green | Red | Green | |
key2 | key1 | |||
1 | a | 0 | 1 | 2 |
2 | a | 3 | 4 | 5 |
1 | b | 6 | 7 | 8 |
2 | b | 9 | 10 | 11 |
frame.swaplevel(0, 1).sort_index(level=0).dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; } .dataframe thead tr:last-of-type th { text-align: right; } </code></pre>
state | Ohio | Colorado | ||
|---|---|---|---|---|
color | Green | Red | Green | |
key2 | key1 | |||
1 | a | 0 | 1 | 2 |
b | 6 | 7 | 8 | |
2 | a | 3 | 4 | 5 |
b | 9 | 10 | 11 | |
frame.sum(level='key2').dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; } .dataframe thead tr:last-of-type th { text-align: right; } </code></pre>
state | Ohio | Colorado | |
|---|---|---|---|
color | Green | Red | Green |
key2 | |||
1 | 6 | 8 | 10 |
2 | 12 | 14 | 16 |
# axis=1表示列
frame.sum(level='color', axis=1).dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
color | Green | Red | |
|---|---|---|---|
key1 | key2 | ||
a | 1 | 2 | 1 |
2 | 8 | 4 | |
b | 1 | 14 | 7 |
2 | 20 | 10 |
frame.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; } .dataframe thead tr:last-of-type th { text-align: right; } </code></pre>
state | Ohio | Colorado | ||
|---|---|---|---|---|
color | Green | Red | Green | |
key1 | key2 | |||
a | 1 | 0 | 1 | 2 |
2 | 3 | 4 | 5 | |
b | 1 | 6 | 7 | 8 |
2 | 9 | 10 | 11 | |
frame.sum(level='key2',axis=0).dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; } .dataframe thead tr:last-of-type th { text-align: right; } </code></pre>
state | Ohio | Colorado | |
|---|---|---|---|
color | Green | Red | Green |
key2 | |||
1 | 6 | 8 | 10 |
2 | 12 | 14 | 16 |
# axis=0指定列
frame.sum(level='key1',axis=0).dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; } .dataframe thead tr:last-of-type th { text-align: right; } </code></pre>
state | Ohio | Colorado | |
|---|---|---|---|
color | Green | Red | Green |
key1 | |||
a | 3 | 5 | 7 |
b | 15 | 17 | 19 |
# 使⽤DataFrame的列进⾏索引
frame = pd.DataFrame({'a': range(7), 'b': range(7, 0, -1),
'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],
'd': [0, 1, 2, 0, 1, 2, 3]})
frame.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
a | b | c | d | |
|---|---|---|---|---|
0 | 0 | 7 | one | 0 |
1 | 1 | 6 | one | 1 |
2 | 2 | 5 | one | 2 |
3 | 3 | 4 | two | 0 |
4 | 4 | 3 | two | 1 |
5 | 5 | 2 | two | 2 |
6 | 6 | 1 | two | 3 |
# set_index函数将列转换成行索引,默认删除
frame2 = frame.set_index(['c', 'd'])
frame2.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
a | b | ||
|---|---|---|---|
c | d | ||
one | 0 | 0 | 7 |
1 | 1 | 6 | |
2 | 2 | 5 | |
two | 0 | 3 | 4 |
1 | 4 | 3 | |
2 | 5 | 2 | |
3 | 6 | 1 |
# 将原来的索引保留
frame.set_index(['c', 'd'], drop=False).dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
a | b | c | d | ||
|---|---|---|---|---|---|
c | d | ||||
one | 0 | 0 | 7 | one | 0 |
1 | 1 | 6 | one | 1 | |
2 | 2 | 5 | one | 2 | |
two | 0 | 3 | 4 | two | 0 |
1 | 4 | 3 | two | 1 | |
2 | 5 | 2 | two | 2 | |
3 | 6 | 1 | two | 3 |
frame2.reset_index().dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
c | d | a | b | |
|---|---|---|---|---|
0 | one | 0 | 0 | 7 |
1 | one | 1 | 1 | 6 |
2 | one | 2 | 2 | 5 |
3 | two | 0 | 3 | 4 |
4 | two | 1 | 4 | 3 |
5 | two | 2 | 5 | 2 |
6 | two | 3 | 6 | 1 |
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1': range(7)})
df2 = pd.DataFrame({'key': ['a', 'b', 'd'],'data2': range(3)})df1.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
key | data1 | |
|---|---|---|
0 | b | 0 |
1 | b | 1 |
2 | a | 2 |
3 | c | 3 |
4 | a | 4 |
5 | a | 5 |
6 | b | 6 |
df2.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
key | data2 | |
|---|---|---|
0 | a | 0 |
1 | b | 1 |
2 | d | 2 |
# 默认根据重叠列名key根据进行合并
pd.merge(df1,df2).dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
key | data1 | data2 | |
|---|---|---|---|
0 | b | 0 | 1 |
1 | b | 1 | 1 |
2 | b | 6 | 1 |
3 | a | 2 | 0 |
4 | a | 4 | 0 |
5 | a | 5 | 0 |
pd.merge(df1, df2, on='key').dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
key | data1 | data2 | |
|---|---|---|---|
0 | b | 0 | 1 |
1 | b | 1 | 1 |
2 | b | 6 | 1 |
3 | a | 2 | 0 |
4 | a | 4 | 0 |
5 | a | 5 | 0 |
df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1': range(7)})
df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'], 'data2': range(3)})pd.merge(df3, df4, left_on='lkey', right_on='rkey').dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
lkey | data1 | rkey | data2 | |
|---|---|---|---|---|
0 | b | 0 | b | 1 |
1 | b | 1 | b | 1 |
2 | b | 6 | b | 1 |
3 | a | 2 | a | 0 |
4 | a | 4 | a | 0 |
5 | a | 5 | a | 0 |
df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
'value': [1, 2, 3, 5]})
df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
'value': [5, 6, 7, 8]})df1.merge(df2, left_on='lkey', right_on='rkey').dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
lkey | value_x | rkey | value_y | |
|---|---|---|---|---|
0 | foo | 1 | foo | 5 |
1 | foo | 1 | foo | 8 |
2 | foo | 5 | foo | 5 |
3 | foo | 5 | foo | 8 |
4 | bar | 2 | bar | 6 |
5 | baz | 3 | baz | 7 |
# suffixes解决两个DF中重复列名的问题
df1.merge(df2, left_on='lkey', right_on='rkey',
suffixes=('_left', '_right')).dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
lkey | value_left | rkey | value_right | |
|---|---|---|---|---|
0 | foo | 1 | foo | 5 |
1 | foo | 1 | foo | 8 |
2 | foo | 5 | foo | 5 |
3 | foo | 5 | foo | 8 |
4 | bar | 2 | bar | 6 |
5 | baz | 3 | baz | 7 |
# 相同的values进行合并
df1.merge(df2, how='inner').dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
lkey | value | rkey | |
|---|---|---|---|
0 | foo | 5 | foo |
# 右边的值为标准,左边如果有,直接显示;没有则显示NaN
df1.merge(df2, how='right').dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
lkey | value | rkey | |
|---|---|---|---|
0 | foo | 5 | foo |
1 | NaN | 6 | bar |
2 | NaN | 7 | baz |
3 | NaN | 8 | foo |
pd.merge(df1, df2, how='left').dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
lkey | value | rkey | |
|---|---|---|---|
0 | foo | 1 | NaN |
1 | bar | 2 | NaN |
2 | baz | 3 | NaN |
3 | foo | 5 | foo |
# outer相当于是right和left一起作用
pd.merge(df1, df2, how='outer').dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
lkey | value | rkey | |
|---|---|---|---|
0 | foo | 1 | NaN |
1 | bar | 2 | NaN |
2 | baz | 3 | NaN |
3 | foo | 5 | foo |
4 | NaN | 6 | bar |
5 | NaN | 7 | baz |
6 | NaN | 8 | foo |
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
'data1': range(6)})
df2 = pd.DataFrame({'key': ['a', 'b', 'a', 'b', 'd'],
'data2': range(5)})# 以right为标准:如果右有左无,标记为NaN;左有右无,直接丢弃
pd.merge(df1, df2, on='key', how='right').dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
key | data1 | data2 | |
|---|---|---|---|
0 | b | 0.0 | 1 |
1 | b | 1.0 | 1 |
2 | b | 5.0 | 1 |
3 | b | 0.0 | 3 |
4 | b | 1.0 | 3 |
5 | b | 5.0 | 3 |
6 | a | 2.0 | 0 |
7 | a | 4.0 | 0 |
8 | a | 2.0 | 2 |
9 | a | 4.0 | 2 |
10 | d | NaN | 4 |
pd.merge(df1, df2, on='key', how='left').dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
key | data1 | data2 | |
|---|---|---|---|
0 | b | 0 | 1.0 |
1 | b | 0 | 3.0 |
2 | b | 1 | 1.0 |
3 | b | 1 | 3.0 |
4 | a | 2 | 0.0 |
5 | a | 2 | 2.0 |
6 | c | 3 | NaN |
7 | a | 4 | 0.0 |
8 | a | 4 | 2.0 |
9 | b | 5 | 1.0 |
10 | b | 5 | 3.0 |
pd.merge(df1, df2, how='inner').dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
key | data1 | data2 | |
|---|---|---|---|
0 | b | 0 | 1 |
1 | b | 0 | 3 |
2 | b | 1 | 1 |
3 | b | 1 | 3 |
4 | b | 5 | 1 |
5 | b | 5 | 3 |
6 | a | 2 | 0 |
7 | a | 2 | 2 |
8 | a | 4 | 0 |
9 | a | 4 | 2 |
left = pd.DataFrame({'key1': ['foo', 'foo', 'bar'],
'key2': ['one', 'two', 'one'],
'lval': [1, 2, 3]})
right = pd.DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
'key2': ['one', 'one', 'one', 'two'],
'rval': [4, 5, 6, 7]})
right.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
key1 | key2 | rval | |
|---|---|---|---|
0 | foo | one | 4 |
1 | foo | one | 5 |
2 | bar | one | 6 |
3 | bar | two | 7 |
pd.merge(left, right, on=['key1', 'key2'], how='outer').dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
key1 | key2 | lval | rval | |
|---|---|---|---|---|
0 | foo | one | 1.0 | 4.0 |
1 | foo | one | 1.0 | 5.0 |
2 | foo | two | 2.0 | NaN |
3 | bar | one | 3.0 | 6.0 |
4 | bar | two | NaN | 7.0 |
pd.merge(left, right, on=['key1', 'key2'], how='inner').dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
key1 | key2 | lval | rval | |
|---|---|---|---|---|
0 | foo | one | 1 | 4 |
1 | foo | one | 1 | 5 |
2 | bar | one | 3 | 6 |
left1 = pd.DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],
'value': range(6)})
right1 = pd.DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])pd.merge(left1, right1, left_on='key', right_index=True).dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
key | value | group_val | |
|---|---|---|---|
0 | a | 0 | 3.5 |
2 | a | 2 | 3.5 |
3 | a | 3 | 3.5 |
1 | b | 1 | 7.0 |
4 | b | 4 | 7.0 |
pd.merge(left1, right1, left_on='key', right_index=True, how='outer').dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
key | value | group_val | |
|---|---|---|---|
0 | a | 0 | 3.5 |
2 | a | 2 | 3.5 |
3 | a | 3 | 3.5 |
1 | b | 1 | 7.0 |
4 | b | 4 | 7.0 |
5 | c | 5 | NaN |
left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]],
index=['a', 'c', 'e'],
columns=['Ohio', 'Nevada'])
right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],
index=['b', 'c', 'd', 'e'],
columns=['Missouri', 'Alabama'])pd.merge(left2, right2, how='outer', left_index=True, right_index=True).dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
Ohio | Nevada | Missouri | Alabama | |
|---|---|---|---|---|
a | 1.0 | 2.0 | NaN | NaN |
b | NaN | NaN | 7.0 | 8.0 |
c | 3.0 | 4.0 | 9.0 | 10.0 |
d | NaN | NaN | 11.0 | 12.0 |
e | 5.0 | 6.0 | 13.0 | 14.0 |
left2.join(right2, how='outer').dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
Ohio | Nevada | Missouri | Alabama | |
|---|---|---|---|---|
a | 1.0 | 2.0 | NaN | NaN |
b | NaN | NaN | 7.0 | 8.0 |
c | 3.0 | 4.0 | 9.0 | 10.0 |
d | NaN | NaN | 11.0 | 12.0 |
e | 5.0 | 6.0 | 13.0 | 14.0 |
left1.join(right1, on='key').dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
key | value | group_val | |
|---|---|---|---|
0 | a | 0 | 3.5 |
1 | b | 1 | 7.0 |
2 | a | 2 | 3.5 |
3 | a | 3 | 3.5 |
4 | b | 4 | 7.0 |
5 | c | 5 | NaN |
another = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],
index=['a', 'c', 'e', 'f'],
columns=['New York', 'Oregon'])
another.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
New York | Oregon | |
|---|---|---|
a | 7.0 | 8.0 |
c | 9.0 | 10.0 |
e | 11.0 | 12.0 |
f | 16.0 | 17.0 |
left2.join([right2, another]).dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
Ohio | Nevada | Missouri | Alabama | New York | Oregon | |
|---|---|---|---|---|---|---|
a | 1.0 | 2.0 | NaN | NaN | 7.0 | 8.0 |
c | 3.0 | 4.0 | 9.0 | 10.0 | 9.0 | 10.0 |
e | 5.0 | 6.0 | 13.0 | 14.0 | 11.0 | 12.0 |
left2.join([right2, another], how='outer', sort='True')c:\users\admin\venv\lib\site-packages\pandas\core\frame.py:6848: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.
To accept the future behavior, pass 'sort=False'.
To retain the current behavior and silence the warning, pass 'sort=True'.
verify_integrity=True).dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
Ohio | Nevada | Missouri | Alabama | New York | Oregon | |
|---|---|---|---|---|---|---|
a | 1.0 | 2.0 | NaN | NaN | 7.0 | 8.0 |
b | NaN | NaN | 7.0 | 8.0 | NaN | NaN |
c | 3.0 | 4.0 | 9.0 | 10.0 | 9.0 | 10.0 |
d | NaN | NaN | 11.0 | 12.0 | NaN | NaN |
e | 5.0 | 6.0 | 13.0 | 14.0 | 11.0 | 12.0 |
f | NaN | NaN | NaN | NaN | 16.0 | 17.0 |
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])# 传入的是列表形式
# concat是在axis=0上⼯作的
pd.concat([s1, s2, s3])a 0
b 1
c 2
d 3
e 4
f 5
g 6
dtype: int64s4 = pd.concat([s1, s2, s3], axis=1,sort=True)
s4.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
0 | 1 | 2 | |
|---|---|---|---|
a | 0.0 | NaN | NaN |
b | 1.0 | NaN | NaN |
c | NaN | 2.0 | NaN |
d | NaN | 3.0 | NaN |
e | NaN | 4.0 | NaN |
f | NaN | NaN | 5.0 |
g | NaN | NaN | 6.0 |
pd.concat([s1, s4]).dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
0 | 1 | 2 | |
|---|---|---|---|
a | 0.0 | NaN | NaN |
b | 1.0 | NaN | NaN |
a | 0.0 | NaN | NaN |
b | 1.0 | NaN | NaN |
c | NaN | 2.0 | NaN |
d | NaN | 3.0 | NaN |
e | NaN | 4.0 | NaN |
f | NaN | NaN | 5.0 |
g | NaN | NaN | 6.0 |
pd.concat([s1, s4], axis=1, sort=True).dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
0 | 0 | 1 | 2 | |
|---|---|---|---|---|
a | 0.0 | 0.0 | NaN | NaN |
b | 1.0 | 1.0 | NaN | NaN |
c | NaN | NaN | 2.0 | NaN |
d | NaN | NaN | 3.0 | NaN |
e | NaN | NaN | 4.0 | NaN |
f | NaN | NaN | NaN | 5.0 |
g | NaN | NaN | NaN | 6.0 |
# 传入join求交集
# join_axes指定要在其它轴上使⽤的索引
pd.concat([s1, s4], axis=1, join='inner').dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
0 | 0 | 1 | 2 | |
|---|---|---|---|---|
a | 0 | 0.0 | NaN | NaN |
b | 1 | 1.0 | NaN | NaN |
result = pd.concat([s1, s1, s3], keys=['one', 'two', 'three'])
resultone a 0
b 1
two a 0
b 1
three f 5
g 6
dtype: int64result.unstack().dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
a | b | f | g | |
|---|---|---|---|---|
one | 0.0 | 1.0 | NaN | NaN |
two | 0.0 | 1.0 | NaN | NaN |
three | NaN | NaN | 5.0 | 6.0 |
pd.concat([s1, s2, s3], axis=1, sort=True, keys=['one', 'two', 'three']).dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
one | two | three | |
|---|---|---|---|
a | 0.0 | NaN | NaN |
b | 1.0 | NaN | NaN |
c | NaN | 2.0 | NaN |
d | NaN | 3.0 | NaN |
e | NaN | 4.0 | NaN |
f | NaN | NaN | 5.0 |
g | NaN | NaN | 6.0 |
df1 = pd.DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])
df1.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
a | b | c | d | |
|---|---|---|---|---|
0 | -1.017946 | -0.127939 | 0.556561 | 3.037912 |
1 | -0.282973 | -1.015500 | -0.846612 | -0.393027 |
2 | -0.022687 | -1.878993 | 0.607246 | 0.365823 |
pd.concat([df1, df2], ignore_index=True,sort=True).dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
a | b | c | d | |
|---|---|---|---|---|
0 | -1.017946 | -0.127939 | 0.556561 | 3.037912 |
1 | -0.282973 | -1.015500 | -0.846612 | -0.393027 |
2 | -0.022687 | -1.878993 | 0.607246 | 0.365823 |
3 | -0.212958 | 0.053333 | NaN | -0.371492 |
4 | -0.237029 | -0.239806 | NaN | 0.623274 |
索引全部或者部分重叠的两个数据
a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
index=['f', 'e', 'd', 'c', 'b', 'a'])
b = pd.Series(np.arange(len(a), dtype=np.float64),
index=['f', 'e', 'd', 'c', 'b', 'a'])
af NaN
e 2.5
d NaN
c 3.5
b 4.5
a NaN
dtype: float64b[-1] = np.nan
bf 0.0
e 1.0
d 2.0
c 3.0
b 4.0
a NaN
dtype: float64np.where(pd.isnull(a), b, a)array([0. , 2.5, 2. , 3.5, 4.5, nan])pd.isnull(a)f True
e False
d True
c False
b False
a True
dtype: bool# Series有⼀个combine_first⽅法
b[:-2].combine_first(a[2:])a NaN
b 4.5
c 3.0
d 2.0
e 1.0
f 0.0
dtype: float64# 创建一个DF,同时指定行列索引的name
data = pd.DataFrame(np.arange(6).reshape((2, 3)),
index=pd.Index(['Ohio', 'Colorado'], name='state'),
columns=pd.Index(['one', 'two', 'three'],
name='number'))
data.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
number | one | two | three |
|---|---|---|---|
state | |||
Ohio | 0 | 1 | 2 |
Colorado | 3 | 4 | 5 |
# 将列转换为行,得到一个Series
result = data.stack()
resultstate number
Ohio one 0
two 1
three 2
Colorado one 3
two 4
three 5
dtype: int32# 将S变成DF数据
result.unstack().dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
number | one | two | three |
|---|---|---|---|
state | |||
Ohio | 0 | 1 | 2 |
Colorado | 3 | 4 | 5 |
# unstack():默认操作的是最内层
result.unstack('state').dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
state | Ohio | Colorado |
|---|---|---|
number | ||
one | 0 | 3 |
two | 1 | 4 |
three | 2 | 5 |
df = pd.DataFrame(np.random.rand(6, 4),
index=['one', 'two', 'three', 'four', 'five', 'six'],
columns=pd.Index(['A', 'B', 'C', 'D'], name='Genus'))
df.dataframe tbody tr th:only-of-type { vertical-align: middle; } <pre><code>.dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </code></pre>
Genus | A | B | C | D |
|---|---|---|---|---|
one | 0.862586 | 0.440328 | 0.172111 | 0.852715 |
two | 0.303660 | 0.322660 | 0.232518 | 0.327529 |
three | 0.128695 | 0.309510 | 0.224989 | 0.750412 |
four | 0.658148 | 0.768613 | 0.234445 | 0.184719 |
five | 0.502111 | 0.406510 | 0.310178 | 0.355971 |
six | 0.002387 | 0.996109 | 0.207558 | 0.033299 |
# Genus自动用作图例
df.plot.bar()<matplotlib.axes._subplots.AxesSubplot at 0x1ad366bafd0>