import pandas as pd
import numpy as np
import os
os.getcwd()
'D:\\Jupyter\\notebook\\Python数据清洗实战\\数据清洗之数据表处理'
os.chdir('D:\\Jupyter\\notebook\\Python数据清洗实战\\数据')
df1 = pd.read_csv('sam_tianchi_mum_baby.csv', encoding='utf-8', dtype=str)
df1.head(5)
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>user_id</th>
<th>birthday</th>
<th>gender</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>2757</td>
<td>20130311</td>
<td>1</td>
</tr>
<tr>
<th>1</th>
<td>415971</td>
<td>20121111</td>
<td>0</td>
</tr>
<tr>
<th>2</th>
<td>1372572</td>
<td>20120130</td>
<td>1</td>
</tr>
<tr>
<th>3</th>
<td>10339332</td>
<td>20110910</td>
<td>0</td>
</tr>
<tr>
<th>4</th>
<td>10642245</td>
<td>20130213</td>
<td>0</td>
</tr>
</tbody>
</table>
</div>
df1.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 953 entries, 0 to 952
Data columns (total 3 columns):
user_id 953 non-null object
birthday 953 non-null object
gender 953 non-null object
dtypes: object(3)
memory usage: 22.4+ KB
# 修改0为女性
df1.loc[df1['gender']=='0', 'gender'] = '女性'
df1.loc[df1['gender']=='1', 'gender'] = '男性'
df1.loc[df1['gender']=='2', 'gender'] = '未知'
df1.head(5)
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>user_id</th>
<th>birthday</th>
<th>gender</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>2757</td>
<td>20130311</td>
<td>男性</td>
</tr>
<tr>
<th>1</th>
<td>415971</td>
<td>20121111</td>
<td>女性</td>
</tr>
<tr>
<th>2</th>
<td>1372572</td>
<td>20120130</td>
<td>男性</td>
</tr>
<tr>
<th>3</th>
<td>10339332</td>
<td>20110910</td>
<td>女性</td>
</tr>
<tr>
<th>4</th>
<td>10642245</td>
<td>20130213</td>
<td>女性</td>
</tr>
</tbody>
</table>
</div>
# 对列标签进行修改,传入参数为字典形式
# 加 inplace=True,会对原数据进行修改
df1.rename(columns={'user_id': '用户ID', 'birthday': '出生日期', 'gender': '性别'}, inplace=True)
df1.head(5)
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>用户ID</th>
<th>出生日期</th>
<th>性别</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>2757</td>
<td>20130311</td>
<td>男性</td>
</tr>
<tr>
<th>1</th>
<td>415971</td>
<td>20121111</td>
<td>女性</td>
</tr>
<tr>
<th>2</th>
<td>1372572</td>
<td>20120130</td>
<td>男性</td>
</tr>
<tr>
<th>3</th>
<td>10339332</td>
<td>20110910</td>
<td>女性</td>
</tr>
<tr>
<th>4</th>
<td>10642245</td>
<td>20130213</td>
<td>女性</td>
</tr>
</tbody>
</table>
</div>
# 修改行标签
df1.rename(index={3: 333, 4:444}, inplace=True)
df1.head(5)
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>用户ID</th>
<th>出生日期</th>
<th>性别</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>2757</td>
<td>20130311</td>
<td>男性</td>
</tr>
<tr>
<th>1</th>
<td>415971</td>
<td>20121111</td>
<td>女性</td>
</tr>
<tr>
<th>2</th>
<td>1372572</td>
<td>20120130</td>
<td>男性</td>
</tr>
<tr>
<th>333</th>
<td>10339332</td>
<td>20110910</td>
<td>女性</td>
</tr>
<tr>
<th>444</th>
<td>10642245</td>
<td>20130213</td>
<td>女性</td>
</tr>
</tbody>
</table>
</div>
# iloc按照位置取值,与标签无关
df1.iloc[:5]
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>用户ID</th>
<th>出生日期</th>
<th>性别</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>2757</td>
<td>20130311</td>
<td>男性</td>
</tr>
<tr>
<th>1</th>
<td>415971</td>
<td>20121111</td>
<td>女性</td>
</tr>
<tr>
<th>2</th>
<td>1372572</td>
<td>20120130</td>
<td>男性</td>
</tr>
<tr>
<th>333</th>
<td>10339332</td>
<td>20110910</td>
<td>女性</td>
</tr>
<tr>
<th>444</th>
<td>10642245</td>
<td>20130213</td>
<td>女性</td>
</tr>
</tbody>
</table>
</div>
# 重置索引
df1.reset_index(drop=True, inplace=True)
df1.head(5)
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>用户ID</th>
<th>出生日期</th>
<th>性别</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>2757</td>
<td>20130311</td>
<td>男性</td>
</tr>
<tr>
<th>1</th>
<td>415971</td>
<td>20121111</td>
<td>女性</td>
</tr>
<tr>
<th>2</th>
<td>1372572</td>
<td>20120130</td>
<td>男性</td>
</tr>
<tr>
<th>3</th>
<td>10339332</td>
<td>20110910</td>
<td>女性</td>
</tr>
<tr>
<th>4</th>
<td>10642245</td>
<td>20130213</td>
<td>女性</td>
</tr>
</tbody>
</table>
</div>
df = pd.read_csv('baby_trade_history.csv', encoding='utf-8', dtype={'user_id':str})
df.head(2)
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>user_id</th>
<th>auction_id</th>
<th>cat_id</th>
<th>cat1</th>
<th>property</th>
<th>buy_mount</th>
<th>day</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>786295544</td>
<td>41098319944</td>
<td>50014866</td>
<td>50022520</td>
<td>21458:86755362;13023209:3593274;10984217:21985...</td>
<td>2</td>
<td>20140919</td>
</tr>
<tr>
<th>1</th>
<td>532110457</td>
<td>17916191097</td>
<td>50011993</td>
<td>28</td>
<td>21458:11399317;1628862:3251296;21475:137325;16...</td>
<td>1</td>
<td>20131011</td>
</tr>
</tbody>
</table>
</div>
# 选择buy_mount 大于10 的数据
# 返回的是布尔索引值
# 放入pandas dataframe 中,会自动筛选
# 数据太多,使用分片打印前5个
df[df['buy_mount'] > 10][:5]
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>user_id</th>
<th>auction_id</th>
<th>cat_id</th>
<th>cat1</th>
<th>property</th>
<th>buy_mount</th>
<th>day</th>
</tr>
</thead>
<tbody>
<tr>
<th>49</th>
<td>103125167</td>
<td>18426669796</td>
<td>50018438</td>
<td>50014815</td>
<td>21458:46896;1628665:3233941;1628665:3233942;21...</td>
<td>16</td>
<td>20140220</td>
</tr>
<tr>
<th>65</th>
<td>605724983</td>
<td>19747694834</td>
<td>50006520</td>
<td>50014815</td>
<td>21458:30992</td>
<td>12</td>
<td>20141017</td>
</tr>
<tr>
<th>89</th>
<td>277279277</td>
<td>18024521052</td>
<td>211122</td>
<td>38</td>
<td>21458:33516;33480:3238774;2653417:7353464;3359...</td>
<td>12</td>
<td>20130513</td>
</tr>
<tr>
<th>247</th>
<td>392530596</td>
<td>17001611735</td>
<td>50011993</td>
<td>28</td>
<td>122218042:50276;21475:135183931;1628861:45151;...</td>
<td>101</td>
<td>20140301</td>
</tr>
<tr>
<th>409</th>
<td>1968453717</td>
<td>12567034563</td>
<td>122616024</td>
<td>28</td>
<td>135925585:42825;138052423:142000990;18822961:2...</td>
<td>14</td>
<td>20150127</td>
</tr>
</tbody>
</table>
</div>
# 选择buy_mount 小于等于10 的数据
df[~(df['buy_mount'] > 10)][:5]
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>user_id</th>
<th>auction_id</th>
<th>cat_id</th>
<th>cat1</th>
<th>property</th>
<th>buy_mount</th>
<th>day</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>786295544</td>
<td>41098319944</td>
<td>50014866</td>
<td>50022520</td>
<td>21458:86755362;13023209:3593274;10984217:21985...</td>
<td>2</td>
<td>20140919</td>
</tr>
<tr>
<th>1</th>
<td>532110457</td>
<td>17916191097</td>
<td>50011993</td>
<td>28</td>
<td>21458:11399317;1628862:3251296;21475:137325;16...</td>
<td>1</td>
<td>20131011</td>
</tr>
<tr>
<th>2</th>
<td>249013725</td>
<td>21896936223</td>
<td>50012461</td>
<td>50014815</td>
<td>21458:30992;1628665:92012;1628665:3233938;1628...</td>
<td>1</td>
<td>20131011</td>
</tr>
<tr>
<th>3</th>
<td>917056007</td>
<td>12515996043</td>
<td>50018831</td>
<td>50014815</td>
<td>21458:15841995;21956:3494076;27000458:59723383...</td>
<td>2</td>
<td>20141023</td>
</tr>
<tr>
<th>4</th>
<td>444069173</td>
<td>20487688075</td>
<td>50013636</td>
<td>50008168</td>
<td>21458:30992;13658074:3323064;1628665:3233941;1...</td>
<td>1</td>
<td>20141103</td>
</tr>
</tbody>
</table>
</div>
# 多条件查询
# 每个条件用括号括起来
df[(df['buy_mount'] > 10) & (df['day'] > 20140101)][:5]
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>user_id</th>
<th>auction_id</th>
<th>cat_id</th>
<th>cat1</th>
<th>property</th>
<th>buy_mount</th>
<th>day</th>
</tr>
</thead>
<tbody>
<tr>
<th>49</th>
<td>103125167</td>
<td>18426669796</td>
<td>50018438</td>
<td>50014815</td>
<td>21458:46896;1628665:3233941;1628665:3233942;21...</td>
<td>16</td>
<td>20140220</td>
</tr>
<tr>
<th>65</th>
<td>605724983</td>
<td>19747694834</td>
<td>50006520</td>
<td>50014815</td>
<td>21458:30992</td>
<td>12</td>
<td>20141017</td>
</tr>
<tr>
<th>247</th>
<td>392530596</td>
<td>17001611735</td>
<td>50011993</td>
<td>28</td>
<td>122218042:50276;21475:135183931;1628861:45151;...</td>
<td>101</td>
<td>20140301</td>
</tr>
<tr>
<th>409</th>
<td>1968453717</td>
<td>12567034563</td>
<td>122616024</td>
<td>28</td>
<td>135925585:42825;138052423:142000990;18822961:2...</td>
<td>14</td>
<td>20150127</td>
</tr>
<tr>
<th>462</th>
<td>1802549062</td>
<td>17383345857</td>
<td>50006520</td>
<td>50014815</td>
<td>22277:6262384;21458:30992;1628665:61550;162866...</td>
<td>11</td>
<td>20141120</td>
</tr>
</tbody>
</table>
</div>
# between方法不适用于字符串数据
df[df['buy_mount'].between(4, 10, inclusive=True)][:5]
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>user_id</th>
<th>auction_id</th>
<th>cat_id</th>
<th>cat1</th>
<th>property</th>
<th>buy_mount</th>
<th>day</th>
</tr>
</thead>
<tbody>
<tr>
<th>22</th>
<td>469517728</td>
<td>8232924597</td>
<td>211122</td>
<td>38</td>
<td>21458:21782;36786:42781029;13023102:6999219;22...</td>
<td>6</td>
<td>20140502</td>
</tr>
<tr>
<th>52</th>
<td>55544814</td>
<td>4917672059</td>
<td>50015727</td>
<td>50014815</td>
<td>21458:4540492;1633959:58840623;7107736:3227806...</td>
<td>4</td>
<td>20131106</td>
</tr>
<tr>
<th>117</th>
<td>296448405</td>
<td>18524578446</td>
<td>50016030</td>
<td>50008168</td>
<td>21458:247918101;1628665:29782;1628665:29784;16...</td>
<td>7</td>
<td>20131202</td>
</tr>
<tr>
<th>134</th>
<td>97481514</td>
<td>41161316434</td>
<td>211122</td>
<td>38</td>
<td>6940834:29865;21458:4331527;1804977:606613769;...</td>
<td>6</td>
<td>20141126</td>
</tr>
<tr>
<th>148</th>
<td>662134541</td>
<td>7594318922</td>
<td>211122</td>
<td>38</td>
<td>21458:21776;36780:15333590;2675455:19653564;69...</td>
<td>6</td>
<td>20131211</td>
</tr>
</tbody>
</table>
</div>
df.dtypes
user_id object
auction_id int64
cat_id int64
cat1 int64
property object
buy_mount int64
day int64
dtype: object
# 多个值放入列表中
df[df['cat1'].isin([38,28])][:5]
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>user_id</th>
<th>auction_id</th>
<th>cat_id</th>
<th>cat1</th>
<th>property</th>
<th>buy_mount</th>
<th>day</th>
</tr>
</thead>
<tbody>
<tr>
<th>1</th>
<td>532110457</td>
<td>17916191097</td>
<td>50011993</td>
<td>28</td>
<td>21458:11399317;1628862:3251296;21475:137325;16...</td>
<td>1</td>
<td>20131011</td>
</tr>
<tr>
<th>8</th>
<td>82830661</td>
<td>19948600790</td>
<td>50013874</td>
<td>28</td>
<td>21458:11580;21475:137325</td>
<td>1</td>
<td>20121101</td>
</tr>
<tr>
<th>9</th>
<td>475046636</td>
<td>10368360710</td>
<td>203527</td>
<td>28</td>
<td>22724:40168;22729:40278;21458:21817;2770200:24...</td>
<td>1</td>
<td>20121101</td>
</tr>
<tr>
<th>10</th>
<td>734147966</td>
<td>15307958346</td>
<td>50018202</td>
<td>38</td>
<td>21458:3270827;7361532:28710594;7397093:7536994...</td>
<td>2</td>
<td>20121101</td>
</tr>
<tr>
<th>13</th>
<td>377550424</td>
<td>15771663914</td>
<td>50015841</td>
<td>28</td>
<td>1628665:3233941;1628665:3233942;3914866:11580;...</td>
<td>1</td>
<td>20121123</td>
</tr>
</tbody>
</table>
</div>
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。