import pandas as pd
import numpy as np
import os
os.getcwd()
'D:\\Jupyter\\notebook\\Python数据清洗实战\\数据清洗之数据转换'
os.chdir('D:\\Jupyter\\notebook\\Python数据清洗实战\\数据')
df = pd.read_csv('sam_tianchi_mum_baby.csv', dtype=str, encoding='utf-8')
df.head(5)
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>user_id</th>
<th>birthday</th>
<th>gender</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>2757</td>
<td>20130311</td>
<td>1</td>
</tr>
<tr>
<th>1</th>
<td>415971</td>
<td>20121111</td>
<td>0</td>
</tr>
<tr>
<th>2</th>
<td>1372572</td>
<td>20120130</td>
<td>1</td>
</tr>
<tr>
<th>3</th>
<td>10339332</td>
<td>20110910</td>
<td>0</td>
</tr>
<tr>
<th>4</th>
<td>10642245</td>
<td>20130213</td>
<td>0</td>
</tr>
</tbody>
</table>
</div>
def f(x):
if '0' in str(x):
return '女'
elif '1' in str(x):
return '男'
else:
return '未知'
# apply函数可做很多其他处理
df['性别'] = df['gender'].apply(f)
df.head(5)
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>user_id</th>
<th>birthday</th>
<th>gender</th>
<th>性别</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>2757</td>
<td>20130311</td>
<td>1</td>
<td>男</td>
</tr>
<tr>
<th>1</th>
<td>415971</td>
<td>20121111</td>
<td>0</td>
<td>女</td>
</tr>
<tr>
<th>2</th>
<td>1372572</td>
<td>20120130</td>
<td>1</td>
<td>男</td>
</tr>
<tr>
<th>3</th>
<td>10339332</td>
<td>20110910</td>
<td>0</td>
<td>女</td>
</tr>
<tr>
<th>4</th>
<td>10642245</td>
<td>20130213</td>
<td>0</td>
<td>女</td>
</tr>
</tbody>
</table>
</div>
# 查看性别为未知数据
df[df['gender'] == '2'].head(5)
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>user_id</th>
<th>birthday</th>
<th>gender</th>
<th>性别</th>
</tr>
</thead>
<tbody>
<tr>
<th>46</th>
<td>49167150</td>
<td>20130818</td>
<td>2</td>
<td>未知</td>
</tr>
<tr>
<th>47</th>
<td>49983255</td>
<td>20140206</td>
<td>2</td>
<td>未知</td>
</tr>
<tr>
<th>51</th>
<td>52529655</td>
<td>20130611</td>
<td>2</td>
<td>未知</td>
</tr>
<tr>
<th>58</th>
<td>57711375</td>
<td>20130420</td>
<td>2</td>
<td>未知</td>
</tr>
<tr>
<th>106</th>
<td>99665637</td>
<td>20130926</td>
<td>2</td>
<td>未知</td>
</tr>
</tbody>
</table>
</div>
del df['性别']
# map函数主要用于映射
df['性别'] = df['gender'].map({'0': '女性', '1':'男性', '2': '未知'})
df.head(5)
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>user_id</th>
<th>birthday</th>
<th>gender</th>
<th>性别</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>2757</td>
<td>20130311</td>
<td>1</td>
<td>男性</td>
</tr>
<tr>
<th>1</th>
<td>415971</td>
<td>20121111</td>
<td>0</td>
<td>女性</td>
</tr>
<tr>
<th>2</th>
<td>1372572</td>
<td>20120130</td>
<td>1</td>
<td>男性</td>
</tr>
<tr>
<th>3</th>
<td>10339332</td>
<td>20110910</td>
<td>0</td>
<td>女性</td>
</tr>
<tr>
<th>4</th>
<td>10642245</td>
<td>20130213</td>
<td>0</td>
<td>女性</td>
</tr>
</tbody>
</table>
</div>
del df['性别']
# map函数也可传入自己定义的函数
df['性别'] = df['gender'].map(f)
df.head(5)
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>user_id</th>
<th>birthday</th>
<th>gender</th>
<th>性别</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>2757</td>
<td>20130311</td>
<td>1</td>
<td>男</td>
</tr>
<tr>
<th>1</th>
<td>415971</td>
<td>20121111</td>
<td>0</td>
<td>女</td>
</tr>
<tr>
<th>2</th>
<td>1372572</td>
<td>20120130</td>
<td>1</td>
<td>男</td>
</tr>
<tr>
<th>3</th>
<td>10339332</td>
<td>20110910</td>
<td>0</td>
<td>女</td>
</tr>
<tr>
<th>4</th>
<td>10642245</td>
<td>20130213</td>
<td>0</td>
<td>女</td>
</tr>
</tbody>
</table>
</div>
# 脱敏处理
# 可使用lambda函数
df['user_id'].apply(lambda x: str(x).replace(x[1:3], '**')).head(5)
0 2**7
1 4**971
2 1**2572
3 1**39332
4 1**42245
Name: user_id, dtype: object
df['birthday'].apply(lambda x: x[0:4]).head(5)
0 2013
1 2012
2 2012
3 2011
4 2013
Name: birthday, dtype: object
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。