import pandas as pd
import numpy as np
import os
os.getcwd()
'D:\\Jupyter\\notebook\\Python数据清洗实战\\数据清洗之数据转换'
os.chdir('D:\\Jupyter\\notebook\\Python数据清洗实战\\数据')
df = pd.read_csv('baby_trade_history.csv', encoding='utf-8', dtype={'user_id':str})
df.head(5)
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>user_id</th>
<th>auction_id</th>
<th>cat_id</th>
<th>cat1</th>
<th>property</th>
<th>buy_mount</th>
<th>day</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>786295544</td>
<td>41098319944</td>
<td>50014866</td>
<td>50022520</td>
<td>21458:86755362;13023209:3593274;10984217:21985...</td>
<td>2</td>
<td>20140919</td>
</tr>
<tr>
<th>1</th>
<td>532110457</td>
<td>17916191097</td>
<td>50011993</td>
<td>28</td>
<td>21458:11399317;1628862:3251296;21475:137325;16...</td>
<td>1</td>
<td>20131011</td>
</tr>
<tr>
<th>2</th>
<td>249013725</td>
<td>21896936223</td>
<td>50012461</td>
<td>50014815</td>
<td>21458:30992;1628665:92012;1628665:3233938;1628...</td>
<td>1</td>
<td>20131011</td>
</tr>
<tr>
<th>3</th>
<td>917056007</td>
<td>12515996043</td>
<td>50018831</td>
<td>50014815</td>
<td>21458:15841995;21956:3494076;27000458:59723383...</td>
<td>2</td>
<td>20141023</td>
</tr>
<tr>
<th>4</th>
<td>444069173</td>
<td>20487688075</td>
<td>50013636</td>
<td>50008168</td>
<td>21458:30992;13658074:3323064;1628665:3233941;1...</td>
<td>1</td>
<td>20141103</td>
</tr>
</tbody>
</table>
</div>
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29971 entries, 0 to 29970
Data columns (total 7 columns):
user_id 29971 non-null object
auction_id 29971 non-null int64
cat_id 29971 non-null int64
cat1 29971 non-null int64
property 29827 non-null object
buy_mount 29971 non-null int64
day 29971 non-null int64
dtypes: int64(5), object(2)
memory usage: 1.6+ MB
df['buy_date'] = pd.to_datetime(df['day'], format='%Y%m%d', errors='coerce')
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29971 entries, 0 to 29970
Data columns (total 8 columns):
user_id 29971 non-null object
auction_id 29971 non-null int64
cat_id 29971 non-null int64
cat1 29971 non-null int64
property 29827 non-null object
buy_mount 29971 non-null int64
day 29971 non-null int64
buy_date 29971 non-null datetime64[ns]
dtypes: datetime64[ns](1), int64(5), object(2)
memory usage: 1.8+ MB
df.head(5)
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>user_id</th>
<th>auction_id</th>
<th>cat_id</th>
<th>cat1</th>
<th>property</th>
<th>buy_mount</th>
<th>day</th>
<th>buy_date</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>786295544</td>
<td>41098319944</td>
<td>50014866</td>
<td>50022520</td>
<td>21458:86755362;13023209:3593274;10984217:21985...</td>
<td>2</td>
<td>20140919</td>
<td>2014-09-19</td>
</tr>
<tr>
<th>1</th>
<td>532110457</td>
<td>17916191097</td>
<td>50011993</td>
<td>28</td>
<td>21458:11399317;1628862:3251296;21475:137325;16...</td>
<td>1</td>
<td>20131011</td>
<td>2013-10-11</td>
</tr>
<tr>
<th>2</th>
<td>249013725</td>
<td>21896936223</td>
<td>50012461</td>
<td>50014815</td>
<td>21458:30992;1628665:92012;1628665:3233938;1628...</td>
<td>1</td>
<td>20131011</td>
<td>2013-10-11</td>
</tr>
<tr>
<th>3</th>
<td>917056007</td>
<td>12515996043</td>
<td>50018831</td>
<td>50014815</td>
<td>21458:15841995;21956:3494076;27000458:59723383...</td>
<td>2</td>
<td>20141023</td>
<td>2014-10-23</td>
</tr>
<tr>
<th>4</th>
<td>444069173</td>
<td>20487688075</td>
<td>50013636</td>
<td>50008168</td>
<td>21458:30992;13658074:3323064;1628665:3233941;1...</td>
<td>1</td>
<td>20141103</td>
<td>2014-11-03</td>
</tr>
</tbody>
</table>
</div>
# 使用dt方法提取属性
# df['buy_date'].dt.year # 提取年
# df['buy_date'].dt.month # 提取月
# df['buy_date'].dt.day # 提取天
df['diff_day'] = pd.datetime.now() - df['buy_date'] # 时间差格式
df.head(5)
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;">
<th></th>
<th>user_id</th>
<th>auction_id</th>
<th>cat_id</th>
<th>cat1</th>
<th>property</th>
<th>buy_mount</th>
<th>day</th>
<th>buy_date</th>
<th>diff_day</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>786295544</td>
<td>41098319944</td>
<td>50014866</td>
<td>50022520</td>
<td>21458:86755362;13023209:3593274;10984217:21985...</td>
<td>2</td>
<td>20140919</td>
<td>2014-09-19</td>
<td>2034 days 22:32:35.614788</td>
</tr>
<tr>
<th>1</th>
<td>532110457</td>
<td>17916191097</td>
<td>50011993</td>
<td>28</td>
<td>21458:11399317;1628862:3251296;21475:137325;16...</td>
<td>1</td>
<td>20131011</td>
<td>2013-10-11</td>
<td>2377 days 22:32:35.614788</td>
</tr>
<tr>
<th>2</th>
<td>249013725</td>
<td>21896936223</td>
<td>50012461</td>
<td>50014815</td>
<td>21458:30992;1628665:92012;1628665:3233938;1628...</td>
<td>1</td>
<td>20131011</td>
<td>2013-10-11</td>
<td>2377 days 22:32:35.614788</td>
</tr>
<tr>
<th>3</th>
<td>917056007</td>
<td>12515996043</td>
<td>50018831</td>
<td>50014815</td>
<td>21458:15841995;21956:3494076;27000458:59723383...</td>
<td>2</td>
<td>20141023</td>
<td>2014-10-23</td>
<td>2000 days 22:32:35.614788</td>
</tr>
<tr>
<th>4</th>
<td>444069173</td>
<td>20487688075</td>
<td>50013636</td>
<td>50008168</td>
<td>21458:30992;13658074:3323064;1628665:3233941;1...</td>
<td>1</td>
<td>20141103</td>
<td>2014-11-03</td>
<td>1989 days 22:32:35.614788</td>
</tr>
</tbody>
</table>
</div>
df.dtypes
user_id object
auction_id int64
cat_id int64
cat1 int64
property object
buy_mount int64
day int64
buy_date datetime64[ns]
diff_day timedelta64[ns]
dtype: object
# 使用dt方法提取属性
# df['diff_day'].dt.days # 提取天数
# df['diff_day'].dt.seconds # 提取秒
# df['diff_day'].dt.microseconds # 提取纳秒
# 将时间差转换为规定的格式
df['时间差'] = df['diff_day']/pd.Timedelta('1 D') # 转换为天数
df['时间差'].head(5)
0 2034.939301
1 2377.939301
2 2377.939301
3 2000.939301
4 1989.939301
Name: 时间差, dtype: float64
df['时间差'] = df['diff_day']/pd.Timedelta('1 H') # 转换为小时
df['时间差'].head(5)
0 48838.543226
1 57070.543226
2 57070.543226
3 48022.543226
4 47758.543226
Name: 时间差, dtype: float64
df['时间差'] = df['diff_day']/pd.Timedelta('1 M') # 转换为分钟数
df['时间差'].head(5)
0 2.930313e+06
1 3.424233e+06
2 3.424233e+06
3 2.881353e+06
4 2.865513e+06
Name: 时间差, dtype: float64
# 将科学计数转换为小数
df['时间差'].head(5).round(decimals=3)
0 2930312.594
1 3424232.594
2 3424232.594
3 2881352.594
4 2865512.594
Name: 时间差, dtype: float64
# D: 天 M: 月 Y: 年
# 转换为天数
df['diff_day'].astype('timedelta64[D]').head(5)
0 2034.0
1 2377.0
2 2377.0
3 2000.0
4 1989.0
Name: diff_day, dtype: float64
# 转换为月数
df['diff_day'].astype('timedelta64[M]').head(5)
0 66.0
1 78.0
2 78.0
3 65.0
4 65.0
Name: diff_day, dtype: float64
# 转换为年数
df['diff_day'].astype('timedelta64[Y]').head(5)
0 5.0
1 6.0
2 6.0
3 5.0
4 5.0
Name: diff_day, dtype: float64
# 转换为小时
# 还可转换为分钟、秒等等
df['diff_day'].astype('timedelta64[h]').head(5)
0 48838.0
1 57070.0
2 57070.0
3 48022.0
4 47758.0
Name: diff_day, dtype: float64
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。