import pandas as pd
import numpy as np
import osos.getcwd()'D:\\Jupyter\\notebook\\Python数据清洗实战\\数据清洗之数据转换'os.chdir('D:\\Jupyter\\notebook\\Python数据清洗实战\\数据')df = pd.read_csv('baby_trade_history.csv', encoding='utf-8', dtype={'user_id':str})df.head(5)<div>
<style scoped>
.dataframe tbody tr th:only-of-type { vertical-align: middle;}.dataframe tbody tr th { vertical-align: top;}.dataframe thead th { text-align: right;}</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;"> <th></th> <th>user_id</th> <th>auction_id</th> <th>cat_id</th> <th>cat1</th> <th>property</th> <th>buy_mount</th> <th>day</th></tr></thead>
<tbody>
<tr> <th>0</th> <td>786295544</td> <td>41098319944</td> <td>50014866</td> <td>50022520</td> <td>21458:86755362;13023209:3593274;10984217:21985...</td> <td>2</td> <td>20140919</td></tr><tr> <th>1</th> <td>532110457</td> <td>17916191097</td> <td>50011993</td> <td>28</td> <td>21458:11399317;1628862:3251296;21475:137325;16...</td> <td>1</td> <td>20131011</td></tr><tr> <th>2</th> <td>249013725</td> <td>21896936223</td> <td>50012461</td> <td>50014815</td> <td>21458:30992;1628665:92012;1628665:3233938;1628...</td> <td>1</td> <td>20131011</td></tr><tr> <th>3</th> <td>917056007</td> <td>12515996043</td> <td>50018831</td> <td>50014815</td> <td>21458:15841995;21956:3494076;27000458:59723383...</td> <td>2</td> <td>20141023</td></tr><tr> <th>4</th> <td>444069173</td> <td>20487688075</td> <td>50013636</td> <td>50008168</td> <td>21458:30992;13658074:3323064;1628665:3233941;1...</td> <td>1</td> <td>20141103</td></tr></tbody>
</table>
</div>
df.info()<class 'pandas.core.frame.DataFrame'>RangeIndex: 29971 entries, 0 to 29970Data columns (total 7 columns):user_id 29971 non-null objectauction_id 29971 non-null int64cat_id 29971 non-null int64cat1 29971 non-null int64property 29827 non-null objectbuy_mount 29971 non-null int64day 29971 non-null int64dtypes: int64(5), object(2)memory usage: 1.6+ MBdf['buy_date'] = pd.to_datetime(df['day'], format='%Y%m%d', errors='coerce')df.info()<class 'pandas.core.frame.DataFrame'>RangeIndex: 29971 entries, 0 to 29970Data columns (total 8 columns):user_id 29971 non-null objectauction_id 29971 non-null int64cat_id 29971 non-null int64cat1 29971 non-null int64property 29827 non-null objectbuy_mount 29971 non-null int64day 29971 non-null int64buy_date 29971 non-null datetime64[ns]dtypes: datetime64[ns](1), int64(5), object(2)memory usage: 1.8+ MBdf.head(5)<div>
<style scoped>
.dataframe tbody tr th:only-of-type { vertical-align: middle;}.dataframe tbody tr th { vertical-align: top;}.dataframe thead th { text-align: right;}</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;"> <th></th> <th>user_id</th> <th>auction_id</th> <th>cat_id</th> <th>cat1</th> <th>property</th> <th>buy_mount</th> <th>day</th> <th>buy_date</th></tr></thead>
<tbody>
<tr> <th>0</th> <td>786295544</td> <td>41098319944</td> <td>50014866</td> <td>50022520</td> <td>21458:86755362;13023209:3593274;10984217:21985...</td> <td>2</td> <td>20140919</td> <td>2014-09-19</td></tr><tr> <th>1</th> <td>532110457</td> <td>17916191097</td> <td>50011993</td> <td>28</td> <td>21458:11399317;1628862:3251296;21475:137325;16...</td> <td>1</td> <td>20131011</td> <td>2013-10-11</td></tr><tr> <th>2</th> <td>249013725</td> <td>21896936223</td> <td>50012461</td> <td>50014815</td> <td>21458:30992;1628665:92012;1628665:3233938;1628...</td> <td>1</td> <td>20131011</td> <td>2013-10-11</td></tr><tr> <th>3</th> <td>917056007</td> <td>12515996043</td> <td>50018831</td> <td>50014815</td> <td>21458:15841995;21956:3494076;27000458:59723383...</td> <td>2</td> <td>20141023</td> <td>2014-10-23</td></tr><tr> <th>4</th> <td>444069173</td> <td>20487688075</td> <td>50013636</td> <td>50008168</td> <td>21458:30992;13658074:3323064;1628665:3233941;1...</td> <td>1</td> <td>20141103</td> <td>2014-11-03</td></tr></tbody>
</table>
</div>
# 使用dt方法提取属性
# df['buy_date'].dt.year # 提取年
# df['buy_date'].dt.month # 提取月
# df['buy_date'].dt.day # 提取天df['diff_day'] = pd.datetime.now() - df['buy_date'] # 时间差格式df.head(5)<div>
<style scoped>
.dataframe tbody tr th:only-of-type { vertical-align: middle;}.dataframe tbody tr th { vertical-align: top;}.dataframe thead th { text-align: right;}</style>
<table border="1" class="dataframe">
<thead>
<tr style="text-align: right;"> <th></th> <th>user_id</th> <th>auction_id</th> <th>cat_id</th> <th>cat1</th> <th>property</th> <th>buy_mount</th> <th>day</th> <th>buy_date</th> <th>diff_day</th></tr></thead>
<tbody>
<tr> <th>0</th> <td>786295544</td> <td>41098319944</td> <td>50014866</td> <td>50022520</td> <td>21458:86755362;13023209:3593274;10984217:21985...</td> <td>2</td> <td>20140919</td> <td>2014-09-19</td> <td>2034 days 22:32:35.614788</td></tr><tr> <th>1</th> <td>532110457</td> <td>17916191097</td> <td>50011993</td> <td>28</td> <td>21458:11399317;1628862:3251296;21475:137325;16...</td> <td>1</td> <td>20131011</td> <td>2013-10-11</td> <td>2377 days 22:32:35.614788</td></tr><tr> <th>2</th> <td>249013725</td> <td>21896936223</td> <td>50012461</td> <td>50014815</td> <td>21458:30992;1628665:92012;1628665:3233938;1628...</td> <td>1</td> <td>20131011</td> <td>2013-10-11</td> <td>2377 days 22:32:35.614788</td></tr><tr> <th>3</th> <td>917056007</td> <td>12515996043</td> <td>50018831</td> <td>50014815</td> <td>21458:15841995;21956:3494076;27000458:59723383...</td> <td>2</td> <td>20141023</td> <td>2014-10-23</td> <td>2000 days 22:32:35.614788</td></tr><tr> <th>4</th> <td>444069173</td> <td>20487688075</td> <td>50013636</td> <td>50008168</td> <td>21458:30992;13658074:3323064;1628665:3233941;1...</td> <td>1</td> <td>20141103</td> <td>2014-11-03</td> <td>1989 days 22:32:35.614788</td></tr></tbody>
</table>
</div>
df.dtypesuser_id objectauction_id int64cat_id int64cat1 int64property objectbuy_mount int64day int64buy_date datetime64[ns]diff_day timedelta64[ns]dtype: object# 使用dt方法提取属性
# df['diff_day'].dt.days # 提取天数
# df['diff_day'].dt.seconds # 提取秒
# df['diff_day'].dt.microseconds # 提取纳秒# 将时间差转换为规定的格式
df['时间差'] = df['diff_day']/pd.Timedelta('1 D') # 转换为天数df['时间差'].head(5)0 2034.9393011 2377.9393012 2377.9393013 2000.9393014 1989.939301Name: 时间差, dtype: float64df['时间差'] = df['diff_day']/pd.Timedelta('1 H') # 转换为小时df['时间差'].head(5)0 48838.5432261 57070.5432262 57070.5432263 48022.5432264 47758.543226Name: 时间差, dtype: float64df['时间差'] = df['diff_day']/pd.Timedelta('1 M') # 转换为分钟数df['时间差'].head(5)0 2.930313e+061 3.424233e+062 3.424233e+063 2.881353e+064 2.865513e+06Name: 时间差, dtype: float64# 将科学计数转换为小数
df['时间差'].head(5).round(decimals=3)0 2930312.5941 3424232.5942 3424232.5943 2881352.5944 2865512.594Name: 时间差, dtype: float64# D: 天 M: 月 Y: 年
# 转换为天数
df['diff_day'].astype('timedelta64[D]').head(5)0 2034.01 2377.02 2377.03 2000.04 1989.0Name: diff_day, dtype: float64# 转换为月数
df['diff_day'].astype('timedelta64[M]').head(5)0 66.01 78.02 78.03 65.04 65.0Name: diff_day, dtype: float64# 转换为年数
df['diff_day'].astype('timedelta64[Y]').head(5)0 5.01 6.02 6.03 5.04 5.0Name: diff_day, dtype: float64# 转换为小时
# 还可转换为分钟、秒等等
df['diff_day'].astype('timedelta64[h]').head(5)0 48838.01 57070.02 57070.03 48022.04 47758.0Name: diff_day, dtype: float64原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。