临时处理一个Numpy的二进制文件,分析知道里面是dict类型,简单小记一下,如果Numpy和Python基础不熟悉可以看我之前写的文章(贴一下Numpy的)
%%time
import numpy as np
Wall time: 135 ms
%%time
import pandas as pd
Wall time: 351 ms
%%time
df = pd.DataFrame(np.load("data.npy")) # 通过narry创建DataFrame
Wall time: 910 ms
%%time
df.head(10) # 快速预览前10行
Wall time: 1 ms
0 | |
---|---|
0 | {'email': 'liurh@csdn.net', 'pwd': '9755DD0556... |
1 | {'email': 'fw19@sina.com', 'pwd': '6BB518D1A42... |
2 | {'email': 'whcheng@126.com', 'pwd': '0079ABBA6... |
3 | {'email': 'zh4ang@163.com', 'pwd': 'E23E561F02... |
4 | {'email': 'johnzhou8888@yahoo.com.cn', 'pwd': ... |
5 | {'email': 'zaza902@hotmail.com', 'pwd': '9B084... |
6 | {'email': 'yuping_zhong@163.com', 'pwd': '7D07... |
7 | {'email': 'annnntning@sina.com', 'pwd': '448A2... |
8 | {'email': 'sunnydinasun@sohu.com', 'pwd': 'DBF... |
9 | {'email': 'ysmrose@sohu.com', 'pwd': '22DDD26D... |
%%time
# 提取email列
df['Email'] = df[0].map(lambda x : dict(x)["email"])
# 提取pwd列
df['MD5'] = df[0].map(lambda x : dict(x)["pwd"] )
# 删除无用列
del df[0]
Wall time: 1.05 s
%%time
df.size # 查看总共多少数据
Wall time: 0 ns
2097148
%%time
df.shape
Wall time: 0 ns
(1048574, 2)
%%time
df.head(10)
Wall time: 0 ns
MD5 | ||
---|---|---|
0 | liurh@csdn.net | 9755DD05564EAD9EADCACE40B5A02711 |
1 | fw19@sina.com | 6BB518D1A42F22DA5CA62D5EE41C5D4F |
2 | whcheng@126.com | 0079ABBA66856DAFDF2B9A6E0DB23A09 |
3 | zh4ang@163.com | E23E561F0202ACECA30B8F07A48AB8E9 |
4 | johnzhou8888@yahoo.com.cn | 0EB1A2DB91A2BF3FB6275DE659A25805 |
5 | zaza902@hotmail.com | 9B08473C992C07E98389ED1C280A634A |
6 | yuping_zhong@163.com | 7D0710824FF191F6A0086A7E3891641E |
7 | annnntning@sina.com | 448A2BCEE09A3B14C22DC000351216B7 |
8 | sunnydinasun@sohu.com | DBFBA02E366BAB58DF605D6475189A51 |
9 | ysmrose@sohu.com | 22DDD26D62AF8B1C4A216BE18FDFF5B2 |
%%time
df.T.to_json("user.json") # 重新保存为Json(转置只是为了存储成我们常见的json格式)
Wall time: 2.85 s