本篇文章请结合以下两篇文章观看
这个是监控服务器网速的最后成果,显示的是下载与上传的网速,单位为M。爬虫的原理都是一样的,只不过将数据存到InfluxDB的方式不一样而已。
InfluxDb,是目前比较流行的时间序列数据库;
Grafana,一个可视化面板(Dashboard),有着非常漂亮的图表和布局展示,功能齐全的度量仪表盘和图形编辑器,支持Graphite、zabbix、InfluxDB、Prometheus和OpenTSDB作为数据源
influxdb安装
pip install influxdb
获取要展示的数据,包含当前的时间数据,存到InfluxDb里面,然后再到Grafana里面进行相应的配置即可展示;
Grafana安装:https://blog.csdn.net/xc_zhou/article/details/88936662
InfulxDb安装:https://blog.csdn.net/xc_zhou/article/details/89478279
InfluxDb数据库的数据有一定的格式,因为我都是利用python库进行相关操作,所以下面将在python中的格式展示一下:
json_body = [
{
"measurement": "crawler",
"time": current_time,
"tags": {
"spider_name": collection_name
},
"fields": {
"count": current_count,
"increase_count": increase_amount,
"size": co_size,
"increase_size": increase_co_size
}
}
]
其中:
measurement , 表名 time ,时间 tags ,标签 fields ,字段
可以看到,就是个列表里面,嵌套了一个字典。其中,对于时间字段,有特殊要求,可以参考 这里 , 下面是python实现方法:
from datetime import datetime
current_time = datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
完整代码
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time :
# @Author :
# @File :
# @Software: PyCharm
import ast,time,pymongo,traceback
import os,configparser
from configparser import SafeConfigParser,ConfigParser
from influxdb import InfluxDBClient
from datetime import datetime
from os.path import getmtime
"""
爬虫可视化存数据到InfluxDb 数据库脚本
采用热更新配置文件的方式
若修改配置的过程中,报错,则会使用上一次的配置信息(非首次,如果首次就报错,就会退出程序)
"""
class spider():
def __init__(self):
# influx配置,需要用influxdb数据库
self.infl_client = InfluxDBClient(host='192.168.10.56', port=8086)#host='localhost'
# 创建数据库
self.infl_client.create_database('test')
# 选择使用数据库
self.infl_client.switch_database('test')
# 配置文件名
self.config_name = 'settings.conf'
# self.config_name = 'config.ini'
WATCHED_FILES = [self.config_name]
'''
os.path.getatime(path)
返回path所指向的文件或者目录的最后存取时间
'''
self.WATCHED_FILES_MTIMES = [(f, getmtime(f)) for f in WATCHED_FILES]
self._count_dict = {}
self._size_dict = {}
def parse_config(self,file_name):
parser = ConfigParser()
parser.read(file_name, encoding='utf-8')
replicaset_mongo = parser.get("Mongo_25",'replicaset')
host_mongo = parser.get("Mongo_25", 'host')
host1_mongo = parser.get("Mongo_25", 'host1')
host2_mongo = parser.get("Mongo_25", 'host2')
port_mongo = parser.get("Mongo_25", 'port')
user_mongo = parser.get("Mongo_25", 'user')
passwd_mongo = parser.get("Mongo_25", 'password')
authenticate = parser.get("Mongo_25", 'authenticate')
self.interval = parser.getint('time', 'interval')
self.dbs_and_cos = ast.literal_eval(parser.get('db', 'db_co_dict'))
for db_name, collection_name in self.dbs_and_cos.items():
if replicaset_mongo: # 连接mongo集群
self.client = pymongo.MongoClient([host1_mongo, host2_mongo], replicaSet=replicaset_mongo, port=int(port_mongo))
self.client.admin.authenticate(user_mongo, passwd_mongo)
self.db = self.client[db_name]
else: # 连接mongo单机
self.client = pymongo.MongoClient(host_mongo, int(port_mongo))
self.db = self.client[db_name]
if user_mongo: # (windows 运行)
self.db.authenticate(user_mongo, passwd_mongo, source=authenticate)
self.collection = self.db[collection_name]
break
def insert_data(self):
for db_name, collection_name in self.dbs_and_cos.items():
self.db = self.client[db_name]
self.collection = self.db[collection_name]
# 集合大小
co_size = round(float(self.db.command("collstats", collection_name).get('size')) / 1024 / 1024, 2)
# 集合内数据条数
current_count = self.collection.count()
# 初始化,当程序刚执行时,初始量就设置为第一次执行时获取的数据
init_count = self._count_dict.get(collection_name, current_count)
# 初始化,当程序刚执行时,初始量就设置为第一次执行时获取的数据大小
init_size = self._size_dict.get(collection_name, co_size)
# 条数增长量
increase_amount = current_count - init_count
# 集合大小增长量
increase_co_size = co_size - init_size
current_time = datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
# 赋值
self._size_dict[collection_name] = co_size
self._count_dict[collection_name] = current_count
json_body = [
{
"measurement": "crawler",
"time": current_time,
"tags": {
"spider_name": collection_name
},
"fields": {
"count": current_count,
"increase_count": increase_amount,
"size": co_size,
"increase_size": increase_co_size
}
}
]
print(json_body)
self.infl_client.write_points(json_body)
def auto_get_new_file(self):
# 热更新配置
self.parse_config(self.config_name)
for f, mtime in self.WATCHED_FILES_MTIMES:
while True:
if getmtime(f) != mtime:
try:
self.parse_config(self.config_name)
mtime = getmtime(f)
except:
print('setting load error')
self.insert_data()
time.sleep(self.interval)
if __name__ == '__main__':
try:
data=spider()
data.auto_get_new_file()
except Exception as e:
import traceback
ex_msg = '{exception}'.format(exception=traceback.format_exc())
print(ex_msg)
# MongoDb 相关配置
[Mongo_Uri]
host = ''
user =''
passwd = ''
[Mongo_25]
replicaset =
host = 192.168.118.3
host1 =''
host2 =''
port = 27017
user =
password =
authenticate =
# [需要展示的数据的数据库,集合名]
[db]
db_co_dict = {
'dm_bond': 'bond_sentiment_news',
}
# [循环间隔时间]
[time]
interval = 7
参考:https://www.jianshu.com/p/0792053aa134
http://www.sohu.com/a/283857025_671965
http://api.mongodb.com/python/current/api/pymongo/database.html