采集csdn 热度排行榜
F12 启动开发者模式,分析网络传输,捕获传输请求URL如下
综合 https://blog.csdn.net/phoenix/web/blog/hot-rank?page=0&pageSize=25
悬赏 https://bizapi.csdn.net/mp/ask/v1/questions/getRewardRankList?pageNo=1&pageSize=20
C/C++ ttps://blog.csdn.net/phoenix/web/blog/hot-rank?page=0&pageSize=25&child_channel=c%2Fc%2B%2B
java https://blog.csdn.net/phoenix/web/blog/hot-rank?page=0&pageSize=25&child_channel=java
javascript https://blog.csdn.net/phoenix/web/blog/hot-rank?page=0&pageSize=25&child_channel=javascript
python https://blog.csdn.net/phoenix/web/blog/hot-rank?page=0&pageSize=25&child_channel=python
人工智能 https://blog.csdn.net/phoenix/web/blog/hot-rank?page=0&pageSize=25&child_channel=%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD
猜测其他领域榜单变的只有child_channel参数,验证证实 随机改变pageSize和page 观察响应 每次最多只响应50条数据,且只有两页page 滚动排行榜,如下图所示,综合榜单只有100,初步证实猜测, 领域排行榜只显示前50,但可以请求前100数据
具体爬虫过程中的处理思路写在后面完整代码的注释里了
import requests
import pandas as pd
from fake_useragent import UserAgent
import json
import urllib.parse
import time
def parse_data(response):
"""解析请求返回的数据,整理成dataframe格式"""
data = response.content.decode('utf-8')
df = pd.DataFrame(json.loads(data)['data'])
# 重命名字段名称
col = {"hotRankScore": "热度", "pcHotRankScore": "pcHotRankScore", "loginUserIsFollow": "loginUserIsFollow",
"nickName": "作者", "avatarUrl": "作者头像图片", "userName": "作者id", "articleTitle": "文章名称",
"articleDetailUrl": "文章链接", "commentCount": "评论数", "favorCount": "收藏数", "viewCount": "浏览量",
"hotComment": "hotComment", "picList": "插图链接"}
df.rename(columns=col, inplace=True)
# 保留需要的数据字段
df = df[['作者', '文章名称', '热度', '文章链接', '评论数', '收藏数', '浏览量']]
return df
def get_rank_all():
"""获取综合排行榜"""
result = []
for page in range(2):
url = f"https://blog.csdn.net/phoenix/web/blog/hot-rank?page={page}&pageSize=50"
# 生成随机虚拟请求头
ua = UserAgent()
headers = {'User-Agent': ua.random}
response = requests.get(url, headers=headers)
# 数据解析
data = parse_data(response)
result.append(data)
# 数据合并保存
all_result = pd.concat(result).reset_index(drop=True)
all_result["热榜排名"] = all_result.index + 1
# 获取当前时间
now = time.strftime("%Y-%m-%d %H_%M_%S", time.localtime())
all_result.to_csv(f"{now} C站综合热榜.csv", index=False, encoding="gbk")
def get_rank_child_channel(channel):
"""获取领域热榜"""
# 对中文做url编码,不做也可以识别,没什么影响
# channel = channel.parse.quote(channel)
result = []
for page in range(2):
url = f"https://blog.csdn.net/phoenix/web/blog/hot-rank?page=0&pageSize=50&child_channel={channel}"
ua = UserAgent()
headers = {'User-Agent': ua.random}
response = requests.get(url, headers=headers)
# 数据解析
data = parse_data(response)
result.append(data)
# 数据合并保存
all_result = pd.concat(result).reset_index(drop=True)
all_result["热榜排名"] = all_result.index + 1
# 获取当前时间
now = time.strftime("%Y-%m-%d %H_%M_%S", time.localtime())
all_result.to_csv(f"{now} C站{channel}领域热榜.csv", index=False, encoding="utf-8-sig")
def main():
# 获取全站综合当前热度排行
get_rank_all()
# 获取领域当前热度排行——列举了部分,具体参考网页推荐
channel_list = ["人工智能", "C/C++", "Java", "区块链", "大数据", "移动开发", "嵌入式", ]
# channel = "人工智能"
for channel in channel_list:
get_rank_child_channel(channel)
if __name__ == '__main__':
main()
扫码关注腾讯云开发者
领取腾讯云代金券
Copyright © 2013 - 2025 Tencent Cloud. All Rights Reserved. 腾讯云 版权所有
深圳市腾讯计算机系统有限公司 ICP备案/许可证号:粤B2-20090059 深公网安备号 44030502008569
腾讯云计算(北京)有限责任公司 京ICP证150476号 | 京ICP备11018762号 | 京公网安备号11010802020287
Copyright © 2013 - 2025 Tencent Cloud.
All Rights Reserved. 腾讯云 版权所有