前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >搞笑视频爬虫

搞笑视频爬虫

作者头像
用户2458545
发布2022-09-07 11:40:08
1950
发布2022-09-07 11:40:08
举报
文章被收录于专栏:阿牛的牙
代码语言:javascript
复制
import json
import hashlib
import time

import requests
import re
import threading
from pymongo import MongoClient
from qiniu import Auth, put_data

import multiprocessing
import sys

sys.path.insert(0, '/data/MyBlog')
from MyBlog.settings import MEDIA_URL
from MyBlog.utils import redis_client
from constants import MONGO_URI, MONGODB_NAME, QINIU_ACCESS_KEY, QINIU_SECRET_KEY, QINIU_BUCKET_NAME



class QiNiu(object):
    def __init__(self):
        self.user = Auth(QINIU_ACCESS_KEY, QINIU_SECRET_KEY)
        self.bucket_name = QINIU_BUCKET_NAME

    def up_stream(self, stream, key):
        token = self.user.upload_token(self.bucket_name, key, 3600)
        return put_data(token, key, stream, progress_handler=True)

def get_mongo():
    return MongoClient(MONGO_URI, connect=False)[MONGODB_NAME]


mongodb = get_mongo()
qiniu_client = QiNiu()


class VideoSpider(DuanziSpider):
    def __init__(self, *args, **kwargs):
        DuanziSpider.__init__(self, *args, **kwargs)
        # self.base_url = 'http://gaoxiao.52op.net/egao/index.htm'
        # self.base_url = 'http://gaoxiao.52op.net/fangyan/'
        self.base_url = 'http://gaoxiao.52op.net/egao/'
        self.json_url = 'http://gaoxiao.52op.net/flvData/d.aspx?id={}'

    def get_media_list(self):
        res = requests.get(self.base_url, headers=self.headers)
        res.encoding = 'utf-8'
        total_pages = re.findall('<a class="linkPage" href="(.*?)">([\S\s]*?)</a>', res.text)
        self.base_url = [self.base_url] + [t[0] for t in total_pages]
        # print(res.text)
        all_url = set()
        for p in self.base_url:
            ret = requests.get(url=p, headers=self.headers)
            ret.encoding = 'utf-8'
            htmls = re.findall('<a href="(.*)">([\S\s]*?)</a>', ret.text)
            for html in htmls:
                if html[0].endswith('target="_blank') and 'htm' in html[0]:
                    _id = html[0].split(' ')[0].split('/')[-1].split('.')[0]
                    all_url.add(self.json_url.format(_id))
        return all_url

    def down_and_upload_media(self, url):
        print(url)
        res = requests.get(url=url, headers=self.headers)
        res.encoding = 'utf-8'
        if res.status_code not in (200, 201):
            return
        data = json.loads(res.text)
        Data = data.get('Data')
        if not Data:
            return
        mp4_url = Data.get('MP4')
        if not mp4_url:
            return
        mp4_url = mp4_url.encode('utf-8').decode()
        res = requests.get(mp4_url, self.headers)
        if res.status_code not in (200, 201):
            return
        m = hashlib.md5()
        m.update(res.content)
        uid = m.hexdigest()
        if self.mongodb['video'].find_one({'uid': uid}):
            print(self.mongodb['video'].find_one({'uid': uid}))
            return
        r, info = qiniu_client.up_stream(res.content, 'video/' + uid + '.mp4')
        title_desc = data.get('Name')
        if ' ' in title_desc:
            title_desc = title_desc.split(' ')
            title = title_desc[0]
            desc = title_desc[1]
        else:
            title = desc = title_desc
        if r.get('key'):
            to_write = {
                'id': self.get_inc_id(),
                'uid': uid,
                'src': MEDIA_URL + r.get('key'),
                'title': title,
                'describe': desc
            }
            print('write data {}'.format(to_write))
            self.mongodb['video'].insert(to_write)

    @staticmethod
    def get_inc_id():
        return redis_client.incrby('video_id', 1)

    def run(self):
        url = list(self.get_media_list())
        pool = multiprocessing.Pool(multiprocessing.cpu_count())
        print(len(url))
        for a in url:
            # pool.apply_async(self.down_and_upload_media, args=(a,))
            self.down_and_upload_media(a)
        # pool.close()
        # pool.join()

video_spider = VideoSpider()
video_spider.run()

瞎玩玩,简单爬

源网站:http://gaoxiao.52op.net/

links: https://www.mongona.com/?c=6

本文参与 腾讯云自媒体同步曝光计划,分享自作者个人站点/博客。
原始发表:2019年10月20日,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 作者个人站点/博客 前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体同步曝光计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档