前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >python爬虫实例

python爬虫实例

作者头像
cuijianzhe
发布2022-06-14 17:36:24
5310
发布2022-06-14 17:36:24
举报
文章被收录于专栏:cuijianzhe

****# 虽然没什么用,学习阶段,练手。

一、 爬取表情包

1、爬取斗图啦:
代码语言:javascript
复制
import requests
from bs4 import BeautifulSoup
import threading
glock = threading.Lock()
headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
        'Referer': 'https://hacpai.com/login'}

class get_doutu():
    def all_htmls(self):
        htmls = []
        for i in range(1,10):
            url = "https://www.doutula.com/photo/list/?page=%s"%i
            res = requests.get(url,headers=headers)
            htmls.append(res.text)
        return htmls
    def html_parser(self):
        datas = []
        for html in self.all_htmls():
            soup = BeautifulSoup(html,'lxml')
            img_data = soup.find_all('img',attrs={'class' : 'img-responsive lazy image_dta'})
            for data in img_data:
                datas.append((data['alt'],data['data-original']))
        return datas
    def Download_img(self):
        for idx, (title, img_url) in enumerate(self.html_parser()):
            post_fix = img_url[-3:]
            filename = f"./image/{title}.{post_fix}"
            print(idx, filename)
            img_info = requests.get(img_url)
            with open(filename, "wb")as f:
                f.write(img_info.content)

if __name__ ==  "__main__":
    doutu = get_doutu()
    doutu.Download_img()

参考:

https://beautifulsoup.readthedocs.io/zh_CN/v4.4.0/#find-all

2 、爬取发表情网站
代码语言:javascript
复制
#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Time  : 2020/5/1 16:11
# @Author : cuijianzhe
# @File  : biaoqingbao.py
# @Software: PyCharm

import threading
import requests
from bs4 import BeautifulSoup
import re
class Download_picture():
    def download_all_htmls(self):
        htmls = []
        for num in range(1,5):
            url = "https://fabiaoqing.com/biaoqing/lists/page/%s.html"%(num)
            r = requests.get(url)
            htmls.append(r.text)
        return htmls
    def html_parser(self):
        """
            解析单个HTML,得到数据
            @return list((img_title, img_url))
        """
        img_divs = []
        for html in self.download_all_htmls():
            soup = BeautifulSoup(html, 'html.parser')
            img_divs.extend(soup.find_all("div", class_="tagbqppdiv"))
        datas = []
        for img_div in img_divs:
            img_node = img_div.find("img")
            if not img_node: continue
            datas.append((img_node["title"], img_node["data-original"]))
        return datas

    def get_picture(self):
        for idx,(title,img_url) in enumerate(self.html_parser()):
            # 移除标点符号,只保留中文、大小写字母和阿拉伯数字
            reg = "[^0-9A-Za-z\u4e00-\u9fa5]"
            title = re.sub(reg, '', title)
            # 发现了超长的图片标题,做截断
            if len(title) > 10: title = title[:10]
            # 得到jpg还是gif后缀
            post_fix = img_url[-3:]
            filename = f"./output/{title}.{post_fix}"
            print(idx, filename)
            img_data = requests.get(img_url)
            with open(filename, "wb")as f:
                f.write(img_data.content)
if __name__ == "__main__":
    get_biaoqingbao = Download_picture()
    for m in range(3):  #创建3个线程
        thread = threading.Thread(target=get_biaoqingbao.get_picture())
        thread.start()

爬取黑客派排行榜

代码语言:javascript
复制
import requests
import re
html =  requests.get('https://hacpai.com/top/general').text
result = re.findall('class="fn-flex-1".*?aria-name="(.*?)".*?href="(.*?)".*?',html,flags=re.S)
#paihang_str = str(result)
for value in result:
    name, url = value
    tup1 = name.replace("/"," "),url
    tup2 = ' '.join(tup1)
    print(tup2)
    with open('paihangbang.txt','a',encoding='utf-8') as f:
        f.write(tup2)

**效果如下:**

爬取豆瓣电影排行榜

代码语言:javascript
复制
import requests

import re
content = requests.get('https://movie.douban.com/chart').text
# 豆瓣电影排行榜
pattern = re.compile('class="pl2".*?<.*?="(.*?)".*?>(.*?)<span.*?>(.*?)</span>.*?"rating_nums">(.*?)</span>.*?"pl">(.*?)</span>', re.S)
# compile可以在多次使用中提高效率,这里影响不大
results = re.findall(pattern, content)
for result in results:
    url, name1, name2, nums, pl = result
    print(url, name1.replace("/","").strip(), name2.replace("/","").strip(), nums, pl)

python 带 cookie 登录黑客派

代码语言:javascript
复制
import requests
from fake_useragent import UserAgent

headers = {
    'Referer': 'https://hacpai.com/',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
}

data = {
    "nameOrEmail":"cuijianzhe",
    "userPassword":"54949cdcbe4d252ba3400897883df589811",
    "captcha":""
}

request = requests.session()

base_url = "https://hacpai.com/login"
res = requests.post(base_url,headers=headers,json=data)
res.raise_for_status()

print(res.text)

二、 爬取网易云音乐

  • 第一步:先把网络请求 get 下来
代码语言:javascript
复制
#http://music.163.com/song/media/outer/url?id=1377519494  #对外开放的下载接口

import requests
import urllib.request  #进行网络数据下载到本地
from fake_useragent import UserAgent
import re

def getResponse(url,headers):
    '''
    :return: html信息
    '''
    try:
        response = requests.get(url=url,headers=headers)
        if response.status_code == 200:
            return response
        return None
    except:
        return None

if __name__ == '__main__':
    url = 'https://music.163.com/song?id=1377519494'
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
    }
    res = getResponse(url,headers)
    res
    print(res.text)

歌曲名信息:

  • 第二步:
代码语言:javascript
复制
……
def getSongName(songid):
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
    }
    url = 'https://music.163.com/song?id={}'.format(songid)
    Text = getResponse(url,headers=headers).text
    titile = re.findall('<title>(.*?)</title>',Text,re.S)
    name = titile[0].split('-')[0]
    return name.strip()
……

if __name__ == '__main__':
    name = getSongName(1377519494)
    print(name)

获取歌曲名称:

  • 第三步下载单曲:
代码语言:javascript
复制
……
if __name__ == '__main__':
    songid = input('请输入要下载的歌曲id:')
    url = 'http://music.163.com/song/media/outer/url?id={}.mp3'.format(songid)
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
    }
    down_url = getResponse(url,headers).url
    SongName = getSongName(songid)
    urllib.request.urlretrieve(down_url,SongName+'.mp3')
……

下载成功:

  • 完整代码:
代码语言:javascript
复制
import requests
import urllib.request  #进行网络数据下载到本地
from fake_useragent import UserAgent
import re
def getResponse(url,headers):
    '''
    :return: html信息
    '''
    try:
        response = requests.get(url=url,headers=headers)
        if response.status_code == 200:
            return response
        return None
    except:
        return None

def getSongName(songid):
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
    }
    url = 'https://music.163.com/song?id={}'.format(songid)
    Text = getResponse(url,headers=headers).text
    titile = re.findall('<title>(.*?)</title>',Text,re.S)
    name = titile[0].split('-')[0]
    return name.strip()

if __name__ == '__main__':
    songid = input('请输入要下载的歌曲id:')
    url = 'http://music.163.com/song/media/outer/url?id={}.mp3'.format(songid)
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
    }
    down_url = getResponse(url,headers).url
    SongName = getSongName(songid)
    urllib.request.urlretrieve(down_url,SongName+'.mp3') #网上歌曲映射到本地

爬取热门歌单并批量下载

代码语言:javascript
复制
import requests
import re
from multiprocessing import Pool

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
}
#todo:下载整个歌单网站信息
def get_page(url):
    res = requests.get(url,headers=headers)
    data = re.findall('<a title="(.*?)" href="/playlist\?id=(\d+)" .*?</a>',res.text)
    print(data)
    pool = Pool(4)
    pool.map(get_songs,data)
#todo:获取整个歌单歌曲并进行下载
def get_songs(data):
    playlist_url = 'https://music.163.com/playlist?id=%s'%data[1]
    res = requests.get(playlist_url,headers=headers)
    for i in re.findall('<a href="/song\?id=(\d+)">(.*?)</a>',res.text):
        down_url = 'http://music.163.com/song/media/outer/url?id=%s'%i[0]
        print(down_url)
        try:
            with open('./music/'+i[1]+'.mp3','wb') as f:
                f.write(requests.get(down_url,headers=headers).content)
        except:
            pass

if __name__ == '__main__':
    playlist_url = 'https://music.163.com/discover/playlist/?order=hot'
    get_page(playlist_url)

爬取网易云推荐歌单

代码语言:javascript
复制
import requests
import re
from multiprocessing import Pool
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
}
music_list = []
def get_page(url):
    res = (requests.get(url,headers=headers)).text
    data = re.findall('<a title="(.*?)" class="tit s-fc0" href="/playlist\?id=(\d+)"',res)
    print(data)
    pool = Pool(4)
    pool.map(get_song,data)

def get_song(data):
    gedan_url = 'https://music.163.com//playlist?id=%s'%data[1]
    res = requests.get(gedan_url,headers=headers)
    for i in re.findall('<a href="/song\?id=(\d+)">(.*?)</a>',res.text):
        down_url = 'http://music.163.com/song/media/outer/url?id=%s'%i[0]
        print(down_url)
        try:
            with open("./music/"+i[1]+".mp3","wb") as f:
                f.write(requests.get(down_url,headers=headers).content)
        except:
            pass

if __name__ == '__main__':
    my_url = 'https://music.163.com/discover'
    get_page(my_url)

标题:python爬虫实例

作者:cuijianzhe

地址:https://cloud.tencent.com/developer/article/2022756

本文参与 腾讯云自媒体同步曝光计划,分享自作者个人站点/博客。
如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 作者个人站点/博客 前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体同步曝光计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
目录
  • 一、 爬取表情包
    • 爬取黑客派排行榜
      • 爬取豆瓣电影排行榜
        • python 带 cookie 登录黑客派
        • 二、 爬取网易云音乐
          • 爬取热门歌单并批量下载
            • 爬取网易云推荐歌单
        领券
        问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档