前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >Python爬虫实践

Python爬虫实践

作者头像
bering
发布2020-05-26 22:36:10
4110
发布2020-05-26 22:36:10
举报
文章被收录于专栏:游戏开发之旅

爬取博客信息

代码语言:javascript
复制
import requests
from bs4 import BeautifulSoup
# import pandas

def GetBlogByPage(pageNum):
	headers={
		"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
	}
	targetUrl="https://blog.csdn.net/CJB_King/article/list/{}?"
	response=requests.get(targetUrl.format(pageNum),headers=headers)
	response.encoding='utf-8'
	contentText=response.text
	soup=BeautifulSoup(contentText,"html.parser")

	getTargetInfo=[]
	articles=soup.select('.article-item-box')
	for article in articles:
		info={}
		info["title"]=article.a.text.strip()
		info["source"]=article.a['href'].strip()
		info["sendTime"]=article.div.p.span.text.strip()
		info["ReadNum"]=article.div.select('span')[1].text
		info["writeNum"]=article.div.select('span')[3].text
		getTargetInfo.append(info)
	with open("blog.txt",'w') as f:
		for info in getTargetInfo:
			print(info)
			f.write(str(info))
	# df=pandas.DataFrame(getTargetInfo)
	# df.head()
	# df.to_excel('blog.xlsx')


for i in range(1,9):       #按页爬取
	GetBlogByPage(i)

爬取租房信息

代码语言:javascript
复制
from bs4 import BeautifulSoup
import requests
import csv
import time
import lxml

url = "https://bj.58.com/pinpaigongyu/pn/{page}/?minprice=2000_4000"

#已完成的页数序号,初时为0
page = 0

csv_file = open("rent1.csv","w") 
csv_writer = csv.writer(csv_file, delimiter=',')

while True:
    page += 1
    print("fetch: ", url.format(page=page))
    #由于该网站设置了反爬虫机制非常容易被屏蔽。因此在每次爬取页面时使用 time.sleep(1),1 代表 1 秒
    time.sleep(1)
    response = requests.get(url.format(page=page))
    html = BeautifulSoup(response.text,features="lxml")
    house_list = html.select(".list > li")

    # 循环在读不到新的房源时结束
    if not house_list:
        break

    for house in house_list:
        house_title = house.select("h2")[0].string
        house_url = house.select("a")[0]["href"]
        house_info_list = house_title.split()
        print(house_info_list)
        # 如果第二列是公寓名则取第一列作为地址
        if "公寓" in house_info_list[1] or "青年社区" in house_info_list[1]:
            house_location = house_info_list[0]
        else:
            house_location = house_info_list[1]

        house_money = house.select(".money")[0].select("b")[0].string
        csv_writer.writerow([house_title.strip(), house_location.strip(), house_money.strip(), house_url.strip()])

csv_file.close()

爬取慕课网课程信息

代码语言:javascript
复制
import requests
from pyquery import PyQuery as pq
from urllib.parse import urljoin
import pandas

totalInfo=[]
def GetTargetPageInfo(pageNum):
	with requests.Session() as s:
		res=s.get("https://www.imooc.com/course/list?page={}".format(1))
		d=pq(res.text)
		courses=d.items(".course-card-container")  #得到所有课程

		for course in courses:						#遍历课程,查找单个课程的信息
			title=course.find('.course-card-name').text()	#课程名称
			des=course.find('.course-card-desc').text()		#课程描述
			level=course.find('.course-card-info>span:eq(0)').text() #课程等级
			users=course.find('.course-card-info>span:eq(1)').text()	#课程观看人数
			prices=course.find('.price').text()						#课程价格
			labels=course.find('.course-label').text().split(' ')     #标签
			url=urljoin("https://www.imooc.com/learn/", course.find("a").attr("href"))   #拼接课程URl
			img_url=urljoin("https://img3.mukewang.com/",course.find("img").attr("src"))
			infoDict={
			"title":title,
			"des":des,
			"level":level,
			"users":users,
			"prices":prices,
			"labels":labels,
			"url":url,
			"img_url":img_url
			}
			totalInfo.append(infoDict)
for i in range(7):
	GetTargetPageInfo(i)


df=pandas.DataFrame(totalInfo)  
df.to_excel('videoInfo.xlsx')     #转化xlsx
检查内容截图

urllib.request爬取链接

代码语言:javascript
复制
import urllib.request

from urllib.error import URLError,HTTPError,ContentTooShortError

import re

"""
def download(url,user_agent='wswp',num_retries=2):
	request=urllib.request.Request(url)
	request.add_header('User-agent',user_agent)
	try:
		html=urllib.request.urlopen(url).read()
	except (URLError,HTTPError,ContentTooShortError) as e:
		print('download error:',e.reason)
		html=None
		if num_retries>0:
			return download(url,num_retries-1)
	return html

download("http://httpstat.us/500")
"""

def download(url,user_agent='wswp',num_retries=2,charset='utf-8'):
	print("downloading:",url)
	request=urllib.request.Request(url)
	request.add_header("User-agent",user_agent)
	try:
		resp=urllib.request.urlopen(request)
		cs=resp.headers.get_content_charset()
		if not cs:
			cs=charset
		html=resp.read().decode(cs)
	except (URLError,HTTPError,ContentTooShortError) as e:
		html=None
		if num_retries>0:
			return download(url,num_retries-1)
	return html

def crawl_sitemap(url):
	sitemap=download(url)
	links=re.findall('<loc>(.*?)</loc>',sitemap)

	for link in links:
		html=download(link)

crawl_sitemap("http://example.python-scraping.com/sitemap.xml")
本文参与 腾讯云自媒体同步曝光计划,分享自作者个人站点/博客。
原始发表:2020/05/19 ,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 作者个人站点/博客 前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体同步曝光计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
目录
  • 爬取博客信息
  • 爬取租房信息
  • 爬取慕课网课程信息
    • 检查内容截图
    • urllib.request爬取链接
    领券
    问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档