from urllib import request,parse
import chardet
if __name__ == '__main__':
url = 'https://blog.csdn.net/m0_37355951/article/details/80457159'
rsp = request.urlopen(url)
html = rsp.read()
##获取网页的头信息(编码)
cs = chardet.detect(html)
print(cs) ##{'encoding': 'utf-8', 'confidence': 0.99, 'language': ''}
##按照获取的页面编码进行解码 默认utf-8
html = html.decode(cs.get("encoding",'utf-8'))
#输出返回的信息
print(rsp)
print(rsp.geturl())
print(rsp.info())
print(rsp.getcode()) ## 正常200
##网页信息
print(html)
from urllib import request,parse
if __name__ == '__main__':
url = 'http://www.baidu.com/s?'
wd = input('Input your keyword')
## 拼接的数据
qs = {
"wd":wd
}
## 对数据进行编译
qs = parse.urlencode(qs)
rsp = request.urlopen(url+qs)
html = rsp.read().decode()
print(html)
'''
利用parse 模块模拟post请求
1.打开F12
2.输入一个g
3.利用NetWork-All-Headers 查看 发现 FormData 的值是kw:g
'''
from urllib import request,parse
import json
'''
利用data构造内容 然后urlopen打开
返回一个json 格式的结果
结果应该是girl的翻译
'''
baseurl = 'https://fanyi.baidu.com/sug'
#存放dict格式的数据
data = {
'kw':'girl'
}
#需要使用parse来变异
data = parse.urlencode(data).encode()
rsp = request.urlopen(baseurl,data= data)
## 读取信息解码 默认utf-8
json_data = rsp.read().decode()
print(json_data)
#把json字符串转化成字典
json_data = json.loads(json_data)
print(json_data)
for item in json_data['data']:
print(item['k'],'---',item['v'])
'''
UrlEror的使用
查看 访问错误
'''
from urllib import request,error
if __name__ == '__main__':
url = 'http://www.baidu.com'
try:
req = request.Request(url)
rsp = request.urlopen(req)
html = rsp.read().decode()
print(html)
except error.HTTPError as e:
print(e)
except error.URLError as e:
print(e)
except Exception as e:
print(e)
常用的agent: https://blog.csdn.net/rookie_is_me/article/details/81634048
两种方式:
1.headers['User-Agent'] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36"
req = request.Request(url= url,headers=headers)
2.req = request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36')
'''
访问一个网址 更改自己的agent
'''
from urllib import request,error
if __name__ == '__main__':
url = 'http://www.baidu.com'
try:
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36"
# req = request.Request(url= url,headers=headers)
req = request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36')
rsp = request.urlopen(req)
html = rsp.read().decode()
print(html)
except error.URLError as e:
print(e)
except Exception as e:
print(e)
'''
代理服务器
www.xicidaili.com
www.goubanjia.com
使用步骤:
1.设置代理地址
2.创建ProxyHandle
3.创建Opener
4.安装 Opener
'''
from urllib import request,error,parse
if __name__ == '__main__':
url = 'http://www.baidu.com'
#设置代理地址
proxy = {'http':'117.169.104.102:80'}
#创建ProxyHandler
proxy_handler = request.ProxyHandler(proxy)
#创建Opener
opener = request.build_opener(proxy_handler)
#安装Opener
request.install_opener(opener)
try:
rsp = request.urlopen(url)
html = rsp.read().decode()
print(html)
except error.URLError as e:
print(e)
'''
使用cookie 登录人人网
复制登录后的cookie
'''
from urllib import request
if __name__ == '__main__':
url = 'http://www.renren.com/894245278/profile'
headers = {'Cookie':' 自己的cookie '}
req = request.Request(url=url,headers=headers)
rsp = request.urlopen(req)
html = rsp.read().decode()
print(html)
'''
自动配置cookie爬取数据
CookieJar 管理存储cookie 向传出的http请求添加cookie
cookie存储在内存中 CookieJar实例回收后,cookie消失
FileCookieJar 使用文件保存cookie
MozillaCookieJar 创建与mocilla浏览器cookie.txt兼容的FileCookie
LwpCookieJar
'''
#利用cookieJar访问人人网
#打开登录界面 自动通过用户名密码登录
#利用提取的cookie登录隐私页面
from urllib import request,error,parse
from http import cookiejar
#创建 cookiejar 的实例
cookie = cookiejar.CookieJar()
#生成cookie的管理器
cookie_handler = request.HTTPCookieProcessor(cookie)
#创建http请求管理器
http_handler = request.HTTPHandler()
#生成https管理器
https_handler = request.HTTPSHandler()
#创建请求管理器
opener = request.build_opener(http_handler,https_handler,cookie_handler)
def login():
url = 'http://www.renren.com/PLogin.do'
#设置登录数据
data = {
'email':'账号',
'password':'密码'
}
#数据编码
data = parse.urlencode(data).encode()
req = request.Request(url,data= data)
rsp = opener.open(req)
def getHomePage():
url = 'http://www.renren.com/894245278/profile'
#如果已经执行了login函数 则opener自动包含相应的cookie值
rsp = opener.open(url)
html = rsp.read().decode()
print(html)
if __name__ == '__main__':
login()
getHomePage()