from lxml import etree
import requests,urllib
from requests.models import Response
import xlsxwriter
import datetime
from openpyxl import load_workbook
import re
from selenium import webdriverdef drinks888(url):
datas=[] html=requ_html(url)html2=requ_html("http://www.drinks888.com/news/53/2.html") url=url.split('/news')# 第一页的三条数据title=html.xpath('//div[@class="news_main2"]/dl[position()>1]/dd/h3/a/text()')urls=html.xpath('//div[@class="news_main2"]/dl[position()>1]/dd/h3/a/@href')for i in range(0,3): datas.append(url[0]+urls[i]) datas.append(title[i].strip()) # 第二页的两条title2=html2.xpath('//div[@class="news_main2"]/dl/dd/h3/a/text()')urls2=html2.xpath('//div[@class="news_main2"]/dl/dd/h3/a/@href')for i in range(0,2): datas.append(url[0]+urls2[i]) datas.append(title2[i].strip())wb = load_workbook(r'C:\Users\Administrator\Desktop\唐富\唐富\5月工作表\各类长尾词统计.xlsx')wbs=wb['所有文章更新链接']n=wbs.max_row+2for i in range(0,10,2): wbs['D{}'.format(n)] = datas[i] wbs['E{}'.format(n)] = datas[i+1] n=n+1wb.save(r'C:\Users\Administrator\Desktop\唐富\唐富\5月工作表\各类长尾词统计.xlsx') print('链接已写入各类关键词表中') return datasdef drinks999(url):
datas=[]html=requ_html(url)url=url.split('/news')list_p=html.xpath('//dl[@class="dl_220"]/dd/a/text()')list_url=html.xpath('//dl[@class="dl_220"]/dd/a/@href')for i in range(0,5): title_text=list_p[i].encode('ISO-8859-1').decode('UTF-8') url_href=url[0]+list_url[i] # worksheet.write(9+i,2,url_href) # worksheet.write(9+i,3,title_text) datas.append(url_href) datas.append(title_text.strip())return datasdef drinksaaa(url):
datas=[]html=requ_html(url)url=url.split('/a')list_p=html.xpath('//ul[@class="fc_pro"]/li/a/h3/text()')[0].encode('ISO-8859-1').decode('UTF-8')list_url=url[0]+html.xpath('//ul[@class="fc_pro"]/li/a/@href')[0]# url_href=url[0]+list_url# worksheet.write(15,2,list_p)# worksheet.write(15,3,url_href)# print(list_p)# print(url_href)datas.append(list_url)datas.append(list_p.strip())return datasdef drinksbcdef(url):
# header={# 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',# 'Cookie':'bdshare_firstime=1574388395758; yjs_id=aHR0cDovL3d3dy5kcmlua3NiYmIuY29tL2p3dGNiYmIwMjgvfDE1ODc2MjMzNjkyNjU; DedeUserID=1; DedeUserID__ckMd5=9880aa35c1f9b840; DedeLoginTime=1597627802; DedeLoginTime__ckMd5=0ae3c362e99a6e1f; Hm_lvt_8b1432e8c92af2a6596d7f512dabf0f0=1597281289,1597367738,1597628086,1597657062; Hm_lpvt_8b1432e8c92af2a6596d7f512dabf0f0=1597657062'# }datas=[]html=requ_html(url)# print(html) list_p=html.xpath('//div[@class="txt"]/h2/a/text()')[0].encode('ISO-8859-1').decode('UTF-8')url=url.split('/a')list_url=url[0]+html.xpath('//div[@class="txt"]/h2/a/@href')[0] datas.append(list_url)datas.append(list_p.strip())return datas def drinksqqq(url):
datas=[]html=requ_html(url)url=url.split('/a')list_title=html.xpath("//div[@class='news_con']/dl/dt/a/text()")[0].encode('ISO-8859-1').decode('UTF-8')list_url=url[0]+html.xpath("//div[@class='news_con']/dl/dt/a/@href")[0]datas.append(list_url)datas.append(list_title.strip())# print(list_url)# print(list_title.strip())return datasdef drinksrrr(url):
datas=[]html=requ_html(url)url=url.split('/a')list_title=html.xpath('//ul[@class="cpshow"]/li/h4/a/text()')[0].encode('ISO-8859-1').decode('UTF-8')list_url=url[0]+html.xpath('//ul[@class="cpshow"]/li/h4/a/@href')[0]# print(list_title)# print(list_url)datas.append(list_url)datas.append(list_title.strip())return datasdef drinkssss(url):
datas=[] html=requ_html(url)url=url.split('/a')list_title=html.xpath('//div[@class="agent_con"]/dl/dd/h4/a/text()')[0].encode('ISO-8859-1').decode('UTF-8')list_url=url[0]+html.xpath('//div[@class="agent_con"]/dl/dd/h4/a/@href')[0]datas.append(list_url)datas.append(list_title.strip())return datasdef drinkshhh(url):
datas=[]html=requ_html(url)url=url.split('/a')list_title=html.xpath('//div[@class="VieList"]/h3/a/text()')[0].encode('ISO-8859-1').decode('UTF-8')list_url=url[0]+html.xpath('//div[@class="VieList"]/h3/a/@href')[0]datas.append(list_url)datas.append(list_title.strip())return datasdef drinksjjj(url):
datas=[]html=requ_html(url)url=url.split('/a')list_title=html.xpath('//div[@class="pro_main"]/dl/dd/a/text()')[0].encode('ISO-8859-1').decode('UTF-8')list_url=url[0]+html.xpath('//div[@class="pro_main"]/dl/dd/a/@href')[0]datas.append(list_url)datas.append(list_title.strip())return datasdef drinkslll(url):
datas=[]html=requ_html(url)url=url.split('/a') list_title=html.xpath('//div[@class="agent_con"]/dl/dd/a/text()')[0].encode('ISO-8859-1').decode('UTF-8')list_url=url[0]+html.xpath('//div[@class="agent_con"]/dl/dd/a/@href')[0]datas.append(list_url)datas.append(list_title.strip())return datasdef drinksooo(url):
datas=[]html=requ_html(url)url=url.split('/a') list_title=html.xpath("//div[@class='news_con']/dl/dd/span/a/text()")[0].encode('ISO-8859-1').decode('UTF-8')list_url=url[0]+html.xpath("//div[@class='news_con']/dl/dd/span/a/@href")[0]datas.append(list_url)datas.append(list_title.strip())return datasdef drinksppp(url):
html=requ_html(url)datas=[]url=url.split('/a') list_title=html.xpath("//dl[@class='pd_list_dl']/dd/a/text()")[0].encode('ISO-8859-1').decode('UTF-8')list_url=url[0]+html.xpath("//dl[@class='pd_list_dl']/dd/a/@href")[0]datas.append(list_url)datas.append(list_title.strip())return datasdef requ_html(url):
headers={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}res=requests.get(url,headers=headers)html=etree.HTML(res.text)return htmldef souhu(url):
datas=[] # 第二个搜狐号html2=requ_html(url)title2=html2.xpath('//ul/li[position()<6]/article/div/h4/a/text()')urls2=html2.xpath('//ul/li[position()<6]/article/div/h4/a/@href')for i in range(0,5): datas.append('https:'+urls2[i]) datas.append(title2[i].strip()) return datasdef boke(url):
datas=[]html=requ_html(url)title=html.xpath('//div[@class="articleList"]/div[position()<6]/p/span/a/text()')urls=html.xpath('//div[@class="articleList"]/div[position()<6]/p/span/a/@href')for i in range(0,5): datas.append(urls[i]) datas.append(title[i].encode('ISO-8859-1').decode('UTF-8'))return datasdef chuangtout(url):
datas=[]html=requ_html(url) urls=html.xpath('//h2/a/@href') title=html.xpath('//h2/a/text()')for i in range(0,5): datas.append("http://www.ctoutiao.com"+urls[i].replace('\\','').replace('"','')) n=title[i].replace(r'\r\n','').encode('utf-8').decode('unicode_escape').strip() datas.append(n.strip())return datasdef kuaichuang():
url='https://www.360kuai.com/mob/api/getgzh?callback=jQuery1910931088931784106_1622702058309&scheme=https&pg=1&sign=360_57c3bbd1&u=0f0d2fb283622d320adb4f56433f6c5e&n=10&sqid=&gzh=155938358&djsource=&tmprtp=&tj_url=&refer_scene=so_1&scene=61&f=jsonp&_=1622702058310'# url='https://www.360kuai.com/mob/api/getgzh?callback=jQuery191023346830254345963_1622705544780&scheme=https&pg=1&sign=look&u=0f0d2fb283622d320adb4f56433f6c5e&n=10&sqid=&gzh=3176116568&djsource=&tmprtp=&tj_url=&refer_scene=&scene=61&f=jsonp&_=1622705544781'datas=[]headers={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}res=requests.get(url,headers=headers)# html=etree.HTML(res.text)# title=html.xpath('//h3/a/text()')# links = html.xpath('//h3/a/@href')title=re.findall('"t":"(.*?)","c"',res.text)ucheck=re.findall('"gnid":"(.*?)"}',res.text)for i in range(0,10): datas.append('https://www.360kuai.com/pc/'+ucheck[i]) datas.append(title[i].encode('utf-8').decode('unicode_escape'))return datasdef get_818(url):
datas=[]headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36' }# s = requests.session() #建立一个Session# form_data = { # '__VIEWSTATE': '/wEPDwUKLTk4NDU4OTMzMWRk2n6a8smWQwShiX7p3Dw0lcMKWtXllwxr+A6O7EuLWfE=',# '__VIEWSTATEGENERATOR': 'D399C246',# 'username': 'a463459227', # 'userpassword':'463459227',# 'Button1': '登录',# }# session = requests.session()# response = session.post(url,headers=headers,data=form_data) #session登录网站url="http://www.818u.com/s447889/"response = requests.get(url,headers=headers) #session浏览页面html=etree.HTML(response.text)title=html.xpath('/html/body/div/div[4]/div/div[4]/div[2]/div[1]/ul/li/a/text()')urls=html.xpath('//html/body/div/div[4]/div/div[4]/div[2]/div[1]/ul/li/a/@href')for i in range(0,5): datas.append('http://www.818u.com'+urls[i]) datas.append(title[i])return datasdef get_baixing():
datas=[]url='https://www.baixing.com/u/92147482/?src=vad_listing_7'headers={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}res=requests.get(url,headers=headers) html=etree.HTML(res.text) title=html.xpath('//ul[@class="list-ad-items"]/li[position()<6]/div/div[1]/a/text()')baixing_url=html.xpath('//ul[@class="list-ad-items"]/li[position()<6]/div/div[1]/a/@href')for i in range(0,5): datas.append(baixing_url[i].replace('?from=','')) datas.append(title[i])return datas def zhihu():
datas=[]url='https://www.zhihu.com/people/ding-ni-ge-fei-68-18/posts'headers={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}res=requests.get(url,headers=headers)html=res.texturl_list=re.search('<script id="js-initialData" type="text/json">(.*?)</script>',html)[0]# title=html.xpath('//h2[@class="ContentItem-title"][position()<6]/a/text()')urls=re.findall('"author".*?},"url":"(.*?)","commentPermission"',url_list)title_list=re.findall('"linkbox":{.*?},"title":"(.*?)","voting',url_list)# print(t[0].encode('utf8').decode('unicode_escape'))for i in range(5,10): datas.append(urls[i].encode('utf-8').decode('unicode-escape')) datas.append(title_list[i])return datas def baijiahao2():
option = webdriver.ChromeOptions()option.add_argument("headless")url='https://author.baidu.com/home?from=bjh_article&app_id=1646805710008842'driver=webdriver.Chrome(r'C:\Program Files\Google\Chrome\Application\chromedriver.exe',options=option)print('打开浏览器,打开登录')datas=[]driver.get(url)baijia_drivers=driver.find_elements_by_xpath('//div[starts-with(@url,"https://baijiahao.baidu.com/s")]')for b in baijia_drivers: datas.append(b.get_attribute("url")) datas.append(b.get_attribute('title'))print('抓取完毕')return datas def baijiahao():
datas=[]headers={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', }urls='https://mbd.baidu.com/webpage?tab=main&num=10&uk=zY-_SEXGkAnbinLaoiX9uw&source=pc&type=newhome&action=dynamic&format=jsonp&otherext=h5_20210512150247&Tenger-Mhor=3729859865&callback=__jsonp01621232675252'# urls='https://mbd.baidu.com/webpage?tab=main&num=10&uk=0sjX3ZhmuoiDKUlCa0Sd2w&source=pc&type=newhome&action=dynamic&format=jsonp&otherext=h5_20211102104358&Tenger-Mhor=3132920746&callback=__jsonp01635838632934'# res=requests.get(urls,headers=headers)# res=urllib.request.Request(urls)cont= urllib.request.urlopen(urls).read()with open('baidu.txt',"wb") as f:# 写文件用bytes而不是str,所以要转码 f.write(cont)# with open('baidu.txt','r',encoding='utf-8') as ff:# jstexts=ff.read()print(cont)# url=re.findall(r'id=(\d+)',res.text)# print(res.text)# title=re.findall(r'"title":"(.*?)",',res.text)# # print(title)# for i in range(0,5):# datas.append('https://baijiahao.baidu.com/s?id='+url[i])# datas.append(title[i].encode('utf-8').decode('unicode_escape'))# return datas#b站栏目文章抓取
def bili_cont():
datas=[]url='https://api.bilibili.com/x/space/article?mid=702710400&pn=1&ps=12&sort=publish_time&jsonp=jsonp'res=requests.get(url)html=res.texttitle=re.findall(r'日常.*?"title":"(\S+?)"',html)url_id=re.findall(r'{"id":(\d+),"category',html)for i in range(0,5): datas.append('https://www.bilibili.com/read/cv{}'.format(url_id[i])) datas.append(title[i])return datasdef meipian(url):
datas=[]headers={'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}res=requests.get(url,headers=headers)html=etree.HTML(res.text)title=html.xpath('//h3/a/text()')links = html.xpath('//h3/a/@href')for t in range(0,10): datas.append(links[t]) datas.append(title[t])return datasdef main():
now_time =datetime.datetime.now().strftime('%Y-%m-%d')workbook = xlsxwriter.Workbook("C:/Users/Administrator/Desktop/唐富/唐富/5月工作表/{}唐富工作日报.xlsx".format(now_time)) # 创建excelworksheet = workbook.add_worksheet("first_sheet") # 创建sheetmerge_format = workbook.add_format({'bold': True,"align": "center", # 对齐方式"valign": "vcenter", # 字体对齐方式"font_color": "red" })style=workbook.add_format({"font_color": "red" })worksheet.merge_range('B3:C3','网站文章更新',merge_format)# worksheet.merge_range('B3:C4', '网站文章更新', merge_format)# worksheet.write(2, 1, '网站文章更新')worksheet.write(3, 1, '网站888文章更新',style)worksheet.write(3, 3, '百家号',style)worksheet.write(9, 1, '网站999文章更新',style)worksheet.write(9, 3, '创头条',style)# 设置宽度worksheet.set_column(1,4, 45)# worksheet.write(10, 2,'url')# worksheet.write(10, 3,'title')worksheet.write(15, 1, '其他网站文章更新',style)worksheet.write(34, 1, '818美篇博客',style)# worksheet.write(16, 1,'url')# worksheet.write(16, 2,'title')# 网站888文章print('网站888文章开始')datas=drinks888("http://www.drinks888.com/news/53/1.html")# 激活 worksheetn=4for i in range(0,10,2): worksheet.write(n,1,datas[i]) worksheet.write(n,2,datas[i+1]) n=n+1print('网站999文章开始')datas=drinks999('http://www.drinks999.com/news')n=10 for i in range(0,10,2): worksheet.write(n,1,datas[i]) worksheet.write(n,2,datas[i+1]) n=n+1 print('网站aaa文章开始')datas_aaa=drinksaaa("http://www.drinksaaa.com/a/xinwenzixun")worksheet.write(16,1,datas_aaa[0])worksheet.write(16,2,datas_aaa[1])print('网站bbb更新开始')datas_bbb=drinksbcdef("http://www.drinksbbb.com/a/xinwenzixun/")worksheet.write(17,1,datas_bbb[0])worksheet.write(17,2,datas_bbb[1])print('网站ccc更新开始')datas_ccc=drinksbcdef("http://www.drinksccc.com/a/xingyezixun")worksheet.write(18,1,datas_ccc[0])worksheet.write(18,2,datas_ccc[1])print('网站ddd更新开始')datas_ddd=drinksbcdef("http://www.drinksddd.com/a/xingyezixun")worksheet.write(19,1,datas_ddd[0])worksheet.write(19,2,datas_ddd[1])print('网站eee更新开始')datas_eee=drinksbcdef("http://www.drinkseee.com/a/xinwenzixun")worksheet.write(20,1,datas_eee[0])worksheet.write(20,2,datas_eee[1])print('网站fff更新开始')datas_fff=drinksbcdef("http://www.drinksfff.com/a/xingyezixun")worksheet.write(21,1,datas_fff[0])worksheet.write(21,2,datas_fff[1])print('网站ggg更新开始')datas_ggg=drinkslll("http://www.drinksggg.com/a/xinwenzixun")worksheet.write(22,1,datas_ggg[0])worksheet.write(22,2,datas_ggg[1])print('网站hhh更新开始')datas_hhh=drinkshhh("http://www.drinkshhh.com/a/xinwenzixun")worksheet.write(23,1,datas_hhh[0])worksheet.write(23,2,datas_hhh[1])print('网站jjj更新开始')datas_jjj=drinksjjj("http://www.drinksjjj.com/news")worksheet.write(24,1,datas_jjj[0])worksheet.write(24,2,datas_jjj[1])print('网站kkk更新开始')datas_kkk=drinksbcdef("http://www.drinkskkk.com/a/xinwenzixun")worksheet.write(25,1,datas_kkk[0])worksheet.write(25,2,datas_kkk[1])print('网站mmm更新开始')datas_mmm=drinksbcdef("http://www.drinksmmm.com/a/xinwenzixun")worksheet.write(26,1,datas_mmm[0])worksheet.write(26,2,datas_mmm[1])print('网站nnn更新开始')datas_nnn=drinkshhh("http://www.drinksnnn.com/a/xinwenzixun")worksheet.write(27,1,datas_nnn[0])worksheet.write(27,2,datas_nnn[1])print('网站ooo更新开始')datas_ooo=drinksooo("http://www.drinksooo.com/a/xinwenzixun")worksheet.write(28,1,datas_ooo[0])worksheet.write(28,2,datas_ooo[1])print('网站ppp更新开始')datas_ppp=drinksppp("http://www.drinksppp.com/a/xinwenzixun")worksheet.write(29,1,datas_ppp[0])worksheet.write(29,2,datas_ppp[1])print('网站qqq更新开始')datas_qqq=drinksqqq("http://www.drinksqqq.com/news")worksheet.write(30,1,datas_qqq[0])worksheet.write(30,2,datas_qqq[1])print('网站rrr更新开始')datas_rrr=drinksrrr("http://www.drinksrrr.com/a/xinwenzixun")worksheet.write(31,1,datas_rrr[0])worksheet.write(31,2,datas_rrr[1])print('网站sss更新开始')datas_sss=drinkssss("http://www.drinkssss.com/a/xinwenzhongxin")worksheet.write(32,1,datas_sss[0])worksheet.write(32,2,datas_sss[1])print('网站lll更新开始')datas_lll=drinkslll("http://www.drinkslll.com/a/xinwenzixun")worksheet.write(33,1,datas_lll[0])worksheet.write(33,2,datas_lll[1])print('818替代搜狐开始')datas=get_818('http://www.818u.com/s447889/')n=35 for i in range(0,10,2): worksheet.write(n,1,datas[i]) worksheet.write(n,2,datas[i+1]) n=n+1# 每篇print('每篇文章抓取开始')datas=meipian('https://www.meipian.cn/c/283140641')n=40for i in range(0,20,2): worksheet.write(n,1,datas[i]) worksheet.write(n,2,datas[i+1]) n=n+1print('boke开始')# 搜狐# datas=souhu("https://mp.sohu.com/profile?xpt=NTU0MmJiMDgtYzMxNC00MzRjLWIxOWUtNTdmMzk1MDBhZDg1&_f=index_pagemp_2&spm=smpc.content.author.3.159825042146246R1zHx")datas=boke('http://blog.sina.com.cn/s/articlelist_5226711056_0_1.html')n=50for i in range(0,10,2): worksheet.write(n,1,datas[i]) worksheet.write(n,2,datas[i+1]) n=n+1# 创头条print('创头条开始')datas=chuangtout("http://www.ctoutiao.com/ajax_new/ajax_data.php?page=newCompany&act=getPosts&uid=1729092&type=getPosts&pageno=1")n=10for i in range(0,10,2): worksheet.write(n,3,datas[i]) worksheet.write(n,4,datas[i+1]) n=n+1# 百家号发布print('百家号开始') datas=baijiahao2() n=4for i in range(0,10,2): worksheet.write(n,3,datas[i]) worksheet.write(n,4,datas[i+1]) n=n+1 workbook.close() # 关闭excel写入print('日报已经写完了')main()
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。