import threading
import requests
from lxml import etree
from urllib import request
import re
import os
from queue import Queue
class produce(threading.Thread):
headers = {
'user - agent': 'Mozilla / 5.0(Windows NT 10.0;WOW64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 94.0\
.4606.71Safari / 537.36Core / 1.94.188.400QQBrowser / 11.4.5226.400'
}
def __init__(self, page_queue, image_queue, *args, **kwargs):
super(produce, self).__init__(*args, **kwargs)
self.page_queue = page_queue
self.image_queue = image_queue
def run(self):
while True:
if self.page_queue.empty():
return
url = self.page_queue.get()
self.parse_page(url)
def parse_page(url):
BASE_DOmain = 'https://dou.yuanmazg.com'
response = requests.get(url, headers=self.headers)
text = response.content.decode('utf-8')
html = etree.HTML(text)
imgs = html.xpath('//div[@class="col-sm-9"]//img')
for img in imgs:
img_url = img.xpath('./@data-original') # img.get('div')可以获得img下所有的div标签
detail_url = BASE_DOmain + img_url[0]
picture_name = img.xpath('./@alt')[0]
picture_name = re.sub(r'[??.。,,*!!]', '', picture_name)
suffix = os.path.splitext(detail_url)[1]
filename = picture_name + suffix
self.image_queue.put((detail_url, filename))
# request.urlretrieve(detail_url,'image/'+filename)
class customer(threading.Thread):
def __init__(self, page_queue, image_queue, *args, **kwargs):
super(customer, self).__init__(*args, **kwargs)
self.page_queue = page_queue
self.image_queue = image_queue
def run(self):
while True:
if self.page_queue.empty():
if self.image_queue.empty():
return
img = self.image_queue.get()
detail_url, filename = img
request.urlretrieve(detail_url, 'image/' + filename)
print(filename + "已经下载完成")
def main():
page_queue = Queue(100)
image_queue = Queue(500)
for i in range(1, 5):
url = 'https://dou.yuanmazg.com/doutu?page=%d' % i
page_queue.put(url)
for i in range(5):
t = produce(page_queue, image_queue).start()
for i in range(5):
t = customer(page_queue, image_queue).start()
if __name__ == '__main__':
main()
TypeError : self.parse_page(url)produce.parse_page() takes 1 positional argument but 2 were given
TypeError: produce.parse_page() takes 1 positional argument but 2 were given
相似问题