import io import formatter from html.parser import HTMLParser import http.client import os import sys import urllib.request, urllib.parse, urllib.error
class MyHTMLParser(HTMLParser): def init(self): HTMLParser.init(self) self.links = [] def handle_starttag(self, tag, attrs): #print "Encountered the beginning of a %s tag" % tag if tag == "a": if len(attrs) == 0: pass else: for (variable, value) in attrs: if variable == "href": self.links.append(value)
class Retriever(object): slots = ('url', 'file') def init(self, url): self.url, self.file = self.get_file(url)
def get_file(self, url, default='index.html', erase=True):
'Create usable local filename from URL'
parsed = urllib.parse.urlparse(url)
host = parsed.netloc.split('@')[-1].split(':')[0]
filepath = '%s%s' % (host, parsed.path)
if not os.path.splitext(parsed.path)[1]:
filepath = os.path.join(filepath, default)
linkdir = os.path.dirname(filepath)
if not os.path.isdir(linkdir):
if erase and os.path.exists(linkdir):
os.unlink(linkdir)
os.makedirs(linkdir)
return url, filepath
def download(self):
'Download URL to specific named file'
try:
retval = urllib.request.urlretrieve(self.url, self.file)
except (IOError, http.client.InvalidURL) as e:
retval = (('*** ERROR: bad URL "%s": %s' % (self.url, e)),)
return retval
def parse_links(self):
'Parse out the links found in downloaded HTML file'
f = open(self.file, 'r')
data = f.read()
f.close()
parser = MyHTMLParser()
parser.feed(data)
parser.close()
return parser.links
#return parser.anchorlist
class Crawler(object): count = 0
def __init__(self, url):
self.q = [url]
self.seen = set()
parsed = urllib.parse.urlparse(url)
host = parsed.netloc.split('@')[-1].split(':')[0]
self.dom = '.'.join(host.split('.')[-2:])
# HC
# self.dom = '.'.join(urlparse.urlparse(url).netloc.split('@')[-1].split(':')[0].split('.')[-2:])
def get_page(self, url, media=False):
'Download page & parse links, add to queue if nec'
r = Retriever(url)
fname = r.download()[0]
if fname[0] == '*':
print(fname, '... skipping parse')
return
Crawler.count += 1
print('\n(', Crawler.count, ')')
print('URL:', url)
print('FILE:', fname)
self.seen.add(url)
ftype = os.path.splitext(fname)[1]
if ftype not in ('.htm', '.html'):
return
for link in r.parse_links():
if link.startswith('mailto:'):
print('... discarded, mailto link')
continue
if not media:
ftype = os.path.splitext(link)[1]
if ftype in ('.mp3', '.mp4', '.m4v', '.wav'):
print('... discarded, media file')
continue
if not link.startswith('http://'):
link = urllib.parse.urljoin(url, link)
print('*', link, end=' ')
if link not in self.seen:
if self.dom not in link:
print('... discarded, not in domain')
else:
if link not in self.q:
self.q.append(link)
print('... new, added to Q')
else:
print('... discarded, already in Q')
else:
print('... discarded, already processed')
def go(self, media=False):
'Process next page in queue (if any)'
while self.q:
url = self.q.pop()
self.get_page(url, media)
def main(): if len(sys.argv) > 1: url = sys.argv[1] else: try: url = input('Enter starting URL: ') except (KeyboardInterrupt, EOFError): url = '' if not url: return if not url.startswith('http://') and not url.startswith('ftp://'): url = 'http://%s/' % url robot = Crawler(url) robot.go()
if name == 'main': main()
扫码关注腾讯云开发者
领取腾讯云代金券
Copyright © 2013 - 2025 Tencent Cloud. All Rights Reserved. 腾讯云 版权所有
深圳市腾讯计算机系统有限公司 ICP备案/许可证号:粤B2-20090059 深公网安备号 44030502008569
腾讯云计算(北京)有限责任公司 京ICP证150476号 | 京ICP备11018762号 | 京公网安备号11010802020287
Copyright © 2013 - 2025 Tencent Cloud.
All Rights Reserved. 腾讯云 版权所有