前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
圈层
工具
发布
首页
学习
活动
专区
圈层
工具
MCP广场
社区首页 >专栏 >python 爬虫脚本crawl.py

python 爬虫脚本crawl.py

作者头像
用户5760343
发布于 2022-05-14 06:47:21
发布于 2022-05-14 06:47:21
34000
代码可运行
举报
文章被收录于专栏:sktjsktj
运行总次数:0
代码可运行

!/usr/bin/env python

import io import formatter from html.parser import HTMLParser import http.client import os import sys import urllib.request, urllib.parse, urllib.error

class MyHTMLParser(HTMLParser): def init(self): HTMLParser.init(self) self.links = [] def handle_starttag(self, tag, attrs): #print "Encountered the beginning of a %s tag" % tag if tag == "a": if len(attrs) == 0: pass else: for (variable, value) in attrs: if variable == "href": self.links.append(value)

class Retriever(object): slots = ('url', 'file') def init(self, url): self.url, self.file = self.get_file(url)

代码语言:javascript
代码运行次数:0
运行
AI代码解释
复制
def get_file(self, url, default='index.html', erase=True):
    'Create usable local filename from URL'
    parsed = urllib.parse.urlparse(url)
    host = parsed.netloc.split('@')[-1].split(':')[0]
    filepath = '%s%s' % (host, parsed.path)
    if not os.path.splitext(parsed.path)[1]:
        filepath = os.path.join(filepath, default)
    linkdir = os.path.dirname(filepath)
    if not os.path.isdir(linkdir):
        if erase and os.path.exists(linkdir):
            os.unlink(linkdir)
        os.makedirs(linkdir)
    return url, filepath

def download(self):
    'Download URL to specific named file'
    try:
        retval = urllib.request.urlretrieve(self.url, self.file)
    except (IOError, http.client.InvalidURL) as e:
        retval = (('*** ERROR: bad URL "%s": %s' % (self.url, e)),)
    return retval

def parse_links(self):
    'Parse out the links found in downloaded HTML file'
    f = open(self.file, 'r')
    data = f.read()
    f.close()
    parser = MyHTMLParser()
    parser.feed(data)
    parser.close()
    return parser.links
    #return parser.anchorlist

class Crawler(object): count = 0

代码语言:javascript
代码运行次数:0
运行
AI代码解释
复制
def __init__(self, url):
    self.q = [url]
    self.seen = set()
    parsed = urllib.parse.urlparse(url)

    host = parsed.netloc.split('@')[-1].split(':')[0]
    self.dom = '.'.join(host.split('.')[-2:])
    # HC
    # self.dom = '.'.join(urlparse.urlparse(url).netloc.split('@')[-1].split(':')[0].split('.')[-2:])

def get_page(self, url, media=False):
    'Download page & parse links, add to queue if nec'
    r = Retriever(url)
    fname = r.download()[0]
    if fname[0] == '*':
        print(fname, '... skipping parse')
        return
    Crawler.count += 1
    print('\n(', Crawler.count, ')')
    print('URL:', url)
    print('FILE:', fname)
    self.seen.add(url)
    ftype = os.path.splitext(fname)[1]
    if ftype not in ('.htm', '.html'):
        return

    for link in r.parse_links():
        if link.startswith('mailto:'):
            print('... discarded, mailto link')
            continue
        if not media:
            ftype = os.path.splitext(link)[1]
            if ftype in ('.mp3', '.mp4', '.m4v', '.wav'):
                print('... discarded, media file')
                continue
        if not link.startswith('http://'):
            link = urllib.parse.urljoin(url, link)
        print('*', link, end=' ')
        if link not in self.seen:
            if self.dom not in link:
                print('... discarded, not in domain')
            else:
                if link not in self.q:
                    self.q.append(link)
                    print('... new, added to Q')
                else:
                    print('... discarded, already in Q')
        else:
                print('... discarded, already processed')

def go(self, media=False):
    'Process next page in queue (if any)'
    while self.q:
        url = self.q.pop()
        self.get_page(url, media)

def main(): if len(sys.argv) > 1: url = sys.argv[1] else: try: url = input('Enter starting URL: ') except (KeyboardInterrupt, EOFError): url = '' if not url: return if not url.startswith('http://') and not url.startswith('ftp://'): url = 'http://%s/' % url robot = Crawler(url) robot.go()

if name == 'main': main()

本文参与 腾讯云自媒体同步曝光计划,分享自作者个人站点/博客。
原始发表:2022-05-13,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 作者个人站点/博客 前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体同步曝光计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
暂无评论
推荐阅读
领券
💥开发者 MCP广场重磅上线!
精选全网热门MCP server,让你的AI更好用 🚀
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档