用scrapy爬取一个47000多页的网站,明明没有爬完,结果隔两三个小时就结束,显示finish。但是明明没有爬完啊。

附上爬虫文件的源码。

import scrapy
from openhub.items import OpenhubItem
from lxml import etree
import json

class ProjectSpider(scrapy.Spider):
    name = 'project'
    # allowed_domains = []
    start_urls = ['https://www.openhub.net/p?page=1&query=']
    url = 'https://www.openhub.net/p?page={}&query='
    page = 1
    def parse(self, response):
        div_list = response.xpath("//div[@class='well searchable']")
        pid = 0
        for odiv in div_list:
            #创建一个item
            item = OpenhubItem()
            # 主页信息提取
            post = odiv.xpath(".//div/a[@class='pull-left logo']/img/@src").extract_first()
            pid = (pid+1)+(self.page*10)-10
            item['pid'] = pid
            if post:
                item['post'] = post
            else:
                item['post'] = ''
            licenses = odiv.xpath(".//div[@class='licenses pull-right']").xpath('string(.)').extract()
            lic = ','.join(licenses).replace('\n','').strip('Licenses:')
            if lic:
                item['licenses'] = lic
            else:
                item['licenses'] = 'No declared licenses'
            href = odiv.xpath(".//div/a[@class='pull-left logo']/@href").extract_first()
            item['openhub_url'] = 'https://www.openhub.net' + href
            time = odiv.xpath(".//div[@id='inner_content']/div/p/i/abbr/@title").extract_first()
            if time:
                item['time'] = time
            else:
                item['time'] = ''
            detail_url = item['openhub_url']
            lt = []
            # 比对去重
            with open('time222.txt', 'r', encoding='utf-8') as f:
                is_Existed = False
                need_update = False
                for lines in f.readlines():
                    line = lines.strip('\n')
                    line = json.loads(line)
                    lt.append(line)
                    # if line.get('openhub_url') == item['url']:
                    if item['openhub_url'] == line.get('openhub_url'):
                        is_Existed = True
                        if item['time'] != line.get('time'):
                            need_update = True
                if (is_Existed == False) or (need_update == True):
                    yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={'item': item})
        if self.page <= 47517:
            self.page += 1
            url = self.url.format(self.page)
            yield scrapy.Request(url=url, callback=self.parse)
    #爬取详情页
    def parse_detail(self, response):
        #通过response的meta属性,获取到参数item
        item = response.meta['item']
        item['title'] = response.xpath('//div[@id="project_header"]/div[1]/h1/a/text()').extract_first()
        summary = response.xpath("//section[@id='project_summary']").xpath('string(.)').extract_first().strip(' \n').replace('\n','')
        if summary:
            item['summary'] = summary
        else:
            item['summary'] = ''
        score = response.xpath('//*[@id="i_use_this_container"]/div/a/text()').extract_first()
        if score:
            item['score'] = score
        else:
            item['score'] = ''
        tags = response.xpath("//p[@class='tags']").xpath('string(.)').extract_first().strip("\n")
        if tags:
            item['tags'] = tags
        else:
            item['tags'] = ''
        info = response.xpath("//div[@class='well']/ul").xpath('string(.)').extract_first().strip('\n').replace('\n','')
        if info:
            item['info'] = info
        else:
            item['info'] = ''
        organiz_text = response.xpath("//div[@class='col-xs-12']/div[@class='col-xs-7'][1]/a/text()").extract_first()
        if organiz_text:
            item['organiz_text'] = organiz_text
        else:
            item['organiz_text'] = ''
        organiz_url = response.xpath("//div[@class='col-xs-12']/div[@class='col-xs-7'][1]/a/@href").extract_first()
        if organiz_url:
            item['organiz_url'] = 'https://www.openhub.net' + organiz_url
        else:
            item['organiz_url'] = ''
        language = response.xpath("//table[@class='table table-striped unstyled pull-left language_table']//tr").xpath('string(.)').extract()
        if language:
            item['language'] = ' '.join(language).replace('\n','')
        else:
            item['language'] = ''

        user_rate = response.xpath('//div[@id="community_rating"]/div[1]/div[1]/text()').extract_first()
        if user_rate:
            item['user_rate'] = ''.join(user_rate).strip('\n').strip(' users rate this project:')
        else:
            item['user_rate'] = ''

        rating = response.xpath('//div[@id="community_rating"]/div[1]/div[2]/div/div/div/text()').extract_first()
        if rating:
            item['rating'] = ''.join(rating).strip('\n')
        else:
            item['rating'] = ''

        link = response.xpath("//div[@class='col-xs-12']/div[@class='col-xs-5 text-right text-right']/a/@href").extract_first()
        link_url = 'https://www.openhub.net' + link

        a_list = response.xpath("//div[@class='well']/div[@class='col-xs-12']/div[@class='col-xs-5 text-right']/a/@href").extract_first()
        # print(a_list)
        if a_list:
            code_link = 'https://www.openhub.net' + a_list
        # print(code_link)
            yield scrapy.Request(url=code_link,callback=self.parse_detail_two2,meta={'item': item})
        yield scrapy.Request(url=link_url,callback=self.parse_detail_two,meta={'item':item})

    def parse_detail_two2(self, response):
        item = response.meta['item']
        print('对着呢***' * 10)
        item['a'] = 0
        tr_list = response.xpath("//table[@class='table table-striped']/tbody/tr")
        for otr in tr_list:
            item['code_url'] = otr.xpath(".//td[@class='col-md-4']/text()").extract_first().strip('\n')
            item['scmtype'] = otr.xpath(".//td[@class='col-md-2']/text()").extract_first().strip('\n')
            item['update_status'] = otr.xpath(".//td[@class='col-md-3']/text()").extract_first().strip('\n')
            yield item

    # 爬取第二层详情页
    def parse_detail_two(self, response):
        item = response.meta['item']
        well_list = response.xpath("//div[@class='well']")
        for owell in well_list:
            html = etree.HTML(owell.extract())
            if html.xpath(r"//div[@class='col-md-3 margin_bottom_25']"):
                type = html.xpath("//h3[@class='strong pull-left margin_bottom_15 margin_top_0']/text()")
                name = html.xpath("//div[@class='col-md-3 margin_bottom_25']/text()")
                url = html.xpath("//div[@class='col-md-6']/h5/a/@href")
                item['type'] = ''.join(type).strip('\n')
                item['name'] = ''.join(name).strip('\n')
                item['url'] = ''.join(url).strip('\n')
                item['a'] = 1
                yield item
        item['a'] = 2
        yield item
阅读 2.1k
1 个回答
新手上路,请多包涵

把最后运行完的统计信息发下,是不是网站屏蔽了

撰写回答
你尚未登录,登录后可以
  • 和开发者交流问题的细节
  • 关注并接收问题和回答的更新提醒
  • 参与内容的编辑和改进,让解决方法与时俱进
推荐问题