scrapy递归爬虫问题

sipder如下:

#-"coding:utf-8 -"-
import sys, os
reload(sys)

from scrapy.spiders import Spider  
from scrapy.spiders import Request
from scrapy.selector import Selector
from network.items import WandoujiawangyeItem


base = "D:/python_workspace/datasets/"  

class WandoujiawangyeSpider(Spider):  
    name = "wandoujiawangye"
    download_delay = 1
    allowed_domains = ["wandoujia.com"]  
    start_urls = [  
       "http://www.wandoujia.com/category/app" ,
       "http://www.wandoujia.com/category/game"          
    ]  
  
    def parse(self, response):
        items =[]
        sel = Selector(response)
        big_urls = sel.xpath('//li[@class=\"parent-cate\"]/a/@href').extract()  
        big_titles = sel.xpath('//li[@class=\"parent-cate\"]/a/text()').extract()

        second_urls = sel.xpath('//li[@class=\"child-cate\"]/a/@href').extract()
        second_titles = sel.xpath('//li[@class=\"child-cate\"]/a/text()').extract()
          
        for i in range(0,len(big_titles)-1):
            file_name = base +big_titles[i]
      #创建目录
            if(not os.path.exists(file_name)):
                os.makedirs(file_name)
            for j in range(0,len(second_titles)):
                item =WandoujiawangyeItem()                
                item['parent_url'] = big_urls[i]
                item['parent_title'] = big_titles[i]
                if_belong = second_urls[j].startswith(item['parent_url'])
                if(if_belong):
                    second_file_name =file_name + '/' +second_titles[j]
                    if(not os.path.extists(second_file_name)):
                        os.makedirs(second_file_name)
                    item['second_url'] = second_urls[j]
                    item['second_title'] = second_titles[j]
                    item['path'] = second_file_name
                    items.append(item)
        for item in items:
            yield Request(url=item['second_url'],meta={'item_1':item},callbck=self.second_parse,dont_filter=True)
     #对于返回的小类的url,再进行递归请求
    def second_parse(self,response):
        sel = Selector(response)
        item_1 = response.meta['item_1']
        items = []
        bigUrls = sel.xpath('//a/@href').extract()   

        for i in range(0,len(bigUrls)):
            if_belong = bigUrls[i].endswith('.shtml') and bigUrls[i].startswith(item_1['parent_url'])
            if(if_belong):
                item = WandoujiawangyeItem()
                item['parent_title']=item_1['parent_title']
                item['parent_url']=item_1['parent_url']
                item['second_url']=item_1['second_url']
                item['second_title']=item_1['second_title']
                item['path']=item_1['path']
                item['link_url']=bigUrls[i]
                items.append(item)
        for item in items:
            yield Request(url=item['link_url'],meta={'item_2':item},callback=self.detail_parse,dont_filter=True)
    def detail_parse(self,response):
        sel=Selector(response)
        item=response.mata['item_2']
        content=""                  
        head=sel.xpath('//span[@class=\"title\"]/text()')         
        content_list=sel.xpath('//div[@class=\"desc-info\"]/div/text()') 
        for content_one in content_list:
            content+=content_one   
        item['head']=head              
        item['content']=content     
        yield item

为什么我只爬得出big_titles出来?好像不能和第二级建立连接,second_titles等都出不来,哪里有问题吗?在线等,求救!

`

阅读 3.8k
1 个回答

我的想法

 #对于返回的小类的url,再进行递归请求
def second_parse(self,response):
    sel = Selector(response)
    item_1 = response.meta['item_1']
    bigUrls = sel.xpath('//a/@href').extract()   

    for i in range(0,len(bigUrls)):
        if_belong = bigUrls[i].endswith('.shtml') and bigUrls[i].startswith(item_1['parent_url'])
        if(if_belong):
            item = WandoujiawangyeItem()
            item['parent_title']=item_1['parent_title']
            item['parent_url']=item_1['parent_url']
            item['second_url']=item_1['second_url']
            item['second_title']=item_1['second_title']
            item['path']=item_1['path']
            item['link_url']=bigUrls[i]
            yield Request(url=item['link_url'],meta={'item_2':item},callback=self.detail_parse,dont_filter=True)
"""  
            items.append(item)
    for item in items:
        yield Request(url=item['link_url'],meta={'item_2':item},callback=self.detail_parse,dont_filter=True)
"""
撰写回答
你尚未登录,登录后可以
  • 和开发者交流问题的细节
  • 关注并接收问题和回答的更新提醒
  • 参与内容的编辑和改进,让解决方法与时俱进
推荐问题