sipder如下:
#-"coding:utf-8 -"-
import sys, os
reload(sys)
from scrapy.spiders import Spider
from scrapy.spiders import Request
from scrapy.selector import Selector
from network.items import WandoujiawangyeItem
base = "D:/python_workspace/datasets/"
class WandoujiawangyeSpider(Spider):
name = "wandoujiawangye"
download_delay = 1
allowed_domains = ["wandoujia.com"]
start_urls = [
"http://www.wandoujia.com/category/app" ,
"http://www.wandoujia.com/category/game"
]
def parse(self, response):
items =[]
sel = Selector(response)
big_urls = sel.xpath('//li[@class=\"parent-cate\"]/a/@href').extract()
big_titles = sel.xpath('//li[@class=\"parent-cate\"]/a/text()').extract()
second_urls = sel.xpath('//li[@class=\"child-cate\"]/a/@href').extract()
second_titles = sel.xpath('//li[@class=\"child-cate\"]/a/text()').extract()
for i in range(0,len(big_titles)-1):
file_name = base +big_titles[i]
#创建目录
if(not os.path.exists(file_name)):
os.makedirs(file_name)
for j in range(0,len(second_titles)):
item =WandoujiawangyeItem()
item['parent_url'] = big_urls[i]
item['parent_title'] = big_titles[i]
if_belong = second_urls[j].startswith(item['parent_url'])
if(if_belong):
second_file_name =file_name + '/' +second_titles[j]
if(not os.path.extists(second_file_name)):
os.makedirs(second_file_name)
item['second_url'] = second_urls[j]
item['second_title'] = second_titles[j]
item['path'] = second_file_name
items.append(item)
for item in items:
yield Request(url=item['second_url'],meta={'item_1':item},callbck=self.second_parse,dont_filter=True)
#对于返回的小类的url,再进行递归请求
def second_parse(self,response):
sel = Selector(response)
item_1 = response.meta['item_1']
items = []
bigUrls = sel.xpath('//a/@href').extract()
for i in range(0,len(bigUrls)):
if_belong = bigUrls[i].endswith('.shtml') and bigUrls[i].startswith(item_1['parent_url'])
if(if_belong):
item = WandoujiawangyeItem()
item['parent_title']=item_1['parent_title']
item['parent_url']=item_1['parent_url']
item['second_url']=item_1['second_url']
item['second_title']=item_1['second_title']
item['path']=item_1['path']
item['link_url']=bigUrls[i]
items.append(item)
for item in items:
yield Request(url=item['link_url'],meta={'item_2':item},callback=self.detail_parse,dont_filter=True)
def detail_parse(self,response):
sel=Selector(response)
item=response.mata['item_2']
content=""
head=sel.xpath('//span[@class=\"title\"]/text()')
content_list=sel.xpath('//div[@class=\"desc-info\"]/div/text()')
for content_one in content_list:
content+=content_one
item['head']=head
item['content']=content
yield item
为什么我只爬得出big_titles出来?好像不能和第二级建立连接,second_titles等都出不来,哪里有问题吗?在线等,求救!
`
我的想法