carapy爬虫,要爬取一个页面上的文章信息,这个信息需要登录才能看见,是ajax请求,源代码中找不到,运行代码时出现InvalidDocument: Cannot encode object:
完整信息如下:
InvalidDocument: Cannot encode object: <GET http://www.yyyy.com/extra/ajax_content.php?id=100&act=ajaxcon>
源代码: parseContent函数ajax请求部分有问题,前面代码运行没有啥问题。
#-*-coding:utf8-*-
import scrapy
from scrapy_redis.spiders import RedisSpider
from scrapy.selector import Selector
from scrapy.http import Request
from novelspider.items import NovelspiderItem
import re
import os
import commands
import subprocess
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
#模拟登录
os.system("casperjs D:/novelspider/novelspider/spiders/login.js")
class novSpider(RedisSpider):
name = "novspider"
redis_key = 'nvospider:start_urls'
start_urls = ['http://www.yyyy.com/articles/articles-list.php']
def start_requests(self):
pages=[]
for i in range(1,101):
eachurl='http://www.yyyy.com/articles/articles-list.php?page=%s'%i
newpage=scrapy.Request(eachurl)
pages.append(newpage)
return pages
def parse(self,response):
selector = Selector(response)
url_list = selector.xpath('//div[@class="title"]/a/@href')
for url in url_list:
fullurl=response.urljoin(url.extract())
yield Request(fullurl, callback=self.parseContent)
def parseContent(self, response):
for sel in response.xpath('//div[@class="content"]'):
item = NovelspiderItem()
item['contentUrl']=response.url
item['title']=sel.xpath('div[1]/div[1]/h3/text()').extract()
item['updateTime']=sel.xpath('div[1]/div[4]/span[1]/text()').extract()
#content需要登录才能显示,是ajax请求,从item['contentUrl']中正则匹配出参数id
contentId= re.search(u'id=\d{1,4}',item['contentUrl']).group(0)
#拼接ajax请求地址
contentAjaxUrl="http://www.yyyy.com/extra/ajax_content.php?"+contentId+"&act=ajaxcon"
#发送ajax请求,取回内容
contentAjax=Request(contentAjaxUrl)
#对取回内容进行正则匹配,找出所需内容
item['content']= re.search('"content_extra">(.*?)</div',contentAjax,re.S)
yield item