from scrapy.spiders import CrawlSpider, Rule, Request
from scrapy.linkextractors import LinkExtractor
from haoduofuli.items import HaoduofuliItem
from scrapy import FormRequest
account = '你的帐号'
password = '你的密码'
class myspider(CrawlSpider):
name = 'haoduofuli'
allowed_domains = ['haoduofuli.wang']
start_urls = ['http://www.haoduofuli.wang/wp-login.php']
def parse_start_url(self, response):
formdate = {
'log': account,
'pwd': password,
'rememberme': "forever",
'wp-submit': "登录",
'redirect_to': "http://www.haoduofuli.wang/wp-admin/",
'testcookie': "1"
}
return [FormRequest.from_response(response, formdata=formdate, callback=self.after_login)]
def after_login(self, response):
lnk = 'http://www.haoduofuli.wang'
return Request(lnk)
rules = (
Rule(LinkExtractor(allow=('\.html',)), callback='parse_item', follow=True),
)
def parse_item(self, response):
item = HaoduofuliItem()
try:
item['category'] = response.xpath('//*[@id="content"]/div[1]/div[1]/span[2]/a/text()').extract()[0]
item['title'] = response.xpath('//*[@id="content"]/div[1]/h1/text()').extract()[0]
item['imgurl'] = response.xpath('//*[@id="post_content"]/p/img/@src').extract()
item['yunlink'] = response.xpath('//*[@id="post_content"]/blockquote/a/@href').extract()[0]
item['password'] = response.xpath('//*[@id="post_content"]/blockquote/font/text()').extract()[0]
return item
except:
item['category'] = response.xpath('//*[@id="content"]/div[1]/div[1]/span[2]/a/text()').extract()[0]
item['title'] = response.xpath('//*[@id="content"]/div[1]/h1/text()').extract()[0]
item['imgurl'] = response.xpath('//*[@id="post_content"]/p/img/@src').extract()
item['yunlink'] = response.xpath('//*[@id="post_content"]/blockquote/p/a/@href').extract()[0]
item['password'] = response.xpath('//*[@id="post_content"]/blockquote/p/span/text()').extract()[0]
return item
按照教程的说法:return Request(lnk)
这一个请求也算作 初始URL 只不过 不是 start_urls
的返回response 所以不会调用 parse_start_url
函数哦!
但实际运行中,发现还是调用了 parse_start_url
, 然后到:
return [FormRequest.from_response(response, formdata=formdate, callback=self.after_login)]
就出现了:
raise ValueError("No <form> element found in %s" % response)
问题。求怎么解决。