使用scrapy框架爬虫,但是遇到各种问题以至于一次都没有成功,想知道自己错在哪里了?
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class BnuzpjtItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
#公告通知标题
title = scrapy.Field()
#公告通知链接
url = scrapy.Field()
pipelines.py
# -*- coding: utf-8 -*-
import codecs
import json
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class BnuzpjtPipeline(object):
def __init__(self):
self.file = codecs.open(r"C:/Users/j/bnuzpjt","wb",encoding = "utf-8")
def process_item(self, item, spider):
i = json.dumps(dict(item),ensure_ascii=False)
line = i+'\n'
#数据写入到jsou文件中
self.file.write(line)
return item
def close_spider(self,spider):
#关闭mydata文件
self.file.close()
settings
# -*- coding: utf-8 -*-
# Scrapy settings for bnuzpjt project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'bnuzpjt'
SPIDER_MODULES = ['bnuzpjt.spiders']
NEWSPIDER_MODULE = 'bnuzpjt.spiders'
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'bnuzpjt.pipelines.BnuzpjtPipeline': 300,
}
爬虫代码:bnuzspd.py
# -*- coding: utf-8 -*-
import scrapy
from bnuzpjt.items import BnuzpjtItem
from scrapy.http import Request
class BnuzspdSpider(scrapy.Spider):
name = 'bnuzspd'
allowed_domains = ['bnuz.edu.cn']
start_urls = ['http://bnuz.edu.cn/']
def parse(self, response):
item=BnuzpjdItem()
item["title"] = response.xpath("//span[@class='lefttitle']/text()").extract()
item["url"] = response.xpath("//ul[@class='leftclick']/li/a/@href").extract()
yield item
运行之后会出现一个错误,如图:
请问到底是出了什么问题?
学习调试,和看报错信息,可以执行下断点看程序在哪里挂的