item.py
import scrapy
from scrapy.item import Item, Field
class MytestItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
class DoubanmoiveItem(Item):
name=Field()#电影名
year=Field()#上映年份
score=Field()#豆瓣分数
director=Field()#导演
classification=Field()#分类
actor=Field()#演员
movie_spider
from scrapy.selector import Selector
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from mytest.items import DoubanmoiveItem
class MovieSpider(CrawlSpider):
name="doubanmovie"
allowed_domains=["movie.douban.com"]
start_urls=["https:/movie.douban.com/top250"]
rules=[
Rule(SgmlLinkExtractor(allow=(r'http://movie.douban.com/top250\?start=\d+.*'))),
Rule(SgmlLinkExtractor(allow=(r'http://movie.douban.com/subject/\d+')), callback="parse_item"),
]
def parse_item(self, response):
sel=Selecotr(response)
item=DoubanmovieItem()
item['name']=sel.xpath('//*[@id="content"]/h1/span[1]/text()').extract()
item['year']=sel.xpath('//*[@id="content"]/h1/span[2]/text()').re(r'\((\d+)\)')
item['score']=sel.xpath('//*[@id="interest_sectl"]/div/p[1]/strong/text()').extract()
item['director']=sel.xpath('//*[@id="info"]/span[1]/a/text()').extract()
item['actor']=sel.xpath('//*[@id="info"]/span[3]/a[1]/text()').extract()
return item
pipeLines.py
from scrapy import log
from twisted.enterprise import adbapi
from scrapy.http import Request
import MySQLdb
import MySQLdb.cursors
class DoubanmoivePipeline(object):
def __init__(self):
self.dbpool = adbapi.ConnectionPool('MySQLdb',
db = 'JY',
user = 'root',
passwd = '508122guoyumei',
cursorclass = MySQLdb.cursors.DictCursor,
charset = 'utf8',
use_unicode = False
)
def process_item(self, item, spider):
query = self.dbpool.runInteraction(self._conditional_insert, item)
query.addErrback(self.handle_error)
return item
def _conditional_insert(self,tx,item):
tx.execute("select * from Moive where movieName= %s",(item['name'][0],))
result=tx.fetchone()
log.msg(result,level=log.DEBUG)
print result
if result:
log.msg("Item already stored in db:%s" % item,level=log.DEBUG)
else:
classification=actor=''
lenClassification=len(item['classification'])
lenActor=len(item['actor'])
for n in xrange(lenClassification):
classification+=item['classification'][n]
if n<lenClassification-1:
classification+='/'
for n in xrange(lenActor):
actor+=item['actor'][n]
if n<lenActor-1:
actor+='/'
tx.execute(\
"insert into Movie(movieName,movieReleasedate,movieScore,movieDirector,movieType,movieActor) values (%s,%s,%s,%s,%s,%s)",\
(item['name'][0],item['year'][0],item['score'][0],item['director'][0],classification,actor))
log.msg("Item stored in db: %s" % item, level=log.DEBUG)
def handle_error(self, e):
log.err(e)
settings.py
BOT_NAME = 'mytest'
SPIDER_MODULES = ['mytest.spiders']
NEWSPIDER_MODULE = 'mytest.spiders'
LOG_LEVEL='DEBUG'
DOWNLOAD_DELAY = 2
RANDOMIZE_DOWNLOAD_DELAY = True
COOKIES_ENABLED = True
ITEM_PIPELINES = {
'mytest.pipelines.DoubanmoviePipeline':300
}
USER_AGENT = 'mytest (+http://www.yourdomain.com)'
错误
~/mytest$ scrapy crawl doubanmovie
/usr/lib/pymodules/python2.7/scrapy/contrib/linkextractors/sgml.py:107: ScrapyDeprecationWarning: SgmlLinkExtractor is deprecated and will be removed in future releases. Please use scrapy.contrib.linkextractors.LinkExtractor
ScrapyDeprecationWarning
2015-05-22 10:13:29+0800 [scrapy] INFO: Scrapy 0.25.0-454-gfa1039f started (bot: mytest)
2015-05-22 10:13:29+0800 [scrapy] INFO: Optional features available: ssl, http11
2015-05-22 10:13:29+0800 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'mytest.spiders', 'SPIDER_MODULES': ['mytest.spiders'], 'USER_AGENT': 'mytest (+http://www.yourdomain.com)', 'DOWNLOAD_DELAY': 2, 'BOT_NAME': 'mytest'}
2015-05-22 10:13:29+0800 [scrapy] INFO: Enabled extensions: LogStats, TelnetConsole, CloseSpider, CoreStats, SpiderState
2015-05-22 10:13:29+0800 [scrapy] INFO: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats
2015-05-22 10:13:29+0800 [scrapy] INFO: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware
2015-05-22 10:13:29+0800 [-] ERROR: Unhandled error in Deferred:
2015-05-22 10:13:29+0800 [-] Unhandled Error
Traceback (most recent call last):
File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 150, in _run_command
cmd.run(args, opts)
File "/usr/lib/pymodules/python2.7/scrapy/commands/crawl.py", line 57, in run
self.crawler_process.crawl(spname, **opts.spargs)
File "/usr/lib/pymodules/python2.7/scrapy/crawler.py", line 105, in crawl
d = crawler.crawl(*args, **kwargs)
File "/usr/lib/python2.7/dist-packages/twisted/internet/defer.py", line 1181, in unwindGenerator
return _inlineCallbacks(None, gen, Deferred())
--- <exception caught here> ---
File "/usr/lib/python2.7/dist-packages/twisted/internet/defer.py", line 1039, in _inlineCallbacks
result = g.send(result)
File "/usr/lib/pymodules/python2.7/scrapy/crawler.py", line 60, in crawl
self.engine = self._create_engine()
File "/usr/lib/pymodules/python2.7/scrapy/crawler.py", line 72, in _create_engine
return ExecutionEngine(self, lambda _: self.stop())
File "/usr/lib/pymodules/python2.7/scrapy/core/engine.py", line 63, in __init__
self.scraper = Scraper(crawler)
File "/usr/lib/pymodules/python2.7/scrapy/core/scraper.py", line 67, in __init__
self.itemproc = itemproc_cls.from_crawler(crawler)
File "/usr/lib/pymodules/python2.7/scrapy/middleware.py", line 50, in from_crawler
return cls.from_settings(crawler.settings, crawler)
File "/usr/lib/pymodules/python2.7/scrapy/middleware.py", line 29, in from_settings
mwcls = load_object(clspath)
File "/usr/lib/pymodules/python2.7/scrapy/utils/misc.py", line 49, in load_object
raise NameError("Module '%s' doesn't define any object named '%s'" % (module, name))
exceptions.NameError: Module 'mytest.pipelines' doesn't define any object named 'DoubanmoviePipeline'
你说你有一个
DoubanmoviePipeline
类,但是你只有DoubanmoivePipeline
类。