# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
from urllib import parse
import datetime
from ArticleSpider.items import JobboleArticleItem
from ArticleSpider.utilss.common import get_md5
class JobboleSpider(scrapy.Spider):
name = 'jobbole'
allowed_domains = ['blog.jobbole.com']
start_urls = ['http://blog.jobbole.com/all-posts/']
def parse(self, response):
post_urls = response.css("#archive .floated-thumb .post-thumb a::attr(href)").extract()
for post_url in post_urls:
yield Request(url=parse.urljoin(response.url,post_url),callback=self.parse_detail)
#next_url = response.css(".next.page-numbers::attr(href)").extract_first("")
#print(next_url)
#if next_url:
# yield Request(url=parse.urljoin(response.url,next_url),callback=self.parse)
# 只是传递并未调用
def parse_detail(self, response):
article_item = JobboleArticleItem()
article_item["url_object_id"] = get_md5(response.url)
title = response.xpath(' // *[ @ class = "entry-header"] / h1 / text()').extract()
create_time = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·","").strip()
# try:
# create_time = datetime.datetime.strptime(create_time, "%Y/%m/%d").date()
# except Exception as e:
# create_time = datetime.datetime.now().date()
article_item["title"] = title
article_item["create_time"] = create_time
yield article_item
这里是主程序,下面是pipeline中的代码
import codecs
import json
import MySQLdb
import MySQLdb.cursors
class MysqlPipeline(object):
def __init__(self):
self.conn = MySQLdb.connect('localhost', 'root', 'root', 'mysql', charset='utf8', use_unicode=True)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
insert_sql = """
insert into title(title,create_time,url_object_id)
VALUES (%s, %s, %s)
"""
self.cursor.execute(insert_sql, (item["title"], item["create_time"], item["url_object_id"]))
self.conn.commit()
我描述下出现的问题,现在的情况是如果不执行pipeline中写入mysql,程序可以实现抓取文章名和文章发表时间,如果在setting中设置执行pipeline,则会出现报错
2017-08-01 14:09:37 [scrapy.core.scraper] ERROR: Error processing {'create_time': '2017/07/31',
'title': ['Neo4j 图数据库基础'],
'url_object_id': '1a8e6c64968ed6db401b5769221f9b4f'}
Traceback (most recent call last):
File "C:\python3\lib\site-packages\twisted\internet\defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "C:\Users\微软中国\ArticleSpider\ArticleSpider\pipelines.py", line 36, in process_item
self.cursor.execute(insert_sql, (item["title"], item["create_time"], item["url_object_id"]))
File "C:\python3\lib\site-packages\MySQLdb\cursors.py", line 234, in execute
args = tuple(map(db.literal, args))
File "C:\python3\lib\site-packages\MySQLdb\connections.py", line 316, in literal
s = self.escape(o, self.encoders)
File "C:\python3\lib\site-packages\MySQLdb\converters.py", line 90, in quote_tuple
return "(%s)" % (','.join(escape_sequence(t, d)))
TypeError: sequence item 0: expected str instance, bytes found
根据报错的意思是,我写入的数据是byte类型,可是我debug的话明明是str啊,mysql数据库不太会但也设置上了
也设置主键了 ,求大神看看这个同步写入mysql到底哪错了???