利用Scray框架爬去segementfault上的保存到数据库然后发送到自己的邮箱
先显示部分源码:
coding:utf-8
! /usr/bin/python
'''
Author fiz
Date:2016-03-30
Segement Blog 内容爬去
'''
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle
from scrapy.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.http import FormRequest, Request
from scrapy.selector import Selector
from Scrapy_demo.items import *
from scrapy import log
from scrapy.utils.response import get_base_url
import urlparse
class SegeblogSpider(CrawlSpider):
name = 'blog'
allowed_domains = ['segmentfault.com']
start_urls = ['https://segmentfault.com/t/python/blogs?page=1',]
#此处注意?要转义
rules = [ Rule(sle(allow=('t/python/blogs\?page={1,}'),), follow=True,callback='parse_item1') ]
def parse_item1(self, response):
sel = Selector(response)
items = []
base_url = get_base_url(response)
postTitle = sel.css('div.tab-content').css("section")#全部的问题数量每一页
postCon = sel.css('div.postCon div.c_b_p_desc')
# #标题、url和描述的结构是一个松散的结构,后期可以改进
for index in range(len(postTitle)):
item = CnblogsItem()
#问题名称
item['title'] = postTitle[index].css("a").xpath('text()').extract()[0]
# item['link'] = 'https://segmentfault.com'+postTitle[index].css('a').xpath('@href').extract()[0]#提问人的主页链接
#问题页面链接
item['link'] = 'https://segmentfault.com'+postTitle[index].css("h2.title").css('a').xpath('@href').extract()[0]
#在当前页面进行爬去
#当前爬去的页面
item['listUrl'] = base_url
item['desc'] = postTitle[index].css("div.views ").xpath("text()").extract()[0]
#print base_url + "********\n"
items.append(item)
return items
添加Pipleline保存到数据库
-- coding: utf-8 --
Define your item pipelines here
Don't forget to add your pipeline to the ITEM_PIPELINES setting
See: http://doc.scrapy.org/en/latest/topics/i...
from scrapy import signals
import json
import codecs
from twisted.enterprise import adbapi
from datetime import datetime
from hashlib import md5
import MySQLdb
import MySQLdb.cursors
'''
保存Json格式
'''
class JsonWithEncodingCnblogsPipeline(object):
def __init__(self):
self.file = codecs.open('Segement.json', 'w', encoding='utf-8')
def process_item(self, item, spider):
line = json.dumps(dict(item), ensure_ascii=False) + "\n"
self.file.write(line)
return item
def spider_closed(self, spider):
self.file.close()
class MySQLStoreCnblogsPipeline(object):
def __init__(self, dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls, settings):
dbargs = dict(
host=settings['MYSQL_HOST'],
db=settings['MYSQL_DBNAME'],
user=settings['MYSQL_USER'],
passwd=settings['MYSQL_PASSWD'],
charset='utf8',
cursorclass = MySQLdb.cursors.DictCursor,
use_unicode= True,
)
dbpool = adbapi.ConnectionPool('MySQLdb', **dbargs)
return cls(dbpool)
#pipeline默认调用
def process_item(self, item, spider):
d = self.dbpool.runInteraction(self._do_upinsert, item, spider)
d.addErrback(self._handle_error, item, spider)
d.addBoth(lambda _: item)
return d
#将每行更新或写入数据库中
def _do_upinsert(self, conn, item, spider):
linkmd5id = self._get_linkmd5id(item)
#print linkmd5id
now = datetime.utcnow().replace(microsecond=0).isoformat(' ')
conn.execute("""
select 1 from cnblogsinfo where linkmd5id = %s
""", (linkmd5id, ))
ret = conn.fetchone()
if ret:
conn.execute("""
update cnblogsinfo set title = %s, description = %s, link = %s, listUrl = %s, updated = %s where linkmd5id = %s
""", (item['title'], item['desc'], item['link'], item['listUrl'], now, linkmd5id))
#print """
# update cnblogsinfo set title = %s, description = %s, link = %s, listUrl = %s, updated = %s where linkmd5id = %s
#""", (item['title'], item['desc'], item['link'], item['listUrl'], now, linkmd5id)
else:
conn.execute("""
insert into cnblogsinfo(linkmd5id, title, description, link, listUrl, updated)
values(%s, %s, %s, %s, %s, %s)
""", (linkmd5id, item['title'], item['desc'], item['link'], item['listUrl'], now))
#print """
# insert into cnblogsinfo(linkmd5id, title, description, link, listUrl, updated)
# values(%s, %s, %s, %s, %s, %s)
#""", (linkmd5id, item['title'], item['desc'], item['link'], item['listUrl'], now)
#获取url的md5编码
def _get_linkmd5id(self, item):
#url进行md5处理,为避免重复采集设计
return md5(item['link']).hexdigest()
#异常处理
def _handle_error(self, failue, item, spider):
# log.err(failure)
pass
邮件发送代码:
coding:utf-8
'''读取mysql中的数据然后发送到邮箱实现浏览的'''
import MySQLdb
import MySQLdb.cursors
import smtplib
from email.header import Header
from email.mime.text import MIMEText
import sys
reload(sys)
msg_content = u'segmentfault有关Python文章爬去'
msg_content = 'ok'
def test(content):
'''
邮件发送
:param content:
:return:
'''
msg = MIMEText(content, 'html', 'utf-8')
server = smtplib.SMTP('smtp.163.com', 25)
server.login('18818261892@163.com', 'LBQ139196')
msg['From'] = '18818261892@163.com <18818261892@163.com>'
msg['Subject'] = Header(u'text', 'utf8').encode()
msg['To'] = u'飞轮海 <1848406889@qq.com>'
server.sendmail('18818261892@163.com', ['1848406889@qq.com'], msg.as_string())
print 'finished is ok!ooooo'
def db_operate():
try:
global msg_content
conn=MySQLdb.connect(host='localhost',user= 'root',passwd='1234',db='cnblogsdb',port=3306,charset = "utf8")
cur=conn.cursor()
i = cur.execute('select * from cnblogsinfo')#945条数据mscontet 需要使用945
rows = cur.fetchall()
for row in rows:
# print "%s, %s, %s, %s" % (row[0], row[1], row[2], row[3])
msg_content +=('<html><body><h1> %s </h1><p>send by <a href= %s>Python</a>...</p></body></html>') %( row[1],row[4],)
msg_content = msg_content+str(row[4])+'ok'
print msg_content
test(msg_content)
print 'finished is ok!ooooo'
cur.close()
conn.close()
except MySQLdb.Error,e:
print "Mysql Error %d: %s" % (e.args[0], e.args[1])
if name == '__main__':
db_operate()
msg_content ='ok'
try:
conn=MySQLdb.connect(host='localhost',user='root',passwd='1234',db='cnblogsdb',port=3306,charset = "utf8")
cur=conn.cursor()
i = cur.execute('select * from cnblogsinfo')#945条数据mscontet 需要使用945
rows = cur.fetchall()
for row in rows:
print "%s, %s, %s, %s" % (row[0], row[1], row[2], row[3])
print row[1]
msg_content +=('<html><body><h1> %s </h1>
send by Python...
</body></html>') %( row[1],row[4],)msg_content +=('<html><body><h1> segmentfault</h1>
%s
send by Python...
</body></html>') %( row[1],row[4],)msg_content = msg_content+str(row[4])+'ok'
print msg_content
msg = MIMEText(msg_content, 'html', 'utf-8')
server = smtplib.SMTP('smtp.163.com', 25)
server.login('18818261892@163.com', 'LBQ139196')
msg['From'] = '18818261892@163.com <18818261892@163.com>'
msg['Subject'] = Header(u'text', 'utf8').encode()
msg['To'] = u'飞轮海 <1848406889@qq.com>'
server.sendmail('18818261892@163.com', ['1848406889@qq.com'], msg.as_string())
print 'finished is ok!ooooo'
cur.close()
conn.close()
except MySQLdb.Error,e:
print "Mysql Error %d: %s" % (e.args[0], e.args[1])
结果显示:
源码分享github
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。