利用Scray框架爬去segementfault上的保存到数据库然后发送到自己的邮箱
图片描述

先显示部分源码:

coding:utf-8

! /usr/bin/python

'''
Author fiz
Date:2016-03-30
Segement Blog 内容爬去
'''

from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle
from scrapy.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.http import FormRequest, Request
from scrapy.selector import Selector
from Scrapy_demo.items import *
from scrapy import log
from scrapy.utils.response import get_base_url
import urlparse

class SegeblogSpider(CrawlSpider):

name = 'blog'
allowed_domains = ['segmentfault.com']
start_urls = ['https://segmentfault.com/t/python/blogs?page=1',]
#此处注意?要转义
rules = [ Rule(sle(allow=('t/python/blogs\?page={1,}'),), follow=True,callback='parse_item1') ]
def parse_item1(self, response):
    sel = Selector(response)

    items = []
    base_url = get_base_url(response)
    postTitle = sel.css('div.tab-content').css("section")#全部的问题数量每一页

    postCon = sel.css('div.postCon div.c_b_p_desc')
    # #标题、url和描述的结构是一个松散的结构,后期可以改进
    for index in range(len(postTitle)):
        item = CnblogsItem()
        #问题名称
        item['title'] = postTitle[index].css("a").xpath('text()').extract()[0]
        # item['link'] = 'https://segmentfault.com'+postTitle[index].css('a').xpath('@href').extract()[0]#提问人的主页链接
        #问题页面链接
        item['link'] = 'https://segmentfault.com'+postTitle[index].css("h2.title").css('a').xpath('@href').extract()[0]
        #在当前页面进行爬去
        #当前爬去的页面
        item['listUrl'] = base_url

        item['desc'] = postTitle[index].css("div.views ").xpath("text()").extract()[0]
        #print base_url + "********\n"
        items.append(item)
    return items
  1. 添加Pipleline保存到数据库

-- coding: utf-8 --

Define your item pipelines here

Don't forget to add your pipeline to the ITEM_PIPELINES setting

See: http://doc.scrapy.org/en/latest/topics/i...

from scrapy import signals
import json
import codecs
from twisted.enterprise import adbapi
from datetime import datetime
from hashlib import md5
import MySQLdb
import MySQLdb.cursors

'''
保存Json格式
'''

class JsonWithEncodingCnblogsPipeline(object):

def __init__(self):
    self.file = codecs.open('Segement.json', 'w', encoding='utf-8')
def process_item(self, item, spider):
    line = json.dumps(dict(item), ensure_ascii=False) + "\n"
    self.file.write(line)
    return item
def spider_closed(self, spider):
    self.file.close()

class MySQLStoreCnblogsPipeline(object):

def __init__(self, dbpool):
    self.dbpool = dbpool

@classmethod
def from_settings(cls, settings):
    dbargs = dict(
        host=settings['MYSQL_HOST'],
        db=settings['MYSQL_DBNAME'],
        user=settings['MYSQL_USER'],
        passwd=settings['MYSQL_PASSWD'],
        charset='utf8',
        cursorclass = MySQLdb.cursors.DictCursor,
        use_unicode= True,
    )
    dbpool = adbapi.ConnectionPool('MySQLdb', **dbargs)
    return cls(dbpool)

#pipeline默认调用
def process_item(self, item, spider):
    d = self.dbpool.runInteraction(self._do_upinsert, item, spider)
    d.addErrback(self._handle_error, item, spider)
    d.addBoth(lambda _: item)
    return d
#将每行更新或写入数据库中
def _do_upinsert(self, conn, item, spider):
    linkmd5id = self._get_linkmd5id(item)
    #print linkmd5id
    now = datetime.utcnow().replace(microsecond=0).isoformat(' ')
    conn.execute("""
            select 1 from cnblogsinfo where linkmd5id = %s
    """, (linkmd5id, ))
    ret = conn.fetchone()

    if ret:
        conn.execute("""
            update cnblogsinfo set title = %s, description = %s, link = %s, listUrl = %s, updated = %s where linkmd5id = %s
        """, (item['title'], item['desc'], item['link'], item['listUrl'], now, linkmd5id))
        #print """
        #    update cnblogsinfo set title = %s, description = %s, link = %s, listUrl = %s, updated = %s where linkmd5id = %s
        #""", (item['title'], item['desc'], item['link'], item['listUrl'], now, linkmd5id)
    else:
        conn.execute("""
            insert into cnblogsinfo(linkmd5id, title, description, link, listUrl, updated)
            values(%s, %s, %s, %s, %s, %s)
        """, (linkmd5id, item['title'], item['desc'], item['link'], item['listUrl'], now))
        #print """
        #    insert into cnblogsinfo(linkmd5id, title, description, link, listUrl, updated)
        #    values(%s, %s, %s, %s, %s, %s)
        #""", (linkmd5id, item['title'], item['desc'], item['link'], item['listUrl'], now)
#获取url的md5编码
def _get_linkmd5id(self, item):
    #url进行md5处理,为避免重复采集设计
    return md5(item['link']).hexdigest()
#异常处理
def _handle_error(self, failue, item, spider):

    # log.err(failure)
    pass
  1. 邮件发送代码:

coding:utf-8

'''读取mysql中的数据然后发送到邮箱实现浏览的'''

import MySQLdb
import MySQLdb.cursors
import smtplib
from email.header import Header
from email.mime.text import MIMEText
import sys
reload(sys)

msg_content = u'segmentfault有关Python文章爬去'

msg_content = 'ok'

def test(content):

'''
邮件发送
:param content:
:return:
'''
msg = MIMEText(content, 'html', 'utf-8')
server = smtplib.SMTP('smtp.163.com', 25)
server.login('18818261892@163.com', 'LBQ139196')
msg['From'] = '18818261892@163.com <18818261892@163.com>'
msg['Subject'] = Header(u'text', 'utf8').encode()
msg['To'] = u'飞轮海 <1848406889@qq.com>'
server.sendmail('18818261892@163.com', ['1848406889@qq.com'], msg.as_string())
print 'finished is ok!ooooo'

def db_operate():
try:

    global msg_content
    conn=MySQLdb.connect(host='localhost',user= 'root',passwd='1234',db='cnblogsdb',port=3306,charset = "utf8")
    cur=conn.cursor()
    i = cur.execute('select * from cnblogsinfo')#945条数据mscontet 需要使用945
    rows = cur.fetchall()
    for row in rows:
       # print "%s, %s, %s, %s" % (row[0], row[1], row[2], row[3])
       msg_content +=('<html><body><h1> %s </h1><p>send by <a href= %s>Python</a>...</p></body></html>')  %( row[1],row[4],)
    msg_content = msg_content+str(row[4])+'ok'
    print msg_content
    test(msg_content)
    print 'finished is ok!ooooo'
    cur.close()
    conn.close()

except MySQLdb.Error,e:

    print "Mysql Error %d: %s" % (e.args[0], e.args[1])

if name == '__main__':

db_operate()

msg_content ='ok'

try:

conn=MySQLdb.connect(host='localhost',user='root',passwd='1234',db='cnblogsdb',port=3306,charset = "utf8")

cur=conn.cursor()

i = cur.execute('select * from cnblogsinfo')#945条数据mscontet 需要使用945

rows = cur.fetchall()

for row in rows:

print "%s, %s, %s, %s" % (row[0], row[1], row[2], row[3])

print row[1]

msg_content +=('<html><body><h1> %s </h1>

send by Python...

</body></html>') %( row[1],row[4],)

msg_content +=('<html><body><h1> segmentfault</h1>

%s

send by Python...

</body></html>') %( row[1],row[4],)

msg_content = msg_content+str(row[4])+'ok'

print msg_content

msg = MIMEText(msg_content, 'html', 'utf-8')

server = smtplib.SMTP('smtp.163.com', 25)

server.login('18818261892@163.com', 'LBQ139196')

msg['From'] = '18818261892@163.com <18818261892@163.com>'

msg['Subject'] = Header(u'text', 'utf8').encode()

msg['To'] = u'飞轮海 <1848406889@qq.com>'

server.sendmail('18818261892@163.com', ['1848406889@qq.com'], msg.as_string())

print 'finished is ok!ooooo'

cur.close()

conn.close()

except MySQLdb.Error,e:

print "Mysql Error %d: %s" % (e.args[0], e.args[1])

  1. 结果显示:
    图片描述

源码分享github


FIZLIN
514 声望8 粉丝

跟我走吧,天亮就出发


引用和评论

0 条评论