python爬去segementfault上的博客文章

利用Scray框架爬去segementfault上的保存到数据库然后发送到自己的邮箱
图片描述

先显示部分源码：

coding:utf-8

! /usr/bin/python

'''
Author fiz
Date:2016-03-30
Segement Blog 内容爬去
'''

from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle
from scrapy.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.http import FormRequest, Request
from scrapy.selector import Selector
from Scrapy_demo.items import *
from scrapy import log
from scrapy.utils.response import get_base_url
import urlparse

class SegeblogSpider(CrawlSpider):

name = 'blog'
allowed_domains = ['segmentfault.com']
start_urls = ['https://segmentfault.com/t/python/blogs?page=1',]
#此处注意?要转义
rules = [ Rule(sle(allow=('t/python/blogs\?page={1,}'),), follow=True,callback='parse_item1') ]

def parse_item1(self, response):
    sel = Selector(response)

    items = []
    base_url = get_base_url(response)
    postTitle = sel.css('div.tab-content').css("section")#全部的问题数量每一页

    postCon = sel.css('div.postCon div.c_b_p_desc')
    # #标题、url和描述的结构是一个松散的结构，后期可以改进
    for index in range(len(postTitle)):
        item = CnblogsItem()
        #问题名称
        item['title'] = postTitle[index].css("a").xpath('text()').extract()[0]
        # item['link'] = 'https://segmentfault.com'+postTitle[index].css('a').xpath('@href').extract()[0]#提问人的主页链接
        #问题页面链接
        item['link'] = 'https://segmentfault.com'+postTitle[index].css("h2.title").css('a').xpath('@href').extract()[0]
        #在当前页面进行爬去

        #当前爬去的页面
        item['listUrl'] = base_url

        item['desc'] = postTitle[index].css("div.views ").xpath("text()").extract()[0]
        #print base_url + "********\n"
        items.append(item)
    return items

添加Pipleline保存到数据库

-- coding: utf-8 --

Define your item pipelines here

Don't forget to add your pipeline to the ITEM_PIPELINES setting

See: http://doc.scrapy.org/en/latest/topics/i...

from scrapy import signals
import json
import codecs
from twisted.enterprise import adbapi
from datetime import datetime
from hashlib import md5
import MySQLdb
import MySQLdb.cursors

'''
保存Json格式
'''

class JsonWithEncodingCnblogsPipeline(object):

def __init__(self):
    self.file = codecs.open('Segement.json', 'w', encoding='utf-8')
def process_item(self, item, spider):
    line = json.dumps(dict(item), ensure_ascii=False) + "\n"
    self.file.write(line)
    return item
def spider_closed(self, spider):
    self.file.close()

class MySQLStoreCnblogsPipeline(object):

def __init__(self, dbpool):
    self.dbpool = dbpool

@classmethod
def from_settings(cls, settings):
    dbargs = dict(
        host=settings['MYSQL_HOST'],
        db=settings['MYSQL_DBNAME'],
        user=settings['MYSQL_USER'],
        passwd=settings['MYSQL_PASSWD'],
        charset='utf8',
        cursorclass = MySQLdb.cursors.DictCursor,
        use_unicode= True,
    )
    dbpool = adbapi.ConnectionPool('MySQLdb', **dbargs)
    return cls(dbpool)

#pipeline默认调用
def process_item(self, item, spider):
    d = self.dbpool.runInteraction(self._do_upinsert, item, spider)
    d.addErrback(self._handle_error, item, spider)
    d.addBoth(lambda _: item)
    return d
#将每行更新或写入数据库中
def _do_upinsert(self, conn, item, spider):
    linkmd5id = self._get_linkmd5id(item)
    #print linkmd5id
    now = datetime.utcnow().replace(microsecond=0).isoformat(' ')
    conn.execute("""
            select 1 from cnblogsinfo where linkmd5id = %s
    """, (linkmd5id, ))
    ret = conn.fetchone()

    if ret:
        conn.execute("""
            update cnblogsinfo set title = %s, description = %s, link = %s, listUrl = %s, updated = %s where linkmd5id = %s
        """, (item['title'], item['desc'], item['link'], item['listUrl'], now, linkmd5id))
        #print """
        #    update cnblogsinfo set title = %s, description = %s, link = %s, listUrl = %s, updated = %s where linkmd5id = %s
        #""", (item['title'], item['desc'], item['link'], item['listUrl'], now, linkmd5id)
    else:
        conn.execute("""
            insert into cnblogsinfo(linkmd5id, title, description, link, listUrl, updated)
            values(%s, %s, %s, %s, %s, %s)
        """, (linkmd5id, item['title'], item['desc'], item['link'], item['listUrl'], now))
        #print """
        #    insert into cnblogsinfo(linkmd5id, title, description, link, listUrl, updated)
        #    values(%s, %s, %s, %s, %s, %s)
        #""", (linkmd5id, item['title'], item['desc'], item['link'], item['listUrl'], now)
#获取url的md5编码
def _get_linkmd5id(self, item):
    #url进行md5处理，为避免重复采集设计
    return md5(item['link']).hexdigest()
#异常处理
def _handle_error(self, failue, item, spider):

    # log.err(failure)
    pass

邮件发送代码：

coding:utf-8

'''读取mysql中的数据然后发送到邮箱实现浏览的'''

import MySQLdb
import MySQLdb.cursors
import smtplib
from email.header import Header
from email.mime.text import MIMEText
import sys
reload(sys)

msg_content = u'segmentfault有关Python文章爬去'

msg_content = 'ok'

def test(content):

'''
邮件发送
:param content:
:return:
'''
msg = MIMEText(content, 'html', 'utf-8')
server = smtplib.SMTP('smtp.163.com', 25)
server.login('18818261892@163.com', 'LBQ139196')
msg['From'] = '18818261892@163.com <18818261892@163.com>'
msg['Subject'] = Header(u'text', 'utf8').encode()
msg['To'] = u'飞轮海 <1848406889@qq.com>'
server.sendmail('18818261892@163.com', ['1848406889@qq.com'], msg.as_string())
print 'finished is ok!ooooo'

def db_operate():
try:

    global msg_content
    conn=MySQLdb.connect(host='localhost',user= 'root',passwd='1234',db='cnblogsdb',port=3306,charset = "utf8")
    cur=conn.cursor()
    i = cur.execute('select * from cnblogsinfo')#945条数据mscontet 需要使用945
    rows = cur.fetchall()
    for row in rows:
       # print "%s, %s, %s, %s" % (row[0], row[1], row[2], row[3])
       msg_content +=('<html><body><h1> %s </h1><p>send by <a href= %s>Python</a>...</p></body></html>')  %( row[1],row[4],)
    msg_content = msg_content+str(row[4])+'ok'
    print msg_content
    test(msg_content)
    print 'finished is ok!ooooo'
    cur.close()
    conn.close()

except MySQLdb.Error,e:

    print "Mysql Error %d: %s" % (e.args[0], e.args[1])

if name == '__main__':

db_operate()

msg_content ='ok'

try:

conn=MySQLdb.connect(host='localhost',user='root',passwd='1234',db='cnblogsdb',port=3306,charset = "utf8")

cur=conn.cursor()

i = cur.execute('select * from cnblogsinfo')#945条数据mscontet 需要使用945

rows = cur.fetchall()

for row in rows:

print "%s, %s, %s, %s" % (row[0], row[1], row[2], row[3])

print row[1]

msg_content +=('<html><body><h1> %s </h1>

send by Python...

</body></html>') %( row[1],row[4],)

msg_content +=('<html><body><h1> segmentfault</h1>

send by Python...

</body></html>') %( row[1],row[4],)

msg_content = msg_content+str(row[4])+'ok'

print msg_content

msg = MIMEText(msg_content, 'html', 'utf-8')

server = smtplib.SMTP('smtp.163.com', 25)

server.login('18818261892@163.com', 'LBQ139196')

msg['From'] = '18818261892@163.com <18818261892@163.com>'

msg['Subject'] = Header(u'text', 'utf8').encode()

msg['To'] = u'飞轮海 <1848406889@qq.com>'

server.sendmail('18818261892@163.com', ['1848406889@qq.com'], msg.as_string())

print 'finished is ok!ooooo'

cur.close()

conn.close()

except MySQLdb.Error,e:

print "Mysql Error %d: %s" % (e.args[0], e.args[1])

结果显示：

源码分享github

python爬去segementfault上的博客文章

coding:utf-8

! /usr/bin/python

-- coding: utf-8 --

Define your item pipelines here

Don't forget to add your pipeline to the ITEM_PIPELINES setting

See: http://doc.scrapy.org/en/latest/topics/i...

coding:utf-8

msg_content = u'segmentfault有关Python文章爬去'

msg_content ='ok'

try:

conn=MySQLdb.connect(host='localhost',user='root',passwd='1234',db='cnblogsdb',port=3306,charset = "utf8")

cur=conn.cursor()

i = cur.execute('select * from cnblogsinfo')#945条数据mscontet 需要使用945

rows = cur.fetchall()

for row in rows:

print "%s, %s, %s, %s" % (row[0], row[1], row[2], row[3])

print row[1]

msg_content +=('<html><body><h1> %s </h1>

msg_content +=('<html><body><h1> segmentfault</h1>

msg_content = msg_content+str(row[4])+'ok'

print msg_content

msg = MIMEText(msg_content, 'html', 'utf-8')

server = smtplib.SMTP('smtp.163.com', 25)

server.login('18818261892@163.com', 'LBQ139196')

msg['From'] = '18818261892@163.com <18818261892@163.com>'

msg['Subject'] = Header(u'text', 'utf8').encode()

msg['To'] = u'飞轮海 <1848406889@qq.com>'

server.sendmail('18818261892@163.com', ['1848406889@qq.com'], msg.as_string())

print 'finished is ok!ooooo'

cur.close()

conn.close()

except MySQLdb.Error,e:

print "Mysql Error %d: %s" % (e.args[0], e.args[1])

FIZLIN

引用和评论

go--读取文件的方式

Python爬虫实战：从入门到精通，代码全解析！

虾皮Shopee商品信息采集

douyin_search_comment_tool | 2025自研python软件采集抖音评论区数据

【GUI软件】调用YouTube的API接口，采集关键词搜索结果，并封装成界面工具！

深入研究：淘宝天猫商品详情查询API详解

【爬虫工具】2025微博采集软件，根据搜索关键词批量爬帖子，突破50页限制！