1.抓取界面

def getHtml(url):
    #构造头部
    headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'}
    #用requests获得网页
    gethtml=requests.get(url,headers=headers)
    return gethtml.text

2.生成多页列表

def changeurl(start_url,page):    
    urls=[]
    for i in range(1,page+1):
        url=start_url+str(i)
        urls.append(url)
    return urls

3.抓取百度贴吧小说代码

# -*- coding:utf-8 -*-
import requests
import re
#使打印的内容是中文
reload(__import__('sys')).setdefaultencoding('utf-8')


def changeurl(start_url,page):  #传参数(开始的url,页数)  
    urls=[]
    for i in range(1,page+1):
        url=start_url+str(i)
        urls.append(url)
    return urls
    
def getHtml(url):
    #构造头部
    headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'}
    #用requests获得网页
    gethtml=requests.get(url,headers=headers)
    return gethtml.text
#打开(wangyuan.txt,'w+'方式写入)
f=open('wangyuan.txt','w+')
#开始的url
start_url="http://tieba.baidu.com/p/3826846894?see_lz=1&pn="
#所有连接调用changurl()构成
all_link=changeurl(start_url,3)
#遍历每个连接
for link in all_link:
    #网页源码调用getHtml(传入连接)
    Yuanma=getHtml(link)
    #所有内容用正则到百度贴吧的每一个楼层抓取
    neirongs=re.findall('<div id="post_content_.*?" class="d_post_content .*?">            (.*?)</div>',Yuanma,re.S)
    #遍历每一个楼层内容
    for neirong in neirongs:
        #对抓到内容再处理
        neirong=neirong.replace('<br>','')
        neirong=neirong.replace('<img class="BDE_Image" pic_type="0" width="560" height="395" src="http://imgsrc.baidu.com/forum/w%3D580/sign=b03b9d4da5ec08fa260013af69ef3d4d/77a6ef345982b2b7979a003634adcbef77099b19.jpg" pic_ext="jpeg"  ><img class="BDE_Image" pic_type="0" width="560" height="150" src="http://imgsrc.baidu.com/forum/w%3D580/sign=24f197924ac2d562f208d0e5d71390f3/2e3e9300baa1cd11ef424bd2bc12c8fcc1ce2daa.jpg" pic_ext="jpeg"  >','')
        neirong=re.sub('<a href=.*?</a>',"",neirong,re.S)
        f.write(neirong)
#关闭文件
f.close()

结果展示:
图片描述

4.抓取极客学院内容

4.1爬极客学院课程的块的正则

def geteveryclass(html):
    everyclass = re.findall('(<li id=.*?</li>)',html,re.S)
    return everyclass

4.2从每一个课程块中获取信息

def getinfo(eachclass):
    info={}#字典
    info['title']= re.search('<h2 class="lesson-info-h2"><a href=".*?" target="_blank" jktag=".*?">(.*?)</a>',eachclass,re.S).group(1)
    info['content']=re.search('''<p style=.*?>(.*?)</p>''',eachclass,re.S).group(1)
    timeandlevel = re.findall('<em>(.*?)</em>',eachclass,re.S)
    info['classtime']=timeandlevel[0]
    info['classlevel'] = timeandlevel[1]
    info['learnnum'] = re.search('<em class="learn-number".*?>(.*?)</em>',eachclass,re.S).group(1)
    return info

4.3保存信息快

def saveinfo(classinfo):
    f = open('info.txt','w+')
    for each in classinfo:
        f.writelines('title:' + each['title'] + '\n')
        f.writelines('content:' + each['content'] + '\n')
        f.writelines('classtime:' + each['classtime'] + '\n')
        f.writelines('classlevel:' + each['classlevel'] + '\n')
        f.writelines('learnnum:' + each['learnnum'] +'\n\n\n')
        f.writelines("======================")

4.4主程序代码

classinfo=[]
#初始连接
start_url='http://www.jikexueyuan.com/course/?pageNum='
#用changeurl生成所有连接
all_links=changeurl(start_url,2)
for link in all_links:#遍历所有连接
    print u'正在处理中'+link
    #从连接中获取源代码
    html=getsource(link)
    #源代码中得到所有课程信息列表
    everyclass=geteveryclass(html)
    #遍历课程信息列表,each每个课程信息
    for ench in everyclass:
        #每个课程得到信息(标题,内容)
        neirong=getinfo(ench)
        
        classinfo.append(neirong)
        
saveinfo(classinfo)

4.5总代吗

# -*- coding:utf-8 -*-
import requests
import re
reload(__import__('sys')).setdefaultencoding('utf-8')
#��ȡ��ҳԴ����
def getsource(url):
    html = requests.get(url)
    return html.text
#����ͬ����ҳ����
def changeurl(start_url,page):    
    urls=[]
    for i in range(1,page+1):
        url=start_url+str(i)
        urls.append(url)
    return urls
#�õ�ÿ���γ���Ϣ
def geteveryclass(html):
    everyclass = re.findall('(<li id=.*?</li>)',html,re.S)
    return everyclass
#有问题
def getinfo(eachclass):
    info={}#字典
    info['title']= re.search('<h2 class="lesson-info-h2"><a href=".*?" target="_blank" jktag=".*?">(.*?)</a>',eachclass,re.S).group(1)
    info['content']=re.search('''<p style=.*?>(.*?)</p>''',eachclass,re.S).group(1)
    timeandlevel = re.findall('<em>(.*?)</em>',eachclass,re.S)
    info['classtime']=timeandlevel[0]
    info['classlevel'] = timeandlevel[1]
    info['learnnum'] = re.search('<em class="learn-number".*?>(.*?)</em>',eachclass,re.S).group(1)
    '''
    content = re.search(<p style=.*?>(.*?)</p>,eachclass,re.S)
        
  
    timeandlevel = re.findall('<em>(.*?)</em>',eachclass,re.S)
    info['classtime'] = timeandlevel[0]
    info['classlevel'] = timeandlevel[1]
    info['learnnum'] = re.search('"learn-number">(.*?)</em>',eachclass,re.S).group(1)
    '''
    return info
def saveinfo(classinfo):
    f = open('info.txt','w+')
    for each in classinfo:
        f.writelines('title:' + each['title'] + '\n')
        f.writelines('content:' + each['content'] + '\n')
        f.writelines('classtime:' + each['classtime'] + '\n')
        f.writelines('classlevel:' + each['classlevel'] + '\n')
        f.writelines('learnnum:' + each['learnnum'] +'\n\n\n')
        f.writelines("======================")
classinfo=[]
#初始连接
start_url='http://www.jikexueyuan.com/course/?pageNum='
#用changeurl生成所有连接
all_links=changeurl(start_url,2)
for link in all_links:#遍历所有连接
    print u'正在处理中'+link
    #从连接中获取源代码
    html=getsource(link)
    #源代码中得到所有课程信息列表
    everyclass=geteveryclass(html)
    #遍历课程信息列表,each每个课程信息
    for ench in everyclass:
        #每个课程得到信息(标题,内容)
        neirong=getinfo(ench)
        
        classinfo.append(neirong)
        
saveinfo(classinfo)

结果展示:
图片描述

后续备注:python>3.4

# -*- coding:utf-8 -*-
import requests
import re
import importlib
import sys
importlib.reload(sys)

即可处理汉语


叫我瞄大人
467 声望81 粉丝

喜欢追星的非科班的编程爱好者