1.抓取界面
def getHtml(url):
#构造头部
headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'}
#用requests获得网页
gethtml=requests.get(url,headers=headers)
return gethtml.text
2.生成多页列表
def changeurl(start_url,page):
urls=[]
for i in range(1,page+1):
url=start_url+str(i)
urls.append(url)
return urls
3.抓取百度贴吧小说代码
# -*- coding:utf-8 -*-
import requests
import re
#使打印的内容是中文
reload(__import__('sys')).setdefaultencoding('utf-8')
def changeurl(start_url,page): #传参数(开始的url,页数)
urls=[]
for i in range(1,page+1):
url=start_url+str(i)
urls.append(url)
return urls
def getHtml(url):
#构造头部
headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'}
#用requests获得网页
gethtml=requests.get(url,headers=headers)
return gethtml.text
#打开(wangyuan.txt,'w+'方式写入)
f=open('wangyuan.txt','w+')
#开始的url
start_url="http://tieba.baidu.com/p/3826846894?see_lz=1&pn="
#所有连接调用changurl()构成
all_link=changeurl(start_url,3)
#遍历每个连接
for link in all_link:
#网页源码调用getHtml(传入连接)
Yuanma=getHtml(link)
#所有内容用正则到百度贴吧的每一个楼层抓取
neirongs=re.findall('<div id="post_content_.*?" class="d_post_content .*?"> (.*?)</div>',Yuanma,re.S)
#遍历每一个楼层内容
for neirong in neirongs:
#对抓到内容再处理
neirong=neirong.replace('<br>','')
neirong=neirong.replace('<img class="BDE_Image" pic_type="0" width="560" height="395" src="http://imgsrc.baidu.com/forum/w%3D580/sign=b03b9d4da5ec08fa260013af69ef3d4d/77a6ef345982b2b7979a003634adcbef77099b19.jpg" pic_ext="jpeg" ><img class="BDE_Image" pic_type="0" width="560" height="150" src="http://imgsrc.baidu.com/forum/w%3D580/sign=24f197924ac2d562f208d0e5d71390f3/2e3e9300baa1cd11ef424bd2bc12c8fcc1ce2daa.jpg" pic_ext="jpeg" >','')
neirong=re.sub('<a href=.*?</a>',"",neirong,re.S)
f.write(neirong)
#关闭文件
f.close()
结果展示:
4.抓取极客学院内容
4.1爬极客学院课程的块的正则
def geteveryclass(html):
everyclass = re.findall('(<li id=.*?</li>)',html,re.S)
return everyclass
4.2从每一个课程块中获取信息
def getinfo(eachclass):
info={}#字典
info['title']= re.search('<h2 class="lesson-info-h2"><a href=".*?" target="_blank" jktag=".*?">(.*?)</a>',eachclass,re.S).group(1)
info['content']=re.search('''<p style=.*?>(.*?)</p>''',eachclass,re.S).group(1)
timeandlevel = re.findall('<em>(.*?)</em>',eachclass,re.S)
info['classtime']=timeandlevel[0]
info['classlevel'] = timeandlevel[1]
info['learnnum'] = re.search('<em class="learn-number".*?>(.*?)</em>',eachclass,re.S).group(1)
return info
4.3保存信息快
def saveinfo(classinfo):
f = open('info.txt','w+')
for each in classinfo:
f.writelines('title:' + each['title'] + '\n')
f.writelines('content:' + each['content'] + '\n')
f.writelines('classtime:' + each['classtime'] + '\n')
f.writelines('classlevel:' + each['classlevel'] + '\n')
f.writelines('learnnum:' + each['learnnum'] +'\n\n\n')
f.writelines("======================")
4.4主程序代码
classinfo=[]
#初始连接
start_url='http://www.jikexueyuan.com/course/?pageNum='
#用changeurl生成所有连接
all_links=changeurl(start_url,2)
for link in all_links:#遍历所有连接
print u'正在处理中'+link
#从连接中获取源代码
html=getsource(link)
#源代码中得到所有课程信息列表
everyclass=geteveryclass(html)
#遍历课程信息列表,each每个课程信息
for ench in everyclass:
#每个课程得到信息(标题,内容)
neirong=getinfo(ench)
classinfo.append(neirong)
saveinfo(classinfo)
4.5总代吗
# -*- coding:utf-8 -*-
import requests
import re
reload(__import__('sys')).setdefaultencoding('utf-8')
#��ȡ��ҳԴ����
def getsource(url):
html = requests.get(url)
return html.text
#����ͬ����ҳ����
def changeurl(start_url,page):
urls=[]
for i in range(1,page+1):
url=start_url+str(i)
urls.append(url)
return urls
#�õ�ÿ���γ���Ϣ
def geteveryclass(html):
everyclass = re.findall('(<li id=.*?</li>)',html,re.S)
return everyclass
#有问题
def getinfo(eachclass):
info={}#字典
info['title']= re.search('<h2 class="lesson-info-h2"><a href=".*?" target="_blank" jktag=".*?">(.*?)</a>',eachclass,re.S).group(1)
info['content']=re.search('''<p style=.*?>(.*?)</p>''',eachclass,re.S).group(1)
timeandlevel = re.findall('<em>(.*?)</em>',eachclass,re.S)
info['classtime']=timeandlevel[0]
info['classlevel'] = timeandlevel[1]
info['learnnum'] = re.search('<em class="learn-number".*?>(.*?)</em>',eachclass,re.S).group(1)
'''
content = re.search(<p style=.*?>(.*?)</p>,eachclass,re.S)
timeandlevel = re.findall('<em>(.*?)</em>',eachclass,re.S)
info['classtime'] = timeandlevel[0]
info['classlevel'] = timeandlevel[1]
info['learnnum'] = re.search('"learn-number">(.*?)</em>',eachclass,re.S).group(1)
'''
return info
def saveinfo(classinfo):
f = open('info.txt','w+')
for each in classinfo:
f.writelines('title:' + each['title'] + '\n')
f.writelines('content:' + each['content'] + '\n')
f.writelines('classtime:' + each['classtime'] + '\n')
f.writelines('classlevel:' + each['classlevel'] + '\n')
f.writelines('learnnum:' + each['learnnum'] +'\n\n\n')
f.writelines("======================")
classinfo=[]
#初始连接
start_url='http://www.jikexueyuan.com/course/?pageNum='
#用changeurl生成所有连接
all_links=changeurl(start_url,2)
for link in all_links:#遍历所有连接
print u'正在处理中'+link
#从连接中获取源代码
html=getsource(link)
#源代码中得到所有课程信息列表
everyclass=geteveryclass(html)
#遍历课程信息列表,each每个课程信息
for ench in everyclass:
#每个课程得到信息(标题,内容)
neirong=getinfo(ench)
classinfo.append(neirong)
saveinfo(classinfo)
结果展示:
后续备注:python>3.4
# -*- coding:utf-8 -*-
import requests
import re
import importlib
import sys
importlib.reload(sys)
即可处理汉语
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。