import urllib
import urllib2
from bs4 import BeautifulSoup
import time
baseurl ='http://tieba.baidu.com/p/3138733512?see_lz=1&pn='
def filter(tag):
return tag.name=="span" and not tag.has_attr('style') and tag.has_attr('class') and tag['class']==['red']
#return tag.name=="span" and not tag.has_attr("style") and tag.attr("class")=="red"
class BaiDuCraw:
def __init__(self,baseurl,title=None,soup=None):
self.baseurl = baseurl
self.title = title
self.soup =soup
def GetPage(self,pnum):
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
totalurl = self.baseurl+str(pnum)
try:
req = urllib2.Request(totalurl,headers=headers)
response = urllib2.urlopen(req)
content = response.read().decode("utf-8")
return content
except urllib2.URLError,e:
if e.has_attr("code"):
print e.code
else:
print e.reason
return
def writetofile(self):
contentlist = self.soup.find_all(name="div", attrs={"class":"d_post_content j_d_post_content "})
for content in contentlist:
f.writelines(content.get_text())
f.write("\n")
def start(self):
content = self.GetPage(1)
self.soup = BeautifulSoup(content,"lxml")
self.writetofile()
totalpage = self.soup.find_all(filter)
totalnum = int(totalpage[0].get_text())
for i in range(2,totalnum+1):
content = self.GetPage(i)
self.soup = BeautifulSoup(content,"lxml")
self.writetofile()
time.sleep(2)
f = open("tieba.txt","w+")
mycrawer = BaiDuCraw(baseurl)
mycrawer.start()
没具体测试过,不过百度限制你是肯定的,这时候你只能找(买)高匿代理IP来做。
之前抓过淘宝的数据,一样也有限制(频率/数量),用高匿IP完美解决,便宜的,10块钱买2万个。