1.问题: loadPage 函数里面的request如果写上headers,那么link_list就会为空,反之正常,为什么?
coding=utf-8
import urllib
import urllib2
from lxml import etree
def loadPage(url):
"""
作用:根据URL发送请求,获取服务器响应文件
"""
print("正在下载")
ua_headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'
}
request = urllib2.Request(url)
response = urllib2.urlopen(request)
html = response.read()
content = etree.HTML(html)
link_list = content.xpath('//div[@class="t_con cleafix"]/div/div/div/a/@href')
for link in link_list:
fulllink = "http://tieba.baidu.com"+link
loadImage(fulllink)
def loadImage(link):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
}
request = urllib2.Request(link, headers=headers)
html = urllib2.urlopen(request).read()
content = etree.HTML(html)
link_list = content.xpath('//img[@class="BDE_Image"]/@src')
print link_list
for link in link_list:
writeImage(link)
def writeImage(link):
"""
作用:将图片文件保存文件写入到 本地
"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
}
reques = urllib2.Request(link, headers=headers)
image = urllib2.urlopen(reques).read()
filename = link[-10:]
print("正在下载"+filename)
with open(filename, 'wb')as f:
f.write(image)
def tiebaSpider(url,bginpage,endpage):
"""
"""
for page in range(beginpage,endpage+1):
pn = (page-1)*50
fullurl = url+"&pn="+str(pn)
print(fullurl)
loadPage(fullurl)
if __name__=="__main__":
kw = raw_input("请输入需要爬去的贴吧名:")
beginpage = input("请输入起始页:")
endpage=input("请输入结束页:")
url="http://tieba.baidu.com/f?"
key=urllib.urlencode({"kw":kw})
fullurl=url+key
tiebaSpider(fullurl, beginpage, endpage)