python的lxml.etree和headers在一起碰到的奇怪问题

新手上路,请多包涵

1.问题: loadPage 函数里面的request如果写上headers,那么link_list就会为空,反之正常,为什么?

coding=utf-8

import urllib
import urllib2
from lxml import etree

def loadPage(url):

"""
作用:根据URL发送请求,获取服务器响应文件

"""
print("正在下载")
ua_headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'
}
request = urllib2.Request(url)
response = urllib2.urlopen(request)
html = response.read()
content = etree.HTML(html)
link_list = content.xpath('//div[@class="t_con cleafix"]/div/div/div/a/@href')
for link in link_list:
    fulllink = "http://tieba.baidu.com"+link
    loadImage(fulllink)

def loadImage(link):

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
}
request = urllib2.Request(link, headers=headers)
html = urllib2.urlopen(request).read()
content = etree.HTML(html)
link_list = content.xpath('//img[@class="BDE_Image"]/@src')
print link_list
for link in link_list:
    writeImage(link)


def writeImage(link):

"""
作用:将图片文件保存文件写入到 本地

"""
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
}
reques = urllib2.Request(link, headers=headers)
image = urllib2.urlopen(reques).read()
filename = link[-10:]
print("正在下载"+filename)
with open(filename, 'wb')as f:
    f.write(image)

def tiebaSpider(url,bginpage,endpage):

"""

"""
for page in range(beginpage,endpage+1):
    pn = (page-1)*50
    fullurl = url+"&pn="+str(pn)
    print(fullurl)
    loadPage(fullurl)


if __name__=="__main__":

kw = raw_input("请输入需要爬去的贴吧名:")
beginpage = input("请输入起始页:")
endpage=input("请输入结束页:")
url="http://tieba.baidu.com/f?"
key=urllib.urlencode({"kw":kw})
fullurl=url+key
tiebaSpider(fullurl, beginpage, endpage)


阅读 1.2k
撰写回答
你尚未登录,登录后可以
  • 和开发者交流问题的细节
  • 关注并接收问题和回答的更新提醒
  • 参与内容的编辑和改进,让解决方法与时俱进
推荐问题