python爬虫提示list index out of range

代码:

program: csdn博文爬虫
function: 实现对我的csdn主页所有博文的日期、主题、访问量、评论个数信息爬取
version: python 3.5.1
time: 2016/05/29
author: yr

import urllib.request,re,time,random,gzip

定义保存文件函数

def saveFile(data,i):

path = "E:\\projects\\Spider\\05_csdn\\papers\\paper_"+str(i+1)+".txt"
file = open(path,'wb')
page = '当前页:'+str(i+1)+'\n'
file.write(page.encode('gbk'))
#将博文信息写入文件(以utf-8保存的文件声明为gbk)
for d in data:
    d = str(d)+'\n'
    file.write(d.encode('gbk'))
file.close()

解压缩数据

def ungzip(data):

try:
    #print("正在解压缩...")
    data = gzip.decompress(data)
    #print("解压完毕...")
except:
    print("未经压缩,无需解压...")
return data

CSDN爬虫类

class CSDNSpider:

def __init__(self,pageIdx=1,url="http://blog.csdn.net/fly_yr/article/list/1"):
    #默认当前页
    self.pageIdx = pageIdx
    self.url = url[0:url.rfind('/') + 1] + str(pageIdx)
    self.headers = {
        "Connection": "keep-alive",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 "
                      "(KHTML, like Gecko) Chrome/51.0.2704.63 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, sdch",
        "Accept-Language": "zh-CN,zh;q=0.8",
        "Host": "blog.csdn.net"
    }

#求总页数
def getPages(self):
    req = urllib.request.Request(url=self.url, headers=self.headers)
    res = urllib.request.urlopen(req)

    # 从我的csdn博客主页抓取的内容是压缩后的内容,先解压缩
    data = res.read()
    data = ungzip(data)
    data = data.decode('utf-8')

    pages = r'<div.*?pagelist">.*?<span>.*?共(.*?)页</span>'
    #link = r'<div.*?pagelist">.*?<a.*?href="(.*?)".*?</a>'
    # 计算我的博文总页数
    pattern = re.compile(pages, re.DOTALL)
    pagesNum = re.findall(pattern, data)
    return pagesNum

#设置要抓取的博文页面
def setPage(self,idx):
    self.url = self.url[0:self.url.rfind('/')+1]+str(idx)

#读取博文信息
def readData(self):
    ret=[]
    str = r'<div.*?article_item">.*?<span class="(.*?)"></span>.*?link_title"><a href="(.*?)">(.*?)</a>.*?' + \
          r'<span class="link_postdate">(.*?)</span>.*?</a>(.∗?)(.∗?)</span>.*?' + \
          r'</a>.*?(.∗?)(.∗?)</span>' 
    req = urllib.request.Request(url=self.url, headers=self.headers)
    res = urllib.request.urlopen(req)

    # 从我的csdn博客主页抓取的内容是压缩后的内容,先解压缩
    data = res.read()
    data = ungzip(data)
    data = data.decode('utf-8')
    pattern = re.compile(str,re.DOTALL)
    items = re.findall(pattern,data)
    for item in items:  
        if item[0] == "ico ico_type_Original":  
            s = '原'  
        else:  
            s = '转'  
        ret.append('\r\n日期:' + item[3] + '\r\t' + s  
                   + '\r\n标题:' + item[2].strip()  
                   + '\r\n链接:http://blog.csdn.net' + item[1]  
                   + '\r\n阅读:' + item[4] + '\r\t评论:' + item[5] + '\r\n')  
    return ret  

定义爬虫对象

cs = CSDNSpider()

求取

pagesNum = int(cs.getPages())
print("博文总页数: ",pagesNum)

for idx in range(pagesNum):

cs.setPage(idx)
print("当前页:",idx+1)
#读取当前页的所有博文,结果为list类型
papers = cs.readData()
saveFile(papers,idx)

报错:
Traceback (most recent call last):
File "I:/python_work/pachong/request www.py", line 99, in <module>

pagesNum = int(cs.getPages())

File "I:/python_work/pachong/request www.py", line 63, in getPages

pagesNum = re.findall(pattern, data)[0]

IndexError: list index out of range

小白求解

阅读 9.5k
2 个回答

pagesNum 看下这个值
或者打印下 re.findall(pattern, data) 这个应该就明白啦
一般的debug 方法 就能解决你的问题

因为你的代码不是连接完整的,推测你63行前后应该有对list遍历操作,是不是对list有删除的操作?如果是的话
看一下这个https://segmentfault.com/q/10...

撰写回答
你尚未登录,登录后可以
  • 和开发者交流问题的细节
  • 关注并接收问题和回答的更新提醒
  • 参与内容的编辑和改进,让解决方法与时俱进
推荐问题