新手学习python,网上copy了一段爬虫,求助攻!
环境:win8、sublime text3
原文:http://blog.csdn.net/omuyejingfeng1/article/details/24182313
# -*- coding: utf-8 -*-
import locale
import urllib.request as request
import urllib.parse as parse
import string
import re
import os
import urllib.error as error
print(u"""
+++++++++++++++++++++++
学校:超神学院
专业:德玛班
姓名:德玛之力
version: python3.2
+++++++++++++++++=++++
""")
def baidu_tieba(url, begin_page, end_page):
count = 1
for i in range(begin_page, end_page + 1):
sName = 'f:/test/'+str(i).zfill(5)+'.html'
print (locale.getdefaultlocale())
print (u'正在下载第'+str(i)+u'个页面, 并保存为'+sName)
m = request.urlopen(url+str(i)).read()
#创建目录保存每个网页上的图片
dirpath = 'f:/test/'
dirname = str(i)
new_path = os.path.join(dirpath, dirname)
if not os.path.isdir(new_path):
os.makedirs(new_path)
page_data = m.decode('GBK')
page_image = re.compile('<img src=\"(.+?)\"')
for image in page_image.findall(page_data):
pattern = re.compile(r'^http://.*.png$')
if pattern.match(image):
try:
image_data = request.urlopen(image).read()
image_path = dirpath + dirname +'/'+str(count)+'.png'
count += 1
print(image_path)
with open(image_path, 'wb') as image_file:
image_file.write(image_data)
image_file.close()
except error.URLError as e:
print('Download failed')
with open(sName,'wb') as file:
file.write(m)
file.close()
if __name__ == "__main__":
url = "http://tieba.baidu.com/p/"
begin_page = 1
end_page = 3
baidu_tieba(url, begin_page, end_page)
输出乱码:
+++++++++++++++++++++++
ѧУ■■■■■ѧԺ
רҵ■■■■■
■■■■■■■■■■
version: python3.2
+++++++++++++++++=++++
('zh_CN', 'cp936')
■■■■■■■■ҳ■, ■■■■■■■f:/test/00001.html
Traceback (most recent call last):
File "E:\python\test_3.py", line 52, in <module>
baidu_tieba(url, begin_page, end_page)
File "E:\python\test_3.py", line 30, in baidu_tieba
page_data = m.decode('GBK')
UnicodeDecodeError: 'gbk' codec can't decode byte 0xae in position 3697: illegal multibyte sequence
>>>
编码问题,看看原网页是什么编码