这是一段多线程人练习代码,可以采集到大部分人数据
但是总是不知执行到“All done ”这里
因为采集到最后,可能会碰到图片地址为空的时候,就会发生错误,
怎么改进一下,让他容错,然后执行完成?
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File : download.py
# @Software: PyCharm
import os,json,sys,getopt
import time
import Queue,threading
from bs4 import BeautifulSoup
import requests
try:
opts, args = getopt.getopt(sys.argv[1:], 'd:')
except getopt.GetoptError, err:
print str(err)
exit()
BASE_DIR = 'xieemanhua'
for k,v in opts:
if k == '-d':
BASE_DIR = v
BASE_URL = 'http://m.wujiecao.cn'
SAVE_DIR = 'datas/'+BASE_DIR
THREAD_COUNT = 5
#获取HTML内容
def getHtml(url,timeout=20):
try:
headers = {
'Accept-Language': 'zh-cn',
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/4.0 (compatible MSIE 6.00 Windows NT 5.1 SV1)',
}
r = requests.get(url,headers=headers,timeout=timeout)
r.encoding='utf-8'
html = r.text
return html
except Exception,ex:
return False
#采集列表页
def getAllPageLists():
print "=====start get all page lists====== %s" % time.ctime()
soup = BeautifulSoup(getHtml(BASE_URL+'/'+BASE_DIR))
select = soup.find('select', class_='paging-select')
option = select.find_all('option')
lists = []
for o in option:
lists.append(BASE_URL+'/'+BASE_DIR+'/'+o['value'])
return lists
#采集每一页的列表
def getSingePageUrlLists(url):
print "=====start getSingePageUrlLists("+os.path.basename(url)+") ====== %s" % time.ctime()
soup = BeautifulSoup(getHtml(url))
ul=soup.find("ul", class_="pic")
all_a=ul.find_all('a')
datas = []
for a in all_a:
img = a.find('img')
span=a.find('span',class_="bt")
title = span.contents[0]
data = {'url':BASE_URL+a['href'],'title':title,'pic':BASE_URL+img['lazysrc']}
datas.append(data)
return datas
#采集详细页
def getDetailPage(url,title):
print "=====start getDetailPage("+os.path.basename(url)+") ====== %s" % time.ctime()
soup = BeautifulSoup(getHtml(url))
div = soup.find('div',id="imgString")
img = div.find('img')
imgUrl = img['src']
if imgUrl == '':
return False
response = requests.get(imgUrl, stream=True)
if response.status_code != 200:
return False
image = response.content
dir = SAVE_DIR
if not os.path.exists(dir):
os.mkdir(dir)
baseName = os.path.basename(img['src']);
fileName = dir+'/'+title+'.'+baseName.split('.')[1]
try:
open(fileName ,"wb").write(image)
print "=====write end====== %s" % time.ctime()
except IOError:
print("IO Error\n")
return
class getLists(threading.Thread):
def __init__(self ,que,detailQue):
threading.Thread.__init__(self)
self.pageQue = que
self.detailQue = detailQue
def run(self):
while True:
url = self.pageQue.get()
singePageLists = getSingePageUrlLists(url)
for sl in singePageLists:
self.detailQue.put(json.dumps(sl))
self.pageQue.task_done()
class getDetailLists(threading.Thread):
def __init__(self ,detailQue):
threading.Thread.__init__(self)
self.detailQue = detailQue
def run(self):
while True:
data = self.detailQue.get()
decodeData = json.loads(data)
getDetailPage(decodeData['url'],decodeData['title'])
self.detailQue.task_done()
pageQue = Queue.Queue()
detailQue = Queue.Queue()
if __name__ == '__main__':
print "====start request====%s" % time.ctime()
allLists = getAllPageLists()
for u in allLists:
pageQue.put(u)
for t in range(THREAD_COUNT):
t = getLists(pageQue,detailQue)
t.setDaemon(True)
t.start()
for t in range(THREAD_COUNT):
t = getDetailLists(detailQue)
t.setDaemon(True)
t.start()
pageQue.join()
detailQue.join()
print 'all DONE at:', time.ctime()