我在爬取pubmed的网页,由于需要爬的网页太多了,于是分为4个线程来同时处理。涉及到的代码如下:
def getContent(stuff):
text = ''
if stuff == None:
return None
else :
cool = stuff.descendants
for parse in cool:
if (isinstance(parse, NavigableString)):
text += parse
elif parse.name == 'p':
text += '\n'
elif re.match(r"h[0-9]+", parse.name):
text += '\n'
elif ("li" == parse.name):
text += '\n\t'
return text
def writeToFile(dirName, fileName,content):
with open(dirName+fileName, "a") as code:
code.write(content)
def pageScrapy(dirName,baseUrl , pubmedId ):
url = baseUrl + pubmedId
webpage = None
flag = 1
while 1 == flag:
pageText = ""
try:
webpage = urllib.request.urlopen(url)
pageText = webpage.read()
except socket.timeout as e:
print ("Timeout, try again. waiting 3 secs and try again." )
time.sleep(3)
continue
except Exception as e:
if hasattr(e, 'reason'):
print ('Reason: '+ str(e.reason) )
elif hasattr(e, 'code'):
print ('Error code: '+ e.code)
print ("Waiting 5 secs to continue." )
time.sleep(5)
continue
else:
flag = 0
webpage.close()
soup = BeautifulSoup(pageText)
rprt_abstract = soup.find('div', {'class': "rprt abstract"})
TagElements = ['abstr']
if rprt_abstract is not None:
writeToFile(dirName,pubmedId, 'title\n'+getContent(rprt_abstract.find("h1")))
for tagElem in TagElements:
text = getContent(rprt_abstract.find('div', {'class': tagElem}))
if (text != None):
writeToFile(dirName,pubmedId,text)
def task1(list,chidDir):
print ("Task 1 executed." )
baseUrl = "http://www.ncbi.nlm.nih.gov/pubmed/"
basepath="/home/yang/Documents/PubMedSpider/data/"
#f="trumpetcreeper_flower"
f=childDir
dirName=basepath+f+"/"
#for pubmedId in list_child1:
for pubmedId in list:
pageScrapy(dirName,baseUrl , pubmedId )
print ("Task 1 has done")
def task2(list,childDir):#与上面一样,task2,task3,task4与task1都一样
print("多线程:")
starttime=time.time(); #记录开始时间
list=[27997051,27995952,27990492,27956742,,27931265]#当然不止这么一点儿
everyDownload=int(len(list)/4)
print(everyDownload)
list_child1=list[0: everyDownload]
list_child2=list[everyDownload: everyDownload*2]
list_child3=list[everyDownload*2: everyDownload*3]
list_child4=list[everyDownload*3: len(list)]
threads = [] #创建一个线程列表,用于存放需要执行的子线程
t1 = threading.Thread(target=task1,args=(list_child1,childDir,)) #创建第一个子线程,子线程的任务是调用task1函数,注意函数名后不能有()
threads.append(t1)#将这个子线程添加到线程列表中
t2 = threading.Thread(target=task2,args=(list_child2,childDir,))#创建第二个子线程
threads.append(t2)#将这个子线程添加到线程列表中
t3 = threading.Thread(target=task3,args=(list_child3,childDir,)) #创建第一个子线程,子线程的任务是调用task1函数,注意函数名后不能有()
threads.append(t3)#将这个子线程添加到线程列表中
t4 = threading.Thread(target=task4,args=(list_child4,childDir,)) #创建第一个子线程,子线程的任务是调用task1函数,注意函数名后不能有()
threads.append(t4)#将这个子线程添加到线程列表中
for t in threads: #遍历线程列表
t.setDaemon(True) #将线程声明为守护线程,必须在start() 方法调用之前设置,如果不设置为守护线程程序会被无限挂起
t.start() #启动子线程
t.join()
endtime=time.time();#记录程序结束时间
totaltime=endtime-starttime;#计算程序执行耗时
print ("耗时:{0:.5f}秒" .format(totaltime)); #格式输出耗时
print('---------------------------')
执行完毕之后的结果是
多线程:
already download's number is:0
need to download is:903
225
Task 1 executed.
Task 2 executed.
Task 3 executed.
Task 4 executed.
/usr/local/lib/python3.4/dist-packages/bs4/__init__.py:181: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system ("lxml"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.
The code that caused this warning is on line 888 of the file /usr/lib/python3.4/threading.py. To get rid of this warning, change code that looks like this:
BeautifulSoup([your markup])
to this:
BeautifulSoup([your markup], "lxml")
markup_type=markup_type))
Task 3 has done
Task 4 has done
耗时:720.36273秒
---------------------------
没有看到线程1执行完毕的提示消息,很奇怪!!!而且进入到文件夹,查看文件数量,根本就没有903个,只下载了897个,线程没有执行完毕,居然就全部结束了,很奇怪,pyhhon初学者,请教大神!!!
你只 join 了最后一个线程。你需要在另一个 for 循环里去逐个 join 线程。
更简单的做法,用 concurrent.futures 的线程池。
PS: t.setDaemon(True) 是旧式写法。现在应该写 t.daemon = True 了。
PPS: while 1 == flag 不要这么写,很奇怪。Python 又不是 JavaScript。你的 flag 应该是 True 或者 False,然后写 while flag 最好了。不过 flag 这名字太宽泛了……