import requests
from lxml import html
from requests.exceptions import RequestException
import time
import queue
import threading
class MyThread(threading.Thread):
def __init__(self, func):
threading.Thread.__init__(self)
self.func = func
def run(self):
self.func()
def worker():
while not q.empty():
page = q.get() # 获得任务
print('成功获取 : 第' + str(page) + '页url列表')
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
main_page_url = 'https://www.qiushibaike.com/hot/page/' + str(page)
url_list = []
try:
rep = requests.get(main_page_url, headers = headers)
time.sleep(1)
if rep.status_code == 200:
print("第" +str(page) + "页链接成功")
con = rep.content
sel = html.fromstring(con)
urls = sel.xpath('//a[@class="contentHerf"]/@href')
for url in urls:
message_url = 'https://www.qiushibaike.com' + url
url_list.append(message_url)
print(url_list)
**#return url_list**
except RequestException:
print("链接失败")
return None
time.sleep(1)
def main():
threads = []
#all_url = []
#url_list = worker()
for page in range(1, 7): #爬前6页
q.put(page)
for i in range(threadNum): #开启2个线程
thread = MyThread(worker)
thread.start()
#all_url.append(url_list)
threads.append(thread)
for thread in threads:
thread.join() #运行2个线程后再运行2个线程
if name == '__main__':
q = queue.Queue()
threadNum = 2 #线程数量
main()
worker里面每次return就只运行前两个线程然后结束 正常来说 是运行两个线程之后继续运行两个线程。还是我返回数据的方法不对 望解答
只是你的数据返回方式不对而且.
在函数中,一旦执行return就代表函数执行完毕.
在这两个线程,每个线程调用的worker函数,该函数中的while循环在执行return返回数据后,该函数终止,那么循环也就结束.