pythpn多线程爬虫queue队列无法返回数据

import requests
from lxml import html
from requests.exceptions import RequestException
import time
import queue
import threading

class MyThread(threading.Thread):

def __init__(self, func):
    threading.Thread.__init__(self)
    self.func = func

def run(self):
    self.func()

def worker():

while not q.empty():
    page = q.get()  # 获得任务
    print('成功获取 : 第' + str(page) + '页url列表')
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
    main_page_url = 'https://www.qiushibaike.com/hot/page/' + str(page)
    url_list = []
    try:
        rep = requests.get(main_page_url, headers = headers)
        time.sleep(1)
        if rep.status_code == 200:
            print("第" +str(page) + "页链接成功")
            con = rep.content
            sel = html.fromstring(con)
            urls = sel.xpath('//a[@class="contentHerf"]/@href')
            for url in urls:                    
                message_url = 'https://www.qiushibaike.com' + url
                url_list.append(message_url)
            print(url_list)
            **#return url_list**
                            
    except RequestException:
        print("链接失败")
        return None
    time.sleep(1)

def main():

threads = []
#all_url = []
#url_list = worker()
for page in range(1, 7):    #爬前6页
    q.put(page)
for i in range(threadNum): #开启2个线程
    thread = MyThread(worker)
    thread.start()
    #all_url.append(url_list)
    threads.append(thread)
for thread in threads:
    thread.join()   #运行2个线程后再运行2个线程

if name == '__main__':

q = queue.Queue()
threadNum = 2   #线程数量 
main()

worker里面每次return就只运行前两个线程然后结束 正常来说 是运行两个线程之后继续运行两个线程。还是我返回数据的方法不对 望解答

阅读 1.9k
1 个回答

只是你的数据返回方式不对而且.
在函数中,一旦执行return就代表函数执行完毕.
在这两个线程,每个线程调用的worker函数,该函数中的while循环在执行return返回数据后,该函数终止,那么循环也就结束.

撰写回答
你尚未登录,登录后可以
  • 和开发者交流问题的细节
  • 关注并接收问题和回答的更新提醒
  • 参与内容的编辑和改进,让解决方法与时俱进
推荐问题