爬虫输出文件为空,求解答

import json
import requests
import csv
from multiprocessing.dummy import Pool as ThreadPool
import time
import random
from lxml import etree

def spider(page):

    url = 'http://waimai.meituan.com/search/wx4g19983su8/rt?keyword='+str(page)
    
    headers ={
        'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding':'gzip, deflate, sdch',
        'Accept-Language':'zh-CN,zh;q=0.8',
        'Cache-Control':'max-age=0',
        'Connection':'keep-alive',
        'Cookie':'w_uuid=Lk80hpGbK2WdGpo3knW8qZw2M6dDGUODsDrcGIPj8TdyhDChRFTV-fjxkGWwEQ9w; _lxsdk=15d9af78c28c8-0ee719fb4f1d34-57e1b3c-144000-15d9af78c29c8; Hm_lvt_f66b37722f586a240d4621318a5a6ebe=1501555925,1501555942,1501629715; __utma=211559370.58204263.1500387690.1501555925.1501629715.2; __utmz=211559370.1501629715.2.2.utmcsr=baidu|utmccn=baidu|utmcmd=organic|utmcct=zt_search; __mta=45569435.1501629712300.1501629720518.1501629728116.3; uuid=9dd0ef628497b76a1925.1501543494.0.0.0; _lxsdk_cuid=15e4aa6effc9e-0b23fb07f9c1bf-57e1b3c-144000-15e4aa6effdc8; _ga=GA1.2.58204263.1500387690; _gid=GA1.2.1025663946.1504513189; w_cid=110101; w_cpy_cn="%E4%B8%9C%E5%9F%8E%E5%8C%BA"; w_cpy=dongchengqu; waddrname="%E6%9C%AA%E7%9F%A5"; w_geoid=wx4g19983su8; w_ah="39.91065189242363,116.43523581326008,%E6%9C%AA%E7%9F%A5|39.8989349976182,116.50381989777088,%E7%99%BE%E5%AD%90%E6%B9%BE|39.96550491079688,116.30504373461008,%E6%B5%B7%E6%B7%80%E5%8C%BA|40.004531890153885,116.47530399262905,%E6%9C%9B%E4%BA%AC%2C%E6%9D%A5%E5%B9%BF%E8%90%A5%2C%E8%8A%B1%E5%AE%B6%E5%9C%B0"; JSESSIONID=s1gvzkp9v2th1ed70m8qr5yyz; _ga=GA1.3.58204263.1500387690; _gid=GA1.3.1025663946.1504513189; _gat=1; w_utmz="utm_campaign=(direct)&utm_source=(direct)&utm_medium=(none)&utm_content=(none)&utm_term=(none)"; w_visitid=039dc5a0-4720-4cef-afc1-e43eeec3671a; __mta=45569435.1501629712300.1501629728116.1504518924133.4',
        'Host':'waimai.meituan.com',
        'Upgrade-Insecure-Requests':'1'
    }
    param ={
        'keyword':str(page)
    }
    
    timeout = random.choice(range(10, 20))
    
    html = requests.get(url,headers = headers,timeout = timeout,params = param)
    
    selector = etree.HTML(html.text)
    
    content_field = selector.xpath('//*[@class="result-content"]/ul/li')
    
    item = {}
    for each in content_field:
        
        name = each.xpath('a/div[1]/p[1]/text()')
        
        time = each.xpath('a/div[1]/p[4]/text()')
        
        order = each.xpath('a/div[1]/p[3]/span[2]/text()')
        
        item['name'] = "".join(name)
        
        item['time'] = "".join(time)
        
        item['order'] = "".join(order)
        
        fieldnames = ['name', 'time', 'order']

        with open('mtjd3.csv', 'a', newline='', errors='ignore')as f:
            f_csv = csv.DictWriter(f, fieldnames=fieldnames)

            f_csv.writeheader()

            f_csv.writerow(item)


if __name__ == '__main__':
    
    f = open('mtjd3.csv', 'w')
    
    list = ["大虾来了","夹克的虾","簋街仔仔","辣私房","烧虾师"]
    
    page = []
    for i in list:
        page.append(i)
        
    print(page)
    
    pool = ThreadPool(4)
    
    results = pool.map(spider, page)
    
    pool.close()
    pool.join()
    f.close()

运行输出文件为空,不知道是迭代设置错误还是函数错误?求帮助

阅读 4k
2 个回答

额,这段代码也是惊人。

建议你把

 with open('mtjd3.csv', 'a', newline='', errors='ignore')as f:
            f_csv = csv.DictWriter(f, fieldnames=fieldnames)

            f_csv.writeheader()

            f_csv.writerow(item)

这段代码从循环里取出,先with open 后进行for循环,item解析完后只用 f_csv.writerow(item) 应该就可以。不然每次循环你都打开这个文件,还重新生成csv.DictWriter以及写header, csv文件写一次header就可以了。

应该是url = 'http://waimai.meituan.com/search/wx4g19983su8/rt?keyword='+str(page) 问题吧, 这里你已经加上了参数了,和后面的

param ={
        'keyword':str(page)
    }

产生冲突了
改成url = 'http://waimai.meituan.com/search/wx4g19983su8/rt' 试下

撰写回答
你尚未登录,登录后可以
  • 和开发者交流问题的细节
  • 关注并接收问题和回答的更新提醒
  • 参与内容的编辑和改进,让解决方法与时俱进
推荐问题