请教为什么我爬的是空白文件

为什么我的爬取文件是空白的

我是想爬取大众点评的门店名及地址等信息

import requests
from bs4 import BeautifulSoup
import sys
import random
import time

base_url = 'https://www.dianping.com/sear...'
deep = 5

def get_html(url):

try:
    r = requests.get(url,timeout = 30)
    r.raise_for_status
    r.encoding = 'utf-8'
    return r.text
except:
    return 'Something Wrong'

def get_content(url):

lists_ = []
html = get_html(url)
soup = BeautifulSoup(html,'lxml')
info = soup.find_all('li',{'class':' '})
for i in info:
    list_ = {}
    try:
        list_['store_name'] = i.find('a',{'h4':' '}).text.strip()
        list_['url'] = i.find('a',{'target':' _blank'})['href']
        list_['addr'] = i.find('span',{'class':' addr'}).text.strip()
        lists_.append(list_)
    except:
        print('Something Wrong')
return lists_
    

def outfill(dict):

with open('JBS_Store.txt','a+',encoding='utf-8') as f:
    for list_ in dict:
        f.write('门店名:{}\t 门店地址:{}\t 门店URL:{}\n'.fomate(list_['store_name'],list_['addr'],list_['url']))
    print('爬取完成')
    

def main(baer_url,deep):

url_list=[]
for i in range(0,deep):
    url_list.append(baer_url+'/p'+str(i))
print('打印完成')

for url in url_list:
    content = get_content(url)
    outfill(content)
print('所有信息存储完成')

if name == '__main__':

main(base_url, deep)

阅读 3.2k
3 个回答

主要的问题还是对于HTTP的HEADERS部分处理,因为大众点评会考虑Cookie来反爬,所以没有Cookie的情况下的爬虫会被直接drop。

并且即使增加了Cookie,所提供的代码仍然很难运行。

import requests
from bs4 import BeautifulSoup

base_url = 'https://www.dianping.com/search/keyword/3/0_%E9%9D%99%E5%8D%9A%E5%A3%AB/p{offset}'
headers = {
    'Accept':
    'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.3',
    'Host':
    'www.dianping.com',
    'Cookie':
    '修改这里的Cookie'
}
deep = 5


def http_request(url):
    r = requests.get(url=url, headers=headers)
    r.encoding = 'utf8'
    return r.text


def html_parse(html):
    soup = BeautifulSoup(html, 'lxml')
    info = soup.find('div', {'id': 'shop-all-list'}).find_all('li')
    return [{
        'store_name':
        i.find('div', {
            'class': 'tit'
        }).find('h4').text.strip(),
        'store_url':
        i.find('div', {
            'class': 'tit'
        }).find('a').get('href'),
        'store_address':
        i.find('div', {
            'class': 'tag-addr'
        }).find('span', {
            'class': 'addr'
        }).text.strip()
    } for i in info]


def out(data):
    with open('JBS_Store.txt', 'a+', encoding='utf-8') as f:
        for i in data:
            f.write('门店名:{}\t 门店地址:{}\t 门店URL:{}\n'.format(
                i['store_name'], i['store_address'], i['store_url']))


def main(deep):
    for i in range(1, deep + 1):
        url = base_url.format(offset=i)
        html = http_request(url=url)
        data = html_parse(html=html)
        out(data=data)
    print('爬取完成')


if __name__ == '__main__':
    main(deep=deep)

考虑反爬了吗

新手上路,请多包涵

f.close

撰写回答
你尚未登录,登录后可以
  • 和开发者交流问题的细节
  • 关注并接收问题和回答的更新提醒
  • 参与内容的编辑和改进,让解决方法与时俱进
推荐问题