python Beautifulsoup 应用问题

新手上路,请多包涵

问题出现的环境背景及自己尝试过哪些方法

bs4
Spyder(python3.6)

相关代码

-- coding: utf-8 --

"""
Created on Wed Aug 1 03:07:33 2018

@author: stephen zheng
"""

import requests
from bs4 import BeautifulSoup
import xlwt
import json
import time
headers = {

    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    }

def get_region_url(html):

# 获得行政区的url
soup=BeautifulSoup(html,'lxml')
url_list=[i['href'] for i in soup.find('div',id='region-nav').find_all('a')]
return url_list

def get_content(url,headers = None,proxy=None):

html = requests.get(url,headers=headers).content
return html

def get_url(html):

soup = BeautifulSoup(html,'lxml')
shop_list = soup.find_all('div',class_='tit')
return [i.find('a')['href'] for i in shop_list]

def get_details_content(html):

soup = BeautifulSoup(html,'lxml')
# 人均价格
price = soup.find('span',id='avgPriceTitle').text
# 评分
evaluation = soup.find('span',id='comment_score').find_all('span',class_='item')
# 星级评定
the_star = soup.find('div',class_='brief-info').find('span')['title'] 
# 店名
title = soup.find('div',class_='shop-name').find('span').text
# 评论数量
comments = soup.find('span',id='sub-title').text
# 地址
address = soup.find('span',itemprop='street-address').text
print (u'店名:'+title)
for ev in evaluation:
    print (ev.text)
    print (price)
    print (u'评论数量:'+comments)
    print (u'地址:'+address.strip())
    print (u'总体评价星级:'+the_star)
    print ('=======================')
    return (title,evaluation[0].text,evaluation[1].text,evaluation[2].text,price,comments,address,the_star)

if name == '__main__':

items = []
start_url = 'https://www.dianping.com/search/keyword/1/0_%E7%AF%AE%E7%90%83%E9%A6%86'
base_url = 'http://www.dianping.com'
headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
        'Cookie':'_hc.v=2f4046e6-6012-4664-6e8b-cdd151ed44e7.1494257443; PHOENIX_ID=0a017918-15c393c3773-116bcd2f;__utma=1.2147215388.1495608855.1495608855.1495622249.2; __utmc=1; __utmz=1.1495608855.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); s_ViewType=10; JSESSIONID=9F1079723C06E82D7555D3373D5DD9B7; aburl=1; cy=2; cye=beijing; __mta=209559469.1495614452018.1495625059663.1495625292763.21'
        }
start_html = get_content(start_url)
region_url_list = get_content(start_html)
region_url_list - [base_url+url for url in region_url_list]
for url in region_url_list:
    for i in range(1,10):
        url_list_page = get_url(get_content(url+'p'+str(i)))
        url_list = [base_url+url for url in url_list_page]
        for url in url_list:
            print (url)
            details_html = get_content(url,headers=headers)
            item = get_details_content(details_html)
            items.append(item)
            newTable='DZDP.xls'
            wb = xlwt.Workbook(encoding='UTF-8')
            ws = wb.add_sheet('test1')
            headData = ['商户名字','评分','评分','评分','人均价格','评论数量','地址','商户星级']

for colnum in range(0,8):

ws.write(0,colnum,headData(colnum),xlwt.easyxf('font:bold on'))
index = 1
lens = len(items)
for j in range(0,lens):
    for i in range(0,8):
        print(items[j][i])
        ws.write(index,i,items[j][i])
    index+=1
wb.save(newTable)### 问题描述

你期待的结果是什么?实际看到的错误信息又是什么?

runfile('C:/Users/stephen/Desktop/untitled0.py')
Traceback (most recent call last):

File "<ipython-input-11-ac3a27dc0ab0>", line 1, in <module>

runfile('C:/Users/stephen/Desktop/untitled0.py')

File "C:UsersstephenAnaconda3libsite-packagesspyderutilssitesitecustomize.py", line 705, in runfile

execfile(filename, namespace)

File "C:UsersstephenAnaconda3libsite-packagesspyderutilssitesitecustomize.py", line 102, in execfile

exec(compile(f.read(), filename, 'exec'), namespace)

File "C:/Users/stephen/Desktop/untitled0.py", line 64, in <module>

region_url_list = get_content(start_html)

File "C:/Users/stephen/Desktop/untitled0.py", line 23, in get_content

html = requests.get(url,headers=headers).content

File "C:UsersstephenAnaconda3libsite-packagesrequestsapi.py", line 72, in get

return request('get', url, params=params, **kwargs)

File "C:UsersstephenAnaconda3libsite-packagesrequestsapi.py", line 58, in request

return session.request(method=method, url=url, **kwargs)

File "C:UsersstephenAnaconda3libsite-packagesrequestssessions.py", line 508, in request

resp = self.send(prep, **send_kwargs)

File "C:UsersstephenAnaconda3libsite-packagesrequestssessions.py", line 612, in send

adapter = self.get_adapter(url=request.url)

File "C:UsersstephenAnaconda3libsite-packagesrequestssessions.py", line 703, in get_adapter

raise InvalidSchema("No connection adapters were found for '%s'" % url)

InvalidSchema: No connection adapters were found for '<html lang="en">

<head>

<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta http-equiv="X-UA-Compatible" content="ie=edge">
<title> 美食, 餐厅餐饮, 团购,生活,优惠券-大众点评网</title>
<!--网页标题左侧显示-->
<link rel="icon" href="//www.dpfile.com/app/pc-common/dp_favicon.ico" type="image/x-icon">
<!--收藏夹显示图标-->
<link rel="shortcut icon" href="//www.dpfile.com/app/pc-common/dp_favicon.ico" type="image/x-icon">
<!--1. 首先引入页头模块css,保证页头模块css在前,首先渲染  -->
<link rel="stylesheet" type="text/css" href="//www.dpfile.com/app/pc-common/index.min.css">
<!--其他模块css在后-->
<link rel="stylesheet" href="//www.dpfile.com/app/dpindex-new-static/static/new_404_pc.min.css" type="text/css"/>    <!--2. 引入页头模块 js  -->
<!--3. 注入页头需要的参数  -->
<script type="text/javascript">
    window._DP_HeaderData = {
        'cityId': '1',  // 城市id , 必须,默认: '1'
        'cityCName': '', // 城市中文名, 必须, 默认:上海
        'cityEnName': '', // 城市英文名,默认:shanghai
        'pageType': 'index', // 页面类型,枚举类型 'index' || 'search', 必须, 默认:'index'
        'userId': '',   // 用户id, 若用户已经登录,必须传,未登录时,可不传此参数,或者传空字符串'',默认:'', 未登录状态
        'userName':'',   // 用户名, 若用户已经登录,必须传,未登录时,可不传此参数,或者传空字符串'',默认:'', 未登录状态
        'dpId': '', // 点评id, 可选参数, 默认:''
        'uesrLng': '0.0', // 用户所在位置经度,可选参数,默认:'0.0';
        'userLan': '0.0', // 用户所在位置维度,可选参数,默认:'0.0';
        'clientIp': '' //用户终端ip地址,可选参数, 默认 '127.0.0.1'
    }
</script>
<!--4. 如果有必要,可以引入其他模块或第三方js,比如:灵犀web4.0打点配置-->
<script type="text/javascript">
    (function (win, doc, ns) { var cacheFunName = '_MeiTuanALogObject'; win[cacheFunName] = ns; if (!win[ns]) { var _LX = function () { _LX.q.push(arguments); return _LX; }; _LX.q = _LX.q || []; _LX.l = +new Date(); win[ns] = _LX; } })(window, document, 'LXAnalytics');
</script>
<script src="http://analytics.meituan.net/analytics.js" type="text/javascript" charset="utf-8" async defer></script>

</head>
<div class="header-container">

<div id="top-nav" class="top-nav">

</div>
<div id="logo-input" class="logo-input">
    <div class="logo-input-container clearfix">
        <a title="大众点评网" href="/" class="logo"></a>

        <div class="search-box">
            <div class="search-bar ">
        <span class="search-container clearfix">
            <i class="i-search"></i>
        </span>
                <p class="hot-search J-hot-search">

                </p>
            </div>
        </div>
        <div class="qrcode-container">
            <i class="close"></i>
            <div class="qrcode">
                <p class="qrcode-text">手机点评</p>
                <div class="qrcode-img"></div>
            </div>
        </div>
    </div>
</div>

<div class="cate-container">
    <div class="nav-header">
        <div class="navbar">
            <a href="//www.dianping.com"><span class="cate-item all-cate">全部分类</span></a>
            <a target="_blank" class="cate-item other-cate" href="http://t.dianping.com/shanghai">团购</a>
            <a target="_blank" class="cate-item other-cate" href="http://s.dianping.com/event/shanghai">霸王餐</a>
            <i class = "hot-icon"></i>
            <a target="_blank" class="cate-item other-cate" href="http://s.dianping.com/shanghai/group?utm_source=dp_pc_other">社区论坛</a>
        </div>
    </div>
    <div class="gradient"></div>

</div>

</div>
<div class="not-found">

<div class="not-found-content">
    <div class="img-not-found"></div>
    <div class="not-found-right">
        <p class="not-found-words">抱歉!页面无法访问......</p>
        <p class="not-found-words1">错误信息:</p>
        <p class="not-found-words1">    currentDate:2018-08-01 21:30:44</p>
        <p class="not-found-words1">    userIp:36.149.210.62, 10.71.192.26</p>
        <p class="not-found-words1">    userAgent:python-requests/2.18.4</p>
        <a class="back-to-home" href="http://www.dianping.com">
            <button type="button">去大众点评首页</button>
        </a>
    </div>
</div>

</div>

<!--页尾部分-->
<div class="footer-container">

<div id="channel-footer" class="channel-footer">
    <p class="links"> <a target="_blank" href="http://www.dianping.com/help/center/rule?name=about1" rel="nofollow">关于大众点评</a>| <a target="_blank" href="https://dpapp-appeal.meituan.com/#/shopCreditRegulationPC" rel="nofollow">诚信公约</a>| <a target="_blank" href="//www.dianping.com/help" rel="nofollow">网站帮助</a>| <a target="_blank" href="http://www.dianping.com/sitemap/c1c10">网站地图</a>| <a target="_blank" href="//www.dianping.com/business/" rel="nofollow">推广服务</a>| <a target="_blank" href="http://www.dianping.com/help/center/rule?name=media1" rel="nofollow">媒体报道</a>| <a target="_blank" href="http://careers.dianping.com" rel="nofollow">人才招聘</a>|
        <!--新增footer links--> <span class="links-container"> <a class="ext-links" href="javascript:void(0);" rel="nofollow">最新咨询</a>| </span> <a target="_blank" href="http://www.dianping.com/forum" rel="nofollow">站务论坛</a>| <a target="_blank" href="http://www.dianping.com/help/center/rule?name=about4" rel="nofollow">联系我们</a>| <a target="_blank" href="http://developer.dianping.com" rel="nofollow">开发者</a>| <a target="_blank" href="https://developer.meituan.com/?applyFrom=dianping_c_pc_busines" rel="nofollow">聚宝盆餐饮开放平台</a> </p>
    <!--新增 footer links 面板-->
    <div class="ext-container Hide">
        <div class="link-items Hide">
            <a target="_blank" href="http://www.dianping.com/wedding/wenda"><span>结婚问答</span></a>
            <a target="_blank" href="http://www.dianping.com/home/wenda"><span>家装问答</span></a>
            <a target="_blank" href="http://www.dianping.com/home-tuku"><span>家装图库</span></a>
            <a target="_blank" href="//m.dianping.com/home-tuku"><span>家装图库手机版</span></a>
            <a target="_blank" href="http://www.dianping.com/wedding"><span>结婚资讯</span></a>
            <a target="_blank" href="http://www.dianping.com/plastic/item"><span>整形项目大全</span></a>
            <a target="_blank" href="http://www.dianping.com/plastic/wenda"><span>丽人问答</span></a>
            <a target="_blank" href="http://www.dianping.com/movie"><span>点评电影</span></a>
            <a target="_blank" href="http://www.dianping.com/baby/wenda"><span>育儿问答</span></a>
        </div>
    </div>
    <p class="rights"> <span style="margin-right:10px;">&copy;2003-2018 dianping.com, All Rights Reserved.</span> <span>本站发布的所有内容,未经许可,不得转载,详见 <a rel="nofollow" class="G" href="//www.dianping.com/help/center/rule?name=base2">《知识产权声明》</a>。 </span> </p>
</div>
<script> (function(){var h=navigator.userAgent;var i=navigator.appName;var b=i.indexOf("Microsoft Internet Explorer")!==-1;if(!b){return false}var d=/MSIE (\d+).0/g;var e=d.exec(h);if(e&&e.length&&e[1]<9){var j='<div class="browser-overlay"></div><div id="browser-ie-con" class="browser-ie-con"><div id="browser-close" class="close">×</div><div class="browser-download chrome"><a href="//www.google.cn/chrome/browser/desktop/index.html?utm_dp" target="_black" title="chrome"></a></div><div class="browser-download firefox"><a href="//www.firefox.com.cn/download/?utm_dp" target="_black" title="firefox"></a></div></div>';var f=document.createElement("div");f.id="browser-update-ie";f.className="browser-update-ie";f.innerHTML=j;document.body.appendChild(f);var a=document.documentElement.clientWidth||document.body.clientWidth;var c=document.getElementById("browser-ie-con").offsetWidth;var g=(a-c)/2;document.getElementById("browser-ie-con").style.left=g+"px";document.getElementById("browser-close").attachEvent("onclick",function(){document.getElementById("browser-update-ie").style.display="none"},false)}})(); </script>
阅读 1.9k
撰写回答
你尚未登录,登录后可以
  • 和开发者交流问题的细节
  • 关注并接收问题和回答的更新提醒
  • 参与内容的编辑和改进,让解决方法与时俱进