问题出现的环境背景及自己尝试过哪些方法
bs4
Spyder(python3.6)
相关代码
-- coding: utf-8 --
"""
Created on Wed Aug 1 03:07:33 2018
@author: stephen zheng
"""
import requests
from bs4 import BeautifulSoup
import xlwt
import json
import time
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
def get_region_url(html):
# 获得行政区的url
soup=BeautifulSoup(html,'lxml')
url_list=[i['href'] for i in soup.find('div',id='region-nav').find_all('a')]
return url_list
def get_content(url,headers = None,proxy=None):
html = requests.get(url,headers=headers).content
return html
def get_url(html):
soup = BeautifulSoup(html,'lxml')
shop_list = soup.find_all('div',class_='tit')
return [i.find('a')['href'] for i in shop_list]
def get_details_content(html):
soup = BeautifulSoup(html,'lxml')
# 人均价格
price = soup.find('span',id='avgPriceTitle').text
# 评分
evaluation = soup.find('span',id='comment_score').find_all('span',class_='item')
# 星级评定
the_star = soup.find('div',class_='brief-info').find('span')['title']
# 店名
title = soup.find('div',class_='shop-name').find('span').text
# 评论数量
comments = soup.find('span',id='sub-title').text
# 地址
address = soup.find('span',itemprop='street-address').text
print (u'店名:'+title)
for ev in evaluation:
print (ev.text)
print (price)
print (u'评论数量:'+comments)
print (u'地址:'+address.strip())
print (u'总体评价星级:'+the_star)
print ('=======================')
return (title,evaluation[0].text,evaluation[1].text,evaluation[2].text,price,comments,address,the_star)
if name == '__main__':
items = []
start_url = 'https://www.dianping.com/search/keyword/1/0_%E7%AF%AE%E7%90%83%E9%A6%86'
base_url = 'http://www.dianping.com'
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
'Cookie':'_hc.v=2f4046e6-6012-4664-6e8b-cdd151ed44e7.1494257443; PHOENIX_ID=0a017918-15c393c3773-116bcd2f;__utma=1.2147215388.1495608855.1495608855.1495622249.2; __utmc=1; __utmz=1.1495608855.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); s_ViewType=10; JSESSIONID=9F1079723C06E82D7555D3373D5DD9B7; aburl=1; cy=2; cye=beijing; __mta=209559469.1495614452018.1495625059663.1495625292763.21'
}
start_html = get_content(start_url)
region_url_list = get_content(start_html)
region_url_list - [base_url+url for url in region_url_list]
for url in region_url_list:
for i in range(1,10):
url_list_page = get_url(get_content(url+'p'+str(i)))
url_list = [base_url+url for url in url_list_page]
for url in url_list:
print (url)
details_html = get_content(url,headers=headers)
item = get_details_content(details_html)
items.append(item)
newTable='DZDP.xls'
wb = xlwt.Workbook(encoding='UTF-8')
ws = wb.add_sheet('test1')
headData = ['商户名字','评分','评分','评分','人均价格','评论数量','地址','商户星级']
for colnum in range(0,8):
ws.write(0,colnum,headData(colnum),xlwt.easyxf('font:bold on'))
index = 1
lens = len(items)
for j in range(0,lens):
for i in range(0,8):
print(items[j][i])
ws.write(index,i,items[j][i])
index+=1
wb.save(newTable)### 问题描述
你期待的结果是什么?实际看到的错误信息又是什么?
runfile('C:/Users/stephen/Desktop/untitled0.py')
Traceback (most recent call last):
File "<ipython-input-11-ac3a27dc0ab0>", line 1, in <module>
runfile('C:/Users/stephen/Desktop/untitled0.py')
File "C:UsersstephenAnaconda3libsite-packagesspyderutilssitesitecustomize.py", line 705, in runfile
execfile(filename, namespace)
File "C:UsersstephenAnaconda3libsite-packagesspyderutilssitesitecustomize.py", line 102, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/stephen/Desktop/untitled0.py", line 64, in <module>
region_url_list = get_content(start_html)
File "C:/Users/stephen/Desktop/untitled0.py", line 23, in get_content
html = requests.get(url,headers=headers).content
File "C:UsersstephenAnaconda3libsite-packagesrequestsapi.py", line 72, in get
return request('get', url, params=params, **kwargs)
File "C:UsersstephenAnaconda3libsite-packagesrequestsapi.py", line 58, in request
return session.request(method=method, url=url, **kwargs)
File "C:UsersstephenAnaconda3libsite-packagesrequestssessions.py", line 508, in request
resp = self.send(prep, **send_kwargs)
File "C:UsersstephenAnaconda3libsite-packagesrequestssessions.py", line 612, in send
adapter = self.get_adapter(url=request.url)
File "C:UsersstephenAnaconda3libsite-packagesrequestssessions.py", line 703, in get_adapter
raise InvalidSchema("No connection adapters were found for '%s'" % url)
InvalidSchema: No connection adapters were found for '<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta http-equiv="X-UA-Compatible" content="ie=edge">
<title> 美食, 餐厅餐饮, 团购,生活,优惠券-大众点评网</title>
<!--网页标题左侧显示-->
<link rel="icon" href="//www.dpfile.com/app/pc-common/dp_favicon.ico" type="image/x-icon">
<!--收藏夹显示图标-->
<link rel="shortcut icon" href="//www.dpfile.com/app/pc-common/dp_favicon.ico" type="image/x-icon">
<!--1. 首先引入页头模块css,保证页头模块css在前,首先渲染 -->
<link rel="stylesheet" type="text/css" href="//www.dpfile.com/app/pc-common/index.min.css">
<!--其他模块css在后-->
<link rel="stylesheet" href="//www.dpfile.com/app/dpindex-new-static/static/new_404_pc.min.css" type="text/css"/> <!--2. 引入页头模块 js -->
<!--3. 注入页头需要的参数 -->
<script type="text/javascript">
window._DP_HeaderData = {
'cityId': '1', // 城市id , 必须,默认: '1'
'cityCName': '', // 城市中文名, 必须, 默认:上海
'cityEnName': '', // 城市英文名,默认:shanghai
'pageType': 'index', // 页面类型,枚举类型 'index' || 'search', 必须, 默认:'index'
'userId': '', // 用户id, 若用户已经登录,必须传,未登录时,可不传此参数,或者传空字符串'',默认:'', 未登录状态
'userName':'', // 用户名, 若用户已经登录,必须传,未登录时,可不传此参数,或者传空字符串'',默认:'', 未登录状态
'dpId': '', // 点评id, 可选参数, 默认:''
'uesrLng': '0.0', // 用户所在位置经度,可选参数,默认:'0.0';
'userLan': '0.0', // 用户所在位置维度,可选参数,默认:'0.0';
'clientIp': '' //用户终端ip地址,可选参数, 默认 '127.0.0.1'
}
</script>
<!--4. 如果有必要,可以引入其他模块或第三方js,比如:灵犀web4.0打点配置-->
<script type="text/javascript">
(function (win, doc, ns) { var cacheFunName = '_MeiTuanALogObject'; win[cacheFunName] = ns; if (!win[ns]) { var _LX = function () { _LX.q.push(arguments); return _LX; }; _LX.q = _LX.q || []; _LX.l = +new Date(); win[ns] = _LX; } })(window, document, 'LXAnalytics');
</script>
<script src="http://analytics.meituan.net/analytics.js" type="text/javascript" charset="utf-8" async defer></script>
</head>
<div class="header-container">
<div id="top-nav" class="top-nav">
</div>
<div id="logo-input" class="logo-input">
<div class="logo-input-container clearfix">
<a title="大众点评网" href="/" class="logo"></a>
<div class="search-box">
<div class="search-bar ">
<span class="search-container clearfix">
<i class="i-search"></i>
</span>
<p class="hot-search J-hot-search">
</p>
</div>
</div>
<div class="qrcode-container">
<i class="close"></i>
<div class="qrcode">
<p class="qrcode-text">手机点评</p>
<div class="qrcode-img"></div>
</div>
</div>
</div>
</div>
<div class="cate-container">
<div class="nav-header">
<div class="navbar">
<a href="//www.dianping.com"><span class="cate-item all-cate">全部分类</span></a>
<a target="_blank" class="cate-item other-cate" href="http://t.dianping.com/shanghai">团购</a>
<a target="_blank" class="cate-item other-cate" href="http://s.dianping.com/event/shanghai">霸王餐</a>
<i class = "hot-icon"></i>
<a target="_blank" class="cate-item other-cate" href="http://s.dianping.com/shanghai/group?utm_source=dp_pc_other">社区论坛</a>
</div>
</div>
<div class="gradient"></div>
</div>
</div>
<div class="not-found">
<div class="not-found-content">
<div class="img-not-found"></div>
<div class="not-found-right">
<p class="not-found-words">抱歉!页面无法访问......</p>
<p class="not-found-words1">错误信息:</p>
<p class="not-found-words1"> currentDate:2018-08-01 21:30:44</p>
<p class="not-found-words1"> userIp:36.149.210.62, 10.71.192.26</p>
<p class="not-found-words1"> userAgent:python-requests/2.18.4</p>
<a class="back-to-home" href="http://www.dianping.com">
<button type="button">去大众点评首页</button>
</a>
</div>
</div>
</div>
<!--页尾部分-->
<div class="footer-container">
<div id="channel-footer" class="channel-footer">
<p class="links"> <a target="_blank" href="http://www.dianping.com/help/center/rule?name=about1" rel="nofollow">关于大众点评</a>| <a target="_blank" href="https://dpapp-appeal.meituan.com/#/shopCreditRegulationPC" rel="nofollow">诚信公约</a>| <a target="_blank" href="//www.dianping.com/help" rel="nofollow">网站帮助</a>| <a target="_blank" href="http://www.dianping.com/sitemap/c1c10">网站地图</a>| <a target="_blank" href="//www.dianping.com/business/" rel="nofollow">推广服务</a>| <a target="_blank" href="http://www.dianping.com/help/center/rule?name=media1" rel="nofollow">媒体报道</a>| <a target="_blank" href="http://careers.dianping.com" rel="nofollow">人才招聘</a>|
<!--新增footer links--> <span class="links-container"> <a class="ext-links" href="javascript:void(0);" rel="nofollow">最新咨询</a>| </span> <a target="_blank" href="http://www.dianping.com/forum" rel="nofollow">站务论坛</a>| <a target="_blank" href="http://www.dianping.com/help/center/rule?name=about4" rel="nofollow">联系我们</a>| <a target="_blank" href="http://developer.dianping.com" rel="nofollow">开发者</a>| <a target="_blank" href="https://developer.meituan.com/?applyFrom=dianping_c_pc_busines" rel="nofollow">聚宝盆餐饮开放平台</a> </p>
<!--新增 footer links 面板-->
<div class="ext-container Hide">
<div class="link-items Hide">
<a target="_blank" href="http://www.dianping.com/wedding/wenda"><span>结婚问答</span></a>
<a target="_blank" href="http://www.dianping.com/home/wenda"><span>家装问答</span></a>
<a target="_blank" href="http://www.dianping.com/home-tuku"><span>家装图库</span></a>
<a target="_blank" href="//m.dianping.com/home-tuku"><span>家装图库手机版</span></a>
<a target="_blank" href="http://www.dianping.com/wedding"><span>结婚资讯</span></a>
<a target="_blank" href="http://www.dianping.com/plastic/item"><span>整形项目大全</span></a>
<a target="_blank" href="http://www.dianping.com/plastic/wenda"><span>丽人问答</span></a>
<a target="_blank" href="http://www.dianping.com/movie"><span>点评电影</span></a>
<a target="_blank" href="http://www.dianping.com/baby/wenda"><span>育儿问答</span></a>
</div>
</div>
<p class="rights"> <span style="margin-right:10px;">©2003-2018 dianping.com, All Rights Reserved.</span> <span>本站发布的所有内容,未经许可,不得转载,详见 <a rel="nofollow" class="G" href="//www.dianping.com/help/center/rule?name=base2">《知识产权声明》</a>。 </span> </p>
</div>
<script> (function(){var h=navigator.userAgent;var i=navigator.appName;var b=i.indexOf("Microsoft Internet Explorer")!==-1;if(!b){return false}var d=/MSIE (\d+).0/g;var e=d.exec(h);if(e&&e.length&&e[1]<9){var j='<div class="browser-overlay"></div><div id="browser-ie-con" class="browser-ie-con"><div id="browser-close" class="close">×</div><div class="browser-download chrome"><a href="//www.google.cn/chrome/browser/desktop/index.html?utm_dp" target="_black" title="chrome"></a></div><div class="browser-download firefox"><a href="//www.firefox.com.cn/download/?utm_dp" target="_black" title="firefox"></a></div></div>';var f=document.createElement("div");f.id="browser-update-ie";f.className="browser-update-ie";f.innerHTML=j;document.body.appendChild(f);var a=document.documentElement.clientWidth||document.body.clientWidth;var c=document.getElementById("browser-ie-con").offsetWidth;var g=(a-c)/2;document.getElementById("browser-ie-con").style.left=g+"px";document.getElementById("browser-close").attachEvent("onclick",function(){document.getElementById("browser-update-ie").style.display="none"},false)}})(); </script>