在学习Python网络爬虫与信息提取实例1的时候从这个页面爬取大学排名信息,源代码如下
import requests
from bs4 import BeautifulSoup
import bs4
def getHTMLText(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return""
def fillUnivList(ulist, url):
soup = BeautifulSoup(url, 'html.parser')
for tr in soup.find('tbody').children:
if isinstance(tr, bs4.element.Tag):
tds = tr('td')
ulist.append([tds[0].content, tds[1].string, tds[3].string])
def printUnivList(ulist, num):
print("{:^10}\t{:^6}\t{:^10}".format("排名", "学校名称", "总分"))
for i in range(num):
u = ulist[i]
print("{:^10}\t{:^6}\t{:^10}".format(u[0], u[1], u[2]))
def main():
uinfo = []
url = 'http://zuihaodaxue.cn/zuihaodaxuepaiming2017.html'
html = getHTMLText(url)
fillUnivList(uinfo, html)
printUnivList(uinfo, 20)
main()
但是在F5之后,出现报错,仅能输出第一行信息,具体输出内容为
排名 学校名称 总分
Traceback (most recent call last):
File "C:/Users/smile/Documents/Python/new1.py", line 34, in <module>
main()
File "C:/Users/smile/Documents/Python/new1.py", line 32, in main
printUnivList(uinfo, 20)
File "C:/Users/smile/Documents/Python/new1.py", line 25, in printUnivList
print("{:^10}\t{:^6}\t{:^10}".format(u[0], u[1], u[2]))
TypeError: unsupported format string passed to NoneType.__format__
>>>
打表查看uinfo,发现每一个子列表的第一项为"None"
而每一段tr标签包含的内容为
<tr class="alt"><td>1<td><div align="left">清华大学</div></td><td>北京</td><td>94.0 </td><td class="hidden-xs need-hidden indicator5">100.0 </td><td class="hidden-xs need-hidden indicator6" style="display:none;">97.70%</td><td class="hidden-xs need-hidden indicator7" style="display:none;">40938</td><td class="hidden-xs need-hidden indicator8" style="display:none;">1.381</td><td class="hidden-xs need-hidden indicator9" style="display:none;">1373</td><td class="hidden-xs need-hidden indicator10" style="display:none;">111</td><td class="hidden-xs need-hidden indicator11" style="display:none;">1263428</td><td class="hidden-xs need-hidden indicator12" style="display:none;">613524</td><td class="hidden-xs need-hidden indicator13" style="display:none;">7.04%</td></tr>
第一个td标签内包含第二个td标签,导致无法爬取?望各位明白人赐教
改了一下下面的代码:
结果如下: