代码如下:
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://ent.sina.com.cn")
page = BeautifulSoup(html)
def get_next_target(page):
start_link = page.find('<a href=')
if start_link == -1:
url, end_quote = None, 0
return url, end_quote
else:
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote + 1)
url = page[start_quote + 1:end_quote]
return url, end_quote
def print_all_links(page):
while True:
url, endpos = get_next_target(page)
if url:
print( url)
page = page[endpos:]
else:
break
print_all_links(get_next_target(page))
执行结果如下:
复制下来就是:
python@ubuntu:~$ python3 aa.py
/usr/lib/python3/dist-packages/bs4/__init__.py:166: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system ("lxml"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.
To get rid of this warning, change this:
BeautifulSoup([your markup])
to this:
BeautifulSoup([your markup], "lxml")
markup_type=markup_type))
Traceback (most recent call last):
File "aa.py", line 35, in <module>
print_all_links(get_next_target(page))
File "aa.py", line 19, in get_next_target
end_quote = page.find('"', start_quote + 1)
TypeError: unsupported operand type(s) for +: 'NoneType' and 'int'
这个是我自制的一个爬虫函数,然后出错了。这个到底错在哪里呢?看不懂
https://www.crummy.com/softwa...
find
的第一个参数必须是标签名,你传一个引号当然找不到