#coding=gbk
import re
import urllib
import requests
# headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36"}
url = "http://www点58pic点com/tupian/xinnian-0-0-1点html"
def craw(url):
html1 = urllib.request.urlopen(url).read()
html1 = str(html1)
pat1 = '(http://pic.qiantucdn.com).+?(/format/webp)'
imagelist = re.compile(pat1).findall(html1)
print(imagelist)
x = 1
for imageurl in imagelist:
imagename = "D:/a1/" + str(x) + ".webp"
imageurl = "https://" + imageurl
print(imageurl)
try:
urllib.request.urlretrieve(imagelist, filename=imagename)
print(imageurl)
except urllib.error.URLError as e:
if hasattr(e, "code"):
x += 1
if hasattr(e, "reason"):
x += 1
x += 1
craw(url)
给你个建议....
dom格式那么清晰的
用lxml模块的xpath取链接