代码1:
#!/usr/bin/env python
# coding=utf-8
#import importlib,sys
#import sys
#sys.setdefaultencoding('gbk')
'''import sys
import imp
import sys
reload(sys)
sys.setdefaultencoding('utf8')
'''
'''
import sys
sys.setdefaultencoding('utf-8')
import jieba
import json'''
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse as parse
import ssl
import re
import os,os.path
import codecs
import requests
def getHtml(url):
global html
page = urllib.request.urlopen(url)
html = page.read()
return html
def file(url1,file_name,name):
print(url1)
headers = {'Host': 'download.lfd.uci.edu','User-Agent':'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER','Referer': 'https://www.lfd.uci.edu/~gohlke/pythonlibs/',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch, br',
'Accept-Language': 'zh-CN,zh;q=0.8'}
#req = urllib.urlretrieve(download_url,headers=headers)
#urllib.request.urlopen('https://www.lfd.uci.edu/~gohlke/pythonlibs/')
#req = urllib.request.Request(url=url,headers=header)
request = requests.get(url=url1,headers=headers)
#response = urllib.request.urlopen(request)
global i
i += 1
file = open(name ,'wb+')
file.write(request.content)
file.close()
print(file_name)
print("Completed : .... %d ..." % x)
save_path = os.getcwd()
url = 'https://www.lfd.uci.edu/'
html = getHtml(url)
html='''
<li><a href="javascript:;" onclick=" javascript:dl([101,105,52,56,106,100,50,118,54,95,110,53,119,47,116,99,51,104,113,108,45,112,115,97,46,49,109], "E53AD6:=<F4?C5G5GHC>D?2C>D?2C;098FI471G;@B"); "javascript: dl("" title="[614 KB] [Oct 17, 2019]">ad3‑2.2.1‑cp38‑cp38‑win_amd64.whl</a></li>
<li><a href="javascript:;" onclick=" javascript:dl([101,108,116,49,47,105,53,99,50,104,97,119,46,115,100,113,51,110,112,118,56,106,45], "<7D>AB5139=?E7;7;2E6A?CE6A?CE:4@?7;:80"); "javascript: dl("" title="[544 KB] [Oct 17, 2019]">ad3‑2.2.1‑cp38‑cp38‑win32.whl</a></li>
<li><a href="javascript:;" onclick=" javascript:dl([101,52,108,119,95,116,104,109,45,100,47,50,55,51,99,105,46,49,53,97,106,115,113,112,110,54,118], "D:CEFIA49B8<7:?:?@7=F<;7=F<;672>G3B68H0?251"); "javascript: dl("" title="[609 KB] [Oct 02, 2018]">ad3‑2.2.1‑cp37‑cp37m‑win_amd64.whl</a></li>
<li><a href="javascript:;" onclick=" javascript:dl([101,100,50,106,53,51,116,105,115,49,108,46,97,45,104,118,47,119,113,55,112,110,109,99], "712AC>35?;04<1:1:8<FC4B<FC4BE<@6D41:@=9"); "javascript: dl("" title="[540 KB] [Oct 02, 2018]">ad3‑2.2.1‑cp37‑cp37m‑win32.whl</a></li>
<li><a href="javascript:;" onclick=" javascript:dl([101,119,50,116,49,54,115,112,106,104,99,51,100,46,108,109,95,97,53,110,118,113,105,47,45,52], "517D6CA2F@;:G1<1<3G96:4G96:4>G0EB?@>;4H<08="); "javascript: dl("" title="[598 KB] [Oct 02, 2018]">ad3‑2.2.1‑cp36‑cp36m‑win_amd64.whl</a></li>
<li><a href="javascript:;" onclick=" javascript:dl([101,110,54,49,113,104,45,108,99,53,116,46,106,115,51,112,50,109,97,118,119,105,47,100], "<?;3>B89EAF=5?:?:257>=157>=1@5CD0=?:C46"); "javascript: dl("" title="[534 KB] [Oct 02, 2018]">ad3‑2.2.1‑cp36‑cp36m‑win32.whl</a></li>
<li><a href="javascript:;" onclick=" javascript:dl([101,53,119,105,112,46,51,104,106,108,95,49,97,47,118,54,109,45,52,99,110,116,113,115,100,50], "FH7E3=0D<B350<;G5@H4H4:@B350@B350?@12C9;?G>A4168"); "javascript: dl("" title="[596 KB] [Oct 02, 2018]">ad3‑2.2.1‑cp35‑cp35m‑win_amd64.whl</a></li>
<li><a href="javascript:;" onclick=" javascript:dl([101,46,104,53,109,47,99,106,115,50,110,119,100,49,113,118,108,51,45,116,112,97,105], "786=C>2B45C@24D;@A8080<A5C@2A5C@23A:E9@80:1?"); "javascript: dl("" title="[533 KB] [Oct 02, 2018]">ad3‑2.2.1‑cp35‑cp35m‑win32.whl</a></li>
<li><a href="javascript:;" onclick=" javascript:dl([101,106,45,105,47,112,95,115,104,119,49,109,116,52,51,53,108,118,97,99,110,113,50,46,100,54], "6E0D4@>;3B4=<3AG=1EFEF91B4=<1B4=<:182C5A:GH<F87?"); "javascript: dl("" title="[599 KB] [Oct 02, 2018]">ad3‑2.2.1‑cp34‑cp34m‑win_amd64.whl</a></li>
<li><a href="javascript:;" onclick=" javascript:dl([101,97,119,113,51,109,49,118,47,110,100,104,50,53,106,112,45,99,108,52,115,46,116,105], "C;=2>6<E7@>3B7093?;D;D5?@>3B?@>3B4?1F83;D1:A"); "javascript: dl("" title="[539 KB] [Oct 02, 2018]">ad3‑2.2.1‑cp34‑cp34m‑win32.whl</a></li>
<li><a href="javascript:;" onclick=" javascript:dl([101,52,49,116,45,53,100,51,97,109,118,54,113,105,46,50,106,95,112,119,47,55,108,115,110,104,99], "F>?;A942CIA>DC7563>=>=13IA>D3IA>D83B<G@785:0=BHE"); "javascript: dl("" title="[642 KB] [Oct 02, 2018]">ad3‑2.2.1‑cp27‑cp27m‑win_amd64.whl</a></li>
<li><a href="javascript:;" onclick=" javascript:dl([101,46,53,49,106,110,108,47,50,99,118,105,112,109,116,119,51,97,100,113,45,115,104,55], "D73B;91=68;7F6@A?C70702C8;7FC8;7F<C>:4?70>E5"); "javascript: dl("" title="[556 KB] [Oct 02, 2018]">ad3‑2.2.1‑cp27‑cp27m‑win32.whl</a></li>
</ul>
'''
print('html done')
#html.decode('utf-8')
#print(html)
'''headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1)AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'}
r = requests.get(url, headers = headers)
r.encoding = "utf-8"
soup = BeautifulSoup(r.text, "html.parser")
#html_mod=re.sub(pattern=".",repl=".",string=html.decode('utf-8'))
for link in soup.find_all('a'): #soup.find_all返回的为列表
print(link.get('href'))
#name_list+=link
'''
name_list = html#soup.find_all('a')#re.findall(r']">*-cp38-win_amd64.whl',html.decode('utf-8'))
x=1
files=os.listdir(save_path)
print(files)
print(type(name_list))
name_list=str(name_list)
name_list1=[]
#print(name_list)
#for name in name_list:
k=0
# name[k]=str(name1[k])
for i in range(len(name_list)):
j=0
if name_list[i-2:i+1]==']">':
name_list1.append(name_list[i+1:i+60])
global m
if k<len(name_list1):
for l in range(len(name_list1[k])):
if l-9>=0:
if name_list1[k][l-4:l]=='.whl' or name_list1[k][l-3:l]=='.gz' or name_list1[k][l-4:l]=='.zip':
j=1
m=l
if j==1:
name_list1[k]=name_list1[k][0:m]
k+=1
'''if j==0:
name_list.remove(name)'''
#file_name = os.path.join(save_path ,name)
i=0
#print(name)
print(name_list1)
for name in name_list1:
j=0
for l in range(len(name)):
if l-9>=0:
if name[l-4:l]=='.whl' or name[l-3:l]=='.gz' or name[l-4:l]=='.zip':
j=1
m=l
if j==1:
name=name[0:m]
k+=1
if name in files:
continue
'''if name=='Delny‑0.4.1‑cp27‑none‑win_amd64.whl</a></li>\n<li>' or name==Delny‑0.4.1‑cp27‑none‑win32.whl</a></li>
</ul>
</:
continue
'''
print('no:'+str(x))
print('\ndownload '+name)
# importlib.reload(sys)
#imp.reload(sys)
for l in range(len(name)):
if l-9>=0:
if name[l-4:l]=='.whl' or name[l-3:l]=='.gz' or name[l-4:l]=='.zip':
j=1
m=l
if j==1:
name=name[0:m]
k+=1
string='https://download.lfd.uci.edu/pythonlibs/s2jqpv5t/' + name#[0:4+name.find('.whl')]#https://download.lfd.uci.edu/pythonlibs/s2jqpv5t/
print('00'+save_path)
file(string,save_path,name)
x=x+1
print('09'+name_list)
print('finished')
'''
'''
import sys
sys.setdefaultencoding('utf-8')
import jieba
import json'''
#from bs4 import BeautifulSoup
import urllib.request
import urllib.parse as parse
import ssl
import re
import os,os.path
import codecs
import requests
def getText(html):
'''headers = {'Host': 'https://pypi.org','User-Agent':'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER','Referer': 'https://pypi.org/search/?c=Programming+Language+%3A%3A+Python+%3A%3A+3',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch, br',
'Accept-Language': 'zh-CN,zh;q=0.8'}
#req = urllib.urlretrieve(download_url,headers=headers)
'''
#urllib.request.urlopen('https://www.lfd.uci.edu/~gohlke/pythonlibs/')
#req = urllib.request.Request(url=url,headers=header)
#headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
#import requests
res = requests.get(html)
res.encoding = 'utf-8'
#print(res.text)
words=res.text
'''
soup = BeautifulSoup(res.text, "html.parser")
words = ""
for a1 in soup.find_all("a"):
words = words + str(a1.string)
'''
return words
def file(url1,file_name,name):
print(url1)
headers = {'Host': 'https://files.pythonhosted.org/packages/','User-Agent':'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER','Referer': 'https://pypi.org/',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch, br',
'Accept-Language': 'zh-CN,zh;q=0.8'}
#req = urllib.urlretrieve(download_url,headers=headers)
#urllib.request.urlopen('https://www.lfd.uci.edu/~gohlke/pythonlibs/')
#req = urllib.request.Request(url=url,headers=header)
request = requests.get(url=url1,headers=headers)
#response = urllib.request.urlopen(request)
global i
i += 1
file = open(name ,'wb+')
file.write(request.content)
file.close()
print(file_name)
print("Completed : .... %d ..." % x)
'''for i in range(len(name_list)):
j=0
if name_list[i-24:i+1]=='https://pypi.org/project/':
name_list1.append(name_list[i+1:i+60])'''
def get(url):
global name_list1
res=getText(url)
#print('\n\n\n\n\n\nok\n\n\n\n\n\n\n\n\n\n')
#name_list = getText(url)
#print(res)
print('html done,page:'+str(count)+'\n')
for i in range(len(res)):
#j=0
if (res[i-8:i+1]=='/project/')==True:
name_list1.append('https://pypi.org'+res[i-8:i+20])
#print(name_list1)
def trim(list1):
k=0
list2=[]
for i in list1:
j=25
while j<len(list1[k]):
if list1[k][j]=='/':
list2.append(list1[k][0:j])
break
j+=1
k+=1
return list2
def get1(url):
"""o=0
for n in len(url):
if url[n]=='"':
url=url[0:n-1]+'#files'
"""
global namelist
url=url+'#files'
#import requests
res = requests.get(url)
res.encoding = 'utf-8'
#print(res.text)
html=res.text
for p in range(len(html)):
stri='https://files'
if html[p-len(stri):p]==stri:
namelist.append(html[p-len(stri):p+170])
save_path = os.getcwd()
'''
url = 'https://pypi.org/search/?c=Programming+Language+%3A%3A+Python+%3A%3A+3'
name_list = getText(url)
print(name_list)
print('html done')
#html.decode('utf-8')
print(name_list)'''
x=1
files=os.listdir(save_path)
#print(files)
#print(type(name_list))
name_list1=[]
#print(name_list)
#for name in name_list:
k=0
# name[k]=str(name1[k])
'''for i in range(len(name_list)):
j=0
if name_list[i-25:i+1]=='https://pypi.org/project/':
name_list1.append(name_list[i-25:i+20])
for u in range(len(name_list1[len(name_list1)])):
if name_list1[len(name_list1)][u]==' ':
name_list1[len(name_list1)]=name_list1[len(name_list1)][0:u-1]
'''
global count
count=2
name_list1=[]
for count in range(1):
get('https://pypi.org/search/?c=Programming+Language+%3A%3A+Python+%3A%3A+3&page='+str(count))
""" global m
if k<len(name_list1):
for l in range(len(name_list1[k])):
if l-9>=0:
if name_list1[k][l-4:l]=='.whl' or name_list1[k][l-3:l]=='.gz' or name_list1[k][l-4:l]=='.zip':
j=1
m=l
if j==1:
name_list1[k]=name_list1[k][0:m]
k+=1"""
'''if j==0:
name_list.remove(name)'''
#file_name = os.path.join(save_path ,name)
#i=0
#print(name)
#print(name_list1)
namelist=[]
h=0
for y in trim(name_list1):
get1(y)
print(namelist)
if h==3:
break
h+=1
i=0
for name in namelist:
j=0
for l in range(len(name)):
if l-9>=0:
if name[l-4:l]=='.whl' or name[l-3:l]=='.gz' or name[l-4:l]=='.zip':
j=1
m=l
break
if j==1:
name=name[0:m]
k+=1
while m>0:
if m<len(name):
if name[m]=='/':
filename=name[m+1:len(name)]#p]
break
m-=1
if filename in files:
continue
'''if name=='Delny‑0.4.1‑cp27‑none‑win_amd64.whl</a></li>\n<li>' or name==Delny‑0.4.1‑cp27‑none‑win32.whl</a></li>
</ul>
</:
continue
'''
print('no:'+str(x))
print('\ndownload '+name)
# importlib.reload(sys)
#imp.reload(sys)
for l in range(len(name)):
if l-9>=0:
if name[l-4:l]=='.whl' or name[l-3:l]=='.gz' or name[l-4:l]=='.zip':
j=1
m=l
break
if j==1:
name=name[0:m]
k+=1
p=m
#string='https://download.lfd.uci.edu/pythonlibs/s2jqpv5t/' + name#[0:4+name.find('.whl')]#https://download.lfd.uci.edu/pythonlibs/s2jqpv5t/
print('00'+save_path)
file(name,save_path,filename)
print('\n........'+filename+'..........complete\n')
x=x+1
print('09')
print('finished')
求高手解决