刚好学习到python爬取网站m3u8视频,爬取视频时遇到了些坑,记录一下
1、打开开发者工具,播放视频后,发现只有以.m3u8和.ts结尾的文件,所以下载m3u8文件,m3u8文件里面全是ts下载链接列表,还有加密key的链接,下载key文件
2、解密加密过后的ts视频需要安装AES库
pip install pycryptodome
如导入rom Crypto.Cipher import AES报错,可以
C:\Python36\Lib\site-packages\crypto 改成
C:\Python36\Lib\site-packages\Crypto
3、python代码
# -*- coding:utf-8 -*-
import os
import sys
from importlib import reload
import requests
import datetime
from Crypto import Random
from Crypto.Cipher import AES
#根据m3u8文件下载地址获取m3u8文件内容,并截取ts下载连接
def download(url):
download_path = os.getcwd() + "\download"
if not os.path.exists(download_path):
os.mkdir(download_path)
# 新建日期文件夹
download_path = os.path.join(download_path, datetime.datetime.now().strftime('%Y%m%d_%H%M%S'))
# 创建
os.mkdir(download_path)
all_content = requests.get(url).text # 获取第M3U8文件内容
# print('all_content:',all_content)
if "#EXTM3U" not in all_content:
raise BaseException("非M3U8的链接")
urls = []
with open(m3u8_path, "r") as file:
lines = file.readlines()
for line in lines:
if line.endswith(".ts\n"):
# urls.append(base_url + line.strip("\n"))
urls.append(line.strip("\n"))
print("地址列表", urls)
file_line = urls
print(file_line)
unknow = True
key = "26ebaf4ddfa89fad"
for index, line in enumerate(file_line): # 遍历ts链接列表
# if "#EXT-X-KEY" in line: # 找解密Key
# method_pos = line.find("METHOD")
# comma_pos = line.find(",")
# method = line[method_pos:comma_pos].split('=')[1]
# print
# "Decode Method:", method
#
# uri_pos = line.find("URI")
# quotation_mark_pos = line.rfind('"')
# key_path = line[uri_pos:quotation_mark_pos].split('"')[1]
#
# key_url = url.rsplit("/", 1)[0] + "/" + key_path # 拼出key解密密钥URL
# res = requests.get(key_url)
# key = res.content
# print
# "key:", key
# if "EXTINF" in line: # 找ts地址并下载
unknow = False
# pd_url = url.rsplit("/", 1)[0] + "/" + file_line[index + 1] # 拼出ts片段的URL
pd_url = file_line[index]
print("下载地址",pd_url)
start = datetime.datetime.now().replace(microsecond=0)
res = requests.get(pd_url)
c_fule_name = file_line[index].rsplit("/")[-1]
print('保存地址',download_path,'保存名字',c_fule_name)
if len(key): # AES 解密
end = datetime.datetime.now().replace(microsecond=0)
print("耗时:%s" % (end - start))
iv = Random.new().read(AES.block_size)
cryptor = AES.new(key.encode('utf-8'), AES.MODE_CBC, iv)
with open(os.path.join(download_path, format(index) + ".mp4"), 'ab') as f:
f.write(cryptor.decrypt(res.content))
else:
with open(os.path.join(download_path, c_fule_name), 'ab') as f:
f.write(res.content)
f.flush()
if unknow:
raise BaseException("未找到对应的下载链接")
else:
print
"下载完成"
merge_file(download_path)
#合并ts文件
def merge_file(path):
os.chdir(path)
cmd = "copy /b * new.tmp"
os.system(cmd)
os.system('del /Q *.ts')
os.system('del /Q *.mp4')
os.rename("new.tmp", "new.mp4")
if __name__ == '__main__':
url = "https://video.buycar5.cn/20200912/WPkXE88O/1000kb/hls/index.m3u8"
download(url)
4、对于没有AES加密的的ts,直接下载ts后合并就行
import datetime
import requests
import os
import time
# Python的urllib3软件包的证书认证及警告的禁用
import urllib3
urllib3.disable_warnings()
# m3u8是本地的文件路径
m3u8_path = "C:\\Users\\Administrator\\Desktop\\报表文档\\index1.m3u8"
# request请求头,若无可能被禁止访问
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"}
# 从m3u8文件中取出并生成ts文件的下载链接
def get_ts_urls(m3u8_path, base_url):
urls = []
with open(m3u8_path, "r") as file:
lines = file.readlines()
for line in lines:
if line.endswith(".ts\n"):
# urls.append(base_url + line.strip("\n"))
urls.append(line.strip("\n"))
print("地址列表",urls)
return urls
'''
取出下载链接并下载
ts_urls 从文件获取到的ts下载连接列表
download_path 保存地址
ts文件合并命令: copy /b E:\pythonProject\tsfiles\*.ts D:\PySpider\new.mp4
'''
def download(ts_urls, download_path):
for i in range(len(ts_urls)):
# ts_path = download_path +"/{0}.ts".format(i)
ts_path = "E:/pythonProject/tsfiles"+"/{0}.ts".format(i)
print("存放地址:",ts_path)
ts_url = ts_urls[i]
file_name = ts_url.split("/")[-1]
if ts_path is not None :
print("ts_url", ts_url)
print("开始下载 %s" % file_name)
# time.sleep(0.5) # 防止爬虫间隔时间过短被禁止请求
start = datetime.datetime.now().replace(microsecond=0)
try:
response = requests.get(headers=header, url=ts_url, stream=True)
except Exception as e:
print("异常请求:%s" % e.args)
return
with open(ts_path, "wb+") as file:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
file.write(chunk)
end = datetime.datetime.now().replace(microsecond=0)
print("耗时:%s" % (end - start))
else:
print("{} 已经存在,开始下载下一个ts".format(file_name))
continue
# 将已经下载的ts文件的路径进行排序
def file_walker(path):
file_list = []
for root, dirs, files in os.walk(path): # 生成器
for fn in files:
p = str(root + '/' + fn)
file_list.append(p)
file_list.sort(key=lambda x: int(x[10:-3]))
print(file_list)
return file_list
# 将所有下载好的ts文件组合成一个文件
# 0.: 下载好的一堆ts文件的文件夹
# combine_path: 组合好的文件的存放位置
# file_name: 组合好的视频文件的文件名
def combine(ts_path, combine_path, file_name):
print(ts_path)
file_list = file_walker(ts_path)
file_path = combine_path + file_name + '.ts'
print(file_path)
with open(file_path, 'wb+') as fw:
for i in range(len(file_list)):
fw.write(open(file_list[i], 'rb').read())
if __name__ == '__main__':
# urls = get_ts_urls(m3u8_path=m3u8_path, base_url="https://www.zhuticlub.com:65")
urls = get_ts_urls(m3u8_path=m3u8_path, base_url="https://www.zhuticlub.com:65")
download(urls, "./tsfiles")
combine("./tsfiles/", "D://PySpider//", "西虹市首富")
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。