2

刚好学习到python爬取网站m3u8视频,爬取视频时遇到了些坑,记录一下

1、打开开发者工具,播放视频后,发现只有以.m3u8和.ts结尾的文件,所以下载m3u8文件,m3u8文件里面全是ts下载链接列表,还有加密key的链接,下载key文件

2、解密加密过后的ts视频需要安装AES库

  pip install  pycryptodome

如导入rom Crypto.Cipher import AES报错,可以

  C:\Python36\Lib\site-packages\crypto 改成
  C:\Python36\Lib\site-packages\Crypto

3、python代码

# -*- coding:utf-8 -*-  
import os
import sys
from importlib import reload
import requests
import datetime
from Crypto import Random
from Crypto.Cipher import AES



#根据m3u8文件下载地址获取m3u8文件内容,并截取ts下载连接
def download(url):
    download_path = os.getcwd() + "\download"
    if not os.path.exists(download_path):
        os.mkdir(download_path)

    # 新建日期文件夹
    download_path = os.path.join(download_path, datetime.datetime.now().strftime('%Y%m%d_%H%M%S'))
    # 创建
    os.mkdir(download_path)

    all_content = requests.get(url).text  # 获取第M3U8文件内容
    # print('all_content:',all_content)
    if "#EXTM3U" not in all_content:
        raise BaseException("非M3U8的链接")

    urls = []
    with open(m3u8_path, "r") as file:
        lines = file.readlines()
        for line in lines:
            if line.endswith(".ts\n"):
                # urls.append(base_url + line.strip("\n"))
                urls.append(line.strip("\n"))

    print("地址列表", urls)
    file_line = urls
    print(file_line)
    unknow = True
    key = "26ebaf4ddfa89fad"
    for index, line in enumerate(file_line):  # 遍历ts链接列表
        # if "#EXT-X-KEY" in line:  # 找解密Key
        #     method_pos = line.find("METHOD")
        #     comma_pos = line.find(",")
        #     method = line[method_pos:comma_pos].split('=')[1]
        #     print
        #     "Decode Method:", method
        #
        #     uri_pos = line.find("URI")
        #     quotation_mark_pos = line.rfind('"')
        #     key_path = line[uri_pos:quotation_mark_pos].split('"')[1]
        #
        #     key_url = url.rsplit("/", 1)[0] + "/" + key_path  # 拼出key解密密钥URL
        #     res = requests.get(key_url)
        #     key = res.content
        #     print
        #     "key:", key

            # if "EXTINF" in line:  # 找ts地址并下载
         unknow = False
         # pd_url = url.rsplit("/", 1)[0] + "/" + file_line[index + 1]  # 拼出ts片段的URL
         pd_url = file_line[index]
         print("下载地址",pd_url)
         start = datetime.datetime.now().replace(microsecond=0)
         res = requests.get(pd_url)
         c_fule_name = file_line[index].rsplit("/")[-1]
         print('保存地址',download_path,'保存名字',c_fule_name)

         if len(key):  # AES 解密
                end = datetime.datetime.now().replace(microsecond=0)
         print("耗时:%s" % (end - start))
         iv = Random.new().read(AES.block_size)
         cryptor = AES.new(key.encode('utf-8'), AES.MODE_CBC, iv)
         with open(os.path.join(download_path, format(index) + ".mp4"), 'ab') as f:
                    f.write(cryptor.decrypt(res.content))
            else:
                with open(os.path.join(download_path, c_fule_name), 'ab') as f:
                    f.write(res.content)
                    f.flush()
    if unknow:
        raise BaseException("未找到对应的下载链接")
    else:
        print
        "下载完成"
    merge_file(download_path)


#合并ts文件
def merge_file(path):
    os.chdir(path)
    cmd = "copy /b * new.tmp"
    os.system(cmd)
    os.system('del /Q *.ts')
    os.system('del /Q *.mp4')
    os.rename("new.tmp", "new.mp4")



 if __name__ == '__main__':
 url = "https://video.buycar5.cn/20200912/WPkXE88O/1000kb/hls/index.m3u8"
 download(url)

4、对于没有AES加密的的ts,直接下载ts后合并就行

import datetime
import requests
import os
import time

# Python的urllib3软件包的证书认证及警告的禁用
import urllib3

urllib3.disable_warnings()

# m3u8是本地的文件路径
m3u8_path = "C:\\Users\\Administrator\\Desktop\\报表文档\\index1.m3u8"
# request请求头,若无可能被禁止访问
header = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"}


# 从m3u8文件中取出并生成ts文件的下载链接
def get_ts_urls(m3u8_path, base_url):
    urls = []
    with open(m3u8_path, "r") as file:
        lines = file.readlines()
        for line in lines:
            if line.endswith(".ts\n"):
                # urls.append(base_url + line.strip("\n"))
                urls.append(line.strip("\n"))

    print("地址列表",urls)
    return urls

'''
取出下载链接并下载  
ts_urls   从文件获取到的ts下载连接列表
download_path    保存地址
  ts文件合并命令:  copy /b E:\pythonProject\tsfiles\*.ts   D:\PySpider\new.mp4
'''
def download(ts_urls, download_path):
    for i in range(len(ts_urls)):
        # ts_path = download_path +"/{0}.ts".format(i)
        ts_path = "E:/pythonProject/tsfiles"+"/{0}.ts".format(i)
        print("存放地址:",ts_path)
        ts_url = ts_urls[i]
        file_name = ts_url.split("/")[-1]

        if ts_path is not None :
            print("ts_url", ts_url)
            print("开始下载 %s" % file_name)
            # time.sleep(0.5)  # 防止爬虫间隔时间过短被禁止请求
            start = datetime.datetime.now().replace(microsecond=0)
            try:
                response = requests.get(headers=header, url=ts_url, stream=True)
            except Exception as e:
                print("异常请求:%s" % e.args)
                return

            with open(ts_path, "wb+") as file:
                for chunk in response.iter_content(chunk_size=1024):
                    if chunk:
                        file.write(chunk)

            end = datetime.datetime.now().replace(microsecond=0)
            print("耗时:%s" % (end - start))
        else:
            print("{} 已经存在,开始下载下一个ts".format(file_name))
            continue


# 将已经下载的ts文件的路径进行排序
def file_walker(path):
    file_list = []
    for root, dirs, files in os.walk(path):  # 生成器
        for fn in files:
            p = str(root + '/' + fn)
            file_list.append(p)
    file_list.sort(key=lambda x: int(x[10:-3]))
    print(file_list)
    return file_list


# 将所有下载好的ts文件组合成一个文件
# 0.: 下载好的一堆ts文件的文件夹
# combine_path: 组合好的文件的存放位置
# file_name: 组合好的视频文件的文件名
def combine(ts_path, combine_path, file_name):
    print(ts_path)
    file_list = file_walker(ts_path)
    file_path = combine_path + file_name + '.ts'
    print(file_path)
    with open(file_path, 'wb+') as fw:
        for i in range(len(file_list)):
            fw.write(open(file_list[i], 'rb').read())


if __name__ == '__main__':
    # urls = get_ts_urls(m3u8_path=m3u8_path, base_url="https://www.zhuticlub.com:65")
    urls = get_ts_urls(m3u8_path=m3u8_path, base_url="https://www.zhuticlub.com:65")
    download(urls, "./tsfiles")
    combine("./tsfiles/", "D://PySpider//", "西虹市首富")

最後の溫柔
9 声望0 粉丝