• Software Environment
 Ubuntu 20.04
Python 3.8
s3fs  0.4.2
 # coding: utf-8
# author: qbit
# date: 2021-08-08
# summary: 遍历中国区 S3 目录

import s3fs

# 如果操作系统已配置命令行可访问 s3
# 可以不用参数
fs = s3fs.S3FileSystem(
    key="aws_access_key_id",
    secret="aws_secret_access_key",
    client_kwargs={"region_name": "cn-northwest-1"}     # 中国宁夏区
)

print(fs.ls("qbit-cn-northwest-1"))
for parent, dirs, files in fs.walk("qbit-cn-northwest-1/data"):
    for file in files:
        pathfile = f"{parent}/{file}"
        print(f"{fs.size(pathfile)/1024:8.2f} KB {pathfile}")
  • Multiprocess sample code
 # coding: utf-8
# author: qbit
# date: 2021-08-11
# summary: 多进程遍历中国区 S3 目录

import time
import multiprocessing
from concurrent.futures import ProcessPoolExecutor, as_completed

import s3fs

fs = s3fs.S3FileSystem()
s3root = 's3://qbit-cn-northwest-1/data'

def StatOneFile(s3path):
    r''' 统计一个 s3 上的 parquet 文件 '''
    return fs.size(s3path)

def StatFiles():
    print(fs.ls(s3root))
    startTime = time.time()
    infileCount = 0
    fileCount = 0
    maxFileSize = 0
    totalFileSize = 0
    batchList = list()
    for parent, dirs, files in fs.walk(s3root):
        for file in files:
            s3path = f's3://{parent}/{file}'
            batchList.append(s3path)
            infileCount += 1
            if len(batchList) >= 128:
                with ProcessPoolExecutor(max_workers=8) as executor:
                    futureDict = {executor.submit(StatOneFile, s3path): s3path for s3path in batchList}
                    for future in as_completed(futureDict):
                        s3path = futureDict[future]
                        filesize = future.result()
                        fileCount += 1
                        totalFileSize += filesize
                        if maxFileSize < filesize:
                            maxFileSize = filesize
                        batchList.clear()   # 清空
                print(f'cost: {(time.time()-startTime):.2f} s, fileCount: {fileCount}, maxFileSize: {maxFileSize} B, totalFileSize: {totalFileSize} B')
    if len(batchList) >= 0:
        with ProcessPoolExecutor(max_workers=8) as executor:
            futureDict = {executor.submit(StatOneFile, s3path): s3path for s3path in batchList}
            for future in as_completed(futureDict):
                s3path = futureDict[future]
                filesize = future.result()
                fileCount += 1
                totalFileSize += filesize
                if maxFileSize < filesize:
                    maxFileSize = filesize
                batchList.clear()   # 清空
    print(f'cost: {(time.time()-startTime):.2f} s, fileCount: {fileCount}, maxFileSize: {maxFileSize} B, totalFileSize: {totalFileSize} B')

if __name__ == '__main__':
    multiprocessing.set_start_method('spawn')
    StatFiles()
This article is from qbit snap

qbit
268 声望279 粉丝

引用和评论

0 条评论