Ubuntu 20.04
Python 3.8
s3fs 0.4.2
# coding: utf-8
# author: qbit
# date: 2021-08-08
# summary: 遍历中国区 S3 目录
import s3fs
# 如果操作系统已配置命令行可访问 s3
# 可以不用参数
fs = s3fs.S3FileSystem(
key="aws_access_key_id",
secret="aws_secret_access_key",
client_kwargs={"region_name": "cn-northwest-1"} # 中国宁夏区
)
print(fs.ls("qbit-cn-northwest-1"))
for parent, dirs, files in fs.walk("qbit-cn-northwest-1/data"):
for file in files:
pathfile = f"{parent}/{file}"
print(f"{fs.size(pathfile)/1024:8.2f} KB {pathfile}")
# coding: utf-8
# author: qbit
# date: 2021-08-11
# summary: 多进程遍历中国区 S3 目录
import time
import multiprocessing
from concurrent.futures import ProcessPoolExecutor, as_completed
import s3fs
fs = s3fs.S3FileSystem()
s3root = 's3://qbit-cn-northwest-1/data'
def StatOneFile(s3path):
r''' 统计一个 s3 上的 parquet 文件 '''
return fs.size(s3path)
def StatFiles():
print(fs.ls(s3root))
startTime = time.time()
infileCount = 0
fileCount = 0
maxFileSize = 0
totalFileSize = 0
batchList = list()
for parent, dirs, files in fs.walk(s3root):
for file in files:
s3path = f's3://{parent}/{file}'
batchList.append(s3path)
infileCount += 1
if len(batchList) >= 128:
with ProcessPoolExecutor(max_workers=8) as executor:
futureDict = {executor.submit(StatOneFile, s3path): s3path for s3path in batchList}
for future in as_completed(futureDict):
s3path = futureDict[future]
filesize = future.result()
fileCount += 1
totalFileSize += filesize
if maxFileSize < filesize:
maxFileSize = filesize
batchList.clear() # 清空
print(f'cost: {(time.time()-startTime):.2f} s, fileCount: {fileCount}, maxFileSize: {maxFileSize} B, totalFileSize: {totalFileSize} B')
if len(batchList) >= 0:
with ProcessPoolExecutor(max_workers=8) as executor:
futureDict = {executor.submit(StatOneFile, s3path): s3path for s3path in batchList}
for future in as_completed(futureDict):
s3path = futureDict[future]
filesize = future.result()
fileCount += 1
totalFileSize += filesize
if maxFileSize < filesize:
maxFileSize = filesize
batchList.clear() # 清空
print(f'cost: {(time.time()-startTime):.2f} s, fileCount: {fileCount}, maxFileSize: {maxFileSize} B, totalFileSize: {totalFileSize} B')
if __name__ == '__main__':
multiprocessing.set_start_method('spawn')
StatFiles()
本文出自 qbit snap
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。