使用 modelscope 跑 OpenBMB/MiniCPM-V-2_6-int4,绕开 huggingface 的网络封锁

完整版

https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6

# test.py
# test.py
import torch
from PIL import Image
from modelscope import AutoModel, AutoTokenizer

model = AutoModel.from_pretrained('OpenBMB/MiniCPM-V-2_6', trust_remote_code=True,
    attn_implementation='sdpa', torch_dtype=torch.bfloat16) # sdpa or flash_attention_2, no eager
model = model.eval().cuda()
tokenizer = AutoTokenizer.from_pretrained('OpenBMB/MiniCPM-V-2_6', trust_remote_code=True)

image = Image.open('image.png').convert('RGB')
question = 'What is in the image?'
msgs = [{'role': 'user', 'content': [image, question]}]

res = model.chat(
    image=None,
    msgs=msgs,
    tokenizer=tokenizer
)
print(res)

## if you want to use streaming, please make sure sampling=True and stream=True
## the model.chat will return a generator
res = model.chat(
    image=None,
    msgs=msgs,
    tokenizer=tokenizer,
    sampling=True,
    stream=True
)

generated_text = ""
for new_text in res:
    generated_text += new_text
    print(new_text, flush=True, end='')

int4 量化版本

https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6-int4

import torch
from PIL import Image
from modelscope import AutoModel, AutoTokenizer
from decord import VideoReader, cpu    # pip install decord

params={}

model = AutoModel.from_pretrained('OpenBMB/MiniCPM-V-2_6-int4', trust_remote_code=True) # sdpa or flash_attention_2, no eager
model = model.eval()
tokenizer = AutoTokenizer.from_pretrained('OpenBMB/MiniCPM-V-2_6-int4', trust_remote_code=True)

MAX_NUM_FRAMES=64

def encode_video(video_path):
    def uniform_sample(l, n):
        gap = len(l) / n
        idxs = [int(i * gap + gap / 2) for i in range(n)]
        return [l[i] for i in idxs]

    vr = VideoReader(video_path, ctx=cpu(0))
    sample_fps = round(vr.get_avg_fps() / 1)  # FPS
    frame_idx = [i for i in range(0, len(vr), sample_fps)]
    if len(frame_idx) > MAX_NUM_FRAMES:
        frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
    frames = vr.get_batch(frame_idx).asnumpy()
    frames = [Image.fromarray(v.astype('uint8')) for v in frames]
    print('num frames:', len(frames))
    return frames

video_path="(讲真话)甄嬛组团逗鸟,全程阴阳怪气没有一句废话#甄嬛传 #甄嬛传十级观众 #甄嬛传解说 #抖音野生娱评人_e4c12bb53bcee4a00bc62b297f0ebaba.mp4"
frames = encode_video(video_path)
question = "Describe the video"
msgs = [
    {'role': 'user', 'content': frames + [question]}, 
]

# Set decode params for video
params={}
params["use_image_id"] = False
params["max_slice_nums"] = 1 # 如果cuda OOM且视频分辨率大于448*448 可设为1

answer = model.chat(
    image=None,
    msgs=msgs,
    tokenizer=tokenizer,
    **params
)
print(answer)

universe_king
3.4k 声望680 粉丝