Use lazy import for flash_attn
#12
by
HwwwH
- opened
README.md
CHANGED
@@ -1,17 +1,7 @@
|
|
1 |
---
|
2 |
-
pipeline_tag:
|
3 |
datasets:
|
4 |
- openbmb/RLAIF-V-Dataset
|
5 |
-
library_name: transformers
|
6 |
-
language:
|
7 |
-
- multilingual
|
8 |
-
tags:
|
9 |
-
- minicpm-v
|
10 |
-
- vision
|
11 |
-
- ocr
|
12 |
-
- multi-image
|
13 |
-
- video
|
14 |
-
- custom_code
|
15 |
---
|
16 |
|
17 |
<h1>A GPT-4V Level MLLM for Single Image, Multi Image and Video on Your Phone</h1>
|
@@ -73,8 +63,7 @@ Note: For proprietary models, we calculate token density based on the image enco
|
|
73 |
<summary>Click to view video results on Video-MME and Video-ChatGPT.</summary>
|
74 |
<div align="center">
|
75 |
|
76 |
-
|
77 |
-
![image/png](https://cdn-uploads.huggingface.co/production/uploads/64abc4aa6cadc7aca585dddf/jmrjoRr8SFLkrstjDmpaV.png)
|
78 |
|
79 |
</div>
|
80 |
|
@@ -281,7 +270,7 @@ def encode_video(video_path):
|
|
281 |
print('num frames:', len(frames))
|
282 |
return frames
|
283 |
|
284 |
-
video_path
|
285 |
frames = encode_video(video_path)
|
286 |
question = "Describe the video"
|
287 |
msgs = [
|
|
|
1 |
---
|
2 |
+
pipeline_tag: visual-question-answering
|
3 |
datasets:
|
4 |
- openbmb/RLAIF-V-Dataset
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
---
|
6 |
|
7 |
<h1>A GPT-4V Level MLLM for Single Image, Multi Image and Video on Your Phone</h1>
|
|
|
63 |
<summary>Click to view video results on Video-MME and Video-ChatGPT.</summary>
|
64 |
<div align="center">
|
65 |
|
66 |
+
![image/png](https://cdn-uploads.huggingface.co/production/uploads/64abc4aa6cadc7aca585dddf/_T1mw5yhqNCqVdYRTQOGu.png)
|
|
|
67 |
|
68 |
</div>
|
69 |
|
|
|
270 |
print('num frames:', len(frames))
|
271 |
return frames
|
272 |
|
273 |
+
video_path="video_test.mp4"
|
274 |
frames = encode_video(video_path)
|
275 |
question = "Describe the video"
|
276 |
msgs = [
|