Use lazy import for flash_attn

#12
Files changed (1) hide show
  1. README.md +3 -14
README.md CHANGED
@@ -1,17 +1,7 @@
1
  ---
2
- pipeline_tag: image-text-to-text
3
  datasets:
4
  - openbmb/RLAIF-V-Dataset
5
- library_name: transformers
6
- language:
7
- - multilingual
8
- tags:
9
- - minicpm-v
10
- - vision
11
- - ocr
12
- - multi-image
13
- - video
14
- - custom_code
15
  ---
16
 
17
  <h1>A GPT-4V Level MLLM for Single Image, Multi Image and Video on Your Phone</h1>
@@ -73,8 +63,7 @@ Note: For proprietary models, we calculate token density based on the image enco
73
  <summary>Click to view video results on Video-MME and Video-ChatGPT.</summary>
74
  <div align="center">
75
 
76
- <!-- ![image/png](https://cdn-uploads.huggingface.co/production/uploads/64abc4aa6cadc7aca585dddf/_T1mw5yhqNCqVdYRTQOGu.png) -->
77
- ![image/png](https://cdn-uploads.huggingface.co/production/uploads/64abc4aa6cadc7aca585dddf/jmrjoRr8SFLkrstjDmpaV.png)
78
 
79
  </div>
80
 
@@ -281,7 +270,7 @@ def encode_video(video_path):
281
  print('num frames:', len(frames))
282
  return frames
283
 
284
- video_path ="video_test.mp4"
285
  frames = encode_video(video_path)
286
  question = "Describe the video"
287
  msgs = [
 
1
  ---
2
+ pipeline_tag: visual-question-answering
3
  datasets:
4
  - openbmb/RLAIF-V-Dataset
 
 
 
 
 
 
 
 
 
 
5
  ---
6
 
7
  <h1>A GPT-4V Level MLLM for Single Image, Multi Image and Video on Your Phone</h1>
 
63
  <summary>Click to view video results on Video-MME and Video-ChatGPT.</summary>
64
  <div align="center">
65
 
66
+ ![image/png](https://cdn-uploads.huggingface.co/production/uploads/64abc4aa6cadc7aca585dddf/_T1mw5yhqNCqVdYRTQOGu.png)
 
67
 
68
  </div>
69
 
 
270
  print('num frames:', len(frames))
271
  return frames
272
 
273
+ video_path="video_test.mp4"
274
  frames = encode_video(video_path)
275
  question = "Describe the video"
276
  msgs = [