Merve Noyan commited on
Commit
57cd770
β€’
1 Parent(s): a93f663
pages/.DS_Store ADDED
Binary file (10.2 kB). View file
 
pages/18_Florence-2.py CHANGED
@@ -58,10 +58,15 @@ st.image("pages/Florence-2/image_6.jpg", use_column_width=True)
58
  st.markdown(""" """)
59
 
60
  st.info("""
61
- Ressources:
62
- [Florence-2: Advancing a Unified Representation for a Variety of Vision Tasks](https://arxiv.org/abs/2311.06242)
 
63
  by Bin Xiao, Haiping Wu, Weijian Xu, Xiyang Dai, Houdong Hu, Yumao Lu, Michael Zeng, Ce Liu, Lu Yuan (2023)
64
- [Hugging Face blog post](https://huggingface.co/blog/finetune-florence2)""", icon="πŸ“š")
 
 
 
 
65
 
66
  st.markdown(""" """)
67
  st.markdown(""" """)
 
58
  st.markdown(""" """)
59
 
60
  st.info("""
61
+ Resources:
62
+
63
+ - [Florence-2: Advancing a Unified Representation for a Variety of Vision Tasks](https://arxiv.org/abs/2311.06242)
64
  by Bin Xiao, Haiping Wu, Weijian Xu, Xiyang Dai, Houdong Hu, Yumao Lu, Michael Zeng, Ce Liu, Lu Yuan (2023)
65
+
66
+ - [Hugging Face blog post](https://huggingface.co/blog/finetune-florence2)
67
+
68
+ - [All the fine-tuned Florence-2 models](https://huggingface.co/models?search=florence-2)
69
+ """, icon="πŸ“š")
70
 
71
  st.markdown(""" """)
72
  st.markdown(""" """)
pages/21_Llava-NeXT-Interleave.py CHANGED
@@ -66,10 +66,14 @@ st.image("pages/Llava-NeXT-Interleave/image_6.jpg", use_column_width=True)
66
  st.markdown(""" """)
67
 
68
  st.info("""
69
- Ressources:
70
- [LLaVA-NeXT: Tackling Multi-image, Video, and 3D in Large Multimodal Models](https://llava-vl.github.io/blog/2024-06-16-llava-next-interleave/)
 
71
  by Feng Li, Renrui Zhang, Hao Zhang, Yuanhan Zhang, Bo Li, Wei Li, Zejun Ma, Chunyuan Li (2024)
72
- [GitHub](https://github.com/LLaVA-VL/LLaVA-NeXT/blob/inference/docs/LLaVA-NeXT-Interleave.md)""", icon="πŸ“š")
 
 
 
73
 
74
  st.markdown(""" """)
75
  st.markdown(""" """)
 
66
  st.markdown(""" """)
67
 
68
  st.info("""
69
+ Resources:
70
+
71
+ - [LLaVA-NeXT: Tackling Multi-image, Video, and 3D in Large Multimodal Models](https://llava-vl.github.io/blog/2024-06-16-llava-next-interleave/)
72
  by Feng Li, Renrui Zhang, Hao Zhang, Yuanhan Zhang, Bo Li, Wei Li, Zejun Ma, Chunyuan Li (2024)
73
+ [GitHub](https://github.com/LLaVA-VL/LLaVA-NeXT/blob/inference/docs/LLaVA-NeXT-Interleave.md)
74
+ - [Transformers Documentation](https://huggingface.co/docs/transformers/en/model_doc/llava)
75
+ - [Demo](https://huggingface.co/spaces/merve/llava-next-interleave)
76
+ """, icon="πŸ“š")
77
 
78
  st.markdown(""" """)
79
  st.markdown(""" """)
pages/22_Chameleon.py CHANGED
@@ -67,11 +67,17 @@ st.image("pages/Chameleon/image_6.jpg", use_column_width=True)
67
  st.markdown(""" """)
68
 
69
  st.info("""
70
- Ressources:
71
- [Chameleon: Mixed-Modal Early-Fusion Foundation Models](https://arxiv.org/abs/2405.09818)
 
72
  by Chameleon Team (2024)
73
- [GitHub](https://github.com/facebookresearch/chameleon)
74
- [Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/chameleon)""", icon="πŸ“š")
 
 
 
 
 
75
 
76
  st.markdown(""" """)
77
  st.markdown(""" """)
 
67
  st.markdown(""" """)
68
 
69
  st.info("""
70
+ Resources:
71
+
72
+ - [Chameleon: Mixed-Modal Early-Fusion Foundation Models](https://arxiv.org/abs/2405.09818)
73
  by Chameleon Team (2024)
74
+
75
+ - [GitHub](https://github.com/facebookresearch/chameleon)
76
+
77
+ - [Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/chameleon)
78
+
79
+ - [Demo](https://huggingface.co/spaces/merve/chameleon-7b)
80
+ """, icon="πŸ“š")
81
 
82
  st.markdown(""" """)
83
  st.markdown(""" """)
pages/24_SAMv2.py CHANGED
@@ -67,11 +67,14 @@ st.image("pages/SAMv2/image_4.jpg", use_column_width=True)
67
  st.markdown(""" """)
68
 
69
  st.info("""
70
- Ressources:
71
- [SAM 2: Segment Anything in Images and Videos]()
 
72
  by Nikhila Ravi, Valentin Gabeur, Yuan-Ting Hu, Ronghang Hu, Chaitanya Ryali, Tengyu Ma, Haitham Khedr, Roman RΓ€dle, Chloe Rolland, Laura Gustafson, Eric Mintun, Junting Pan, Kalyan Vasudev Alwala, Nicolas Carion, Chao-Yuan Wu, Ross Girshick, Piotr DollΓ‘r, Christoph Feichtenhofer (2024)
73
- [GitHub](https://github.com/facebookresearch/segment-anything-2)
74
- [Hugging Face documentation]()""", icon="πŸ“š")
 
 
75
 
76
  st.markdown(""" """)
77
  st.markdown(""" """)
@@ -85,4 +88,4 @@ with col2:
85
  switch_page("Home")
86
  with col3:
87
  if st.button('Next paper', use_container_width=True):
88
- switch_page("Home")
 
67
  st.markdown(""" """)
68
 
69
  st.info("""
70
+ Resources:
71
+
72
+ - [SAM 2: Segment Anything in Images and Videos]()
73
  by Nikhila Ravi, Valentin Gabeur, Yuan-Ting Hu, Ronghang Hu, Chaitanya Ryali, Tengyu Ma, Haitham Khedr, Roman RΓ€dle, Chloe Rolland, Laura Gustafson, Eric Mintun, Junting Pan, Kalyan Vasudev Alwala, Nicolas Carion, Chao-Yuan Wu, Ross Girshick, Piotr DollΓ‘r, Christoph Feichtenhofer (2024)
74
+
75
+ - [GitHub](https://github.com/facebookresearch/segment-anything-2)
76
+
77
+ - [Model and Demos Collection](https://huggingface.co/collections/merve/sam2-66ac9deac6fca3bc5482fe30)""", icon="πŸ“š")
78
 
79
  st.markdown(""" """)
80
  st.markdown(""" """)
 
88
  switch_page("Home")
89
  with col3:
90
  if st.button('Next paper', use_container_width=True):
91
+ switch_page("UDOP")
pages/25_UDOP.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from streamlit_extras.switch_page_button import switch_page
3
+
4
+ st.title("SAMv2")
5
+
6
+ st.success("""[Original tweet](https://x.com/mervenoyann/status/1767200350530859321) (Mar 11, 2024)""", icon="ℹ️")
7
+ st.markdown(""" """)
8
+
9
+ st.markdown("""New foundation model on document understanding and generation in transformers 🀩
10
+ UDOP by MSFT is a bleeding-edge model that is capable of many tasks, including question answering, document editing and more! 🀯
11
+ Check out the [demo](https://huggingface.co/spaces/merve/UDOP).
12
+ Technical details 🧢""")
13
+ st.markdown(""" """)
14
+
15
+ st.image("pages/UDOP/image_1.jpeg", use_column_width=True)
16
+ st.markdown(""" """)
17
+
18
+ st.markdown("""
19
+ UDOP is a model that combines vision, text and layout. πŸ“
20
+ This model is very interesting because the input representation truly captures the nature of the document modality: text, where the text is, and the layout of the document matters!""")
21
+ st.markdown(""" """)
22
+
23
+
24
+ st.markdown("""
25
+ If you know T5, it resembles that: it's pre-trained on both self-supervised and supervised objectives over text, image and layout.
26
+ To switch between tasks, one simply needs to change the task specific prompt at the beginning, e.g. for QA, one prepends with Question answering.""")
27
+ st.markdown(""" """)
28
+
29
+ st.image("pages/UDOP/image_2.png", use_column_width=True)
30
+ st.markdown(""" """)
31
+
32
+ st.markdown("""
33
+ As for the architecture, it's like T5, except it has a single encoder that takes in text, image and layout, and two decoders (text-layout and vision decoders) combined into one.
34
+ The vision decoder is a masked autoencoder (thus the capabilities of document editing).
35
+ """)
36
+ st.image("pages/UDOP/image_3.jpeg", use_column_width=True)
37
+ st.markdown(""" """)
38
+
39
+ st.image("pages/UDOP/image_3.jpeg", use_column_width=True)
40
+ st.markdown(""" """)
41
+
42
+ st.markdown("""
43
+ For me, the most interesting capability is document reconstruction, document editing and layout re-arrangement (see below πŸ‘‡ ) this decoder isn't released though because it could be used maliciously to fake document editing.
44
+ """)
45
+ st.markdown(""" """)
46
+
47
+ st.image("pages/UDOP/image_4.jpeg", use_column_width=True)
48
+ st.markdown(""" """)
49
+
50
+ st.markdown("""
51
+ Overall, the model performs very well on document understanding benchmark (DUE) and also information extraction (FUNSD, CORD) and classification (RVL-CDIP) for vision, text, layout modalities πŸ‘‡
52
+ """)
53
+ st.markdown(""" """)
54
+
55
+ st.image("pages/UDOP/image_5.jpeg", use_column_width=True)
56
+ st.markdown(""" """)
57
+
58
+ st.info("""
59
+ Resources:
60
+ - [Unifying Vision, Text, and Layout for Universal Document Processing](https://arxiv.org/abs/2212.02623)
61
+ Zineng Tang, Ziyi Yang, Guoxin Wang, Yuwei Fang, Yang Liu, Chenguang Zhu, Michael Zeng, Cha Zhang, Mohit Bansal (2022)
62
+
63
+ - [GitHub](https://github.com/microsoft/UDOP)
64
+
65
+ - [Hugging Face Models](https://huggingface.co/microsoft/udop-large)
66
+
67
+ - [Hugging Face documentation](https://huggingface.co/docs/transformers/en/model_doc/udop)""", icon="πŸ“š")
68
+
69
+ st.markdown(""" """)
70
+ st.markdown(""" """)
71
+ st.markdown(""" """)
72
+ col1, col2, col3 = st.columns(3)
73
+ with col1:
74
+ if st.button('Previous paper', use_container_width=True):
75
+ switch_page("SAMv2")
76
+ with col2:
77
+ if st.button('Home', use_container_width=True):
78
+ switch_page("Home")
79
+ with col3:
80
+ if st.button('Next paper', use_container_width=True):
81
+ switch_page("MiniGemini")
pages/26_MiniGemini.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from streamlit_extras.switch_page_button import switch_page
3
+
4
+ st.title("MiniGemini")
5
+
6
+ st.success("""[Original tweet](https://x.com/mervenoyann/status/1783864388249694520) (April 26, 2024)""", icon="ℹ️")
7
+ st.markdown(""" """)
8
+
9
+ st.markdown("""
10
+ MiniGemini is the coolest VLM, let's explain 🧢
11
+ """)
12
+ st.markdown(""" """)
13
+
14
+ st.image("pages/MiniGemini/image_1.jpeg", use_column_width=True)
15
+ st.markdown(""" """)
16
+
17
+ st.markdown("""MiniGemini is a vision language model that understands both image and text and also generates text and an image that goes best with the context! 🀯
18
+ """)
19
+ st.markdown(""" """)
20
+
21
+ st.image("pages/MiniGemini/image_2.jpeg", use_column_width=True)
22
+ st.markdown(""" """)
23
+
24
+ st.markdown("""
25
+ This model has two image encoders (one CNN and one ViT) in parallel to capture the details in the images
26
+ I saw the same design in DocOwl 1.5
27
+ then it has a decoder to output text and also a prompt to be sent to SDXL for image generation (which works very well!)""")
28
+ st.markdown(""" """)
29
+
30
+ st.image("pages/MiniGemini/image_3.jpeg", use_column_width=True)
31
+ st.markdown(""" """)
32
+
33
+ st.markdown("""They adopt CLIP's ViT for low resolution visual embedding encoder and a CNN-based one for high resolution image encoding (precisely a pre-trained ConvNeXt)
34
+ """)
35
+ st.markdown(""" """)
36
+
37
+ st.image("pages/MiniGemini/image_4.jpeg", use_column_width=True)
38
+ st.markdown(""" """)
39
+
40
+ st.markdown("""Thanks to the second encoder it can grasp details in images, which also comes in handy for e.g. document tasks (but see below the examples are mindblowing IMO)
41
+ """)
42
+ st.markdown(""" """)
43
+
44
+ st.image("pages/MiniGemini/image_5.jpeg", use_column_width=True)
45
+ st.markdown(""" """)
46
+
47
+ st.markdown("""According to their reporting the model performs very well across many benchmarks compared to LLaVA 1.5 and Gemini Pro
48
+ """)
49
+ st.markdown(""" """)
50
+
51
+ st.image("pages/MiniGemini/image_6.png", use_column_width=True)
52
+ st.markdown(""" """)
53
+
54
+ st.info("""
55
+ Resources:
56
+
57
+ - [Mini-Gemini: Mining the Potential of Multi-modality Vision Language Models](https://huggingface.co/papers/2403.18814)
58
+ by Yanwei Li, Yuechen Zhang, Chengyao Wang, Zhisheng Zhong, Yixin Chen, Ruihang Chu, Shaoteng Liu, Jiaya Jia
59
+ (2024)
60
+
61
+ - [GitHub](https://github.com/dvlab-research/MGM)
62
+
63
+ - [Link to Model Repository](https://huggingface.co/YanweiLi/MGM-13B-HD)""", icon="πŸ“š")
64
+
65
+ st.markdown(""" """)
66
+ st.markdown(""" """)
67
+ st.markdown(""" """)
68
+ col1, col2, col3 = st.columns(3)
69
+ with col1:
70
+ if st.button('Previous paper', use_container_width=True):
71
+ switch_page("UDOP")
72
+ with col2:
73
+ if st.button('Home', use_container_width=True):
74
+ switch_page("Home")
75
+ with col3:
76
+ if st.button('Next paper', use_container_width=True):
77
+ switch_page("ColPali")
pages/27_ColPali.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from streamlit_extras.switch_page_button import switch_page
3
+
4
+ st.title("ColPali")
5
+
6
+ st.success("""[Original tweet](https://x.com/mervenoyann/status/1811003265858912670) (Jul 10, 2024)""", icon="ℹ️")
7
+ st.markdown(""" """)
8
+
9
+ st.markdown("""
10
+ Forget any document retrievers, use ColPali πŸ’₯πŸ’₯
11
+
12
+ Document retrieval is done through OCR + layout detection, but it's overkill and doesn't work well! πŸ€“
13
+
14
+ ColPali uses a vision language model, which is better in doc understanding πŸ“‘
15
+ """)
16
+ st.markdown(""" """)
17
+
18
+ st.image("pages/ColPali/image_1.png", use_column_width=True)
19
+ st.markdown(""" """)
20
+
21
+ st.markdown("""
22
+ Check out [ColPali model](https://huggingface.co/vidore/colpali) (mit license!)
23
+ Check out the [blog](https://huggingface.co/blog/manu/colpali)
24
+
25
+ The authors also released a new benchmark for document retrieval, [ViDoRe Leaderboard](https://huggingface.co/spaces/vidore/vidore-leaderboard), submit your model! """)
26
+ st.markdown(""" """)
27
+
28
+ st.image("pages/ColPali/image_2.jpeg", use_column_width=True)
29
+ st.markdown(""" """)
30
+
31
+ st.markdown("""
32
+ Regular document retrieval systems use OCR + layout detection + another model to retrieve information from documents, and then use output representations in applications like RAG πŸ₯²
33
+
34
+ Meanwhile modern image encoders demonstrate out-of-the-box document understanding capabilities!""")
35
+ st.markdown(""" """)
36
+
37
+ st.markdown("""
38
+ ColPali marries the idea of modern vision language models with retrieval 🀝
39
+
40
+ The authors apply contrastive fine-tuning to SigLIP on documents, and pool the outputs (they call it BiSigLip). Then they feed the patch embedding outputs to PaliGemma and create BiPali πŸ–‡οΈ
41
+ """)
42
+ st.markdown(""" """)
43
+
44
+ st.image("pages/ColPali/image_3.png", use_column_width=True)
45
+ st.markdown(""" """)
46
+
47
+ st.markdown("""
48
+ BiPali natively supports image patch embeddings to an LLM, which enables leveraging the ColBERT-like late interaction computations between text tokens and image patches (hence the name ColPali!) 🀩
49
+ """)
50
+ st.markdown(""" """)
51
+
52
+ st.markdown("""
53
+ The authors created the ViDoRe benchmark by collecting PDF documents and generate queries from Claude-3 Sonnet.
54
+
55
+ See below how every model and ColPali performs on ViDoRe πŸ‘‡πŸ»
56
+ """)
57
+ st.markdown(""" """)
58
+
59
+ st.image("pages/ColPali/image_4.jpeg", use_column_width=True)
60
+ st.markdown(""" """)
61
+
62
+ st.markdown("""Aside from performance improvements, ColPali is very fast for offline indexing as well!
63
+ """)
64
+ st.markdown(""" """)
65
+
66
+ st.image("pages/ColPali/image_5.png", use_column_width=True)
67
+ st.markdown(""" """)
68
+
69
+
70
+ st.info("""
71
+ Resources:
72
+ - [ColPali: Efficient Document Retrieval with Vision Language Models](https://huggingface.co/papers/2407.01449)
73
+ by Manuel Faysse, Hugues Sibille, Tony Wu, Bilel Omrani, Gautier Viaud, CΓ©line Hudelot, Pierre Colombo (2024)
74
+
75
+ - [GitHub](https://github.com/illuin-tech/colpali)
76
+
77
+ - [Link to Models](https://huggingface.co/models?search=vidore)
78
+
79
+ - [Link to Leaderboard](https://huggingface.co/spaces/vidore/vidore-leaderboard)""", icon="πŸ“š")
80
+
81
+ st.markdown(""" """)
82
+ st.markdown(""" """)
83
+ st.markdown(""" """)
84
+ col1, col2, col3 = st.columns(3)
85
+ with col1:
86
+ if st.button('Previous paper', use_container_width=True):
87
+ switch_page("MiniGemini")
88
+ with col2:
89
+ if st.button('Home', use_container_width=True):
90
+ switch_page("Home")
91
+ with col3:
92
+ if st.button('Next paper', use_container_width=True):
93
+ switch_page("Home")
pages/ColPali/image_1.png ADDED
pages/ColPali/image_2.jpeg ADDED
pages/ColPali/image_3.png ADDED
pages/ColPali/image_4.jpeg ADDED
pages/ColPali/image_5.png ADDED
pages/MiniGemini/image_1.jpeg ADDED
pages/MiniGemini/image_2.jpeg ADDED
pages/MiniGemini/image_3.jpeg ADDED
pages/MiniGemini/image_4.jpeg ADDED
pages/MiniGemini/image_5.jpeg ADDED
pages/MiniGemini/image_6.png ADDED
pages/UDOP/image_1.jpeg ADDED
pages/UDOP/image_2.png ADDED
pages/UDOP/image_3.jpeg ADDED
pages/UDOP/image_4.jpeg ADDED
pages/UDOP/image_5.jpeg ADDED