akhaliq HF staff commited on
Commit
b6ef90b
1 Parent(s): d7dd1ce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -47
app.py CHANGED
@@ -1,9 +1,11 @@
1
  import gradio as gr
2
- from gradio_multimodalchatbot import MultimodalChatbot
3
- from gradio.data_classes import FileData
4
  import os
5
  from together import Together
6
  import base64
 
 
7
 
8
  # Initialize Together client
9
  client = Together()
@@ -16,61 +18,70 @@ def encode_image(image_path):
16
  with open(image_path, "rb") as image_file:
17
  return base64.b64encode(image_file.read()).decode('utf-8')
18
 
19
- def call_llama_vision_api(prompt: str, image_path: str) -> str:
20
- getDescriptionPrompt = "You are a UX/UI designer. Describe the attached screenshot or UI mockup in detail. I will feed in the output you give me to a coding model that will attempt to recreate this mockup, so please think step by step and describe the UI in detail. Pay close attention to background color, text color, font size, font family, padding, margin, border, etc. Match the colors and sizes exactly. Make sure to mention every part of the screenshot including any headers, footers, etc. Use the exact text from the screenshot."
 
 
21
 
22
- base64_image = encode_image(image_path)
 
 
 
 
 
 
 
 
23
 
24
- messages = [
25
- {
26
- "role": "user",
27
- "content": [
28
- {"type": "text", "text": getDescriptionPrompt + "\n\n" + prompt},
29
- {
30
- "type": "image_url",
31
- "image_url": {
32
- "url": f"data:image/jpeg;base64,{base64_image}"
33
- },
34
- },
35
- ],
36
- }
37
- ]
38
 
39
  stream = client.chat.completions.create(
40
  model="meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
41
  messages=messages,
 
42
  stream=True,
43
  )
44
 
45
- response = ""
46
  for chunk in stream:
47
- content = chunk.choices[0].delta.content or ""
48
- response += content
49
- return response
50
-
51
- def chat(message, history):
52
- user_message = message["text"]
53
- files = message.get("files", [])
54
-
55
- if files and files[0]["file"].path:
56
- image_path = files[0]["file"].path
57
- response = call_llama_vision_api(user_message, image_path)
58
- else:
59
- response = "I'm sorry, but I need an image to analyze. Please upload an image along with your question."
60
 
61
- history.append((message, {"text": response, "files": []}))
62
- return history
63
-
64
- with gr.Blocks() as demo:
65
- gr.Markdown("# Llama 3.2 Vision Multimodal Chatbot Demo")
66
- gr.Markdown("Upload an image and enter your message to analyze using the Llama 3.2 Vision model.")
67
-
68
- chatbot = MultimodalChatbot(
69
- value=[],
70
- height=800,
71
- )
72
-
73
- chatbot.submit(chat, [chatbot.messages, chatbot.messages], [chatbot.messages])
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  if __name__ == "__main__":
76
- demo.launch()
 
1
  import gradio as gr
2
+ from PIL import Image
3
+ import requests
4
  import os
5
  from together import Together
6
  import base64
7
+ from threading import Thread
8
+ import time
9
 
10
  # Initialize Together client
11
  client = Together()
 
18
  with open(image_path, "rb") as image_file:
19
  return base64.b64encode(image_file.read()).decode('utf-8')
20
 
21
+ def bot_streaming(message, history, max_new_tokens=250):
22
+ txt = message["text"]
23
+ messages = []
24
+ images = []
25
 
26
+ for i, msg in enumerate(history):
27
+ if isinstance(msg[0], tuple):
28
+ messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encode_image(msg[0][0])}"}}]})
29
+ messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]})
30
+ elif isinstance(history[i-1], tuple) and isinstance(msg[0], str):
31
+ pass
32
+ elif isinstance(history[i-1][0], str) and isinstance(msg[0], str):
33
+ messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
34
+ messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
35
 
36
+ if len(message["files"]) == 1:
37
+ if isinstance(message["files"][0], str): # examples
38
+ image_path = message["files"][0]
39
+ else: # regular input
40
+ image_path = message["files"][0]["path"]
41
+ messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encode_image(image_path)}"}}]})
42
+ else:
43
+ messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})
 
 
 
 
 
 
44
 
45
  stream = client.chat.completions.create(
46
  model="meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
47
  messages=messages,
48
+ max_tokens=max_new_tokens,
49
  stream=True,
50
  )
51
 
52
+ buffer = ""
53
  for chunk in stream:
54
+ if chunk.choices[0].delta.content is not None:
55
+ buffer += chunk.choices[0].delta.content
56
+ time.sleep(0.01)
57
+ yield buffer
 
 
 
 
 
 
 
 
 
58
 
59
+ demo = gr.ChatInterface(
60
+ fn=bot_streaming,
61
+ title="Multimodal Llama",
62
+ examples=[
63
+ [{"text": "Which era does this piece belong to? Give details about the era.", "files":["./examples/rococo.jpg"]}, 200],
64
+ [{"text": "Where do the droughts happen according to this diagram?", "files":["./examples/weather_events.png"]}, 250],
65
+ [{"text": "What happens when you take out white cat from this chain?", "files":["./examples/ai2d_test.jpg"]}, 250],
66
+ [{"text": "Which company was this invoice addressed to?", "files":["./examples/invoice.png"]}, 250],
67
+ [{"text": "Where to find this monument? Can you give me other recommendations around the area?", "files":["./examples/wat_arun.jpg"]}, 250],
68
+ ],
69
+ textbox=gr.MultimodalTextbox(),
70
+ additional_inputs=[
71
+ gr.Slider(
72
+ minimum=10,
73
+ maximum=500,
74
+ value=250,
75
+ step=10,
76
+ label="Maximum number of new tokens to generate",
77
+ )
78
+ ],
79
+ cache_examples=False,
80
+ description="Try Multimodal Llama by Meta with the Together API in this demo. Upload an image, and start chatting about it, or simply try one of the examples below.",
81
+ stop_btn="Stop Generation",
82
+ fill_height=True,
83
+ multimodal=True
84
+ )
85
 
86
  if __name__ == "__main__":
87
+ demo.launch(debug=True)