kasper-boy commited on
Commit
dc9dbbb
1 Parent(s): 1a4e427

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +100 -0
app.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from PIL import Image, ImageDraw, ImageFont
3
+ import scipy.io.wavfile as wavfile
4
+ from transformers import pipeline
5
+
6
+ # Initialize pipelines
7
+ narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
8
+ object_detector = pipeline("object-detection", model="facebook/detr-resnet-101")
9
+
10
+ # Constants
11
+ FONT_PATH = None # Update this with the path to your custom font if needed
12
+ FONT_SIZE = 20
13
+ BOX_COLOR = "red"
14
+ TEXT_BACKGROUND_COLOR = "red"
15
+ TEXT_COLOR = "white"
16
+
17
+
18
+ def generate_audio(text):
19
+ try:
20
+ # Generate the narrated text
21
+ narrated_text = narrator(text)
22
+ # Save the audio to a WAV file
23
+ wavfile.write("output.wav", rate=narrated_text["sampling_rate"],
24
+ data=narrated_text["audio"][0])
25
+ return "output.wav"
26
+ except Exception as e:
27
+ print(f"Error generating audio: {e}")
28
+ return None
29
+
30
+
31
+ def count_objects(detection_objects):
32
+ object_counts = {}
33
+ for detection in detection_objects:
34
+ label = detection['label']
35
+ if label in object_counts:
36
+ object_counts[label] += 1
37
+ else:
38
+ object_counts[label] = 1
39
+ return object_counts
40
+
41
+
42
+ def generate_text_from_objects(object_counts):
43
+ response = "This picture contains"
44
+ labels = list(object_counts.keys())
45
+ for i, label in enumerate(labels):
46
+ count = object_counts[label]
47
+ response += f" {count} {label}"
48
+ if count > 1:
49
+ response += "s"
50
+ if i < len(labels) - 2:
51
+ response += ","
52
+ elif i == len(labels) - 2:
53
+ response += " and"
54
+ response += "."
55
+ return response
56
+
57
+
58
+ def draw_bounding_boxes(image, detections, font_path=FONT_PATH, font_size=FONT_SIZE):
59
+ draw_image = image.copy()
60
+ draw = ImageDraw.Draw(draw_image)
61
+ font = ImageFont.truetype(font_path, font_size) if font_path else ImageFont.load_default()
62
+
63
+ for detection in detections:
64
+ box = detection['box']
65
+ xmin, ymin, xmax, ymax = box['xmin'], box['ymin'], box['xmax'], box['ymax']
66
+ draw.rectangle([(xmin, ymin), (xmax, ymax)], outline=BOX_COLOR, width=3)
67
+
68
+ label = detection['label']
69
+ score = detection['score']
70
+ text = f"{label} {score:.2f}"
71
+
72
+ text_size = draw.textbbox((xmin, ymin), text, font=font)
73
+ draw.rectangle([(text_size[0], text_size[1]), (text_size[2], text_size[3])], fill=TEXT_BACKGROUND_COLOR)
74
+ draw.text((xmin, ymin), text, fill=TEXT_COLOR, font=font)
75
+
76
+ return draw_image
77
+
78
+
79
+ def detect_object(image):
80
+ try:
81
+ detections = object_detector(image)
82
+ processed_image = draw_bounding_boxes(image, detections)
83
+ object_counts = count_objects(detections)
84
+ natural_text = generate_text_from_objects(object_counts)
85
+ processed_audio = generate_audio(natural_text)
86
+ return processed_image, processed_audio
87
+ except Exception as e:
88
+ print(f"Error in object detection: {e}")
89
+ return None, None
90
+
91
+
92
+ demo = gr.Interface(
93
+ fn=detect_object,
94
+ inputs=[gr.Image(label="Select Image", type="pil")],
95
+ outputs=[gr.Image(label="Processed Image", type="pil"), gr.Audio(label="Generated Audio")],
96
+ title="AI-Powered Object Detection with Audio Feedback",
97
+ description="Upload an image and get object detection results using the DETR model with a ResNet-101 backbone with Audio Feedback"
98
+ )
99
+
100
+ demo.launch()