Spaces:
Running
Running
jadehardouin
commited on
Commit
•
0e893b5
1
Parent(s):
eef299f
Update models.py
Browse files
models.py
CHANGED
@@ -131,10 +131,10 @@ class OpenAIModelGPT3_5(BaseTCOModel):
|
|
131 |
|
132 |
return cost_per_input_token, cost_per_output_token, labor
|
133 |
|
134 |
-
class
|
135 |
|
136 |
def __init__(self):
|
137 |
-
self.set_name("(
|
138 |
self.set_latency("27s")
|
139 |
super().__init__()
|
140 |
|
@@ -144,7 +144,7 @@ class OpenSourceLlama2Model(BaseTCOModel):
|
|
144 |
input_tokens_cost_per_token = 0.00052
|
145 |
r = maxed_out / 100
|
146 |
return input_tokens_cost_per_token * 0.65 / r, output_tokens_cost_per_token * 0.65/ r
|
147 |
-
|
148 |
self.source = gr.Markdown("""<span style="font-size: 16px; font-weight: 600; color: #212529;">Source</span>""")
|
149 |
self.info = gr.Markdown("The cost per input and output tokens values below are from [these benchmark results](https://www.cursor.so/blog/llama-inference#user-content-fn-llama-paper) that were obtained using the following initial configurations.",
|
150 |
interactive=False,
|
@@ -176,7 +176,7 @@ class OpenSourceLlama2Model(BaseTCOModel):
|
|
176 |
)
|
177 |
self.maxed_out.change(on_maxed_out_change, inputs=[self.maxed_out, self.input_tokens_cost_per_token, self.output_tokens_cost_per_token], outputs=[self.input_tokens_cost_per_token, self.output_tokens_cost_per_token])
|
178 |
|
179 |
-
self.labor = gr.Number(
|
180 |
label="($) Labor cost per month",
|
181 |
info="This is an estimate of the labor cost of the AI engineer in charge of deploying the model",
|
182 |
interactive=True
|
@@ -266,16 +266,15 @@ class ModelPage:
|
|
266 |
|
267 |
def compute_cost_per_token(self, *args):
|
268 |
begin=0
|
269 |
-
current_model = args[-3]
|
270 |
current_input_tokens = args[-2]
|
271 |
current_output_tokens = args[-1]
|
272 |
for model in self.models:
|
273 |
model_n_args = len(model.get_components_for_cost_computing())
|
274 |
if current_model == model.get_name():
|
275 |
-
|
276 |
model_args = args[begin:begin+model_n_args]
|
277 |
cost_per_input_token, cost_per_output_token, labor_cost = model.compute_cost_per_token(*model_args)
|
278 |
-
model_tco = cost_per_input_token * current_input_tokens + cost_per_output_token * current_output_tokens
|
279 |
latency = model.get_latency()
|
280 |
|
281 |
return model_tco, latency, labor_cost
|
|
|
131 |
|
132 |
return cost_per_input_token, cost_per_output_token, labor
|
133 |
|
134 |
+
class DIYLlama2Model(BaseTCOModel):
|
135 |
|
136 |
def __init__(self):
|
137 |
+
self.set_name("(Deploy yourself) Llama 2 70B")
|
138 |
self.set_latency("27s")
|
139 |
super().__init__()
|
140 |
|
|
|
144 |
input_tokens_cost_per_token = 0.00052
|
145 |
r = maxed_out / 100
|
146 |
return input_tokens_cost_per_token * 0.65 / r, output_tokens_cost_per_token * 0.65/ r
|
147 |
+
|
148 |
self.source = gr.Markdown("""<span style="font-size: 16px; font-weight: 600; color: #212529;">Source</span>""")
|
149 |
self.info = gr.Markdown("The cost per input and output tokens values below are from [these benchmark results](https://www.cursor.so/blog/llama-inference#user-content-fn-llama-paper) that were obtained using the following initial configurations.",
|
150 |
interactive=False,
|
|
|
176 |
)
|
177 |
self.maxed_out.change(on_maxed_out_change, inputs=[self.maxed_out, self.input_tokens_cost_per_token, self.output_tokens_cost_per_token], outputs=[self.input_tokens_cost_per_token, self.output_tokens_cost_per_token])
|
178 |
|
179 |
+
self.labor = gr.Number(5000, visible=False,
|
180 |
label="($) Labor cost per month",
|
181 |
info="This is an estimate of the labor cost of the AI engineer in charge of deploying the model",
|
182 |
interactive=True
|
|
|
266 |
|
267 |
def compute_cost_per_token(self, *args):
|
268 |
begin=0
|
269 |
+
current_model = args[-3]
|
270 |
current_input_tokens = args[-2]
|
271 |
current_output_tokens = args[-1]
|
272 |
for model in self.models:
|
273 |
model_n_args = len(model.get_components_for_cost_computing())
|
274 |
if current_model == model.get_name():
|
|
|
275 |
model_args = args[begin:begin+model_n_args]
|
276 |
cost_per_input_token, cost_per_output_token, labor_cost = model.compute_cost_per_token(*model_args)
|
277 |
+
model_tco = cost_per_input_token * current_input_tokens.value + cost_per_output_token * current_output_tokens.value
|
278 |
latency = model.get_latency()
|
279 |
|
280 |
return model_tco, latency, labor_cost
|