|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import gradio as gr |
|
from masklid import MaskLID |
|
from huggingface_hub import hf_hub_download |
|
from fasttext.FastText import _FastText |
|
|
|
def render_metadata(): |
|
"""Renders the metadata.""" |
|
html_content = """ |
|
<p align="center"> |
|
<a href="https://github.com/cisnlp/MaskLID"><img alt="GitHub stars" src="https://img.shields.io/github/stars/cisnlp/MaskLID"></a> |
|
This is the demo for <a href="https://arxiv.org/abs/2406.06263">MaskLID</a> paper (ACL 2024). You can see the whole code in our GitHub. Please also note that if you increase the number of languages, you also need larger alpha and beta values. |
|
MaskLID does not add much overhead to language identification. You first fix the languages your model is limited to and then run the MaskLID code. However, in this demo, we load the model each time (that takes couple of seconds) you hit submit to ensure the results are not cached and to make it possible to change the set of languages each time. We may later change the demo code to resolve this. |
|
</p> |
|
""" |
|
return html_content |
|
|
|
|
|
def get_model_path(): |
|
|
|
model_path = hf_hub_download(repo_id="cis-lmu/glotlid", filename="model_v3.bin") |
|
return model_path |
|
|
|
|
|
def get_masklid(): |
|
|
|
masklid_model = MaskLID(get_model_path()) |
|
|
|
|
|
labels = masklid_model.model.get_labels() |
|
labels = [l for l in labels if not l.startswith('__label__und') and not l.startswith('__label__zxx')] |
|
|
|
return masklid_model, labels |
|
|
|
def predict_codeswitch(text, top_labels=200, beta=20, alpha=3, max_lambda=3, min_length=10, min_prob=0.90, max_retry=3, alpha_step_increase=3, beta_step_increase=5): |
|
|
|
|
|
beta = top_labels if beta > top_labels else beta |
|
alpha = beta if alpha > beta else alpha |
|
|
|
|
|
masklid_model, labels = get_masklid() |
|
masklid_model.language_indices = masklid_model._compute_language_indices(labels[:top_labels]) |
|
masklid_model.labels = [masklid_model.model.get_labels()[i] for i in masklid_model.language_indices] |
|
|
|
ans = masklid_model.predict_codeswitch(text, beta=beta, alpha=alpha, max_lambda=max_lambda, min_length=min_length, min_prob=min_prob, max_retry=max_retry, alpha_step_increase=alpha_step_increase, beta_step_increase=beta_step_increase) |
|
|
|
return ans |
|
|
|
inputs = gr.Textbox(lines=2, label="Enter the text", value="bir kahve dükkanında geçen film tadında güzel bir şarkıya ayrılsın gece falling in love at a coffee shop") |
|
parameters = { |
|
"top_labels": gr.Slider(minimum=2, maximum=len(get_masklid()[1]), step=1, value=200, label="Limit LID to X Top Languages"), |
|
"beta": gr.Slider(minimum=1, maximum=100, value=20, step=1, label="Beta"), |
|
"alpha": gr.Slider(minimum=1, maximum=30, value=3, step=1, label="Alpha"), |
|
"max_lambda": gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Max Iteration"), |
|
"min_length": gr.Slider(minimum=1, maximum=100, value=10, step=1, label="Min Length"), |
|
"min_prob": gr.Slider(minimum=0.0, maximum=1.0, value=0.9, step=0.01, label="Min Probability"), |
|
"max_retry": gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Max Retry In total"), |
|
"alpha_step_increase": gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Alpha Step Increase"), |
|
"beta_step_increase": gr.Slider(minimum=1, maximum=15, value=5, step=1, label="Beta Step Increase") |
|
} |
|
|
|
output = gr.JSON(label="Output") |
|
|
|
gr.Interface( |
|
fn=predict_codeswitch, |
|
inputs=[inputs, *parameters.values()], |
|
outputs=output, |
|
title="MaskLID (Code-Switch Language Identification)", |
|
description = render_metadata(), |
|
cache_examples=False |
|
).launch() |