import spaces import gradio as gr # gr.load("models/kirankunapuli/Gemma-2B-Hinglish-LORA-v1.0").launch() import re import torch from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("kirankunapuli/Gemma-2B-Hinglish-LORA-v1.0") model = AutoModelForCausalLM.from_pretrained("kirankunapuli/Gemma-2B-Hinglish-LORA-v1.0") device = "cuda:0" if torch.cuda.is_available() else "cpu" model = model.to(device) alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. ### Instruction: {} ### Input: {} ### Response: {}""" @spaces.GPU def get_response(input_text: str) -> str: inputs = tokenizer( [ alpaca_prompt.format( "Please answer the following sentence as requested", # instruction input_text, # input "", # output - leave this blank for generation! ) ], return_tensors="pt", ).to(device) outputs = model.generate(**inputs, max_new_tokens=256, use_cache=True) output = tokenizer.batch_decode(outputs)[0] response_pattern = re.compile(r"### Response:\n(.*?)", re.DOTALL) response_match = response_pattern.search(output) if response_match: response = response_match.group(1).strip() return response else: return "Response not found" interface = gr.Interface( fn=get_response, inputs=[ gr.Textbox( label="Enter your input text here", value="Germany ka capital city kya hai?", placeholder="Input to LLM", lines=5, ) ], outputs=[gr.Textbox(label="LLM Output", lines=5)], title="Gemma Hinglish Model Inference", description="🤗 + 🦥 = 🔥 This model is based on google/gemma-2b and has been LoRA fine-tuned on English & Hindi language instruction datasets", ) interface.launch()