Spaces:

Mykes
/

test

Sleeping

App Files Files Community

test / app.py

Mykes

Update app.py

4bf96b6 verified 2 months ago

raw

history blame contribute delete

No virus

2.46 kB

	import gradio as gr
	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download

	# Download the model
	model_name = "Mykes/med_tinyllama_gguf"
	filename = "unsloth.Q4_K_M.gguf"
	model_path = hf_hub_download(repo_id=model_name, filename=filename)

	# Initialize the model
	# model = Llama(model_path=model_path, n_ctx=2048, n_threads=4, n_batch=32, use_mmap=True, use_mlock=True, rope_freq_base=10000, rope_freq_scale=1.0)
	model = Llama(model_path=model_path, n_ctx=256, n_threads=2, n_batch=8, use_mlock=True)
	# def preload_model(model, preload_tokens=1024):
	# # Dummy call to load model into RAM by accessing parts of it
	# try:
	# dummy_input = " " * preload_tokens
	# _ = model(dummy_input, max_tokens=1)
	# print("Model preloaded into RAM.")
	# except Exception as e:
	# print(f"Error preloading model: {e}")

	# # Preload the model into RAM
	# preload_model(model)
	def respond(
	message,
	history: list[tuple[str, str]],
	system_message,
	max_tokens,
	temperature,
	top_p,
	):
	history = history[-3:]
	# Construct the prompt
	prompt = f"<s>{system_message}\n\n"
	for user_msg, assistant_msg in history:
	prompt += f"<\|user\|>{user_msg}<\|end\|></s> <\|assistant\|>{assistant_msg}<\|end\|></s>"
	prompt += f"<\|user\|>{message}<\|end\|></s> <\|assistant\|>"

	# Generate response
	response = ""
	for token in model(
	prompt,
	max_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p,
	stream=True,
	stop=["<\|end\|>", "</s>"]
	):
	response += token['choices'][0]['text']
	yield response.strip()

	# Create the Gradio interface
	demo = gr.ChatInterface(
	respond,
	undo_btn="Отменить",
	clear_btn="Очистить",
	additional_inputs=[
	# gr.Textbox(value="You are a friendly medical assistant.", label="System message"),
	gr.Textbox(value="", label="System message"),
	gr.Slider(minimum=128, maximum=4096, value=2048, step=1, label="Max new tokens"),
	gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"),
	gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.9,
	step=0.05,
	label="Top-p (nucleus sampling)",
	),
	],
	title="Med TinyLlama Chat",
	description="Chat with the Med TinyLlama model for medical information.",
	)

	if __name__ == "__main__":
	demo.launch()