turing-20.0

Runtime error

App Files Files Community

turing-20.0 / app.py

brunodoti

Update app.py

beb7111 about 1 year ago

raw

history blame

11.4 kB

	"""Run codes."""
	# pylint: disable=line-too-long, broad-exception-caught, invalid-name, missing-function-docstring, too-many-instance-attributes, missing-class-docstring
	# ruff: noqa: E501
	import gc
	import os
	import platform
	import random
	import time
	from dataclasses import asdict, dataclass
	from pathlib import Path

	# from types import SimpleNamespace
	import gradio as gr
	import psutil
	from about_time import about_time
	from ctransformers import AutoModelForCausalLM
	from dl_hf_model import dl_hf_model
	from loguru import logger




	# url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q2_K.bin"
	#url = "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/blob/main/llama-2-7b-chat.ggmlv3.q2_K.bin" # 2.87G
	url = "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/blob/main/llama-2-7b-chat.ggmlv3.q4_K_M.bin" # 2.87G


	prompt_template = """Abaixo está uma instrução que descreve uma tarefa. Escreva uma resposta que complete adequadamente o pedido.
	### Instrução: {user_prompt}
	### Resposta:
	"""

	prompt_template = """Sistema: Você é um assistente prestativo, respeitoso e honesto de Tecnologia da Informação (TI). Sempre responda da maneira mais prestativa possível em português, mantendo a segurança em mente. Suas respostas não devem incluir nenhum conteúdo prejudicial, antiético, racista, sexista, tóxico, perigoso ou ilegal. Por favor, certifique-se de que suas respostas sejam socialmente imparciais e positivas. Se uma pergunta não fizer sentido ou não for factualmente coerente, explique o motivo em vez de dar uma resposta incorreta. Se você não souber a resposta para uma pergunta, por favor, não compartilhe informações falsas.
	Usuário: {prompt}
	Assistente: """

	prompt_template = """Sistema: Você é um assistente prestativo Tecnologia da Informação (TI).
	Usuário: {prompt}
	Assistente: """

	prompt_template = """Pergunta: {question}
	Resposta: Vamos resolver isso passo a passo para ter certeza de que temos a resposta certa."""

	prompt_template = """[INST] <>
	Você é um assistente prestativo, respeitoso e honesto. Sempre responda da maneira mais prestativa possível em português. Pense passo a passo.
	<>
	Qual time da NFL venceu o Super Bowl no ano em que Justin Bieber nasceu?
	[/INST]"""

	prompt_template = """[INST] <<SYS>>
	Você é um assistente pouco prestativo de Tecnologia da Informação (TI). Sempre responda da maneira mais prestativa possível em português. Pense passo a passo. <</SYS>>
	{question} [/INST]
	"""

	prompt_template = """[INST] <<SYS>>
	Você é um assistente de Tecnologia da Informação (TI) prestativo.
	<</SYS>>
	{question} [/INST]"""


	_ = [elm for elm in prompt_template.splitlines() if elm.strip()]
	stop_string = [elm.split(":")[0] + ":" for elm in _][-2]

	logger.debug(f"{stop_string=}")

	_ = psutil.cpu_count(logical=False) - 1
	cpu_count: int = int(_) if _ else 1
	logger.debug(f"{cpu_count=}")

	LLM = None
	gc.collect()

	try:
	model_loc, file_size = dl_hf_model(url)
	except Exception as exc_:
	logger.error(exc_)
	raise SystemExit(1) from exc_

	LLM = AutoModelForCausalLM.from_pretrained(
	model_loc,
	model_type="llama",
	# threads=cpu_count,
	)

	logger.info(f"done load llm {model_loc=} {file_size=}G")

	os.environ["TZ"] = "Asia/Shanghai"
	try:
	time.tzset() # type: ignore # pylint: disable=no-member
	except Exception:
	# Windows
	logger.warning("Windows, cant run time.tzset()")

	_ = """
	ns = SimpleNamespace(
	response="",
	generator=(_ for _ in []),
	)
	# """

	@dataclass
	class GenerationConfig:
	temperature: float = 0.7
	top_k: int = 50
	top_p: float = 0.9
	repetition_penalty: float = 1.0
	max_new_tokens: int = 5000
	seed: int = 42
	reset: bool = False
	stream: bool = True
	# threads: int = cpu_count
	# stop: list[str] = field(default_factory=lambda: [stop_string])


	def generate(
	question: str,
	llm=LLM,
	config: GenerationConfig = GenerationConfig(),
	):
	"""Run model inference, will return a Generator if streaming is true."""
	# _ = prompt_template.format(question=question)
	# print(_)

	prompt = prompt_template.format(question=question)

	return llm(
	prompt,
	**asdict(config),
	)


	logger.debug(f"{asdict(GenerationConfig())=}")


	def user(user_message, history):
	# return user_message, history + [[user_message, None]]
	history.append([user_message, None])
	return user_message, history # keep user_message


	def user1(user_message, history):
	# return user_message, history + [[user_message, None]]
	history.append([user_message, None])
	return "", history # clear user_message


	def bot_(history):
	user_message = history[-1][0]
	resp = random.choice(["Como você está?", "I love you"])
	bot_message = user_message + ": " + resp
	history[-1][1] = ""
	for character in bot_message:
	history[-1][1] += character
	time.sleep(0.02)
	yield history

	history[-1][1] = resp
	yield history


	def bot(history):
	user_message = history[-1][0]
	response = []

	logger.debug(f"{user_message=}")

	with about_time() as atime: # type: ignore
	flag = 1
	prefix = ""
	then = time.time()

	logger.debug("about to generate")

	config = GenerationConfig(reset=True)
	for elm in generate(user_message, config=config):
	if flag == 1:
	logger.debug("in the loop")
	prefix = f"({time.time() - then:.2f}s) "
	flag = 0
	print(prefix, end="", flush=True)
	logger.debug(f"{prefix=}")
	print(elm, end="", flush=True)
	# logger.debug(f"{elm}")

	response.append(elm)
	history[-1][1] = prefix + "".join(response)
	yield history

	_ = (
	f"(time elapsed: {atime.duration_human}, " # type: ignore
	f"{atime.duration/len(''.join(response)):.2f}s/char)" # type: ignore
	)

	history[-1][1] = "".join(response) + f"\n{_}"
	yield history


	def predict_api(prompt):
	logger.debug(f"{prompt=}")
	try:
	# user_prompt = prompt
	config = GenerationConfig(
	temperature=0.2,
	top_k=10,
	top_p=0.9,
	repetition_penalty=1.0,
	max_new_tokens=5000, # adjust as needed
	seed=42,
	reset=True, # reset history (cache)
	stream=False,
	# threads=cpu_count,
	# stop=prompt_prefix[1:2],
	)

	response = generate(
	prompt,
	config=config,
	)

	logger.debug(f"api: {response=}")
	except Exception as exc:
	logger.error(exc)
	response = f"{exc=}"
	# bot = {"inputs": [response]}
	# bot = [(prompt, response)]

	return response


	css = """
	.importantButton {
	background: linear-gradient(45deg, #7e0570,#5d1c99, #6e00ff) !important;
	border: none !important;
	}
	.importantButton:hover {
	background: linear-gradient(45deg, #ff00e0,#8500ff, #6e00ff) !important;
	border: none !important;
	}
	.disclaimer {font-variant-caps: all-small-caps; font-size: xx-small;}
	.xsmall {font-size: x-small;}
	"""
	etext = """In Turing-15.0🤖🧠 """




	logger.info("start block")

	with gr.Blocks(
	title="Turing-15.0🤖",
	theme=gr.themes.Soft(text_size="sm", spacing_size="sm"),
	css=css,
	) as block:
	# buff_var = gr.State("")
	with gr.Accordion("Turing-15.0🤖🧠", open=False, style={"text-align": "center", "font-weight": "bold"}):

	gr.Markdown(
	f"""<div style="text-align: center;">
	<h5>Gradio </h5><br>
	Utilize -- como favorecimento : Exemplo : "como reiniciar o serviço de de --DNS no Windows server 2019 64 bits" // e ao final de cada imput coloque ponto final // utilize dizima destribuitiva // escreva sem erros de português .
	</div>""",
	elem_classes="xsmall",
	)

	# chatbot = gr.Chatbot().style(height=700) # 500
	chatbot = gr.Chatbot(height=500)

	# buff = gr.Textbox(show_label=False, visible=True)

	with gr.Row():
	with gr.Column(scale=5):
	msg = gr.Textbox(
	label="Chat Message Box",
	placeholder="Claro, estou aqui para ajudar. O que você gostaria de perguntar ou discutir? Por favor, digite sua pergunta ou tópico e eu ficarei feliz em responder.",
	show_label=False,
	# container=False,
	lines=6,
	max_lines=30,
	show_copy_button=True,
	# ).style(container=False)
	)
	with gr.Column(scale=1, min_width=50):
	with gr.Row():
	submit = gr.Button("Submit", elem_classes="xsmall")
	stop = gr.Button("Stop", visible=True)
	clear = gr.Button("Clear History", visible=True)
	with gr.Row(visible=False):
	with gr.Accordion("Advanced Options:", open=False):
	with gr.Row():
	with gr.Column(scale=2):
	system = gr.Textbox(
	label="System Prompt",
	value=prompt_template,
	show_label=False,
	container=False,
	# ).style(container=False)
	)
	with gr.Column():
	with gr.Row():
	change = gr.Button("Change System Prompt")
	reset = gr.Button("Reset System Prompt")



	msg_submit_event = msg.submit(
	# fn=conversation.user_turn,
	fn=user,
	inputs=[msg, chatbot],
	outputs=[msg, chatbot],
	queue=True,
	show_progress="full",
	# api_name=None,
	).then(bot, chatbot, chatbot, queue=True)
	submit_click_event = submit.click(
	# fn=lambda x, y: ("",) + user(x, y)[1:], # clear msg
	fn=user1, # clear msg
	inputs=[msg, chatbot],
	outputs=[msg, chatbot],
	queue=True,
	# queue=False,
	show_progress="full",
	# api_name=None,
	).then(bot, chatbot, chatbot, queue=True)
	stop.click(
	fn=None,
	inputs=None,
	outputs=None,
	cancels=[msg_submit_event, submit_click_event],
	queue=False,
	)
	clear.click(lambda: None, None, chatbot, queue=False)

	with gr.Accordion("For Chat/Translation API", open=False, visible=False):
	input_text = gr.Text()
	api_btn = gr.Button("Go", variant="primary")
	out_text = gr.Text()

	api_btn.click(
	predict_api,
	input_text,
	out_text,
	api_name="api",
	)

	# block.load(update_buff, [], buff, every=1)
	# block.load(update_buff, [buff_var], [buff_var, buff], every=1)

	# concurrency_count=5, max_size=20
	# max_size=36, concurrency_count=14
	# CPU cpu_count=2 16G, model 7G
	# CPU UPGRADE cpu_count=8 32G, model 7G

	# does not work
	_ = """
	# _ = int(psutil.virtual_memory().total / 10**9 // file_size - 1)
	# concurrency_count = max(_, 1)
	if psutil.cpu_count(logical=False) >= 8:
	# concurrency_count = max(int(32 / file_size) - 1, 1)
	else:
	# concurrency_count = max(int(16 / file_size) - 1, 1)
	# """

	concurrency_count = 1
	logger.info(f"{concurrency_count=}")

	block.queue(concurrency_count=concurrency_count, max_size=5).launch(debug=True)