PMAlpha / app.py
Sergidev's picture
com201
08bf4ea verified
raw
history blame
1.83 kB
from fastapi import FastAPI, Request, BackgroundTasks
from fastapi.responses import HTMLResponse, StreamingResponse
from fastapi.staticfiles import StaticFiles
from modules.pmbl import PMBL
import torch
from queue import Queue
import asyncio
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device count: {torch.cuda.device_count()}")
if torch.cuda.is_available():
print(f"CUDA device name: {torch.cuda.get_device_name(0)}")
app = FastAPI(docs_url=None, redoc_url=None)
app.mount("/static", StaticFiles(directory="static"), name="static")
app.mount("/templates", StaticFiles(directory="templates"), name="templates")
pmbl = PMBL("./PMB-7b.Q6_K.gguf", gpu_layers=50)
request_queue = Queue()
@app.head("/")
@app.get("/")
def index() -> HTMLResponse:
with open("templates/index.html") as f:
return HTMLResponse(content=f.read())
async def process_request(user_input: str, mode: str):
history = pmbl.get_chat_history(mode, user_input)
async for chunk in pmbl.generate_response(user_input, history, mode):
yield chunk
@app.post("/chat")
async def chat(request: Request, background_tasks: BackgroundTasks):
try:
data = await request.json()
user_input = data["user_input"]
mode = data["mode"]
async def stream_response():
async for chunk in process_request(user_input, mode):
yield chunk
return StreamingResponse(stream_response(), media_type="text/plain")
except Exception as e:
print(f"[SYSTEM] Error: {str(e)}")
return {"error": str(e)}
@app.post("/sleep")
async def sleep():
try:
pmbl.sleep_mode()
return {"message": "Sleep mode completed successfully"}
except Exception as e:
print(f"[SYSTEM] Error: {str(e)}")
return {"error": str(e)}