PMAlpha

Sleeping

File size: 1,825 Bytes

from fastapi import FastAPI, Request, BackgroundTasks
from fastapi.responses import HTMLResponse, StreamingResponse
from fastapi.staticfiles import StaticFiles
from modules.pmbl import PMBL
import torch
from queue import Queue
import asyncio

print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device count: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    print(f"CUDA device name: {torch.cuda.get_device_name(0)}")

app = FastAPI(docs_url=None, redoc_url=None)

app.mount("/static", StaticFiles(directory="static"), name="static")
app.mount("/templates", StaticFiles(directory="templates"), name="templates")

pmbl = PMBL("./PMB-7b.Q6_K.gguf", gpu_layers=50)
request_queue = Queue()

@app.head("/")
@app.get("/")
def index() -> HTMLResponse:
    with open("templates/index.html") as f:
        return HTMLResponse(content=f.read())

async def process_request(user_input: str, mode: str):
    history = pmbl.get_chat_history(mode, user_input)
    async for chunk in pmbl.generate_response(user_input, history, mode):
        yield chunk

@app.post("/chat")
async def chat(request: Request, background_tasks: BackgroundTasks):
    try:
        data = await request.json()
        user_input = data["user_input"]
        mode = data["mode"]

        async def stream_response():
            async for chunk in process_request(user_input, mode):
                yield chunk

        return StreamingResponse(stream_response(), media_type="text/plain")
    except Exception as e:
        print(f"[SYSTEM] Error: {str(e)}")
        return {"error": str(e)}

@app.post("/sleep")
async def sleep():
    try:
        pmbl.sleep_mode()
        return {"message": "Sleep mode completed successfully"}
    except Exception as e:
        print(f"[SYSTEM] Error: {str(e)}")
        return {"error": str(e)}