|
from fastapi import FastAPI, Request, BackgroundTasks |
|
from fastapi.responses import HTMLResponse, StreamingResponse |
|
from fastapi.staticfiles import StaticFiles |
|
from modules.pmbl import PMBL |
|
import torch |
|
from queue import Queue |
|
import asyncio |
|
|
|
print(f"CUDA available: {torch.cuda.is_available()}") |
|
print(f"CUDA device count: {torch.cuda.device_count()}") |
|
if torch.cuda.is_available(): |
|
print(f"CUDA device name: {torch.cuda.get_device_name(0)}") |
|
|
|
app = FastAPI(docs_url=None, redoc_url=None) |
|
|
|
app.mount("/static", StaticFiles(directory="static"), name="static") |
|
app.mount("/templates", StaticFiles(directory="templates"), name="templates") |
|
|
|
pmbl = PMBL("./PMB-7b.Q6_K.gguf", gpu_layers=50) |
|
request_queue = Queue() |
|
|
|
@app.head("/") |
|
@app.get("/") |
|
def index() -> HTMLResponse: |
|
with open("templates/index.html") as f: |
|
return HTMLResponse(content=f.read()) |
|
|
|
async def process_request(user_input: str, mode: str): |
|
history = pmbl.get_chat_history(mode, user_input) |
|
async for chunk in pmbl.generate_response(user_input, history, mode): |
|
yield chunk |
|
|
|
@app.post("/chat") |
|
async def chat(request: Request, background_tasks: BackgroundTasks): |
|
try: |
|
data = await request.json() |
|
user_input = data["user_input"] |
|
mode = data["mode"] |
|
|
|
async def stream_response(): |
|
async for chunk in process_request(user_input, mode): |
|
yield chunk |
|
|
|
return StreamingResponse(stream_response(), media_type="text/plain") |
|
except Exception as e: |
|
print(f"[SYSTEM] Error: {str(e)}") |
|
return {"error": str(e)} |
|
|
|
@app.post("/sleep") |
|
async def sleep(): |
|
try: |
|
pmbl.sleep_mode() |
|
return {"message": "Sleep mode completed successfully"} |
|
except Exception as e: |
|
print(f"[SYSTEM] Error: {str(e)}") |
|
return {"error": str(e)} |