File size: 2,294 Bytes
08bf4ea 76e9347 c901280 08bf4ea 361f350 c901280 76e9347 c901280 08bf4ea 76e9347 08bf4ea 76e9347 08bf4ea 76e9347 08bf4ea 2f9891f 08bf4ea 2f9891f 76e9347 2f9891f 76e9347 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
from fastapi import FastAPI, Request, BackgroundTasks
from fastapi.responses import HTMLResponse, StreamingResponse
from fastapi.staticfiles import StaticFiles
from modules.pmbl import PMBL
import torch
from queue import Queue
import asyncio
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device count: {torch.cuda.device_count()}")
if torch.cuda.is_available():
print(f"CUDA device name: {torch.cuda.get_device_name(0)}")
app = FastAPI(docs_url=None, redoc_url=None)
app.mount("/static", StaticFiles(directory="static"), name="static")
app.mount("/templates", StaticFiles(directory="templates"), name="templates")
pmbl = PMBL("./PMB-7b.Q6_K.gguf", gpu_layers=50)
request_queue = Queue()
@app.head("/")
@app.get("/")
def index() -> HTMLResponse:
with open("templates/index.html") as f:
return HTMLResponse(content=f.read())
async def process_request(user_input: str, mode: str):
history = pmbl.get_chat_history(mode, user_input)
async for chunk in pmbl.generate_response(user_input, history, mode):
yield chunk
@app.post("/chat")
async def chat(request: Request, background_tasks: BackgroundTasks):
try:
data = await request.json()
user_input = data["user_input"]
mode = data["mode"]
async def response_generator():
future = asyncio.Future()
request_queue.put((future, user_input, mode))
await future
async for chunk in future.result():
yield chunk
return StreamingResponse(response_generator(), media_type="text/plain")
except Exception as e:
print(f"[SYSTEM] Error: {str(e)}")
return {"error": str(e)}
async def queue_worker():
while True:
if not request_queue.empty():
future, user_input, mode = request_queue.get()
result = process_request(user_input, mode)
future.set_result(result)
await asyncio.sleep(0.1)
@app.on_event("startup")
async def startup_event():
asyncio.create_task(queue_worker())
@app.post("/sleep")
async def sleep():
try:
pmbl.sleep_mode()
return {"message": "Sleep mode completed successfully"}
except Exception as e:
print(f"[SYSTEM] Error: {str(e)}")
return {"error": str(e)} |