Sergidev commited on
Commit
08bf4ea
1 Parent(s): 9689eed
Files changed (1) hide show
  1. app.py +16 -5
app.py CHANGED
@@ -1,8 +1,10 @@
1
- from fastapi import FastAPI, Request
2
  from fastapi.responses import HTMLResponse, StreamingResponse
3
  from fastapi.staticfiles import StaticFiles
4
  from modules.pmbl import PMBL
5
  import torch
 
 
6
 
7
  print(f"CUDA available: {torch.cuda.is_available()}")
8
  print(f"CUDA device count: {torch.cuda.device_count()}")
@@ -15,6 +17,7 @@ app.mount("/static", StaticFiles(directory="static"), name="static")
15
  app.mount("/templates", StaticFiles(directory="templates"), name="templates")
16
 
17
  pmbl = PMBL("./PMB-7b.Q6_K.gguf", gpu_layers=50)
 
18
 
19
  @app.head("/")
20
  @app.get("/")
@@ -22,15 +25,23 @@ def index() -> HTMLResponse:
22
  with open("templates/index.html") as f:
23
  return HTMLResponse(content=f.read())
24
 
 
 
 
 
 
25
  @app.post("/chat")
26
- async def chat(request: Request):
27
  try:
28
  data = await request.json()
29
  user_input = data["user_input"]
30
  mode = data["mode"]
31
- history = pmbl.get_chat_history(mode, user_input)
32
- response_generator = pmbl.generate_response(user_input, history, mode)
33
- return StreamingResponse(response_generator, media_type="text/plain")
 
 
 
34
  except Exception as e:
35
  print(f"[SYSTEM] Error: {str(e)}")
36
  return {"error": str(e)}
 
1
+ from fastapi import FastAPI, Request, BackgroundTasks
2
  from fastapi.responses import HTMLResponse, StreamingResponse
3
  from fastapi.staticfiles import StaticFiles
4
  from modules.pmbl import PMBL
5
  import torch
6
+ from queue import Queue
7
+ import asyncio
8
 
9
  print(f"CUDA available: {torch.cuda.is_available()}")
10
  print(f"CUDA device count: {torch.cuda.device_count()}")
 
17
  app.mount("/templates", StaticFiles(directory="templates"), name="templates")
18
 
19
  pmbl = PMBL("./PMB-7b.Q6_K.gguf", gpu_layers=50)
20
+ request_queue = Queue()
21
 
22
  @app.head("/")
23
  @app.get("/")
 
25
  with open("templates/index.html") as f:
26
  return HTMLResponse(content=f.read())
27
 
28
+ async def process_request(user_input: str, mode: str):
29
+ history = pmbl.get_chat_history(mode, user_input)
30
+ async for chunk in pmbl.generate_response(user_input, history, mode):
31
+ yield chunk
32
+
33
  @app.post("/chat")
34
+ async def chat(request: Request, background_tasks: BackgroundTasks):
35
  try:
36
  data = await request.json()
37
  user_input = data["user_input"]
38
  mode = data["mode"]
39
+
40
+ async def stream_response():
41
+ async for chunk in process_request(user_input, mode):
42
+ yield chunk
43
+
44
+ return StreamingResponse(stream_response(), media_type="text/plain")
45
  except Exception as e:
46
  print(f"[SYSTEM] Error: {str(e)}")
47
  return {"error": str(e)}