ffreemt commited on
Commit
3e124e7
1 Parent(s): 58d404e

Update embed_files

Browse files
Files changed (4) hide show
  1. app.py +75 -78
  2. main.py +43 -8
  3. requirements-dev.txt +5 -1
  4. run-main.sh +1 -0
app.py CHANGED
@@ -47,21 +47,24 @@ CPU times: user 1min 27s, sys: 8.09 s, total: 1min 35s
47
  Wall time: 1min 37s
48
 
49
  """
50
- # pylint: disable=broad-exception-caught, unused-import, invalid-name, line-too-long, too-many-return-statements, import-outside-toplevel, no-name-in-module, no-member, too-many-branches, unused-variable, too-many-arguments, global-statement
51
  import os
52
  import time
53
  from copy import deepcopy
54
  from math import ceil
55
  from pathlib import Path
56
- from tempfile import _TemporaryFileWrapper
 
57
  from textwrap import dedent
58
  from types import SimpleNamespace
59
  from typing import List
60
 
61
  import gradio as gr
 
62
  import more_itertools as mit
63
  import torch
64
- from about_time import about_time
 
65
  from charset_normalizer import detect
66
  from chromadb.config import Settings
67
 
@@ -77,9 +80,8 @@ from langchain.document_loaders import (
77
  TextLoader,
78
  )
79
  from langchain.embeddings import (
80
- HuggingFaceInstructEmbeddings,
81
  SentenceTransformerEmbeddings,
82
- )
83
  from langchain.llms import HuggingFacePipeline, OpenAI
84
  from langchain.memory import ConversationBufferMemory
85
  from langchain.text_splitter import (
@@ -112,6 +114,14 @@ if api_key is not None:
112
  os.environ.setdefault("OPENAI_API_BASE", sk_base)
113
  elif api_key.startswith("pk-"):
114
  os.environ.setdefault("OPENAI_API_BASE", pk_base)
 
 
 
 
 
 
 
 
115
 
116
  ROOT_DIRECTORY = Path(__file__).parent
117
  PERSIST_DIRECTORY = f"{ROOT_DIRECTORY}/db"
@@ -128,6 +138,7 @@ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
128
  ns_initial = SimpleNamespace(
129
  db=None,
130
  qa=None,
 
131
  ingest_done=None,
132
  files_info=None,
133
  files_uploaded=[],
@@ -140,7 +151,7 @@ ns = deepcopy(ns_initial)
140
 
141
 
142
  def load_single_document(file_path: str | Path) -> List[Document]:
143
- """Loads a single document from a file path."""
144
  try:
145
  _ = Path(file_path).read_bytes()
146
  encoding = detect(_).get("encoding")
@@ -350,6 +361,28 @@ def process_files(
350
  logger.info(f"Loaded {len(documents)} document(s) ")
351
  logger.info(f"Split into {len(texts)} chunk(s) of text")
352
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353
  # initialize if necessary
354
  if ns.db is None:
355
  logger.info(f"loading {ns.model_name:}")
@@ -366,19 +399,21 @@ def process_files(
366
  )
367
  logger.info("done creating vectorstore")
368
 
369
- total = ceil(len(texts) / 101)
370
  if progress is None:
371
  # for text in progress.tqdm(
372
- for idx, text in enumerate(mit.chunked_even(texts, 101)):
373
  logger.debug(f"-{idx + 1} of {total}")
374
  ns.db.add_documents(documents=text)
375
  else:
376
  # for text in progress.tqdm(
377
- for idx, text in enumerate(progress.tqdm(
378
- mit.chunked_even(texts, 101),
379
- total=total,
380
- desc="Processing docs",
381
- )):
 
 
382
  logger.debug(f"{idx + 1} of {total}")
383
  ns.db.add_documents(documents=text)
384
  logger.debug(f" done all {total}")
@@ -394,15 +429,15 @@ def process_files(
394
  # return_source_documents=True,
395
  )
396
 
397
- ns.ingest_done = True
398
- _ = [
399
- [Path(doc.metadata.get("source")).name, len(doc.page_content)]
400
- for doc in documents
401
- ]
402
- ns.files_info = _
403
-
404
  logger.debug(f"{ns.ingest_done=}, exit process_files")
405
- return f"done file(s): {dict(ns.files_info)}"
 
 
 
 
 
 
 
406
 
407
 
408
  def respond(message, chat_history):
@@ -445,6 +480,8 @@ def respond(message, chat_history):
445
  except Exception as exc:
446
  logger.error(exc)
447
  bot_message = f"bummer! {exc}"
 
 
448
 
449
  chat_history.append((message, bot_message))
450
 
@@ -571,17 +608,20 @@ def gen_local_llm(model_id="TheBloke/vicuna-7B-1.1-HF"):
571
  else:
572
  model = LlamaForCausalLM.from_pretrained(model_id)
573
 
574
- pipe = pipeline(
575
- "text-generation",
576
- model=model,
577
- tokenizer=tokenizer,
578
- max_length=2048,
579
- temperature=0,
580
- top_p=0.95,
581
- repetition_penalty=1.15,
582
- )
 
 
 
 
583
 
584
- local_llm = HuggingFacePipeline(pipeline=pipe)
585
  return local_llm
586
 
587
 
@@ -666,7 +706,9 @@ def main1():
666
  logger.info(f"ROOT_DIRECTORY: {ROOT_DIRECTORY}")
667
 
668
  openai_api_key = os.getenv("OPENAI_API_KEY")
 
669
  logger.info(f"openai_api_key (env var/hf space SECRETS): {openai_api_key}")
 
670
 
671
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
672
  # name = gr.Textbox(label="Name")
@@ -724,57 +766,12 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
724
  upload_button.upload(upload_files, upload_button, file_output)
725
  process_btn.click(process_files, [], text2)
726
 
727
- def respond(message, chat_history):
728
- """Gen response."""
729
- logger.info(f"{ns.ingest_done=}")
730
- if ns.ingest_done is None: # no files processed yet
731
- bot_message = "Upload some file(s) for processing first."
732
- chat_history.append((message, bot_message))
733
- return "", chat_history
734
-
735
- logger.info(f"{ns.ingest_done=}")
736
- if not ns.ingest_done: # embedding database not doen yet
737
- bot_message = (
738
- "Waiting for ingest (embedding) to finish, "
739
- f"({ns.ingest_done=})"
740
- "be patient... You can switch the 'Upload files' "
741
- "Tab to check"
742
- )
743
- chat_history.append((message, bot_message))
744
- return "", chat_history
745
-
746
- _ = """
747
- if ns.qa is None: # load qa one time
748
- logger.info("Loading qa, need to do just one time.")
749
- ns.qa = load_qa()
750
- logger.info("Done loading qa, need to do just one time.")
751
- # """
752
- if ns.qa is None:
753
- bot_message = "Looks like the bot is not ready. Try again later..."
754
- chat_history.append((message, bot_message))
755
- return "", chat_history
756
-
757
- try:
758
- res = ns.qa(message)
759
- answer = res.get("result")
760
- docs = res.get("source_documents")
761
- if docs:
762
- bot_message = f"{answer}\n({docs})"
763
- else:
764
- bot_message = f"{answer}"
765
- except Exception as exc:
766
- logger.error(exc)
767
- bot_message = f"bummer! {exc}"
768
-
769
- chat_history.append((message, bot_message))
770
-
771
- return "", chat_history
772
-
773
  msg.submit(respond, [msg, chatbot], [msg, chatbot])
774
  clear.click(lambda: None, None, chatbot, queue=False)
775
 
776
  if __name__ == "__main__":
777
- demo.queue(concurrency_count=20).launch(share=share)
778
 
779
  _ = """
780
  run_localgpt
 
47
  Wall time: 1min 37s
48
 
49
  """
50
+ # pylint: disable=broad-except, unused-import, invalid-name, line-too-long, too-many-return-statements, import-outside-toplevel, no-name-in-module, no-member, too-many-branches, unused-variable, too-many-arguments, global-statement
51
  import os
52
  import time
53
  from copy import deepcopy
54
  from math import ceil
55
  from pathlib import Path
56
+
57
+ # from tempfile import _TemporaryFileWrapper
58
  from textwrap import dedent
59
  from types import SimpleNamespace
60
  from typing import List
61
 
62
  import gradio as gr
63
+ import httpx
64
  import more_itertools as mit
65
  import torch
66
+
67
+ # from about_time import about_time
68
  from charset_normalizer import detect
69
  from chromadb.config import Settings
70
 
 
80
  TextLoader,
81
  )
82
  from langchain.embeddings import (
 
83
  SentenceTransformerEmbeddings,
84
+ ) # HuggingFaceInstructEmbeddings,
85
  from langchain.llms import HuggingFacePipeline, OpenAI
86
  from langchain.memory import ConversationBufferMemory
87
  from langchain.text_splitter import (
 
114
  os.environ.setdefault("OPENAI_API_BASE", sk_base)
115
  elif api_key.startswith("pk-"):
116
  os.environ.setdefault("OPENAI_API_BASE", pk_base)
117
+ # resetip
118
+ try:
119
+ url = "https://api.pawan.krd/resetip"
120
+ headers = {"Authorization": f"{api_key}"}
121
+ httpx.post(url, headers=headers)
122
+ except Exception as exc_:
123
+ logger.error(exc_)
124
+ raise
125
 
126
  ROOT_DIRECTORY = Path(__file__).parent
127
  PERSIST_DIRECTORY = f"{ROOT_DIRECTORY}/db"
 
138
  ns_initial = SimpleNamespace(
139
  db=None,
140
  qa=None,
141
+ texts=[],
142
  ingest_done=None,
143
  files_info=None,
144
  files_uploaded=[],
 
151
 
152
 
153
  def load_single_document(file_path: str | Path) -> List[Document]:
154
+ """Load a single document from a file path."""
155
  try:
156
  _ = Path(file_path).read_bytes()
157
  encoding = detect(_).get("encoding")
 
361
  logger.info(f"Loaded {len(documents)} document(s) ")
362
  logger.info(f"Split into {len(texts)} chunk(s) of text")
363
 
364
+ total = ceil(len(texts) / 101)
365
+ ns.texts = texts
366
+
367
+ ns.ingest_done = True
368
+ _ = [
369
+ [Path(doc.metadata.get("source")).name, len(doc.page_content)]
370
+ for doc in documents
371
+ ]
372
+ ns.files_info = _
373
+
374
+ _ = (
375
+ f"done file(s): {dict(ns.files_info)}, splitted to "
376
+ f"{total} chunks. \n\nThe following embedding takes "
377
+ f"step 0-{total - 1}. (Each step lasts about 18 secs "
378
+ " on a free tier instance on huggingface space.)"
379
+ )
380
+
381
+ return _
382
+
383
+
384
+ def embed_files(progress=gr.Progress()):
385
+ """Embded ns.files_uploaded."""
386
  # initialize if necessary
387
  if ns.db is None:
388
  logger.info(f"loading {ns.model_name:}")
 
399
  )
400
  logger.info("done creating vectorstore")
401
 
402
+ total = ceil(len(ns.texts) / 101)
403
  if progress is None:
404
  # for text in progress.tqdm(
405
+ for idx, text in enumerate(mit.chunked_even(ns.texts, 101)):
406
  logger.debug(f"-{idx + 1} of {total}")
407
  ns.db.add_documents(documents=text)
408
  else:
409
  # for text in progress.tqdm(
410
+ for idx, text in enumerate(
411
+ progress.tqdm(
412
+ mit.chunked_even(ns.texts, 101),
413
+ total=total,
414
+ desc="Processing docs",
415
+ )
416
+ ):
417
  logger.debug(f"{idx + 1} of {total}")
418
  ns.db.add_documents(documents=text)
419
  logger.debug(f" done all {total}")
 
429
  # return_source_documents=True,
430
  )
431
 
 
 
 
 
 
 
 
432
  logger.debug(f"{ns.ingest_done=}, exit process_files")
433
+
434
+ _ = (
435
+ f"Done {total} chunks. You can now "
436
+ "switch to Query Docs Tab to chat. "
437
+ "You can chat in a language you prefer, "
438
+ "independent of the document language. Have fun."
439
+ )
440
+ return _
441
 
442
 
443
  def respond(message, chat_history):
 
480
  except Exception as exc:
481
  logger.error(exc)
482
  bot_message = f"bummer! {exc}"
483
+ if "empty" in str(exc):
484
+ bot_message = f"{bot_message} (probably invalid apikey)"
485
 
486
  chat_history.append((message, bot_message))
487
 
 
608
  else:
609
  model = LlamaForCausalLM.from_pretrained(model_id)
610
 
611
+ local_llm = None
612
+ if model is not None: # to please pyright
613
+ pipe = pipeline(
614
+ "text-generation",
615
+ model=model, # type: ignore
616
+ tokenizer=tokenizer,
617
+ max_length=2048,
618
+ temperature=0,
619
+ top_p=0.95,
620
+ repetition_penalty=1.15,
621
+ )
622
+
623
+ local_llm = HuggingFacePipeline(pipeline=pipe)
624
 
 
625
  return local_llm
626
 
627
 
 
706
  logger.info(f"ROOT_DIRECTORY: {ROOT_DIRECTORY}")
707
 
708
  openai_api_key = os.getenv("OPENAI_API_KEY")
709
+ openai_api_base = os.getenv("OPENAI_API_BASE")
710
  logger.info(f"openai_api_key (env var/hf space SECRETS): {openai_api_key}")
711
+ logger.info(f"openai_api_base: {openai_api_base}")
712
 
713
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
714
  # name = gr.Textbox(label="Name")
 
766
  upload_button.upload(upload_files, upload_button, file_output)
767
  process_btn.click(process_files, [], text2)
768
 
769
+ # Query docs TAB
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
770
  msg.submit(respond, [msg, chatbot], [msg, chatbot])
771
  clear.click(lambda: None, None, chatbot, queue=False)
772
 
773
  if __name__ == "__main__":
774
+ demo.queue(concurrency_count=20).launch()
775
 
776
  _ = """
777
  run_localgpt
main.py CHANGED
@@ -2,13 +2,43 @@
2
  # pylint: disable=invalid-name, unused-import, broad-except,
3
  from copy import deepcopy
4
 
 
 
5
  import gradio as gr
6
- from app import ingest, ns, ns_initial, process_files, upload_files, respond
7
- from load_api_key import load_api_key, pk_base, sk_base
8
  from loguru import logger
9
 
 
 
 
 
 
 
 
 
 
 
 
10
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
11
- with gr.Tab("Upload files"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  # Upload files and generate vectorstore
13
  with gr.Row():
14
  file_output = gr.File()
@@ -20,12 +50,15 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
20
  file_count="multiple",
21
  )
22
  with gr.Row():
23
- text2 = gr.Textbox("Gen embedding")
24
- process_btn = gr.Button("Click to embed")
 
 
 
25
 
26
  reset_btn = gr.Button("Reset everything", visible=False)
27
 
28
- with gr.Tab("Query docs"):
29
  # interactive chat
30
  chatbot = gr.Chatbot()
31
  msg = gr.Textbox(label="Query")
@@ -38,11 +71,13 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
38
  globals().update(**{"ns": deepcopy(ns_initial)})
39
  return f"reset done: ns={ns}"
40
 
41
- reset_btn.click(reset_all, [], text2)
42
-
43
  upload_button.upload(upload_files, upload_button, file_output)
44
  process_btn.click(process_files, [], text2)
 
 
45
 
 
46
  msg.submit(respond, [msg, chatbot], [msg, chatbot])
47
  clear.click(lambda: None, None, chatbot, queue=False)
48
 
 
2
  # pylint: disable=invalid-name, unused-import, broad-except,
3
  from copy import deepcopy
4
 
5
+ from textwrap import dedent
6
+
7
  import gradio as gr
 
 
8
  from loguru import logger
9
 
10
+ from app import (
11
+ embed_files,
12
+ ingest,
13
+ ns,
14
+ ns_initial,
15
+ process_files,
16
+ respond,
17
+ upload_files,
18
+ )
19
+ from load_api_key import load_api_key, pk_base, sk_base
20
+
21
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
22
+ with gr.Tab("Upload files"): # Tab1
23
+ with gr.Accordion("Info", open=False):
24
+ _ = """
25
+ # multilingual dokugpt/多语dokugpt
26
+
27
+ 和你的文件对话: 可用中文向外语文件提问或用外语向中文文件提问
28
+
29
+ Talk to your docs (.pdf, .docx, .epub, .txt .md and
30
+ other text docs): You can ask questions in a language you prefer, independent of the document language.
31
+
32
+ It
33
+ takes quite a while to ingest docs (5-30 min. depending
34
+ on net, RAM, CPU etc.).
35
+
36
+ Send empty query (hit Enter) to check embedding status and files info ([filename, numb of chars])
37
+
38
+ Homepage: https://huggingface.co/spaces/mikeee/localgpt
39
+ """
40
+ gr.Markdown(dedent(_))
41
+
42
  # Upload files and generate vectorstore
43
  with gr.Row():
44
  file_output = gr.File()
 
50
  file_count="multiple",
51
  )
52
  with gr.Row():
53
+ text2 = gr.Textbox("Process docs")
54
+ process_btn = gr.Button("Click to process")
55
+ with gr.Row():
56
+ text_embed = gr.Textbox("Generate embeddings")
57
+ embed_btn = gr.Button("Click to embed")
58
 
59
  reset_btn = gr.Button("Reset everything", visible=False)
60
 
61
+ with gr.Tab("Query docs"): # Tab1
62
  # interactive chat
63
  chatbot = gr.Chatbot()
64
  msg = gr.Textbox(label="Query")
 
71
  globals().update(**{"ns": deepcopy(ns_initial)})
72
  return f"reset done: ns={ns}"
73
 
74
+ # Tab1
 
75
  upload_button.upload(upload_files, upload_button, file_output)
76
  process_btn.click(process_files, [], text2)
77
+ embed_btn.click(embed_files, [], text_embed)
78
+ reset_btn.click(reset_all, [], text2)
79
 
80
+ # Tab2
81
  msg.submit(respond, [msg, chatbot], [msg, chatbot])
82
  clear.click(lambda: None, None, chatbot, queue=False)
83
 
requirements-dev.txt CHANGED
@@ -1,2 +1,6 @@
1
  ipython
2
- pylint
 
 
 
 
 
1
  ipython
2
+ isort
3
+ black
4
+ pydocstyle
5
+ pyright
6
+ pylint
run-main.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ nodemon -w app.py -w main.py -x python main.py