ffreemt commited on
Commit
21c3825
1 Parent(s): e882a86

Add test.epub to docs

Browse files
Files changed (4) hide show
  1. .gitignore +3 -0
  2. app.py +4 -2
  3. docs/test.epub +0 -0
  4. epub_loader.py +38 -0
.gitignore CHANGED
@@ -1,3 +1,6 @@
1
  .venv
2
  db
3
  dummy
 
 
 
 
1
  .venv
2
  db
3
  dummy
4
+ .ENV
5
+ .env
6
+ __pycache__
app.py CHANGED
@@ -289,12 +289,12 @@ def ingest(
289
  ]
290
 
291
 
 
292
  # https://huggingface.co/TheBloke/vicuna-7B-1.1-HF
293
  def gen_local_llm(model_id="TheBloke/vicuna-7B-1.1-HF"):
294
  """Gen a local llm.
295
 
296
  localgpt run_localgpt
297
-
298
  https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2
299
  with torch.device(“cuda”):
300
  model = AutoModelForCausalLM.from_pretrained(“gpt2-large”, torch_dtype=torch.float16)
@@ -354,7 +354,9 @@ def load_qa(device=None, model_name: str = "hkunlp/instructor-base"):
354
  llm = gen_local_llm() # "TheBloke/vicuna-7B-1.1-HF" 12G?
355
 
356
  qa = RetrievalQA.from_chain_type(
357
- llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True
 
 
358
  )
359
 
360
  logger.info("Done qa")
 
289
  ]
290
 
291
 
292
+ # TheBloke/Wizard-Vicuna-7B-Uncensored-HF
293
  # https://huggingface.co/TheBloke/vicuna-7B-1.1-HF
294
  def gen_local_llm(model_id="TheBloke/vicuna-7B-1.1-HF"):
295
  """Gen a local llm.
296
 
297
  localgpt run_localgpt
 
298
  https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2
299
  with torch.device(“cuda”):
300
  model = AutoModelForCausalLM.from_pretrained(“gpt2-large”, torch_dtype=torch.float16)
 
354
  llm = gen_local_llm() # "TheBloke/vicuna-7B-1.1-HF" 12G?
355
 
356
  qa = RetrievalQA.from_chain_type(
357
+ llm=llm, chain_type="stuff",
358
+ retriever=retriever,
359
+ return_source_documents=True,
360
  )
361
 
362
  logger.info("Done qa")
docs/test.epub ADDED
Binary file (261 kB). View file
 
epub_loader.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Loads an epub file into a list of documents."""
2
+ from dataclasses import dataclass
3
+ from pathlib import Path
4
+ from typing import List, Union
5
+
6
+ from epub2txt import epub2txt
7
+ from langchain.docstore.document import Document
8
+ from langchain.document_loaders.base import BaseLoader
9
+ from loguru import logger
10
+
11
+
12
+ @dataclass
13
+ class EpubLoader(BaseLoader):
14
+ """Load an epub file into a list of documents.
15
+
16
+ Args:
17
+ file_path: file path or url to epub
18
+ Returns:
19
+ self.load() -> list of Documents
20
+ """
21
+ file_path: Union[str, Path]
22
+
23
+ def load(self) -> List[Document]:
24
+ """Load data into document objects."""
25
+ try:
26
+ texts = epub2txt(self.file_path, outputlist=True)
27
+ ch_titles = epub2txt.content_titles
28
+
29
+ except Exception as exc:
30
+ logger.error(exc)
31
+ raise
32
+
33
+ docs = []
34
+ for title, text in zip(ch_titles, texts):
35
+ metadata = {"source": self.file_path, "ch.": title}
36
+ docs.append(Document(page_content=text, metadata=metadata))
37
+
38
+ return docs