ChatPDF / chatpdf.py
sinksmell's picture
Duplicate from shibing624/ChatPDF
e86290c
# -*- coding: utf-8 -*-
"""
@author:XuMing([email protected])
@description:
"""
from similarities import Similarity
from textgen import ChatGlmModel, LlamaModel
PROMPT_TEMPLATE = """\
基于以下已知信息,简洁和专业的来回答用户的问题。
如果无法从中得到答案,请说 "根据已知信息无法回答该问题" 或 "没有提供足够的相关信息",不允许在答案中添加编造成分,答案请使用中文。
已知内容:
{context_str}
问题:
{query_str}
"""
class ChatPDF:
def __init__(
self,
sim_model_name_or_path: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
gen_model_type: str = "chatglm",
gen_model_name_or_path: str = "THUDM/chatglm-6b-int4",
lora_model_name_or_path: str = None,
):
self.sim_model = Similarity(model_name_or_path=sim_model_name_or_path)
if gen_model_type == "chatglm":
self.gen_model = ChatGlmModel(gen_model_type, gen_model_name_or_path, lora_name=lora_model_name_or_path)
elif gen_model_type == "llama":
self.gen_model = LlamaModel(gen_model_type, gen_model_name_or_path, lora_name=lora_model_name_or_path)
else:
raise ValueError('gen_model_type must be chatglm or llama.')
self.history = None
self.pdf_path = None
def load_pdf_file(self, pdf_path: str):
"""Load a PDF file."""
if pdf_path.endswith('.pdf'):
corpus = self.extract_text_from_pdf(pdf_path)
elif pdf_path.endswith('.docx'):
corpus = self.extract_text_from_docx(pdf_path)
elif pdf_path.endswith('.md'):
corpus = self.extract_text_from_markdown(pdf_path)
else:
corpus = self.extract_text_from_txt(pdf_path)
self.sim_model.add_corpus(corpus)
self.pdf_path = pdf_path
@staticmethod
def extract_text_from_pdf(file_path: str):
"""Extract text content from a PDF file."""
import PyPDF2
contents = []
with open(file_path, 'rb') as f:
pdf_reader = PyPDF2.PdfReader(f)
for page in pdf_reader.pages:
page_text = page.extract_text().strip()
raw_text = [text.strip() for text in page_text.splitlines() if text.strip()]
new_text = ''
for text in raw_text:
new_text += text
if text[-1] in ['.', '!', '?', '。', '!', '?', '…', ';', ';', ':', ':', '”', '’', ')', '】', '》', '」',
'』', '〕', '〉', '》', '〗', '〞', '〟', '»', '"', "'", ')', ']', '}']:
contents.append(new_text)
new_text = ''
if new_text:
contents.append(new_text)
return contents
@staticmethod
def extract_text_from_txt(file_path: str):
"""Extract text content from a TXT file."""
contents = []
with open(file_path, 'r', encoding='utf-8') as f:
contents = [text.strip() for text in f.readlines() if text.strip()]
return contents
@staticmethod
def extract_text_from_docx(file_path: str):
"""Extract text content from a DOCX file."""
import docx
document = docx.Document(file_path)
contents = [paragraph.text.strip() for paragraph in document.paragraphs if paragraph.text.strip()]
return contents
@staticmethod
def extract_text_from_markdown(file_path: str):
"""Extract text content from a Markdown file."""
import markdown
from bs4 import BeautifulSoup
with open(file_path, 'r', encoding='utf-8') as f:
markdown_text = f.read()
html = markdown.markdown(markdown_text)
soup = BeautifulSoup(html, 'html.parser')
contents = [text.strip() for text in soup.get_text().splitlines() if text.strip()]
return contents
@staticmethod
def _add_source_numbers(lst):
"""Add source numbers to a list of strings."""
return [f'[{idx + 1}]\t "{item}"' for idx, item in enumerate(lst)]
def _generate_answer(self, query_str, context_str, history=None, max_length=1024):
"""Generate answer from query and context."""
prompt = PROMPT_TEMPLATE.format(context_str=context_str, query_str=query_str)
response, out_history = self.gen_model.chat(prompt, history, max_length=max_length)
return response, out_history
def query(
self,
query,
topn: int = 5,
max_length: int = 1024,
max_input_size: int = 1024,
use_history: bool = False
):
"""Query from corpus."""
sim_contents = self.sim_model.most_similar(query, topn=topn)
reference_results = []
for query_id, id_score_dict in sim_contents.items():
for corpus_id, s in id_score_dict.items():
reference_results.append(self.sim_model.corpus[corpus_id])
if not reference_results:
return '没有提供足够的相关信息', reference_results
reference_results = self._add_source_numbers(reference_results)
context_str = '\n'.join(reference_results)[:(max_input_size - len(PROMPT_TEMPLATE))]
if use_history:
response, out_history = self._generate_answer(query, context_str, self.history, max_length=max_length)
self.history = out_history
else:
response, out_history = self._generate_answer(query, context_str)
return response, out_history, reference_results
def save_index(self, index_path=None):
"""Save model."""
if index_path is None:
index_path = '.'.join(self.pdf_path.split('.')[:-1]) + '_index.json'
self.sim_model.save_index(index_path)
def load_index(self, index_path=None):
"""Load model."""
if index_path is None:
index_path = '.'.join(self.pdf_path.split('.')[:-1]) + '_index.json'
self.sim_model.load_index(index_path)
if __name__ == "__main__":
import sys
if len(sys.argv) > 2:
gen_model_name_or_path = sys.argv[1]
else:
print('Usage: python chatpdf.py <gen_model_name_or_path>')
gen_model_name_or_path = "THUDM/chatglm-6b-int4"
m = ChatPDF(gen_model_name_or_path=gen_model_name_or_path)
m.load_pdf_file(pdf_path='sample.pdf')
response = m.query('自然语言中的非平行迁移是指什么?')
print(response[0])
response = m.query('本文作者是谁?')
print(response[0])