|
from __future__ import annotations |
|
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Tuple, Type |
|
import logging |
|
import json |
|
import os |
|
from datetime import datetime |
|
import hashlib |
|
import csv |
|
import requests |
|
import re |
|
import html |
|
import markdown2 |
|
import torch |
|
import sys |
|
import gc |
|
from pygments.lexers import guess_lexer, ClassNotFound |
|
import time |
|
|
|
import gradio as gr |
|
from pypinyin import lazy_pinyin |
|
import tiktoken |
|
import mdtex2html |
|
from markdown import markdown |
|
from pygments import highlight |
|
from pygments.lexers import guess_lexer,get_lexer_by_name |
|
from pygments.formatters import HtmlFormatter |
|
|
|
from langchain.chains import LLMChain, RetrievalQA |
|
from langchain.chat_models import ChatOpenAI |
|
from langchain.document_loaders import PyPDFLoader, WebBaseLoader, UnstructuredWordDocumentLoader, DirectoryLoader |
|
from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader |
|
from langchain.document_loaders.generic import GenericLoader |
|
from langchain.document_loaders.parsers import OpenAIWhisperParser |
|
from langchain.schema import AIMessage, HumanMessage |
|
from langchain.llms import HuggingFaceHub |
|
from langchain.llms import HuggingFaceTextGenInference |
|
from langchain.embeddings import HuggingFaceInstructEmbeddings, HuggingFaceEmbeddings, HuggingFaceBgeEmbeddings, HuggingFaceInferenceAPIEmbeddings |
|
from langchain.tools import DuckDuckGoSearchRun |
|
from langchain.retrievers.tavily_search_api import TavilySearchAPIRetriever |
|
|
|
from langchain.embeddings.openai import OpenAIEmbeddings |
|
from langchain.prompts import PromptTemplate |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain.vectorstores import Chroma |
|
from chromadb.errors import InvalidDimensionException |
|
import io |
|
from PIL import Image, ImageDraw, ImageOps, ImageFont |
|
import base64 |
|
from tempfile import NamedTemporaryFile |
|
|
|
import nltk |
|
from nltk.corpus import stopwords |
|
from nltk.tokenize import word_tokenize |
|
from nltk.stem import WordNetLemmatizer |
|
nltk.download('punkt') |
|
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
import numpy as np |
|
|
|
from reportlab.lib.pagesizes import inch, A4 |
|
from reportlab.platypus import SimpleDocTemplate, Frame, Spacer |
|
from reportlab.lib import colors |
|
from reportlab.lib.units import mm |
|
from reportlab.platypus import Paragraph, SimpleDocTemplate, Frame, Image, Table, ListFlowable, ListItem |
|
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle |
|
from reportlab.lib.units import cm |
|
|
|
|
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format="%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s", |
|
) |
|
|
|
|
|
|
|
|
|
|
|
ANTWORT_WEISS_NICHT = ["ich weiß nicht.", "ich weiß das nicht", "Ich habe dazu keine Antwort", "Ich bin nicht sicher", "Ich kann das nicht beantworten", "Es tut mir leid, aber ich kenne keinen", "Es tut mir leid, aber ich kann die Frage nicht beantworten.", "Es tut mir leid, aber ich kann die Frage nicht beantworten, da ich zu der Frage keine spezifischen Informatioen habe"] |
|
|
|
|
|
|
|
|
|
|
|
|
|
template = """\Antworte in deutsch, wenn es nicht explizit anders gefordert wird. Wenn du die Antwort nicht kennst, antworte direkt, dass du es nicht weißt. |
|
Versuche nicht es zu umschreiben. Versuche nicht, die Antwort zu erfinden oder aufzumocken. Halte die Antwort kurz aber ausführlich genug und exakt.""" |
|
|
|
llm_template = "Beantworte die Frage am Ende. " + template + "Frage: {question} " |
|
|
|
llm_template2 = "Fasse folgenden Text als Überschrift mit maximal 3 Worten zusammen. Text: {question} " |
|
|
|
rag_template = "Nutze die folgenden Kontext (Beginnend mit dem Wort 'Kontext:') aus Teilen aus den angehängten Dokumenten, um die Frage (Beginnend mit dem Wort 'Frage: ') am Ende zu beantworten. Wenn du die Frage aus dem folgenden Kontext nicht beantworten kannst, dann versuche eine Beantwortung aus deinen eigenen trainierten Daten zu finden. Mache das kenntlich, ob du dich auf den hier angehängten Kontext beziehst oder ob du anhand deiner Daten antwortest." + template + "Kontext: {context} Frage: {question}" |
|
|
|
|
|
|
|
LLM_CHAIN_PROMPT = PromptTemplate(input_variables = ["question"], |
|
template = llm_template) |
|
|
|
LLM_CHAIN_PROMPT2 = PromptTemplate(input_variables = ["question"], |
|
template = llm_template2) |
|
|
|
RAG_CHAIN_PROMPT = PromptTemplate(input_variables = ["context", "question"], |
|
template = rag_template) |
|
|
|
|
|
|
|
|
|
PATH_WORK = "." |
|
CHROMA_DIR = "/chroma/kkg" |
|
CHROMA_PDF = './chroma/kkg/pdf' |
|
CHROMA_WORD = './chroma/kkg/word' |
|
CHROMA_EXCEL = './chroma/kkg/excel' |
|
YOUTUBE_DIR = "/youtube" |
|
HISTORY_PFAD = "/data/history" |
|
|
|
|
|
|
|
PDF_URL = "https://arxiv.org/pdf/2303.08774.pdf" |
|
WEB_URL = "https://openai.com/research/gpt-4" |
|
YOUTUBE_URL_1 = "https://www.youtube.com/watch?v=--khbXchTeE" |
|
YOUTUBE_URL_2 = "https://www.youtube.com/watch?v=hdhZwyf24mE" |
|
|
|
|
|
|
|
|
|
|
|
|
|
def is_response_similar(response, threshold=0.7): |
|
if (len(response) < 160): |
|
|
|
combined_responses = ANTWORT_WEISS_NICHT + [response] |
|
|
|
|
|
vectorizer = TfidfVectorizer() |
|
tfidf_matrix = vectorizer.fit_transform(combined_responses) |
|
|
|
|
|
cosine_similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]) |
|
|
|
|
|
if np.max(cosine_similarities) > threshold: |
|
return True |
|
return False |
|
return False |
|
|
|
|
|
|
|
|
|
|
|
def normalise_prompt (prompt): |
|
|
|
prompt_klein =prompt.lower() |
|
|
|
tokens = word_tokenize(prompt_klein) |
|
|
|
tokens = [word for word in tokens if word.isalnum()] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tokens = [re.sub(r'\W+', '', word) for word in tokens] |
|
|
|
|
|
|
|
|
|
|
|
normalized_prompt = ' '.join(tokens) |
|
print("normaiserd prompt..................................") |
|
print(normalized_prompt) |
|
return normalized_prompt |
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_directory_loader(file_type, directory_path): |
|
|
|
loaders = { |
|
'.pdf': PyPDFLoader, |
|
'.word': UnstructuredWordDocumentLoader, |
|
} |
|
return DirectoryLoader( |
|
path=directory_path, |
|
glob=f"**/*{file_type}", |
|
loader_cls=loaders[file_type], |
|
) |
|
|
|
|
|
def document_loading_splitting(): |
|
|
|
|
|
docs = [] |
|
|
|
|
|
pdf_loader = create_directory_loader('.pdf', CHROMA_PDF) |
|
word_loader = create_directory_loader('.word', CHROMA_WORD) |
|
|
|
|
|
pdf_documents = pdf_loader.load() |
|
word_documents = word_loader.load() |
|
|
|
|
|
docs.extend(pdf_documents) |
|
docs.extend(word_documents) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_overlap = 150, chunk_size = 1500) |
|
splits = text_splitter.split_documents(docs) |
|
|
|
return splits |
|
|
|
|
|
|
|
def document_storage_chroma(splits): |
|
|
|
Chroma.from_documents(documents = splits, embedding = OpenAIEmbeddings(disallowed_special = ()), persist_directory = PATH_WORK + CHROMA_DIR) |
|
|
|
|
|
|
|
|
|
|
|
def document_storage_mongodb(splits): |
|
MongoDBAtlasVectorSearch.from_documents(documents = splits, |
|
embedding = OpenAIEmbeddings(disallowed_special = ()), |
|
collection = MONGODB_COLLECTION, |
|
index_name = MONGODB_INDEX_NAME) |
|
|
|
|
|
def document_retrieval_chroma(llm, prompt): |
|
|
|
embeddings = OpenAIEmbeddings() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
db = Chroma(embedding_function = embeddings, persist_directory = PATH_WORK + CHROMA_DIR) |
|
return db |
|
|
|
|
|
|
|
|
|
def document_retrieval_chroma2(): |
|
|
|
embeddings = OpenAIEmbeddings() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
db = Chroma(embedding_function = embeddings, persist_directory = PATH_WORK + CHROMA_DIR) |
|
print ("Chroma DB bereit ...................") |
|
|
|
return db |
|
|
|
|
|
|
|
def document_retrieval_mongodb(llm, prompt): |
|
db = MongoDBAtlasVectorSearch.from_connection_string(MONGODB_URI, |
|
MONGODB_DB_NAME + "." + MONGODB_COLLECTION_NAME, |
|
OpenAIEmbeddings(disallowed_special = ()), |
|
index_name = MONGODB_INDEX_NAME) |
|
return db |
|
|
|
|
|
|
|
|
|
|
|
def llm_chain(llm, prompt): |
|
llm_chain = LLMChain(llm = llm, prompt = LLM_CHAIN_PROMPT) |
|
result = llm_chain.run({"question": prompt}) |
|
return result |
|
|
|
|
|
def llm_chain2(llm, prompt): |
|
llm_chain = LLMChain(llm = llm, prompt = LLM_CHAIN_PROMPT2) |
|
result = llm_chain.run({"question": prompt}) |
|
return result |
|
|
|
|
|
def rag_chain(llm, prompt, db): |
|
rag_chain = RetrievalQA.from_chain_type(llm, |
|
chain_type_kwargs = {"prompt": RAG_CHAIN_PROMPT}, |
|
retriever = db.as_retriever(search_kwargs = {"k": 5}), |
|
return_source_documents = True) |
|
result = rag_chain({"query": prompt}) |
|
return result["result"] |
|
|
|
|
|
|
|
|
|
|
|
def rag_chain2(prompt, db, k=3): |
|
rag_template = "Nutze die folgenden Kontext Teile am Ende, um die Frage zu beantworten . " + template + "Frage: " + prompt + "Kontext Teile: " |
|
retrieved_chunks = db.similarity_search(prompt, k) |
|
|
|
neu_prompt = rag_template |
|
for i, chunk in enumerate(retrieved_chunks): |
|
neu_prompt += f"{i+1}. {chunk}\n" |
|
|
|
return neu_prompt |
|
|
|
|
|
|
|
|
|
|
|
def generate_prompt_with_history(text, history, max_length=4048): |
|
|
|
|
|
prompt="" |
|
history = ["\n{}\n{}".format(x[0],x[1]) for x in history] |
|
history.append("\n{}\n".format(text)) |
|
history_text = "" |
|
flag = False |
|
for x in history[::-1]: |
|
history_text = x + history_text |
|
flag = True |
|
print("hist+prompt: ") |
|
print(history_text) |
|
if flag: |
|
return prompt+history_text |
|
else: |
|
return None |
|
|
|
|
|
|
|
|
|
def generate_prompt_with_history_openai(prompt, history): |
|
history_openai_format = [] |
|
for human, assistant in history: |
|
history_openai_format.append({"role": "user", "content": human }) |
|
history_openai_format.append({"role": "assistant", "content":assistant}) |
|
|
|
history_openai_format.append({"role": "user", "content": prompt}) |
|
print("openai history und prompt................") |
|
print(history_openai_format) |
|
return history_openai_format |
|
|
|
|
|
|
|
def generate_prompt_with_history_hf(prompt, history): |
|
history_transformer_format = history + [[prompt, ""]] |
|
|
|
|
|
messages = "".join(["".join(["\n<human>:"+item[0], "\n<bot>:"+item[1]]) |
|
for item in history_transformer_format]) |
|
|
|
|
|
|
|
def generate_prompt_with_history_langchain(prompt, history): |
|
history_langchain_format = [] |
|
for human, ai in history: |
|
history_langchain_format.append(HumanMessage(content=human)) |
|
history_langchain_format.append(AIMessage(content=ai)) |
|
history_langchain_format.append(HumanMessage(content=prompt)) |
|
|
|
return history_langchain_format |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def process_image(image_path, prompt, model_image, oai_key): |
|
|
|
with open(image_path, "rb") as image_file: |
|
encoded_string = base64.b64encode(image_file.read()).decode('utf-8') |
|
|
|
|
|
headers = { |
|
"Content-Type": "application/json", |
|
"Authorization": f"Bearer {oai_key}" |
|
} |
|
payload = { |
|
"model": model_image, |
|
"messages": [ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{ |
|
"type": "text", |
|
"text": llm_template + prompt |
|
}, |
|
{ |
|
"type": "image_url", |
|
"image_url": { |
|
"url": f"data:image/jpeg;base64,{encoded_string}" |
|
} |
|
} |
|
] |
|
} |
|
], |
|
"max_tokens": 300 |
|
} |
|
return headers, payload |
|
|
|
|
|
def process_chatverlauf(prompt, model, oai_key): |
|
|
|
if (len(prompt)>50): |
|
prompt = prompt[:50] |
|
|
|
headers = { |
|
"Content-Type": "application/json", |
|
"Authorization": f"Bearer {oai_key}" |
|
} |
|
payload = { |
|
"model": model, |
|
"messages": [ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{ |
|
"type": "text", |
|
"text": 'Gib folgendem Text eine Überschrift mit maximal 2 Worten' + prompt |
|
}, |
|
] |
|
} |
|
], |
|
"max_tokens": 100 |
|
} |
|
return headers, payload |
|
|
|
def process_chatverlauf_hf(history, llm): |
|
input = generate_prompt_with_history("Gib folgendem Text eine Überschrift mit maximal 3 Worten", history) |
|
result = llm_chain2(llm, input) |
|
return result |
|
|
|
|
|
|
|
def save_and_download(chat_history): |
|
|
|
with NamedTemporaryFile(delete=False, mode="w", suffix=".txt", dir="./temp") as tmp: |
|
temp_file_path = tmp.name |
|
tmp.write(chat_history) |
|
return temp_file_path |
|
|
|
def cleanup(file_path): |
|
if os.path.exists(file_path): |
|
os.remove(file_path) |
|
|
|
|
|
|
|
|
|
|
|
def markdown_to_html_with_syntax_highlight(md_str): |
|
def replacer(match): |
|
lang = match.group(1) or "text" |
|
code = match.group(2) |
|
lang = lang.strip() |
|
|
|
if lang=="text": |
|
lexer = guess_lexer(code) |
|
lang = lexer.name |
|
|
|
try: |
|
lexer = get_lexer_by_name(lang, stripall=True) |
|
except ValueError: |
|
lexer = get_lexer_by_name("python", stripall=True) |
|
formatter = HtmlFormatter() |
|
|
|
highlighted_code = highlight(code, lexer, formatter) |
|
|
|
return f'<pre><code class="{lang}">{highlighted_code}</code></pre>' |
|
|
|
code_block_pattern = r"```(\w+)?\n([\s\S]+?)\n```" |
|
md_str = re.sub(code_block_pattern, replacer, md_str, flags=re.MULTILINE) |
|
|
|
html_str = markdown(md_str) |
|
return html_str |
|
|
|
|
|
def normalize_markdown(md_text: str) -> str: |
|
lines = md_text.split("\n") |
|
normalized_lines = [] |
|
inside_list = False |
|
|
|
for i, line in enumerate(lines): |
|
if re.match(r"^(\d+\.|-|\*|\+)\s", line.strip()): |
|
if not inside_list and i > 0 and lines[i - 1].strip() != "": |
|
normalized_lines.append("") |
|
inside_list = True |
|
normalized_lines.append(line) |
|
elif inside_list and line.strip() == "": |
|
if i < len(lines) - 1 and not re.match( |
|
r"^(\d+\.|-|\*|\+)\s", lines[i + 1].strip() |
|
): |
|
normalized_lines.append(line) |
|
continue |
|
else: |
|
inside_list = False |
|
normalized_lines.append(line) |
|
|
|
return "\n".join(normalized_lines) |
|
|
|
|
|
def convert_mdtext(md_text): |
|
code_block_pattern = re.compile(r"```(.*?)(?:```|$)", re.DOTALL) |
|
inline_code_pattern = re.compile(r"`(.*?)`", re.DOTALL) |
|
code_blocks = code_block_pattern.findall(md_text) |
|
non_code_parts = code_block_pattern.split(md_text)[::2] |
|
|
|
result = [] |
|
for non_code, code in zip(non_code_parts, code_blocks + [""]): |
|
if non_code.strip(): |
|
non_code = normalize_markdown(non_code) |
|
if inline_code_pattern.search(non_code): |
|
result.append(markdown(non_code, extensions=["tables"])) |
|
else: |
|
result.append(mdtex2html.convert(non_code, extensions=["tables"])) |
|
if code.strip(): |
|
code = f"\n```{code}\n\n```" |
|
code = markdown_to_html_with_syntax_highlight(code) |
|
result.append(code) |
|
result = "".join(result) |
|
result += ALREADY_CONVERTED_MARK |
|
return result |
|
|
|
def convert_asis(userinput): |
|
return f"<p style=\"white-space:pre-wrap;\">{html.escape(userinput)}</p>"+ALREADY_CONVERTED_MARK |
|
|
|
def detect_converted_mark(userinput): |
|
if userinput.endswith(ALREADY_CONVERTED_MARK): |
|
return True |
|
else: |
|
return False |
|
|
|
|
|
|
|
def detect_language(code): |
|
if code.startswith("\n"): |
|
first_line = "" |
|
else: |
|
first_line = code.strip().split("\n", 1)[0] |
|
language = first_line.lower() if first_line else "" |
|
code_without_language = code[len(first_line) :].lstrip() if first_line else code |
|
return language, code_without_language |
|
|
|
def convert_to_markdown(text): |
|
text = text.replace("$","$") |
|
def replace_leading_tabs_and_spaces(line): |
|
new_line = [] |
|
|
|
for char in line: |
|
if char == "\t": |
|
new_line.append("	") |
|
elif char == " ": |
|
new_line.append(" ") |
|
else: |
|
break |
|
return "".join(new_line) + line[len(new_line):] |
|
|
|
markdown_text = "" |
|
lines = text.split("\n") |
|
in_code_block = False |
|
|
|
for line in lines: |
|
if in_code_block is False and line.startswith("```"): |
|
in_code_block = True |
|
markdown_text += f"{line}\n" |
|
elif in_code_block is True and line.startswith("```"): |
|
in_code_block = False |
|
markdown_text += f"{line}\n" |
|
elif in_code_block: |
|
markdown_text += f"{line}\n" |
|
else: |
|
line = replace_leading_tabs_and_spaces(line) |
|
line = re.sub(r"^(#)", r"\\\1", line) |
|
markdown_text += f"{line} \n" |
|
|
|
return markdown_text |
|
|
|
def add_language_tag(text): |
|
def detect_language(code_block): |
|
try: |
|
lexer = guess_lexer(code_block) |
|
return lexer.name.lower() |
|
except ClassNotFound: |
|
return "" |
|
|
|
code_block_pattern = re.compile(r"(```)(\w*\n[^`]+```)", re.MULTILINE) |
|
|
|
def replacement(match): |
|
code_block = match.group(2) |
|
if match.group(2).startswith("\n"): |
|
language = detect_language(code_block) |
|
if language: |
|
return f"```{language}{code_block}```" |
|
else: |
|
return f"```\n{code_block}```" |
|
else: |
|
return match.group(1) + code_block + "```" |
|
|
|
text2 = code_block_pattern.sub(replacement, text) |
|
return text2 |
|
|
|
def delete_last_conversation(chatbot, history): |
|
if len(chatbot) > 0: |
|
chatbot.pop() |
|
|
|
if len(history) > 0: |
|
history.pop() |
|
|
|
return ( |
|
chatbot, |
|
history, |
|
"Delete Done", |
|
) |
|
|
|
def reset_state(): |
|
return [], [], "Reset Done" |
|
|
|
def reset_textbox(): |
|
return gr.update(value=""),"" |
|
|
|
def cancel_outputing(): |
|
return "Stop Done" |
|
|
|
|
|
|
|
|
|
def analyze_file(file): |
|
file_extension = file.name.split('.')[-1] |
|
return file_extension |
|
|
|
|
|
|
|
def get_filename(file_pfad): |
|
parts = file_pfad.rsplit('/', 1) |
|
if len(parts) == 2: |
|
result = parts[1] |
|
else: |
|
result = "Ein Fehler im Filenamen ist aufgetreten..." |
|
return result |
|
|
|
|
|
|
|
|
|
def submit_message(assistant_id, thread, client, user_message): |
|
client.beta.threads.messages.create( |
|
thread_id=thread.id, role="user", content=user_message |
|
) |
|
return client.beta.threads.runs.create( |
|
thread_id=thread.id, |
|
assistant_id=assistant_id, |
|
) |
|
|
|
def get_response(thread, client, assi_id): |
|
return client.beta.threads.messages.list(thread_id=thread.id, order="asc") |
|
|
|
def create_thread_and_run(user_input, client, assi_id): |
|
thread = client.beta.threads.create() |
|
run = submit_message(assi_id, thread, client, user_input) |
|
return thread, run |
|
|
|
def pretty_print(messages): |
|
print("# Messages") |
|
for m in messages: |
|
print(f"{m.role}: {m.content[0].text.value}") |
|
print() |
|
|
|
|
|
def wait_on_run(run, thread, client): |
|
while run.status == "queued" or run.status == "in_progress": |
|
run = client.beta.threads.runs.retrieve( |
|
thread_id=thread.id, |
|
run_id=run.id, |
|
) |
|
time.sleep(0.5) |
|
return run |
|
|
|
|
|
|
|
def tavily_search(tavily_client, query): |
|
search_result = tavily_client.get_search_context(query, search_depth="advanced", max_tokens=8000) |
|
return search_result |
|
|
|
|
|
|
|
def hugchat_search(chatbot, query): |
|
search_result = chatbot.query(query, web_search=True) |
|
|
|
|
|
|
|
|
|
return search_result.text, search_result.link |
|
|
|
|
|
|
|
def openai_assistant_suche(client): |
|
assistant = client.beta.assistants.create( |
|
instructions=template, |
|
model="gpt-4-1106-preview", |
|
tools=[{ |
|
"type": "function", |
|
"function": { |
|
"name": "tavily_search", |
|
"description": "Get information on recent events from the web.", |
|
"parameters": { |
|
"type": "object", |
|
"properties": { |
|
"query": {"type": "string", "description": "Die Suchanfrage, die die KI nicht beantworten konnte, hier hinein"}, |
|
}, |
|
"required": ["query"] |
|
} |
|
} |
|
}] |
|
) |
|
return assistant |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_picture(history, prompt): |
|
client = OpenAI() |
|
response = client.images.generate(model="dall-e-3", prompt=prompt,size="1024x1024",quality="standard",n=1,) |
|
image_url = response.data[0].url |
|
|
|
response2 = requests.get(image_url) |
|
|
|
image = Image.open(response2.raw) |
|
return image |
|
|
|
|
|
|
|
|
|
""" |
|
#Aufzählungen in der History erkennen und auch als Auflistung darstellen |
|
def erkennen_und_formatieren_von_aufzaehlungen_backup(text, styles): |
|
# Aufzählungszeichen oder Nummerierungen erkennen |
|
aufzaehlungszeichen = ['-', '*', '•'] |
|
nummerierung = [f'{i}.' for i in range(1, 11)] # Beispiel für einfache Nummerierungserkennung |
|
nummerierung2 = [f'{i}. ' for i in range(1, 11)] |
|
nummerierung3 = [f' {i}. ' for i in range(1, 11)] |
|
|
|
zeilen = text.split('\n') |
|
list_items = [] |
|
for zeile in zeilen: |
|
# Prüft, ob die Zeile mit einem Aufzählungszeichen oder einer Nummerierung beginnt |
|
if any(zeile.lstrip().startswith(zeichen) for zeichen in aufzaehlungszeichen + nummerierung + nummerierung2 + nummerierung3) : |
|
# Entfernt das Aufzählungszeichen/Nummerierung für die Darstellung |
|
for zeichen in aufzaehlungszeichen + nummerierung + nummerierung2 + nummerierung3: |
|
if zeile.lstrip().startswith(zeichen): |
|
zeile = zeile.lstrip()[len(zeichen):].lstrip() |
|
break |
|
list_items.append(ListItem(Paragraph(zeile, styles['BodyText']))) |
|
else: |
|
# Wenn die Zeile nicht als Teil einer Aufzählung erkannt wird, breche die Schleife ab |
|
break |
|
if list_items: |
|
# Eine Aufzählung wurde erkannt |
|
return ListFlowable(list_items, bulletType='bullet', start='bulletchar', bulletFontName='Helvetica') |
|
else: |
|
# Keine Aufzählung erkannt, gebe einen normalen Paragraph zurück |
|
return Paragraph(text, styles['BodyText']) |
|
|
|
#Aufzählungen in der History erkennen und auch als Auflistung darstellen |
|
def erkennen_und_formatieren_von_aufzaehlungen(text, styles): |
|
# Aufzählungszeichen oder Nummerierungen erkennen |
|
aufzaehlungszeichen = ['-', '*', '•'] |
|
# Regulärer Ausdruck für Nummerierungen (z.B. "1.", "2.") |
|
# Verbesserter regulärer Ausdruck, der optionale Leerzeichen vor der Nummerierung berücksichtigt |
|
nummerierung_regex = r"^\s*\d+\.\s*" # Optional Leerzeichen, gefolgt von Ziffern und einem Punkt, dann Leerzeichen |
|
zeilen = text.split('\n') |
|
list_items = [] |
|
for zeile in zeilen: |
|
# Prüft, ob die Zeile mit einem Aufzählungszeichen beginnt |
|
print("zeile:.............................") |
|
print(zeile) |
|
if any(zeile.lstrip().startswith(zeichen) for zeichen in aufzaehlungszeichen) or re.match(nummerierung_regex, zeile.lstrip()): |
|
# Entfernt das Aufzählungszeichen/Nummerierung für die Darstellung |
|
if (re.match(nummerierung_regex, zeile.lstrip())): |
|
cleaned_line = re.sub(nummerierung_regex, '', zeile.lstrip(), 1).lstrip() # Entfernt nummerierte Aufzählungszeichen |
|
else: |
|
for zeichen in aufzaehlungszeichen: |
|
if zeile.lstrip().startswith(zeichen): |
|
cleaned_line = zeile.lstrip()[len(zeichen):].lstrip() |
|
break |
|
print(cleaned_line) |
|
list_items.append(ListItem(Paragraph(cleaned_line, styles['BodyText']))) |
|
else: |
|
# Wenn die Zeile nicht als Teil einer Aufzählung erkannt wird, breche die Schleife ab |
|
# und behandle den gesamten Text als normalen Paragraphen, wenn keine Liste erkannt wurde |
|
if not list_items: |
|
return Paragraph(text, styles['BodyText']) |
|
break |
|
if list_items: |
|
# Eine Aufzählung wurde erkannt |
|
return ListFlowable(list_items, bulletType='bullet', start='bulletchar', bulletFontName='Helvetica') |
|
else: |
|
# Keine Aufzählung erkannt, gebe einen normalen Paragraph zurück |
|
return Paragraph(text, styles['BodyText']) |
|
""" |
|
|
|
|
|
|
|
|
|
def verarbeite_text_und_aufzaehlungen(text, styles): |
|
|
|
aufzaehlungszeichen = ['-', '*', '•'] |
|
|
|
|
|
nummerierung_regex = r"^\s*\d+\.\s*" |
|
|
|
zeilen = text.split('\n') |
|
elements = [] |
|
list_items = [] |
|
paragraph_text = [] |
|
|
|
for zeile in zeilen: |
|
if any(zeile.lstrip().startswith(zeichen) for zeichen in aufzaehlungszeichen) or re.match(nummerierung_regex, zeile.lstrip()): |
|
|
|
if paragraph_text: |
|
elements.append(Paragraph(' '.join(paragraph_text), styles['BodyText'])) |
|
paragraph_text = [] |
|
|
|
|
|
if re.match(nummerierung_regex, zeile.lstrip()): |
|
cleaned_line = re.sub(nummerierung_regex, '', zeile.lstrip(), 1).lstrip() |
|
else: |
|
for zeichen in aufzaehlungszeichen: |
|
if zeile.lstrip().startswith(zeichen): |
|
cleaned_line = zeile.lstrip()[len(zeichen):].lstrip() |
|
break |
|
list_items.append(ListItem(Paragraph(cleaned_line, styles['BodyText']))) |
|
else: |
|
|
|
if list_items: |
|
|
|
elements.append(ListFlowable(list_items, bulletType='bullet', start='bulletchar', bulletFontName='Helvetica')) |
|
list_items = [] |
|
paragraph_text.append(zeile) |
|
|
|
|
|
if paragraph_text: |
|
elements.append(Paragraph(' '.join(paragraph_text), styles['BodyText'])) |
|
if list_items: |
|
elements.append(ListFlowable(list_items, bulletType='bullet', start='bulletchar', bulletFontName='Helvetica')) |
|
|
|
return elements |
|
|
|
|
|
|
|
|
|
def on_each_page(canvas, doc): |
|
page_width, page_height = A4 |
|
canvas.saveState() |
|
canvas.setFont('Times-Roman', 10) |
|
|
|
current_date = datetime.now().strftime("%Y-%m-%d") |
|
print(current_date) |
|
|
|
canvas.drawRightString(page_width - 72, page_height - 28, current_date) |
|
canvas.restoreState() |
|
|
|
|
|
def erstellePdf(file_path_download, ueberschrift, dic_history): |
|
|
|
elements = [] |
|
|
|
paper_size = A4 |
|
|
|
|
|
styles = getSampleStyleSheet() |
|
|
|
new_style = ParagraphStyle('NewStyle', fontName='Helvetica', fontSize=11) |
|
styles.add(new_style) |
|
|
|
line_style = ParagraphStyle('LineStyle', fontSize=4, leading=6, borderPadding=0, |
|
spaceBefore=0, spaceAfter=0, textColor='black') |
|
list_style = getSampleStyleSheet() |
|
|
|
|
|
|
|
title = Paragraph(ueberschrift, styles['Title']) |
|
headline_nutzer = Paragraph('Nutzer:', styles['Heading3']) |
|
headline_assi = Paragraph('Assistent:', styles['Heading3']) |
|
|
|
|
|
elements.append(title) |
|
for nutzer, assi in dic_history.items(): |
|
elements.append(headline_nutzer) |
|
p = Paragraph(nutzer, styles['NewStyle']) |
|
elements.append(p) |
|
|
|
elements.append(Spacer(1, 2*mm)) |
|
elements.append(headline_assi) |
|
element_check = verarbeite_text_und_aufzaehlungen(assi,list_style) |
|
|
|
for elem in element_check: |
|
if isinstance(elem, list): |
|
|
|
elements.extend(elem) |
|
else: |
|
|
|
elements.append(elem) |
|
|
|
|
|
elements.append(Spacer(1, 8*mm)) |
|
|
|
elements.append(Paragraph('_' * 100, line_style)) |
|
|
|
elements.append(Spacer(1, 8*mm)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
doc = CustomDocTemplate(file_path_download, pagesize=paper_size) |
|
|
|
doc.onPage = on_each_page |
|
doc.build(elements) |
|
|
|
|
|
|
|
|
|
def hash_input(input_string): |
|
return hashlib.sha256(input_string.encode()).hexdigest() |
|
|
|
|
|
|
|
|
|
def transfer_input(inputs): |
|
textbox = reset_textbox() |
|
return ( |
|
inputs, |
|
gr.update(value=""), |
|
gr.Button.update(visible=True), |
|
) |
|
|
|
|
|
|
|
|
|
|
|
class State: |
|
interrupted = False |
|
|
|
def interrupt(self): |
|
self.interrupted = True |
|
|
|
def recover(self): |
|
self.interrupted = False |
|
shared_state = State() |
|
|
|
|
|
|
|
|
|
def is_stop_word_or_prefix(s: str, stop_words: list) -> bool: |
|
for stop_word in stop_words: |
|
if s.endswith(stop_word): |
|
return True |
|
for i in range(1, len(stop_word)): |
|
if s.endswith(stop_word[:i]): |
|
return True |
|
return False |
|
|
|
|
|
|
|
|
|
|
|
|
|
class CustomDocTemplate(SimpleDocTemplate): |
|
def handle_pageBegin(self): |
|
|
|
self._handle_pageBegin() |
|
|
|
self.canv.saveState() |
|
self.canv.setFont('Helvetica', 10) |
|
current_date = datetime.now().strftime("%Y-%m-%d") |
|
|
|
self.canv.drawRightString(550, 800, current_date) |
|
self.canv.restoreState() |