Spaces:
Sleeping
Sleeping
File size: 7,257 Bytes
be240ea 352a114 be240ea |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 |
from typing import List
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
from langchain_community.document_loaders import PyPDFLoader
from IPython.display import display
import os
os.system('apt-get install poppler-utils')
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import streamlit as st
class PDFProcessor:
"""
Class for processing PDF files to extract text content.
"""
def extract_text_from_pdfs(self, file_paths: List[str]) -> List[str]:
"""
Extract text content from a list of PDF files.
Args:
file_paths (List[str]): A list of file paths to the PDF documents.
Returns:
List[str]: A list of text content extracted from the PDF documents.
"""
texts = []
for file_path in file_paths:
try:
loader = PyPDFLoader(file_path)
pages = loader.load_and_split()
for page in pages:
if isinstance(page.page_content, bytes):
text = page.page_content.decode('utf-8', errors='ignore')
elif isinstance(page.page_content, str):
text = page.page_content
else:
print(f"Unexpected type: {type(page.page_content)}")
continue
texts.append(text)
except Exception as e:
print(f"Failed to process {file_path}: {e}")
return texts
class EmbeddingsProcessor:
"""
Class for processing text to obtain embeddings using a transformer model.
"""
def __init__(self, model_name: str):
"""
Initialize the EmbeddingsProcessor with a pre-trained model.
Args:
model_name (str): The name of the pre-trained model to use for generating embeddings.
"""
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModel.from_pretrained(model_name).to('cuda')
def get_embeddings(self, texts: List[str]) -> np.ndarray:
"""
Generate embeddings for a list of texts.
Args:
texts (List[str]): A list of text strings for which to generate embeddings.
Returns:
np.ndarray: A NumPy array of embeddings for the provided texts.
"""
encoded_input = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
encoded_input = {k: v.to('cuda') for k, v in encoded_input.items()}
model_output = self.model(**encoded_input)
return model_output.last_hidden_state.mean(dim=1).detach().cpu().numpy()
def compute_similarity(template_embeddings: np.ndarray, contract_embeddings: np.ndarray) -> np.ndarray:
"""
Compute cosine similarity between template and contract embeddings.
Args:
template_embeddings (np.ndarray): A NumPy array of template embeddings.
contract_embeddings (np.ndarray): A NumPy array of contract embeddings.
Returns:
np.ndarray: A NumPy array of similarity scores between contracts and templates.
"""
return cosine_similarity(contract_embeddings, template_embeddings)
def clear_folder(path):
if not os.path.exists(path):
os.makedirs(path) # Create the directory if it doesn't exist
for file in os.listdir(path):
file_path = os.path.join(path, file)
try:
if os.path.isfile(file_path):
os.unlink(file_path)
except Exception as e:
print(f"Failed to delete {file_path}: {e}")
def save_uploaded_file(uploaded_file, path):
try:
with open(os.path.join(path, uploaded_file.name), "wb") as f:
f.write(uploaded_file.getbuffer())
return True
except:
return False
# Streamlit UI
st.title('PDF Similarity Checker')
col1, col2 = st.columns(2)
# Clear the templates and contracts folders before uploading new files
templates_folder = './templates'
contracts_folder = './contracts'
clear_folder(templates_folder)
clear_folder(contracts_folder)
with col1:
st.header("Upload Templates")
uploaded_files_templates = st.file_uploader("Choose PDF files", accept_multiple_files=True, type=['pdf'])
os.makedirs(templates_folder, exist_ok=True)
for uploaded_file in uploaded_files_templates:
if save_uploaded_file(uploaded_file, templates_folder):
st.write(f"Saved: {uploaded_file.name}")
with col2:
st.header("Upload Contracts")
uploaded_files_contracts = st.file_uploader("Choose PDF files", key="contracts", accept_multiple_files=True, type=['pdf'])
os.makedirs(contracts_folder, exist_ok=True)
for uploaded_file in uploaded_files_contracts:
if save_uploaded_file(uploaded_file, contracts_folder):
st.write(f"Saved: {uploaded_file.name}")
model_name = st.selectbox("Select Model", ['sentence-transformers/multi-qa-mpnet-base-dot-v1'], index=0)
if st.button("Compute Similarities"):
pdf_processor = PDFProcessor()
embedding_processor = EmbeddingsProcessor(model_name)
# Process templates
template_files = [os.path.join(templates_folder, f) for f in os.listdir(templates_folder)]
template_texts = [pdf_processor.extract_text_from_pdfs([f])[0] for f in template_files if pdf_processor.extract_text_from_pdfs([f])]
template_embeddings = embedding_processor.get_embeddings(template_texts)
# Process contracts
contract_files = [os.path.join(contracts_folder, f) for f in os.listdir(contracts_folder)]
contract_texts = [pdf_processor.extract_text_from_pdfs([f])[0] for f in contract_files if pdf_processor.extract_text_from_pdfs([f])]
contract_embeddings = embedding_processor.get_embeddings(contract_texts)
# Compute similarities
similarities = compute_similarity(template_embeddings, contract_embeddings)
# Display results in a table format
similarity_data = []
for i, contract_file in enumerate(contract_files):
row = [i + 1, os.path.basename(contract_file)] # SI No and contract file name
for j in range(len(template_files)):
if j < similarities.shape[1] and i < similarities.shape[0]: # Check if indices are within bounds
row.append(f"{similarities[i, j] * 100:.2f}%") # Format as percentage
else:
row.append("N/A") # Handle out-of-bounds indices gracefully
similarity_data.append(row)
# Create a DataFrame for the table
columns = ["SI No", "Contract"] + [os.path.basename(template_files[j]) for j in range(len(template_files))]
similarity_df = pd.DataFrame(similarity_data, columns=columns)
# Display maximize option
if st.checkbox("Maximize Table View"):
st.write("Similarity Scores Table (Maximized):")
st.dataframe(similarity_df) # Maximized view
else:
st.write("Similarity Scores Table:")
st.table(similarity_df.style.hide(axis="index")) # Normal view
# Download option
csv = similarity_df.to_csv(index=False).encode('utf-8')
st.download_button(
label="Download Similarity Table as CSV",
data=csv,
file_name='similarity_scores.csv',
mime='text/csv',
) |