karthikeyan-r commited on
Commit
be240ea
1 Parent(s): 4828c45

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +185 -0
app.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ import pandas as pd
3
+ from transformers import AutoTokenizer, AutoModel
4
+ import torch
5
+ from langchain_community.document_loaders import PyPDFLoader
6
+ from IPython.display import display
7
+ import os
8
+ os.system('apt-get install poppler-utils')
9
+ from sklearn.metrics.pairwise import cosine_similarity
10
+ import numpy as np
11
+ import streamlit as st
12
+
13
+ class PDFProcessor:
14
+ """
15
+ Class for processing PDF files to extract text content.
16
+ """
17
+ def extract_text_from_pdfs(self, file_paths: List[str]) -> List[str]:
18
+ """
19
+ Extract text content from a list of PDF files.
20
+
21
+ Args:
22
+ file_paths (List[str]): A list of file paths to the PDF documents.
23
+
24
+ Returns:
25
+ List[str]: A list of text content extracted from the PDF documents.
26
+ """
27
+ texts = []
28
+ for file_path in file_paths:
29
+ try:
30
+ loader = PyPDFLoader(file_path)
31
+ pages = loader.load_and_split()
32
+
33
+ for page in pages:
34
+ if isinstance(page.page_content, bytes):
35
+ text = page.page_content.decode('utf-8', errors='ignore')
36
+ elif isinstance(page.page_content, str):
37
+ text = page.page_content
38
+ else:
39
+ print(f"Unexpected type: {type(page.page_content)}")
40
+ continue
41
+ texts.append(text)
42
+ except Exception as e:
43
+ print(f"Failed to process {file_path}: {e}")
44
+
45
+ return texts
46
+
47
+ class EmbeddingsProcessor:
48
+ """
49
+ Class for processing text to obtain embeddings using a transformer model.
50
+ """
51
+ def __init__(self, model_name: str):
52
+ """
53
+ Initialize the EmbeddingsProcessor with a pre-trained model.
54
+
55
+ Args:
56
+ model_name (str): The name of the pre-trained model to use for generating embeddings.
57
+ """
58
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
59
+ self.model = AutoModel.from_pretrained(model_name).to('cuda')
60
+
61
+ def get_embeddings(self, texts: List[str]) -> np.ndarray:
62
+ """
63
+ Generate embeddings for a list of texts.
64
+
65
+ Args:
66
+ texts (List[str]): A list of text strings for which to generate embeddings.
67
+
68
+ Returns:
69
+ np.ndarray: A NumPy array of embeddings for the provided texts.
70
+ """
71
+ encoded_input = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
72
+ encoded_input = {k: v.to('cuda') for k, v in encoded_input.items()}
73
+ model_output = self.model(**encoded_input)
74
+ return model_output.last_hidden_state.mean(dim=1).detach().cpu().numpy()
75
+
76
+ def compute_similarity(template_embeddings: np.ndarray, contract_embeddings: np.ndarray) -> np.ndarray:
77
+ """
78
+ Compute cosine similarity between template and contract embeddings.
79
+
80
+ Args:
81
+ template_embeddings (np.ndarray): A NumPy array of template embeddings.
82
+ contract_embeddings (np.ndarray): A NumPy array of contract embeddings.
83
+
84
+ Returns:
85
+ np.ndarray: A NumPy array of similarity scores between contracts and templates.
86
+ """
87
+ return cosine_similarity(contract_embeddings, template_embeddings)
88
+
89
+ def clear_folder(path):
90
+ if not os.path.exists(path):
91
+ os.makedirs(path) # Create the directory if it doesn't exist
92
+ for file in os.listdir(path):
93
+ file_path = os.path.join(path, file)
94
+ try:
95
+ if os.path.isfile(file_path):
96
+ os.unlink(file_path)
97
+ except Exception as e:
98
+ print(f"Failed to delete {file_path}: {e}")
99
+
100
+ def save_uploaded_file(uploaded_file, path):
101
+ try:
102
+ with open(os.path.join(path, uploaded_file.name), "wb") as f:
103
+ f.write(uploaded_file.getbuffer())
104
+ return True
105
+ except:
106
+ return False
107
+
108
+ # Streamlit UI
109
+ st.title('PDF Similarity Checker')
110
+
111
+ col1, col2 = st.columns(2)
112
+
113
+ # Clear the templates and contracts folders before uploading new files
114
+ templates_folder = './templates'
115
+ contracts_folder = './contracts'
116
+
117
+ clear_folder(templates_folder)
118
+ clear_folder(contracts_folder)
119
+
120
+ with col1:
121
+ st.header("Upload Templates")
122
+ uploaded_files_templates = st.file_uploader("Choose PDF files", accept_multiple_files=True, type=['pdf'])
123
+ os.makedirs(templates_folder, exist_ok=True)
124
+ for uploaded_file in uploaded_files_templates:
125
+ if save_uploaded_file(uploaded_file, templates_folder):
126
+ st.write(f"Saved: {uploaded_file.name}")
127
+
128
+ with col2:
129
+ st.header("Upload Contracts")
130
+ uploaded_files_contracts = st.file_uploader("Choose PDF files", key="contracts", accept_multiple_files=True, type=['pdf'])
131
+ os.makedirs(contracts_folder, exist_ok=True)
132
+ for uploaded_file in uploaded_files_contracts:
133
+ if save_uploaded_file(uploaded_file, contracts_folder):
134
+ st.write(f"Saved: {uploaded_file.name}")
135
+
136
+ model_name = st.selectbox("Select Model", ['sentence-transformers/multi-qa-mpnet-base-dot-v1'], index=0)
137
+
138
+ if st.button("Compute Similarities"):
139
+ pdf_processor = PDFProcessor()
140
+ embedding_processor = EmbeddingsProcessor(model_name)
141
+
142
+ # Process templates
143
+ template_files = [os.path.join(templates_folder, f) for f in os.listdir(templates_folder)]
144
+ template_texts = [pdf_processor.extract_text_from_pdfs([f])[0] for f in template_files if pdf_processor.extract_text_from_pdfs([f])]
145
+ template_embeddings = embedding_processor.get_embeddings(template_texts)
146
+
147
+ # Process contracts
148
+ contract_files = [os.path.join(contracts_folder, f) for f in os.listdir(contracts_folder)]
149
+ contract_texts = [pdf_processor.extract_text_from_pdfs([f])[0] for f in contract_files if pdf_processor.extract_text_from_pdfs([f])]
150
+ contract_embeddings = embedding_processor.get_embeddings(contract_texts)
151
+
152
+ # Compute similarities
153
+ similarities = compute_similarity(template_embeddings, contract_embeddings)
154
+
155
+ # Display results in a table format
156
+ similarity_data = []
157
+ for i, contract_file in enumerate(contract_files):
158
+ row = [i + 1, os.path.basename(contract_file)] # SI No and contract file name
159
+ for j in range(len(template_files)):
160
+ if j < similarities.shape[1] and i < similarities.shape[0]: # Check if indices are within bounds
161
+ row.append(f"{similarities[i, j] * 100:.2f}%") # Format as percentage
162
+ else:
163
+ row.append("N/A") # Handle out-of-bounds indices gracefully
164
+ similarity_data.append(row)
165
+
166
+ # Create a DataFrame for the table
167
+ columns = ["SI No", "Contract"] + [os.path.basename(template_files[j]) for j in range(len(template_files))]
168
+ similarity_df = pd.DataFrame(similarity_data, columns=columns)
169
+
170
+ # Display maximize option
171
+ if st.checkbox("Maximize Table View"):
172
+ st.write("Similarity Scores Table (Maximized):")
173
+ st.dataframe(similarity_df) # Maximized view
174
+ else:
175
+ st.write("Similarity Scores Table:")
176
+ st.table(similarity_df.style.hide(axis="index")) # Normal view
177
+
178
+ # Download option
179
+ csv = similarity_df.to_csv(index=False).encode('utf-8')
180
+ st.download_button(
181
+ label="Download Similarity Table as CSV",
182
+ data=csv,
183
+ file_name='similarity_scores.csv',
184
+ mime='text/csv',
185
+ )