shresthasingh commited on
Commit
7747b9d
1 Parent(s): 0330932

Upload 2 files

Browse files
Files changed (2) hide show
  1. legal_doc_summarizer.py +92 -0
  2. requirements.txt +8 -0
legal_doc_summarizer.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import fitz # PyMuPDF
4
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForTokenClassification
5
+ import warnings
6
+ from sklearn.exceptions import ConvergenceWarning
7
+
8
+ # Suppress warnings
9
+ warnings.filterwarnings("ignore", category=FutureWarning)
10
+ warnings.filterwarnings("ignore", category=ConvergenceWarning)
11
+
12
+ # Summarization pipeline setup
13
+ model_name = "shresthasingh/my_awesome_billsum_model"
14
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
15
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
16
+ summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
17
+
18
+ # NER pipeline setup
19
+ ner_tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
20
+ ner_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
21
+ ner_pipeline = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer)
22
+
23
+ def extract_text_from_pdf(pdf_file):
24
+ doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
25
+ text = ""
26
+ for page in doc:
27
+ page_text = page.get_text()
28
+ text += page_text
29
+ return text
30
+
31
+ def summarize_text(text, min_length=30):
32
+ return summarizer(text, min_length=min_length, do_sample=False)[0]['summary_text']
33
+
34
+ def chunk_text(text, chunk_size=512):
35
+ words = text.split()
36
+ chunks = [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
37
+ return chunks
38
+
39
+ def recursive_summarize(text, chunk_size=300, min_length=30):
40
+ if len(text.split()) <= chunk_size:
41
+ return summarize_text(text, min_length)
42
+
43
+ chunks = chunk_text(text, chunk_size)
44
+ summaries = [summarize_text(chunk, min_length) for chunk in chunks]
45
+ combined_summary = ' '.join(summaries)
46
+ return recursive_summarize(combined_summary, chunk_size, min_length)
47
+
48
+ def extract_named_entities(text, chunk_size=256):
49
+ chunks = chunk_text(text, chunk_size)
50
+ entities = {'PER': set(), 'ORG': set(), 'LOC': set()}
51
+
52
+ for chunk in chunks:
53
+ ner_results = ner_pipeline(chunk)
54
+ for result in ner_results:
55
+ entity_type = result['entity'].split('-')[-1]
56
+ if entity_type in entities:
57
+ entity_value = result['word'].replace('##', '')
58
+ entities[entity_type].add(entity_value)
59
+
60
+ return entities
61
+
62
+ def process_legal_document(pdf_file):
63
+ # Extract text from PDF
64
+ text = extract_text_from_pdf(pdf_file)
65
+
66
+ # Generate summary
67
+ summary = recursive_summarize(text)
68
+
69
+ # Extract named entities
70
+ entities = extract_named_entities(text)
71
+
72
+ return summary, entities
73
+
74
+ # Streamlit App
75
+ st.title("Legal Document Summarizer")
76
+ st.write("Upload PDF documents to generate summaries and extract named entities.")
77
+
78
+ uploaded_files = st.file_uploader("Choose PDF files", type="pdf", accept_multiple_files=True)
79
+
80
+ if uploaded_files:
81
+ for uploaded_file in uploaded_files:
82
+ summary, entities = process_legal_document(uploaded_file)
83
+
84
+ st.write(f"**File:** {uploaded_file.name}")
85
+ st.write(f"**Summary:** {summary}")
86
+
87
+ st.write("**Named Entities:**")
88
+ st.write(f"**Persons:** {', '.join(entities['PER'])}")
89
+ st.write(f"**Organizations:** {', '.join(entities['ORG'])}")
90
+ st.write(f"**Locations:** {', '.join(entities['LOC'])}")
91
+
92
+ st.write("Note: The analysis results are displayed above.")
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ PyMuPDF
2
+ bert-extractive-summarizer
3
+ transformers
4
+ torch
5
+ sentence-transformers
6
+ pytesseract
7
+ Pillow
8
+ scikit-learn