File size: 6,238 Bytes
a5e9cde
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# hacky fix for HF environment issues
import os
os.system("pip uninstall -y spaces")
os.system('pip install spaces==0.17.0')
os.system("pip uninstall -y gradio")
os.system("pip uninstall -y pydantic")
os.system("pip uninstall -y typer")
os.system('pip install typer==0.4.0')
os.system('pip install pydantic==1.8.2 --use-deprecated=legacy-resolver')

import appStore.vulnerability_analysis as vulnerability_analysis
import appStore.doc_processing as processing
from appStore.rag import run_query
from utils.uploadAndExample import add_upload, get_tabs
from utils.vulnerability_classifier import label_dict
import streamlit as st
import pandas as pd
import plotly.express as px


st.set_page_config(page_title = 'Vulnerability Analysis', 
                   initial_sidebar_state='expanded', layout="wide") 

with st.sidebar:
    # upload and example doc
    choice = st.sidebar.radio(label = 'Select the Document',
                            help = 'You can upload your own documents \
                            or use the example document', 
                            options = ('Upload Document', 'Try Example'), 
                            horizontal = True)
    add_upload(choice)

with st.container():
        st.markdown("<h2 style='text-align: center;'> Vulnerability Analysis </h2>", unsafe_allow_html=True)
        st.write(' ')

with st.expander("ℹ️ - About this app", expanded=False):
    st.write(
        """
        The Vulnerability Analysis App is an open-source\
        digital tool which aims to assist policy analysts and \
        other users in extracting and filtering references \
        to different vulnerable groups from public documents.
        """)

    st.write("""
        What Happens in background?
        
        - Step 1: Once the document is provided to app, it undergoes *Pre-processing*.\
        In this step the document is broken into smaller paragraphs \
        (based on word/sentence count).
        - Step 2: The paragraphs are then fed to the **Vulnerability Classifier** which detects if
        the paragraph contains any references to vulnerable groups.
        """)
                  
    st.write("")


# Define the apps used
apps = [processing.app, vulnerability_analysis.app]

multiplier_val = 1 / len(apps)
if st.button("Analyze Documents"):
    prg = st.progress(0.0)
    for i, func in enumerate(apps):
        func()
        prg.progress((i + 1) * multiplier_val)

if 'combined_files_df' in st.session_state: # check for existence of processed documents
    # get the filenames from the processed docs dataframe so we can use for tab names
    uploaded_docs = [value for key, value in st.session_state.items() if key.startswith('filename_')]
    tab_titles = get_tabs(uploaded_docs)

    if tab_titles:
        tabs = st.tabs(tab_titles)

        # Render the results (Pie chart, Summary and Table) in indidivual tabs for each doc
        for tab, doc in zip(tabs, uploaded_docs):
            with tab:
                # Main app code
                with st.container():
                    st.write(' ')

                    # Assign dataframe a name
                df_vul = st.session_state['combined_files_df']
                df_vul = df_vul[df_vul['filename'] == doc]

                col1, col2 = st.columns([1,1])

                with col1:
                    # Header
                    st.subheader("Explore references to vulnerable groups:")

                    # Text 
                    num_paragraphs = len(df_vul['Vulnerability Label'])
                    num_references = len(df_vul[df_vul['Vulnerability Label'] != 'Other'])
                    
                    st.markdown(f"""<div style="text-align: justify;"> The document contains a
                            total of <span style="color: red;">{num_paragraphs}</span> paragraphs.
                            We identified <span style="color: red;">{num_references}</span>
                            references to vulnerable groups.</div>
                            <br>
                            In the pie chart on the right you can see the distribution of the different 
                            groups defined. For a more detailed view in the text, see the paragraphs and 
                            their respective labels in the table below.</div>""", unsafe_allow_html=True)
            
                with col2:
                    ### Pie chart
                                
                    # Create a df that stores all the labels
                    df_labels = pd.DataFrame(list(label_dict.items()), columns=['Label ID', 'Label'])
            
                    # Count how often each label appears in the "Vulnerability Labels" column
                    label_counts = df_vul['Vulnerability Label'].value_counts().reset_index()
                    label_counts.columns = ['Label', 'Count']
            
                    # Merge the label counts with the df_label DataFrame
                    df_labels = df_labels.merge(label_counts, on='Label', how='left')
            
                    # Configure graph
                    fig = px.pie(df_labels,
                            names="Label", 
                            values="Count",
                            title='Label Counts',
                            hover_name="Count",
                            color_discrete_sequence=px.colors.qualitative.Plotly
                    )
                    
                    #Show plot
                    st.plotly_chart(fig, use_container_width=True)

                ### Document Summary
                st.markdown("----")
                st.markdown('**DOCUMENT FINDINGS SUMMARY:**')
                
                # filter out 'Other' cause we don't want that in the table (and it's way too big for the summary)
                df_docs = df_vul[df_vul['Vulnerability Label'] != 'Other']
                # construct RAG query, send to openai and process response
                run_query(df_docs)
                
                st.markdown("----")
                
                with st.expander("ℹ️ - Document Text Classifications", expanded=False):
                    ### Table 
                    st.table(df_docs)