beki commited on
Commit
ac7d4be
1 Parent(s): a409919

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -41
app.py CHANGED
@@ -1,34 +1,57 @@
1
- """Streamlit app for Presidio."""
2
 
3
- import json
4
- from json import JSONEncoder
5
- from annotated_text import annotated_text
6
- import pandas as pd
7
- import streamlit as st
8
- from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
9
- from presidio_anonymizer import AnonymizerEngine
10
-
11
- from flair_recognizer import FlairRecognizer
12
 
13
  import spacy
14
- spacy.cli.download("en_core_web_lg")
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  # Helper methods
17
  @st.cache(allow_output_mutation=True)
18
  def analyzer_engine():
19
  """Return AnalyzerEngine."""
20
 
21
- flair_recognizer = FlairRecognizer()
22
-
 
 
 
 
 
 
 
 
 
 
23
  registry = RecognizerRegistry()
24
- registry.add_recognizer(flair_recognizer)
25
- registry.load_predefined_recognizers()
 
 
26
  registry.remove_recognizer("SpacyRecognizer")
27
-
28
- analyzer = AnalyzerEngine(registry=registry)
 
 
 
 
 
 
 
29
  return analyzer
30
 
31
-
32
  @st.cache(allow_output_mutation=True)
33
  def anonymizer_engine():
34
  """Return AnonymizerEngine."""
@@ -49,10 +72,12 @@ def analyze(**kwargs):
49
 
50
  def anonymize(text, analyze_results):
51
  """Anonymize identified input using Presidio Abonymizer."""
52
-
 
53
  res = anonymizer_engine().anonymize(text, analyze_results)
54
  return res.text
55
 
 
56
  def annotate(text, st_analyze_results, st_entities):
57
  tokens = []
58
  # sort by start index
@@ -72,12 +97,14 @@ def annotate(text, st_analyze_results, st_entities):
72
  tokens.append(text[res.end:])
73
  return tokens
74
 
75
- st.set_page_config(page_title="Presidio demo (English)", layout="wide")
 
76
 
77
  # Side bar
78
  st.sidebar.markdown(
79
- """
80
- Detect and anonymize PII in text using an [NLP model](https://huggingface.co/beki/en_spacy_pii_distilbert) trained on protocol trace data generated by [privy](https://github.com/pixie-io/pixie/tree/main/src/datagen/pii/privy) and rule-based classifiers from [presidio](https://aka.ms/presidio).
 
81
  """
82
  )
83
 
@@ -91,7 +118,8 @@ st_threshold = st.sidebar.slider(
91
  label="Acceptance threshold", min_value=0.0, max_value=1.0, value=0.35
92
  )
93
 
94
- st_return_decision_process = st.sidebar.checkbox("Add analysis explanations in json")
 
95
 
96
  st.sidebar.info(
97
  "Privy is an open source framework for synthetic data generation in protocol trace formats (json, sql, html etc). Presidio is an open source framework for PII detection and anonymization. "
@@ -100,42 +128,49 @@ st.sidebar.info(
100
 
101
 
102
  # Main panel
103
- analyzer_load_state = st.info("Starting Presidio analyzer and loading Privy-trained model...")
 
104
  engine = analyzer_engine()
105
  analyzer_load_state.empty()
106
 
107
 
108
  st_text = st.text_area(
109
  label="Type in some text",
110
- value=
111
- "SELECT shipping FROM users WHERE shipping = '201 Thayer St Providence RI 02912'"
112
  "\n\n"
113
  "{user: Willie Porter, ip: 192.168.2.80, email: [email protected]}",
114
  height=200,
115
  )
116
 
 
 
 
 
 
117
  # After
118
  st.subheader("Analyzed")
119
  with st.spinner("Analyzing..."):
120
- st_analyze_results = analyze(
121
- text=st_text,
122
- entities=st_entities,
123
- language="en",
124
- score_threshold=st_threshold,
125
- return_decision_process=st_return_decision_process,
126
- )
127
- annotated_tokens = annotate(st_text, st_analyze_results, st_entities)
128
- # annotated_tokens
129
- annotated_text(*annotated_tokens)
130
-
131
  # vertical space
132
  st.text("")
133
-
134
  st.subheader("Anonymized")
135
 
136
  with st.spinner("Anonymizing..."):
137
- st_anonymize_results = anonymize(st_text, st_analyze_results)
138
- st_anonymize_results
 
 
139
 
140
  # table result
141
  st.subheader("Detailed Findings")
@@ -155,11 +190,14 @@ if st_analyze_results:
155
  )
156
 
157
  st.dataframe(df, width=1000)
158
- # table result
159
  else:
160
  st.text("No findings")
161
 
 
 
162
  # json result
 
 
163
  class ToDictListEncoder(JSONEncoder):
164
  """Encode dict to json."""
165
 
 
 
1
 
2
+ """Streamlit app for Presidio + Privy-trained PII models."""
 
 
 
 
 
 
 
 
3
 
4
  import spacy
5
+ from spacy_recognizer import CustomSpacyRecognizer
6
+ from presidio_analyzer.nlp_engine import NlpEngineProvider
7
+ from presidio_anonymizer import AnonymizerEngine
8
+ from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
9
+ import pandas as pd
10
+ from annotated_text import annotated_text
11
+ from json import JSONEncoder
12
+ import json
13
+ import warnings
14
+ import streamlit as st
15
+ import os
16
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
17
+ warnings.filterwarnings('ignore')
18
+ # from flair_recognizer import FlairRecognizer
19
 
20
  # Helper methods
21
  @st.cache(allow_output_mutation=True)
22
  def analyzer_engine():
23
  """Return AnalyzerEngine."""
24
 
25
+ spacy_recognizer = CustomSpacyRecognizer()
26
+
27
+ configuration = {
28
+ "nlp_engine_name": "spacy",
29
+ "models": [
30
+ {"lang_code": "en", "model_name": "en_spacy_pii_distilbert"}],
31
+ }
32
+
33
+ # Create NLP engine based on configuration
34
+ provider = NlpEngineProvider(nlp_configuration=configuration)
35
+ nlp_engine = provider.create_engine()
36
+
37
  registry = RecognizerRegistry()
38
+ # add rule-based recognizers
39
+ registry.load_predefined_recognizers(nlp_engine=nlp_engine)
40
+ registry.add_recognizer(spacy_recognizer)
41
+ # remove the nlp engine we passed, to use custom label mappings
42
  registry.remove_recognizer("SpacyRecognizer")
43
+
44
+ analyzer = AnalyzerEngine(nlp_engine=nlp_engine,
45
+ registry=registry, supported_languages=["en"])
46
+
47
+ # uncomment for flair-based NLP recognizer
48
+ # flair_recognizer = FlairRecognizer()
49
+ # registry.load_predefined_recognizers()
50
+ # registry.add_recognizer(flair_recognizer)
51
+ # analyzer = AnalyzerEngine(registry=registry, supported_languages=["en"])
52
  return analyzer
53
 
54
+
55
  @st.cache(allow_output_mutation=True)
56
  def anonymizer_engine():
57
  """Return AnonymizerEngine."""
 
72
 
73
  def anonymize(text, analyze_results):
74
  """Anonymize identified input using Presidio Abonymizer."""
75
+ if not text:
76
+ return
77
  res = anonymizer_engine().anonymize(text, analyze_results)
78
  return res.text
79
 
80
+
81
  def annotate(text, st_analyze_results, st_entities):
82
  tokens = []
83
  # sort by start index
 
97
  tokens.append(text[res.end:])
98
  return tokens
99
 
100
+
101
+ st.set_page_config(page_title="Privy + Presidio demo (English)", layout="wide")
102
 
103
  # Side bar
104
  st.sidebar.markdown(
105
+ """
106
+ Detect and anonymize PII in text using an [NLP model](https://huggingface.co/beki/en_spacy_pii_distilbert) trained on protocol traces (JSON, SQL, XML etc.) generated by
107
+ [Privy](https://github.com/pixie-io/pixie/tree/main/src/datagen/pii/privy) and rule-based classifiers from [Presidio](https://aka.ms/presidio).
108
  """
109
  )
110
 
 
118
  label="Acceptance threshold", min_value=0.0, max_value=1.0, value=0.35
119
  )
120
 
121
+ st_return_decision_process = st.sidebar.checkbox(
122
+ "Add analysis explanations in json")
123
 
124
  st.sidebar.info(
125
  "Privy is an open source framework for synthetic data generation in protocol trace formats (json, sql, html etc). Presidio is an open source framework for PII detection and anonymization. "
 
128
 
129
 
130
  # Main panel
131
+ analyzer_load_state = st.info(
132
+ "Starting Presidio analyzer and loading Privy-trained PII model...")
133
  engine = analyzer_engine()
134
  analyzer_load_state.empty()
135
 
136
 
137
  st_text = st.text_area(
138
  label="Type in some text",
139
+ value="SELECT shipping FROM users WHERE shipping = '201 Thayer St Providence RI 02912'"
 
140
  "\n\n"
141
  "{user: Willie Porter, ip: 192.168.2.80, email: [email protected]}",
142
  height=200,
143
  )
144
 
145
+ button = st.button("Detect PII")
146
+
147
+ if 'first_load' not in st.session_state:
148
+ st.session_state['first_load'] = True
149
+
150
  # After
151
  st.subheader("Analyzed")
152
  with st.spinner("Analyzing..."):
153
+ if button or st.session_state.first_load:
154
+ st_analyze_results = analyze(
155
+ text=st_text,
156
+ entities=st_entities,
157
+ language="en",
158
+ score_threshold=st_threshold,
159
+ return_decision_process=st_return_decision_process,
160
+ )
161
+ annotated_tokens = annotate(st_text, st_analyze_results, st_entities)
162
+ # annotated_tokens
163
+ annotated_text(*annotated_tokens)
164
  # vertical space
165
  st.text("")
166
+
167
  st.subheader("Anonymized")
168
 
169
  with st.spinner("Anonymizing..."):
170
+ if button or st.session_state.first_load:
171
+ st_anonymize_results = anonymize(st_text, st_analyze_results)
172
+ st_anonymize_results
173
+
174
 
175
  # table result
176
  st.subheader("Detailed Findings")
 
190
  )
191
 
192
  st.dataframe(df, width=1000)
 
193
  else:
194
  st.text("No findings")
195
 
196
+ st.session_state['first_load'] = True
197
+
198
  # json result
199
+
200
+
201
  class ToDictListEncoder(JSONEncoder):
202
  """Encode dict to json."""
203