Spaces:

zama-fhe
/

encrypted-anonymization

Running on CPU Upgrade

App Files Files Community

encrypted-anonymization / anonymize_file_clear.py

jfrery-zama

update anonymize file in clear with roberta +update uuid map with query id

d0b1031 8 months ago

raw

history blame

3.21 kB

	import argparse
	import json
	import re
	import uuid
	from pathlib import Path
	import gensim
	from concrete.ml.common.serialization.loaders import load
	from transformers import AutoTokenizer, AutoModel
	from utils_demo import get_batch_text_representation

	def load_models():
	base_dir = Path(__file__).parent / "models"

	# Load tokenizer and model
	tokenizer = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
	embeddings_model = AutoModel.from_pretrained("obi/deid_roberta_i2b2")

	with open(base_dir / "cml_logreg.model", "r") as model_file:
	fhe_ner_detection = load(file=model_file)
	return embeddings_model, tokenizer, fhe_ner_detection

	def anonymize_text(text, embeddings_model, tokenizer, fhe_ner_detection):
	token_pattern = r"(\b[\w\.\/\-@]+\b\|[\s,.!?;:'\"-]+)"
	tokens = re.findall(token_pattern, text)
	uuid_map = {}
	processed_tokens = []

	for token in tokens:
	if token.strip() and re.match(r"\w+", token): # If the token is a word
	x = get_batch_text_representation([token], embeddings_model, tokenizer)
	prediction_proba = fhe_ner_detection.predict_proba(x)
	probability = prediction_proba[0][1]
	prediction = probability >= 0.5
	if prediction:
	if token not in uuid_map:
	uuid_map[token] = str(uuid.uuid4())[:8]
	processed_tokens.append(uuid_map[token])
	else:
	processed_tokens.append(token)
	else:
	processed_tokens.append(token) # Preserve punctuation and spaces as is

	anonymized_text = ''.join(processed_tokens)
	return anonymized_text, uuid_map

	def main():
	parser = argparse.ArgumentParser(description="Anonymize named entities in a text file and save the mapping to a JSON file.")
	parser.add_argument("file_path", type=str, help="The path to the file to be processed.")
	args = parser.parse_args()

	embeddings_model, tokenizer, fhe_ner_detection = load_models()

	# Read the input file
	with open(args.file_path, 'r', encoding='utf-8') as file:
	text = file.read()

	# Save the original text to its specified file
	original_file_path = Path(__file__).parent / "files" / "original_document.txt"
	with open(original_file_path, 'w', encoding='utf-8') as original_file:
	original_file.write(text)

	# Anonymize the text
	anonymized_text, uuid_map = anonymize_text(text, embeddings_model, tokenizer, fhe_ner_detection)

	# Save the anonymized text to its specified file
	anonymized_file_path = Path(__file__).parent / "files" / "anonymized_document.txt"
	with open(anonymized_file_path, 'w', encoding='utf-8') as anonymized_file:
	anonymized_file.write(anonymized_text)

	# Save the UUID mapping to a JSON file
	mapping_path = Path(args.file_path).stem + "_uuid_mapping.json"
	with open(mapping_path, 'w', encoding='utf-8') as file:
	json.dump(uuid_map, file, indent=4, sort_keys=True)

	print(f"Original text saved to {original_file_path}")
	print(f"Anonymized text saved to {anonymized_file_path}")
	print(f"UUID mapping saved to {mapping_path}")

	if __name__ == "__main__":
	main()