srinidhidevaraj commited on
Commit
cdca3a4
1 Parent(s): bd1c620

Update helpers.py

Browse files
Files changed (1) hide show
  1. helpers.py +215 -214
helpers.py CHANGED
@@ -1,214 +1,215 @@
1
- import re
2
- import os
3
- import simple_icd_10_cm as cm
4
- from transformers import AutoModelForCausalLM, AutoTokenizer
5
- # from openai import OpenAI
6
- from prompt_template import *
7
- from langchain_groq import ChatGroq
8
- from groq import Groq
9
- from dotenv import load_dotenv
10
- import csv
11
- import time
12
- load_dotenv()
13
-
14
- os.environ["LANGCHAIN_TRACING_V2"]="true"
15
- groq_api_key=os.environ.get('GROQ_API_KEY')
16
- os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
17
- LANGCHAIN_API_KEY=os.environ.get("LANGCHAIN_API_KEY")
18
-
19
- client = Groq()
20
-
21
- CHAPTER_LIST = cm.chapter_list
22
-
23
- def construct_translation_prompt(medical_note):
24
- """
25
- Construct a prompt template for translating spanish medical notes to english.
26
-
27
- Args:
28
- medical_note (str): The medical case note.
29
-
30
- Returns:
31
- str: A structured template ready to be used as input for a language model.
32
- """
33
- translation_prompt = """You are an expert Spanish-to-English translator. You are provided with a clinical note written in Spanish.
34
- You must translate the note into English. You must ensure that you properly translate the medical and technical terms from Spanish to English without any mistakes.
35
- Spanish Medical Note:
36
- {medical_note}"""
37
-
38
- return translation_prompt.format(medical_note = medical_note)
39
-
40
- def build_translation_prompt(input_note, system_prompt=""):
41
- """
42
- Build a zero-shot prompt for translating spanish medical notes to english.
43
-
44
- Args:
45
- input_note (str): The input note or query.
46
- system_prompt (str): Optional initial system prompt or instruction.
47
-
48
- Returns:
49
- list of dict: A structured list of dictionaries defining the role and content of each message.
50
- """
51
- input_prompt = construct_translation_prompt(input_note)
52
-
53
-
54
- return [{"role": "system", "content": system_prompt}, {"role": "user", "content": input_prompt}]
55
-
56
-
57
- def remove_extra_spaces(text):
58
- """
59
- Remove extra spaces from a given text.
60
-
61
- Args:
62
- text (str): The original text string.
63
-
64
- Returns:
65
- str: The cleaned text with extra spaces removed.
66
- """
67
- return re.sub(r'\s+', ' ', text).strip()
68
-
69
- def remove_last_parenthesis(text):
70
- """
71
- Removes the last occurrence of content within parentheses from the provided text.
72
-
73
- Args:
74
- text (str): The input string from which to remove the last parentheses and its content.
75
-
76
- Returns:
77
- str: The modified string with the last parentheses content removed.
78
- """
79
- pattern = r'\([^()]*\)(?!.*\([^()]*\))'
80
- cleaned_text = re.sub(pattern, '', text)
81
- return cleaned_text
82
-
83
- def format_code_descriptions(text, model_name):
84
- """
85
- Format the ICD-10 code descriptions by removing content inside brackets and extra spaces.
86
-
87
- Args:
88
- text (str): The original text containing ICD-10 code descriptions.
89
-
90
- Returns:
91
- str: The cleaned text with content in brackets removed and extra spaces cleaned up.
92
- """
93
- pattern = r'\([^()]*\)(?!.*\([^()]*\))'
94
- cleaned_text = remove_last_parenthesis(text)
95
- cleaned_text = remove_extra_spaces(cleaned_text)
96
-
97
- return cleaned_text
98
-
99
- def construct_prompt_template(case_note, code_descriptions, model_name):
100
- """
101
- Construct a prompt template for evaluating ICD-10 code descriptions against a given case note.
102
-
103
- Args:
104
- case_note (str): The medical case note.
105
- code_descriptions (str): The ICD-10 code descriptions formatted as a single string.
106
-
107
- Returns:
108
- str: A structured template ready to be used as input for a language model.
109
- """
110
- template = prompt_template_dict[model_name]
111
-
112
- return template.format(note=case_note, code_descriptions=code_descriptions)
113
-
114
- def build_zero_shot_prompt(input_note, descriptions, model_name, system_prompt=""):
115
- """
116
- Build a zero-shot classification prompt with system and user roles for a language model.
117
-
118
- Args:
119
- input_note (str): The input note or query.
120
- descriptions (list of str): List of ICD-10 code descriptions.
121
- system_prompt (str): Optional initial system prompt or instruction.
122
-
123
- Returns:
124
- list of dict: A structured list of dictionaries defining the role and content of each message.
125
- """
126
- if model_name == "llama3-70b-8192":
127
- code_descriptions = "\n".join(["* " + x for x in descriptions])
128
- else:
129
-
130
- code_descriptions = "\n".join(["* " + x for x in descriptions])
131
-
132
-
133
- input_prompt = construct_prompt_template(input_note, code_descriptions, model_name)
134
- return [{"role": "system", "content": system_prompt}, {"role": "user", "content": input_prompt}]
135
-
136
- def get_response(messages, model_name, temperature=0.0, max_tokens=500):
137
- """
138
- Obtain responses from a specified model via the chat-completions API.
139
-
140
- Args:
141
- messages (list of dict): List of messages structured for API input.
142
- model_name (str): Identifier for the model to query.
143
- temperature (float): Controls randomness of response, where 0 is deterministic.
144
- max_tokens (int): Limit on the number of tokens in the response.
145
-
146
- Returns:
147
- str: The content of the response message from the model.
148
- """
149
- response = client.chat.completions.create(
150
- model=model_name,
151
- messages=messages,
152
- temperature=temperature,
153
- max_tokens=max_tokens
154
- )
155
- return response.choices[0].message.content
156
-
157
- def remove_noisy_prefix(text):
158
- # Removing numbers or letters followed by a dot and optional space at the beginning of the string
159
- cleaned_text = text.replace("* ", "").strip()
160
- cleaned_text = re.sub(r"^\s*\w+\.\s*", "", cleaned_text)
161
- return cleaned_text.strip()
162
- def parse_outputs(output, code_description_map, model_name):
163
- """
164
- Parse model outputs to confirm ICD-10 codes based on a given description map.
165
-
166
- Args:
167
- output (str): The model output containing confirmations.
168
- code_description_map (dict): Mapping of descriptions to ICD-10 codes.
169
-
170
- Returns:
171
- list of dict: A list of confirmed codes and their descriptions.
172
- """
173
- confirmed_codes = []
174
- split_outputs = [x for x in output.split("\n") if x]
175
- for item in split_outputs:
176
- try:
177
- code_description, confirmation = item.split(":", 1)
178
- # print(confirmation)
179
- cnf,fact = confirmation.split(",", 1)
180
-
181
-
182
- if model_name == "llama3-70b-8192":
183
- code_description = remove_noisy_prefix(code_description)
184
- else:
185
- code_description = remove_noisy_prefix(code_description)
186
-
187
- if confirmation.lower().strip().startswith("yes"):
188
- try:
189
-
190
- code = code_description_map[code_description]
191
-
192
-
193
- confirmed_codes.append({"ICD Code": code, "Code Description": code_description,"Evidence From Notes":fact})
194
-
195
- except Exception as e:
196
- # print(str(e) + " Here")
197
- continue
198
- except:
199
- continue
200
- return confirmed_codes
201
-
202
- def get_name_and_description(code, model_name):
203
- """
204
- Retrieve the name and description of an ICD-10 code.
205
-
206
- Args:
207
- code (str): The ICD-10 code.
208
-
209
- Returns:
210
- tuple: A tuple containing the formatted description and the name of the code.
211
- """
212
- full_data = cm.get_full_data(code).split("\n")
213
- return format_code_descriptions(full_data[3], model_name), full_data[1]
214
-
 
 
1
+ import re
2
+ import os
3
+ import simple_icd_10_cm as cm
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer
5
+ # from openai import OpenAI
6
+ from prompt_template import *
7
+ from langchain_groq import ChatGroq
8
+ from groq import Groq
9
+ from dotenv import load_dotenv
10
+ import csv
11
+ import time
12
+ load_dotenv()
13
+
14
+ os.environ["LANGCHAIN_TRACING_V2"]="true"
15
+ # groq_api_key=os.environ.get('GROQ_API_KEY')
16
+ groq_api_key=os.getenv('GROQ_API_KEY')
17
+ os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
18
+ LANGCHAIN_API_KEY=os.environ.get("LANGCHAIN_API_KEY")
19
+
20
+ client = Groq()
21
+
22
+ CHAPTER_LIST = cm.chapter_list
23
+
24
+ def construct_translation_prompt(medical_note):
25
+ """
26
+ Construct a prompt template for translating spanish medical notes to english.
27
+
28
+ Args:
29
+ medical_note (str): The medical case note.
30
+
31
+ Returns:
32
+ str: A structured template ready to be used as input for a language model.
33
+ """
34
+ translation_prompt = """You are an expert Spanish-to-English translator. You are provided with a clinical note written in Spanish.
35
+ You must translate the note into English. You must ensure that you properly translate the medical and technical terms from Spanish to English without any mistakes.
36
+ Spanish Medical Note:
37
+ {medical_note}"""
38
+
39
+ return translation_prompt.format(medical_note = medical_note)
40
+
41
+ def build_translation_prompt(input_note, system_prompt=""):
42
+ """
43
+ Build a zero-shot prompt for translating spanish medical notes to english.
44
+
45
+ Args:
46
+ input_note (str): The input note or query.
47
+ system_prompt (str): Optional initial system prompt or instruction.
48
+
49
+ Returns:
50
+ list of dict: A structured list of dictionaries defining the role and content of each message.
51
+ """
52
+ input_prompt = construct_translation_prompt(input_note)
53
+
54
+
55
+ return [{"role": "system", "content": system_prompt}, {"role": "user", "content": input_prompt}]
56
+
57
+
58
+ def remove_extra_spaces(text):
59
+ """
60
+ Remove extra spaces from a given text.
61
+
62
+ Args:
63
+ text (str): The original text string.
64
+
65
+ Returns:
66
+ str: The cleaned text with extra spaces removed.
67
+ """
68
+ return re.sub(r'\s+', ' ', text).strip()
69
+
70
+ def remove_last_parenthesis(text):
71
+ """
72
+ Removes the last occurrence of content within parentheses from the provided text.
73
+
74
+ Args:
75
+ text (str): The input string from which to remove the last parentheses and its content.
76
+
77
+ Returns:
78
+ str: The modified string with the last parentheses content removed.
79
+ """
80
+ pattern = r'\([^()]*\)(?!.*\([^()]*\))'
81
+ cleaned_text = re.sub(pattern, '', text)
82
+ return cleaned_text
83
+
84
+ def format_code_descriptions(text, model_name):
85
+ """
86
+ Format the ICD-10 code descriptions by removing content inside brackets and extra spaces.
87
+
88
+ Args:
89
+ text (str): The original text containing ICD-10 code descriptions.
90
+
91
+ Returns:
92
+ str: The cleaned text with content in brackets removed and extra spaces cleaned up.
93
+ """
94
+ pattern = r'\([^()]*\)(?!.*\([^()]*\))'
95
+ cleaned_text = remove_last_parenthesis(text)
96
+ cleaned_text = remove_extra_spaces(cleaned_text)
97
+
98
+ return cleaned_text
99
+
100
+ def construct_prompt_template(case_note, code_descriptions, model_name):
101
+ """
102
+ Construct a prompt template for evaluating ICD-10 code descriptions against a given case note.
103
+
104
+ Args:
105
+ case_note (str): The medical case note.
106
+ code_descriptions (str): The ICD-10 code descriptions formatted as a single string.
107
+
108
+ Returns:
109
+ str: A structured template ready to be used as input for a language model.
110
+ """
111
+ template = prompt_template_dict[model_name]
112
+
113
+ return template.format(note=case_note, code_descriptions=code_descriptions)
114
+
115
+ def build_zero_shot_prompt(input_note, descriptions, model_name, system_prompt=""):
116
+ """
117
+ Build a zero-shot classification prompt with system and user roles for a language model.
118
+
119
+ Args:
120
+ input_note (str): The input note or query.
121
+ descriptions (list of str): List of ICD-10 code descriptions.
122
+ system_prompt (str): Optional initial system prompt or instruction.
123
+
124
+ Returns:
125
+ list of dict: A structured list of dictionaries defining the role and content of each message.
126
+ """
127
+ if model_name == "llama3-70b-8192":
128
+ code_descriptions = "\n".join(["* " + x for x in descriptions])
129
+ else:
130
+
131
+ code_descriptions = "\n".join(["* " + x for x in descriptions])
132
+
133
+
134
+ input_prompt = construct_prompt_template(input_note, code_descriptions, model_name)
135
+ return [{"role": "system", "content": system_prompt}, {"role": "user", "content": input_prompt}]
136
+
137
+ def get_response(messages, model_name, temperature=0.0, max_tokens=500):
138
+ """
139
+ Obtain responses from a specified model via the chat-completions API.
140
+
141
+ Args:
142
+ messages (list of dict): List of messages structured for API input.
143
+ model_name (str): Identifier for the model to query.
144
+ temperature (float): Controls randomness of response, where 0 is deterministic.
145
+ max_tokens (int): Limit on the number of tokens in the response.
146
+
147
+ Returns:
148
+ str: The content of the response message from the model.
149
+ """
150
+ response = client.chat.completions.create(
151
+ model=model_name,
152
+ messages=messages,
153
+ temperature=temperature,
154
+ max_tokens=max_tokens
155
+ )
156
+ return response.choices[0].message.content
157
+
158
+ def remove_noisy_prefix(text):
159
+ # Removing numbers or letters followed by a dot and optional space at the beginning of the string
160
+ cleaned_text = text.replace("* ", "").strip()
161
+ cleaned_text = re.sub(r"^\s*\w+\.\s*", "", cleaned_text)
162
+ return cleaned_text.strip()
163
+ def parse_outputs(output, code_description_map, model_name):
164
+ """
165
+ Parse model outputs to confirm ICD-10 codes based on a given description map.
166
+
167
+ Args:
168
+ output (str): The model output containing confirmations.
169
+ code_description_map (dict): Mapping of descriptions to ICD-10 codes.
170
+
171
+ Returns:
172
+ list of dict: A list of confirmed codes and their descriptions.
173
+ """
174
+ confirmed_codes = []
175
+ split_outputs = [x for x in output.split("\n") if x]
176
+ for item in split_outputs:
177
+ try:
178
+ code_description, confirmation = item.split(":", 1)
179
+ # print(confirmation)
180
+ cnf,fact = confirmation.split(",", 1)
181
+
182
+
183
+ if model_name == "llama3-70b-8192":
184
+ code_description = remove_noisy_prefix(code_description)
185
+ else:
186
+ code_description = remove_noisy_prefix(code_description)
187
+
188
+ if confirmation.lower().strip().startswith("yes"):
189
+ try:
190
+
191
+ code = code_description_map[code_description]
192
+
193
+
194
+ confirmed_codes.append({"ICD Code": code, "Code Description": code_description,"Evidence From Notes":fact})
195
+
196
+ except Exception as e:
197
+ # print(str(e) + " Here")
198
+ continue
199
+ except:
200
+ continue
201
+ return confirmed_codes
202
+
203
+ def get_name_and_description(code, model_name):
204
+ """
205
+ Retrieve the name and description of an ICD-10 code.
206
+
207
+ Args:
208
+ code (str): The ICD-10 code.
209
+
210
+ Returns:
211
+ tuple: A tuple containing the formatted description and the name of the code.
212
+ """
213
+ full_data = cm.get_full_data(code).split("\n")
214
+ return format_code_descriptions(full_data[3], model_name), full_data[1]
215
+