import re def parse_topics_to_dict(text): topics = {} lines = text.strip().split("\n") current_topic = None topic_pattern = re.compile(r"^\d+\.\s+(.*)$") sub_topic_pattern = re.compile(r"^\*\s+(.*)$") for line in lines: line = line.strip() if topic_pattern.match(line): current_topic = topic_pattern.match(line).group(1) topics[current_topic] = [] elif sub_topic_pattern.match(line): sub_topic = sub_topic_pattern.match(line).group(1) if current_topic: topics[current_topic].append(sub_topic) print(topics) return topics def remove_all_sources(text): # Construct a regular expression pattern to match all sources pattern = r"Source \d+:(.*?)(?=Source \d+:|$)" # Use re.DOTALL to make '.' match newlines and re.IGNORECASE for case-insensitive matching updated_text = re.sub(pattern, "", text, flags=re.DOTALL) return updated_text.strip() def clean_text(text): # Replace multiple spaces with a single space text = re.sub(r"\s{2,}", " ", text) # Remove newline characters that are not followed by a number (to keep lists or numbered points) text = re.sub(r"\n(?!\s*\d)", " ", text) # Remove unnecessary punctuation (optional, adjust as needed) text = re.sub(r";(?=\S)", "", text) # Optional: Remove extra spaces around certain characters text = re.sub(r"\s*([,;])\s*", r"\1 ", text) # Normalize whitespace to a single space text = re.sub(r"\s+", " ", text).strip() return text def update_response(text): # Find all the references in the text, e.g., [1], [3], [5] responses = re.findall(r"\[\d+\]", text) # Extract the numbers from the responses, and remove duplicates ref_numbers = sorted(set(int(respon.strip("[]")) for respon in responses)) # Create a mapping from old reference numbers to new ones ref_mapping = {old: new for new, old in enumerate(ref_numbers, start=1)} # Replace old responses with the updated responses in the text for old, new in ref_mapping.items(): text = re.sub(rf"\[{old}\]", f"[{new}]", text) return text def renumber_sources(source_list): new_sources = [] for i, source in enumerate(source_list): # Extract the content after the colon content = source.split(": ", 1)[1] # Add the new source number and content new_sources.append(f"source {i+1}: {content}") return new_sources def seperate_to_list(text): # Step 1: Split the text by line breaks (\n) lines = text.split("\n") # Step 2: Remove occurrences of "source (number):" cleaned_lines = [re.sub(r"Source \d+\:", "", line) for line in lines] # Step 3: Split all capital sentences final_output = [] for line in cleaned_lines: # Split any fully capitalized sentence (surrounding non-uppercase text remains intact) split_line = re.split(r"([A-Z\s]+[.!?])", line) final_output.extend([part.strip() for part in split_line if part.strip()]) return final_output