import re from ftfy import fix_text def contains_math(text): return text.startswith("$") or text.endswith("$") def fix_math(text): # Fix any issues with the text text = fix_text(text) # Remove LaTeX labels and references text = remove_labels(text) text = replace_katex_invalid(text) text = fix_fences(text) return text def remove_labels(text): pattern = r'\\label\{[^}]*\}' text = re.sub(pattern, '', text) ref_pattern = r'\\ref\{[^}]*\}' text = re.sub(ref_pattern, '', text) pageref_pattern = r'\\pageref\{[^}]*\}' text = re.sub(pageref_pattern, '', text) return text def replace_katex_invalid(string): # KaTeX cannot render all LaTeX, so we need to replace some things string = re.sub(r'\\tag\{.*?\}', '', string) string = re.sub(r'\\(?:Bigg?|bigg?)\{(.*?)\}', r'\1', string) string = re.sub(r'\\quad\\mbox\{(.*?)\}', r'\1', string) string = re.sub(r'\\mbox\{(.*?)\}', r'\1', string) string = remove_inner_dollars(string) return string def remove_inner_dollars(text): def replace_dollar(match): # Replace single $ with nothing, keep $$ intact math_block = match.group(1) return '$$' + math_block.replace('$', '') + '$$' pattern = r'\$\$(.*?)\$\$' return re.sub(pattern, replace_dollar, text, flags=re.DOTALL) def extract_latex_with_positions(text): pattern = r'(\$\$.*?\$\$|\$.*?\$)' matches = [] for match in re.finditer(pattern, text, re.DOTALL): matches.append((match.group(), match.start(), match.end())) return matches def slice_latex(text): # Extract LaTeX blocks along with their positions latex_blocks_with_positions = extract_latex_with_positions(text) chunks = [] last_position = 0 for block, start, end in latex_blocks_with_positions: # Add text before the current LaTeX block, if any if start > last_position: chunks.append({"text": text[last_position:start], "type": "text"}) # Add the LaTeX block chunks.append({"text": block, "type": "latex"}) last_position = end # Add remaining text after the last LaTeX block, if any if last_position < len(text): chunks.append({"text": text[last_position:], "type": "text"}) return chunks def is_latex(text): latex_patterns = [ r'\\(?:begin|end)\{[a-zA-Z]*\}', r'\$.*?\$', r'\$\$.*?\$\$', r'\\[a-zA-Z]+', r'\\[^a-zA-Z]', ] combined_pattern = '|'.join(latex_patterns) if re.search(combined_pattern, text, re.DOTALL): return True return False def fix_fences(text): if text.startswith("$$") and not text.endswith("$$"): if text[-1] == "$": text += "$" else: text += "$$" if text.endswith("$$") and not text.startswith("$$"): if text[0] == "$": text = "$" + text else: text = "$$" + text if text.startswith("$") and not text.endswith("$"): text = "$" + text + "$$" if text.endswith("$") and not text.startswith("$"): text = "$$" + text + "$" return text def strip_fences(text): while text.startswith("$"): text = text[1:] while text.endswith("$"): text = text[:-1] return text