import gradio as gr |
import spacy |
import pandas as pd |
from docx import Document |
from io import BytesIO |
import tempfile |
import os |
import multiprocessing as mp |
nlp = spacy.load('zh_core_web_trf') |
def extract_names_from_text(text): |
print(f'{len(text)}/n/n{text}') |
doc = nlp(text) |
persons = [ent.text for ent in doc.ents if ent.label_ == 'PERSON'] |
return persons |
def split_text(text, max_length=100000): |
result = [] |
current_chunk = [] |
current_length = 0 |
paragraphs = text.split('\n') |
for paragraph in paragraphs: |
paragraph_length = len(paragraph) + 1 |
if current_length + paragraph_length <= max_length: |
current_chunk.append(paragraph) |
current_length += paragraph_length |
else: |
result.append('\n'.join(current_chunk)) |
current_chunk = [paragraph] |
current_length = paragraph_length |
if current_chunk: |
result.append('\n'.join(current_chunk)) |
return result |
def extract_names_from_fragments(fragments): |
with mp.Pool(processes=4) as pool: |
results = pool.map(extract_names_from_text, fragments) |
return results |
def extract_names_from_docx(docx_file): |
document = Document(docx_file) |
full_text = [] |
for para in document.paragraphs: |
full_text.append(para.text) |
text = ' '.join(full_text) |
text_fragments = split_text(text) |
results = extract_names_from_fragments(text_fragments) |
all_persons = [] |
for persons in results: |
all_persons.extend(persons) |
all_persons = list(set(all_persons)) |
df = pd.DataFrame(all_persons, columns=['Nombres']) |
temp_dir = tempfile.mkdtemp() |
temp_file_path = os.path.join(temp_dir, "nombres_personas.xlsx") |
with pd.ExcelWriter(temp_file_path, engine='openpyxl') as writer: |
df.to_excel(writer, index=False) |
return temp_file_path |
iface = gr.Interface( |
fn=extract_names_from_docx, |
inputs=gr.File(file_types=[".docx"]), |
outputs=gr.File(), |
title="Extractor de Nombres", |
description="Sube un archivo .docx y extrae los nombres de las personas usando NLP con SpaCy. Descarga el resultado en un archivo Excel." |
) |
iface.launch() |