Spacy / app.py
lik07's picture
Update app.py
02010f9 verified
raw
history blame
3.84 kB
import gradio as gr
import spacy
import pandas as pd
from docx import Document
from io import BytesIO
import tempfile
import os
import multiprocessing as mp
import psutil
import time
from datetime import datetime
# Cargar el modelo de SpaCy en espa帽ol
nlp = spacy.load('zh_core_web_trf')
def get_system_status():
cpu_usage = psutil.cpu_percent()
memory = psutil.virtual_memory()
return f"CPU: {cpu_usage}% | RAM: {memory.percent}% | 脷ltimo update: {datetime.now().strftime('%H:%M:%S')}"
def extract_names_from_text(text):
print(f'{len(text)}/n/n')
doc = nlp(text)
persons = [ent.text for ent in doc.ents if ent.label_ == 'PERSON']
return persons
def split_text(text, max_length=100000):
result = []
current_chunk = []
current_length = 0
paragraphs = text.split('\n')
for paragraph in paragraphs:
paragraph_length = len(paragraph) + 1
if current_length + paragraph_length <= max_length:
current_chunk.append(paragraph)
current_length += paragraph_length
else:
result.append('\n'.join(current_chunk))
current_chunk = [paragraph]
current_length = paragraph_length
if current_chunk:
result.append('\n'.join(current_chunk))
return result
def extract_names_from_fragments(fragments):
with mp.Pool(processes=4) as pool:
results = pool.map(extract_names_from_text, fragments)
return results
def extract_names_from_docx(docx_file, progress=gr.Progress()):
# Inicializar variables de progreso
progress(0, desc="Iniciando procesamiento...")
# Cargar el archivo DOCX
document = Document(docx_file)
full_text = []
for para in document.paragraphs:
full_text.append(para.text)
progress(0.2, desc="Documento cargado, preparando texto...")
# Unir todo el texto
text = ' '.join(full_text)
# Dividir el texto en fragmentos
text_fragments = split_text(text)
progress(0.3, desc=f"Texto dividido en {len(text_fragments)} fragmentos...")
# Extraer los nombres de cada fragmento en paralelo
all_persons = []
for i, fragment_results in enumerate(extract_names_from_fragments(text_fragments)):
all_persons.extend(fragment_results)
progress((0.3 + (0.5 * (i+1)/len(text_fragments))),
desc=f"Procesando fragmento {i+1} de {len(text_fragments)}...")
# Eliminar duplicados
all_persons = list(set(all_persons))
progress(0.9, desc="Preparando resultados...")
# Crear un DataFrame
df = pd.DataFrame(all_persons, columns=['Nombres'])
# Crear un archivo temporal para guardar el Excel
temp_dir = tempfile.mkdtemp()
temp_file_path = os.path.join(temp_dir, "nombres_personas.xlsx")
# Guardar el DataFrame en un archivo Excel
with pd.ExcelWriter(temp_file_path, engine='openpyxl') as writer:
df.to_excel(writer, index=False)
progress(1.0, desc="隆Procesamiento completado!")
return temp_file_path
# Interfaz de Gradio
with gr.Blocks() as demo:
gr.Markdown("# Extractor de Nombres")
gr.Markdown("Sube un archivo .docx y extrae los nombres de las personas usando NLP con SpaCy.")
# Componente de estado del sistema (keepalive)
system_status = gr.Textbox(label="Estado del Sistema", value="Inicializando...")
# Componentes principales
file_input = gr.File(file_types=[".docx"])
output_file = gr.File(label="Archivo de resultados")
# Bot贸n de proceso
process_btn = gr.Button("Procesar Documento")
process_btn.click(fn=extract_names_from_docx, inputs=file_input, outputs=output_file)
# Actualizaci贸n peri贸dica del estado del sistema
demo.load(get_system_status, None, system_status, every=5)
# Iniciar la aplicaci贸n
demo.launch()