Update app.py
Browse files
app.py
CHANGED
@@ -5,77 +5,80 @@ from docx import Document
|
|
5 |
from io import BytesIO
|
6 |
import tempfile
|
7 |
import os
|
8 |
-
import multiprocessing as mp
|
|
|
|
|
|
|
9 |
|
10 |
# Cargar el modelo de SpaCy en español
|
11 |
nlp = spacy.load('zh_core_web_trf')
|
12 |
-
#nlp.max_length = 15000000 # Aumenta el límite a 3 millones de caracteres
|
13 |
|
14 |
-
|
|
|
|
|
|
|
|
|
15 |
def extract_names_from_text(text):
|
16 |
print(f'{len(text)}/n/n')
|
17 |
doc = nlp(text)
|
18 |
-
# Extraer las entidades de tipo PERSON
|
19 |
persons = [ent.text for ent in doc.ents if ent.label_ == 'PERSON']
|
20 |
return persons
|
21 |
|
22 |
-
# Función para dividir el texto en fragmentos más pequeños
|
23 |
def split_text(text, max_length=100000):
|
24 |
result = []
|
25 |
current_chunk = []
|
26 |
current_length = 0
|
27 |
-
|
28 |
-
# Divide por salto de línea simple en lugar de doble
|
29 |
-
paragraphs = text.split('\n') # Usamos '\n' ya que en chino no se usan saltos dobles
|
30 |
|
31 |
for paragraph in paragraphs:
|
32 |
-
paragraph_length = len(paragraph) + 1
|
33 |
if current_length + paragraph_length <= max_length:
|
34 |
current_chunk.append(paragraph)
|
35 |
current_length += paragraph_length
|
36 |
else:
|
37 |
-
# Guarda el fragmento actual y empieza uno nuevo
|
38 |
result.append('\n'.join(current_chunk))
|
39 |
current_chunk = [paragraph]
|
40 |
current_length = paragraph_length
|
41 |
|
42 |
-
# Añadir el último fragmento si no está vacío
|
43 |
if current_chunk:
|
44 |
result.append('\n'.join(current_chunk))
|
45 |
|
46 |
return result
|
47 |
|
48 |
-
# Función para paralelizar la extracción de nombres
|
49 |
def extract_names_from_fragments(fragments):
|
50 |
-
# Utiliza todos los núcleos disponibles de la CPU
|
51 |
with mp.Pool(processes=4) as pool:
|
52 |
results = pool.map(extract_names_from_text, fragments)
|
53 |
return results
|
54 |
|
55 |
-
|
56 |
-
|
|
|
|
|
57 |
# Cargar el archivo DOCX
|
58 |
document = Document(docx_file)
|
59 |
full_text = []
|
60 |
for para in document.paragraphs:
|
61 |
full_text.append(para.text)
|
62 |
|
|
|
|
|
63 |
# Unir todo el texto
|
64 |
text = ' '.join(full_text)
|
65 |
|
66 |
-
# Dividir el texto en fragmentos
|
67 |
text_fragments = split_text(text)
|
|
|
68 |
|
69 |
# Extraer los nombres de cada fragmento en paralelo
|
70 |
-
results = extract_names_from_fragments(text_fragments)
|
71 |
-
|
72 |
-
# Unir todos los resultados de nombres en una sola lista
|
73 |
all_persons = []
|
74 |
-
for
|
75 |
-
all_persons.extend(
|
|
|
|
|
76 |
|
77 |
# Eliminar duplicados
|
78 |
all_persons = list(set(all_persons))
|
|
|
79 |
|
80 |
# Crear un DataFrame
|
81 |
df = pd.DataFrame(all_persons, columns=['Nombres'])
|
@@ -88,16 +91,27 @@ def extract_names_from_docx(docx_file):
|
|
88 |
with pd.ExcelWriter(temp_file_path, engine='openpyxl') as writer:
|
89 |
df.to_excel(writer, index=False)
|
90 |
|
91 |
-
|
|
|
92 |
|
93 |
# Interfaz de Gradio
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
|
102 |
# Iniciar la aplicación
|
103 |
-
|
|
|
5 |
from io import BytesIO
|
6 |
import tempfile
|
7 |
import os
|
8 |
+
import multiprocessing as mp
|
9 |
+
import psutil
|
10 |
+
import time
|
11 |
+
from datetime import datetime
|
12 |
|
13 |
# Cargar el modelo de SpaCy en español
|
14 |
nlp = spacy.load('zh_core_web_trf')
|
|
|
15 |
|
16 |
+
def get_system_status():
|
17 |
+
cpu_usage = psutil.cpu_percent()
|
18 |
+
memory = psutil.virtual_memory()
|
19 |
+
return f"CPU: {cpu_usage}% | RAM: {memory.percent}% | Último update: {datetime.now().strftime('%H:%M:%S')}"
|
20 |
+
|
21 |
def extract_names_from_text(text):
|
22 |
print(f'{len(text)}/n/n')
|
23 |
doc = nlp(text)
|
|
|
24 |
persons = [ent.text for ent in doc.ents if ent.label_ == 'PERSON']
|
25 |
return persons
|
26 |
|
|
|
27 |
def split_text(text, max_length=100000):
|
28 |
result = []
|
29 |
current_chunk = []
|
30 |
current_length = 0
|
31 |
+
paragraphs = text.split('\n')
|
|
|
|
|
32 |
|
33 |
for paragraph in paragraphs:
|
34 |
+
paragraph_length = len(paragraph) + 1
|
35 |
if current_length + paragraph_length <= max_length:
|
36 |
current_chunk.append(paragraph)
|
37 |
current_length += paragraph_length
|
38 |
else:
|
|
|
39 |
result.append('\n'.join(current_chunk))
|
40 |
current_chunk = [paragraph]
|
41 |
current_length = paragraph_length
|
42 |
|
|
|
43 |
if current_chunk:
|
44 |
result.append('\n'.join(current_chunk))
|
45 |
|
46 |
return result
|
47 |
|
|
|
48 |
def extract_names_from_fragments(fragments):
|
|
|
49 |
with mp.Pool(processes=4) as pool:
|
50 |
results = pool.map(extract_names_from_text, fragments)
|
51 |
return results
|
52 |
|
53 |
+
def extract_names_from_docx(docx_file, progress=gr.Progress()):
|
54 |
+
# Inicializar variables de progreso
|
55 |
+
progress(0, desc="Iniciando procesamiento...")
|
56 |
+
|
57 |
# Cargar el archivo DOCX
|
58 |
document = Document(docx_file)
|
59 |
full_text = []
|
60 |
for para in document.paragraphs:
|
61 |
full_text.append(para.text)
|
62 |
|
63 |
+
progress(0.2, desc="Documento cargado, preparando texto...")
|
64 |
+
|
65 |
# Unir todo el texto
|
66 |
text = ' '.join(full_text)
|
67 |
|
68 |
+
# Dividir el texto en fragmentos
|
69 |
text_fragments = split_text(text)
|
70 |
+
progress(0.3, desc=f"Texto dividido en {len(text_fragments)} fragmentos...")
|
71 |
|
72 |
# Extraer los nombres de cada fragmento en paralelo
|
|
|
|
|
|
|
73 |
all_persons = []
|
74 |
+
for i, fragment_results in enumerate(extract_names_from_fragments(text_fragments)):
|
75 |
+
all_persons.extend(fragment_results)
|
76 |
+
progress((0.3 + (0.5 * (i+1)/len(text_fragments))),
|
77 |
+
desc=f"Procesando fragmento {i+1} de {len(text_fragments)}...")
|
78 |
|
79 |
# Eliminar duplicados
|
80 |
all_persons = list(set(all_persons))
|
81 |
+
progress(0.9, desc="Preparando resultados...")
|
82 |
|
83 |
# Crear un DataFrame
|
84 |
df = pd.DataFrame(all_persons, columns=['Nombres'])
|
|
|
91 |
with pd.ExcelWriter(temp_file_path, engine='openpyxl') as writer:
|
92 |
df.to_excel(writer, index=False)
|
93 |
|
94 |
+
progress(1.0, desc="¡Procesamiento completado!")
|
95 |
+
return temp_file_path
|
96 |
|
97 |
# Interfaz de Gradio
|
98 |
+
with gr.Blocks() as demo:
|
99 |
+
gr.Markdown("# Extractor de Nombres")
|
100 |
+
gr.Markdown("Sube un archivo .docx y extrae los nombres de las personas usando NLP con SpaCy.")
|
101 |
+
|
102 |
+
# Componente de estado del sistema (keepalive)
|
103 |
+
system_status = gr.Textbox(label="Estado del Sistema", value="Inicializando...")
|
104 |
+
|
105 |
+
# Componentes principales
|
106 |
+
file_input = gr.File(file_types=[".docx"])
|
107 |
+
output_file = gr.File(label="Archivo de resultados")
|
108 |
+
|
109 |
+
# Botón de proceso
|
110 |
+
process_btn = gr.Button("Procesar Documento")
|
111 |
+
process_btn.click(fn=extract_names_from_docx, inputs=file_input, outputs=output_file)
|
112 |
+
|
113 |
+
# Actualización periódica del estado del sistema
|
114 |
+
demo.load(get_system_status, None, system_status, every=5)
|
115 |
|
116 |
# Iniciar la aplicación
|
117 |
+
demo.launch()
|