rmayormartins commited on
Commit
de44379
β€’
1 Parent(s): d5016ec

Subindo arquivos

Browse files
README.md CHANGED
@@ -1,13 +1,75 @@
1
  ---
2
- title: Speech Accent Es Classifier
3
- emoji: πŸ’»
4
- colorFrom: red
5
- colorTo: green
6
  sdk: gradio
7
- sdk_version: 4.40.0
8
  app_file: app.py
9
  pinned: false
10
- license: ecl-2.0
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Speech-accent-es-classifier
3
+ emoji: πŸŽ™οΈπŸ€–πŸ‡ͺπŸ‡Έ
4
+ colorFrom: blue
5
+ colorTo: purple
6
  sdk: gradio
7
+ sdk_version: "4.12.0"
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
+ # Clasificador de Sotaques en EspaΓ±ol
13
+
14
+ Este proyecto es un clasificador de acentos que distingue entre el espaΓ±ol y otros acentos.
15
+
16
+ ## Resumen del Proyecto
17
+
18
+ Esta aplicaciΓ³n utiliza un modelo entrenado para clasificar acentos de habla en dos categorΓ­as:
19
+ 1. EspaΓ±ol
20
+ 2. Otro
21
+
22
+ El modelo se basa en el trabajo del autor y utiliza la parte en espaΓ±ol del conjunto de datos Common Voice (versiΓ³n 11.0) de la FundaciΓ³n Mozilla.
23
+
24
+ ## Conjunto de Datos
25
+
26
+ El proyecto utiliza el subconjunto en espaΓ±ol del conjunto de datos Common Voice:
27
+ - Conjunto de datos: "mozilla-foundation/common_voice_11_0", "es"
28
+
29
+ Acentos espaΓ±oles incluidos en el conjunto de datos:
30
+ - EspaΓ±a
31
+ - MΓ©xico
32
+ - Colombia
33
+ - Argentina
34
+ - Chile
35
+ - PerΓΊ
36
+ - Venezuela
37
+ - Cuba
38
+ - RepΓΊblica Dominicana
39
+ - Uruguay
40
+ - Paraguay
41
+ - Bolivia
42
+ - Ecuador
43
+ - Guatemala
44
+ - Honduras
45
+ - El Salvador
46
+ - Nicaragua
47
+ - Costa Rica
48
+ - PanamΓ‘
49
+ - Puerto Rico
50
+
51
+ ## Detalles TΓ©cnicos
52
+
53
+ El proyecto utiliza el siguiente modelo y procesador:
54
+ - Modelo: "facebook/wav2vec2-base-960h"
55
+ - Procesador: Wav2Vec2Processor.from_pretrained
56
+
57
+ ## Licencia
58
+
59
+ ecl
60
+
61
+ ## InformaciΓ³n del Desarrollador
62
+
63
+ Desarrollado por Ramon Mayor Martins, Ph.D. (2024)
64
+ - Correo electrΓ³nico: [email protected]
65
+ - PΓ‘gina web: https://rmayormartins.github.io/
66
+ - Twitter: @rmayormartins
67
+ - GitHub: https://github.com/rmayormartins
68
+
69
+ ## Agradecimientos
70
+
71
+ Agradecimientos especiales al Instituto Federal de Santa Catarina (Instituto Federal de Santa Catarina) IFSC-SΓ£o JosΓ©-Brasil.
72
+
73
+ ## Contacto
74
+
75
+ Para cualquier consulta o sugerencia, por favor contacte al desarrollador usando la informaciΓ³n proporcionada arriba.
app.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import numpy as np
4
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
5
+
6
+ #
7
+ model_name = "results"
8
+ processor = Wav2Vec2Processor.from_pretrained(model_name)
9
+ model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
10
+
11
+ def classify_accent(audio):
12
+ if audio is None:
13
+ return "Error: No se recibiΓ³ audio"
14
+
15
+ # Entrada
16
+ print(f"Tipo de entrada de audio: {type(audio)}")
17
+
18
+ # O Γ‘udio formato
19
+ print(f"Entrada de audio recibida: {audio}")
20
+
21
+ try:
22
+ audio_array = audio[1] #
23
+ sample_rate = audio[0] #
24
+
25
+ print(f"Forma del audio: {audio_array.shape}, Frecuencia de muestreo: {sample_rate}")
26
+
27
+ #
28
+ audio_array = audio_array.astype(np.float32)
29
+
30
+ #
31
+ if sample_rate != 16000:
32
+ import librosa
33
+ audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000)
34
+
35
+ input_values = processor(audio_array, return_tensors="pt", sampling_rate=16000).input_values
36
+
37
+ # Infer
38
+ with torch.no_grad():
39
+ logits = model(input_values).logits
40
+ predicted_ids = torch.argmax(logits, dim=-1).item()
41
+
42
+ #
43
+ labels = ["EspaΓ±ol", "Otro"]
44
+ return labels[predicted_ids]
45
+
46
+ except Exception as e:
47
+ return f"Error al procesar el audio: {str(e)}"
48
+
49
+ #
50
+ description_html = """
51
+ <p>Prueba con grabaciΓ³n o cargando un archivo de audio. Para probar, recomiendo una palabra.</p>
52
+ <p>Ramon Mayor Martins, Ph.D.: <a href="https://rmayormartins.github.io/" target="_blank">Website</a> | <a href="https://huggingface.co/rmayormartins" target="_blank">Spaces</a></p>
53
+ """
54
+
55
+ #
56
+ interface = gr.Interface(
57
+ fn=classify_accent,
58
+ inputs=gr.Audio(type="numpy"),
59
+ outputs="label",
60
+ title="Clasificador de Sotaques (EspaΓ±ol vs Otro)",
61
+ description=description_html
62
+ )
63
+
64
+ interface.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio==4.29.0
2
+ torch==2.0.1
3
+ numpy==1.23.5
4
+ transformers==4.24.0
5
+ librosa==0.9.2
results/config.json ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/wav2vec2-base-960h",
3
+ "activation_dropout": 0.1,
4
+ "adapter_attn_dim": null,
5
+ "adapter_kernel_size": 3,
6
+ "adapter_stride": 2,
7
+ "add_adapter": false,
8
+ "apply_spec_augment": true,
9
+ "architectures": [
10
+ "Wav2Vec2ForSequenceClassification"
11
+ ],
12
+ "attention_dropout": 0.1,
13
+ "bos_token_id": 1,
14
+ "classifier_proj_size": 256,
15
+ "codevector_dim": 256,
16
+ "contrastive_logits_temperature": 0.1,
17
+ "conv_bias": false,
18
+ "conv_dim": [
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512,
25
+ 512
26
+ ],
27
+ "conv_kernel": [
28
+ 10,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 3,
33
+ 2,
34
+ 2
35
+ ],
36
+ "conv_stride": [
37
+ 5,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2,
43
+ 2
44
+ ],
45
+ "ctc_loss_reduction": "sum",
46
+ "ctc_zero_infinity": false,
47
+ "diversity_loss_weight": 0.1,
48
+ "do_stable_layer_norm": false,
49
+ "eos_token_id": 2,
50
+ "feat_extract_activation": "gelu",
51
+ "feat_extract_dropout": 0.0,
52
+ "feat_extract_norm": "group",
53
+ "feat_proj_dropout": 0.1,
54
+ "feat_quantizer_dropout": 0.0,
55
+ "final_dropout": 0.1,
56
+ "gradient_checkpointing": false,
57
+ "hidden_act": "gelu",
58
+ "hidden_dropout": 0.1,
59
+ "hidden_dropout_prob": 0.1,
60
+ "hidden_size": 768,
61
+ "initializer_range": 0.02,
62
+ "intermediate_size": 3072,
63
+ "layer_norm_eps": 1e-05,
64
+ "layerdrop": 0.1,
65
+ "mask_feature_length": 10,
66
+ "mask_feature_min_masks": 0,
67
+ "mask_feature_prob": 0.0,
68
+ "mask_time_length": 10,
69
+ "mask_time_min_masks": 2,
70
+ "mask_time_prob": 0.05,
71
+ "model_type": "wav2vec2",
72
+ "num_adapter_layers": 3,
73
+ "num_attention_heads": 12,
74
+ "num_codevector_groups": 2,
75
+ "num_codevectors_per_group": 320,
76
+ "num_conv_pos_embedding_groups": 16,
77
+ "num_conv_pos_embeddings": 128,
78
+ "num_feat_extract_layers": 7,
79
+ "num_hidden_layers": 12,
80
+ "num_negatives": 100,
81
+ "output_hidden_size": 768,
82
+ "pad_token_id": 0,
83
+ "proj_codevector_dim": 256,
84
+ "tdnn_dilation": [
85
+ 1,
86
+ 2,
87
+ 3,
88
+ 1,
89
+ 1
90
+ ],
91
+ "tdnn_dim": [
92
+ 512,
93
+ 512,
94
+ 512,
95
+ 512,
96
+ 1500
97
+ ],
98
+ "tdnn_kernel": [
99
+ 5,
100
+ 3,
101
+ 3,
102
+ 1,
103
+ 1
104
+ ],
105
+ "torch_dtype": "float32",
106
+ "transformers_version": "4.43.4",
107
+ "use_weighted_layer_sum": false,
108
+ "vocab_size": 32,
109
+ "xvector_output_dim": 512
110
+ }
results/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6632eb608edacf1e3977b987719741db6567b25d68d7e94a698090c1a05e4ca0
3
+ size 378302360
results/preprocessor_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "processor_class": "Wav2Vec2Processor",
8
+ "return_attention_mask": false,
9
+ "sampling_rate": 16000
10
+ }
results/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "<pad>",
5
+ "unk_token": "<unk>"
6
+ }
results/tokenizer_config.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<pad>",
5
+ "lstrip": true,
6
+ "normalized": false,
7
+ "rstrip": true,
8
+ "single_word": false,
9
+ "special": false
10
+ },
11
+ "1": {
12
+ "content": "<s>",
13
+ "lstrip": true,
14
+ "normalized": false,
15
+ "rstrip": true,
16
+ "single_word": false,
17
+ "special": false
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": true,
22
+ "normalized": false,
23
+ "rstrip": true,
24
+ "single_word": false,
25
+ "special": false
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": true,
30
+ "normalized": false,
31
+ "rstrip": true,
32
+ "single_word": false,
33
+ "special": false
34
+ }
35
+ },
36
+ "bos_token": "<s>",
37
+ "clean_up_tokenization_spaces": true,
38
+ "do_lower_case": false,
39
+ "do_normalize": true,
40
+ "eos_token": "</s>",
41
+ "model_max_length": 1000000000000000019884624838656,
42
+ "pad_token": "<pad>",
43
+ "processor_class": "Wav2Vec2Processor",
44
+ "replace_word_delimiter_char": " ",
45
+ "return_attention_mask": false,
46
+ "target_lang": null,
47
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
48
+ "unk_token": "<unk>",
49
+ "word_delimiter_token": "|"
50
+ }
results/vocab.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "'": 27,
3
+ "</s>": 2,
4
+ "<pad>": 0,
5
+ "<s>": 1,
6
+ "<unk>": 3,
7
+ "A": 7,
8
+ "B": 24,
9
+ "C": 19,
10
+ "D": 14,
11
+ "E": 5,
12
+ "F": 20,
13
+ "G": 21,
14
+ "H": 11,
15
+ "I": 10,
16
+ "J": 29,
17
+ "K": 26,
18
+ "L": 15,
19
+ "M": 17,
20
+ "N": 9,
21
+ "O": 8,
22
+ "P": 23,
23
+ "Q": 30,
24
+ "R": 13,
25
+ "S": 12,
26
+ "T": 6,
27
+ "U": 16,
28
+ "V": 25,
29
+ "W": 18,
30
+ "X": 28,
31
+ "Y": 22,
32
+ "Z": 31,
33
+ "|": 4
34
+ }