chan4lk commited on
Commit
4cd2ebc
1 Parent(s): 147b057

with voice clone

Browse files
Files changed (5) hide show
  1. .gitignore +165 -0
  2. app.py +85 -0
  3. requirements.txt +169 -0
  4. tts.py +14 -0
  5. whisper.py +20 -0
.gitignore ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110
+ .pdm.toml
111
+ .pdm-python
112
+ .pdm-build/
113
+
114
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115
+ __pypackages__/
116
+
117
+ # Celery stuff
118
+ celerybeat-schedule
119
+ celerybeat.pid
120
+
121
+ # SageMath parsed files
122
+ *.sage.py
123
+
124
+ # Environments
125
+ .env
126
+ .venv
127
+ env/
128
+ venv/
129
+ ENV/
130
+ env.bak/
131
+ venv.bak/
132
+
133
+ # Spyder project settings
134
+ .spyderproject
135
+ .spyproject
136
+
137
+ # Rope project settings
138
+ .ropeproject
139
+
140
+ # mkdocs documentation
141
+ /site
142
+
143
+ # mypy
144
+ .mypy_cache/
145
+ .dmypy.json
146
+ dmypy.json
147
+
148
+ # Pyre type checker
149
+ .pyre/
150
+
151
+ # pytype static type analyzer
152
+ .pytype/
153
+
154
+ # Cython debug symbols
155
+ cython_debug/
156
+
157
+ # PyCharm
158
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
161
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162
+ #.idea/
163
+
164
+ *.wav
165
+ flagged/
app.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
4
+ from datasets import load_dataset
5
+ import torch
6
+ import soundfile as sf
7
+ from pdfminer.high_level import extract_text
8
+ from llama_cpp import Llama
9
+
10
+
11
+ # Check if MPS is available and set the device
12
+ if torch.backends.mps.is_available():
13
+ device = torch.device("mps")
14
+ print("Using MPS device")
15
+ else:
16
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
17
+ print(f"MPS not available, using {device}")
18
+ def toText(audio):
19
+ asr = pipeline(
20
+ "automatic-speech-recognition",
21
+ model="openai/whisper-tiny.en",
22
+ chunk_length_s=30,
23
+ device=device,
24
+ )
25
+ question = asr(audio, batch_size=8)["text"]
26
+ return question
27
+
28
+
29
+ def extract_answer(question, text):
30
+ # Load the LLaMA model
31
+ model_path="/Users/chandima/.cache/lm-studio/models/lmstudio-community/Llama-3.2-3B-Instruct-GGUF/Llama-3.2-3B-Instruct-Q3_K_L.gguf"
32
+ # Load the LLaMA model with MPS acceleration
33
+ llm = Llama(
34
+ model_path=model_path,
35
+ n_gpu_layers=-1, # Use all available layers for GPU acceleration
36
+ n_ctx=2048, # Adjust context size as needed
37
+ verbose=True, # Optional: for debugging
38
+ use_mlock=True, # Optional: for better memory management
39
+ n_threads=6, # Adjust based on your CPU
40
+ use_mmap=True, # Optional: for faster loading
41
+ )
42
+
43
+ # Use LLaMA to extract skills
44
+ prompt = f"""
45
+ Answer the question based on the Resume.
46
+
47
+ Question:
48
+ {question}:
49
+
50
+ Resume:
51
+ {text}
52
+
53
+ Answer:
54
+ """
55
+
56
+ response = llm(prompt, max_tokens=800, stop=["Human:", "\n\n"])
57
+ answer = response['choices'][0]['text'].strip()
58
+ print(answer)
59
+ return answer
60
+
61
+ def toAudio(text):
62
+ synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts", device=device)
63
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
64
+ speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
65
+ speech = synthesiser(text, forward_params={"speaker_embeddings": speaker_embedding})
66
+ return speech
67
+
68
+ def clone(audio, file):
69
+ question = toText(audio=audio)
70
+ text = extract_text(file.name)
71
+ res = extract_answer(question, text)
72
+ print(res)
73
+ speech = toAudio(res)
74
+ sf.write("speech.wav", speech["audio"], samplerate=speech["sampling_rate"])
75
+ return "./speech.wav"
76
+
77
+ iface = gr.Interface(fn=clone,
78
+ inputs=[gr.Audio(type='filepath', label='Voice reference audio file'), gr.File(label="Resume")],
79
+ outputs=gr.Audio(label='Says'),
80
+ title='Voice Clone',
81
+ description="""
82
+ whisper
83
+ """,
84
+ theme = gr.themes.Base(primary_hue="teal",secondary_hue="teal",neutral_hue="slate"))
85
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ aiofiles==23.2.1
3
+ aiohappyeyeballs==2.4.3
4
+ aiohttp==3.10.8
5
+ aiosignal==1.3.1
6
+ annotated-types==0.7.0
7
+ anyascii==0.3.2
8
+ anyio==4.6.0
9
+ async-timeout==4.0.3
10
+ attrs==24.2.0
11
+ audioread==3.0.1
12
+ babel==2.16.0
13
+ bangla==0.0.2
14
+ blinker==1.8.2
15
+ blis==0.7.11
16
+ bnnumerizer==0.0.2
17
+ bnunicodenormalizer==0.1.7
18
+ catalogue==2.0.10
19
+ certifi==2024.8.30
20
+ cffi==1.17.1
21
+ charset-normalizer==3.3.2
22
+ click==8.1.7
23
+ cloudpathlib==0.19.0
24
+ confection==0.1.5
25
+ contourpy==1.2.1
26
+ coqpit==0.0.17
27
+ cryptography==43.0.1
28
+ cycler==0.12.1
29
+ cymem==2.0.8
30
+ Cython==3.0.11
31
+ datasets==3.0.1
32
+ dateparser==1.1.8
33
+ decorator==5.1.1
34
+ dill==0.3.8
35
+ diskcache==5.6.3
36
+ docopt==0.6.2
37
+ einops==0.8.0
38
+ encodec==0.1.1
39
+ exceptiongroup==1.2.2
40
+ fastapi==0.115.0
41
+ ffmpy==0.4.0
42
+ filelock==3.16.1
43
+ Flask==3.0.3
44
+ fonttools==4.54.1
45
+ frozenlist==1.4.1
46
+ fsspec==2024.6.1
47
+ g2pkk==0.1.2
48
+ gradio==4.44.1
49
+ gradio_client==1.3.0
50
+ grpcio==1.66.2
51
+ gruut==2.2.3
52
+ gruut-ipa==0.13.0
53
+ gruut_lang_de==2.0.1
54
+ gruut_lang_en==2.0.1
55
+ gruut_lang_es==2.0.1
56
+ gruut_lang_fr==2.0.2
57
+ h11==0.14.0
58
+ hangul-romanize==0.1.0
59
+ httpcore==1.0.5
60
+ httpx==0.27.2
61
+ huggingface-hub==0.25.1
62
+ idna==3.10
63
+ importlib_resources==6.4.5
64
+ inflect==7.4.0
65
+ itsdangerous==2.2.0
66
+ jamo==0.4.1
67
+ jieba==0.42.1
68
+ Jinja2==3.1.4
69
+ joblib==1.4.2
70
+ jsonlines==1.2.0
71
+ kiwisolver==1.4.7
72
+ langcodes==3.4.1
73
+ language_data==1.2.0
74
+ lazy_loader==0.4
75
+ librosa==0.10.0
76
+ llama_cpp_python==0.3.1
77
+ llvmlite==0.43.0
78
+ marisa-trie==1.2.0
79
+ Markdown==3.7
80
+ markdown-it-py==3.0.0
81
+ MarkupSafe==2.1.5
82
+ matplotlib==3.8.4
83
+ mdurl==0.1.2
84
+ more-itertools==10.5.0
85
+ mpmath==1.3.0
86
+ msgpack==1.1.0
87
+ multidict==6.1.0
88
+ multiprocess==0.70.16
89
+ murmurhash==1.0.10
90
+ networkx==2.8.8
91
+ nltk==3.9.1
92
+ num2words==0.5.13
93
+ numba==0.60.0
94
+ numpy==1.22.0
95
+ orjson==3.10.7
96
+ packaging==24.1
97
+ pandas==1.5.3
98
+ pdfminer.six==20240706
99
+ pillow==10.4.0
100
+ platformdirs==4.3.6
101
+ pooch==1.8.2
102
+ preshed==3.0.9
103
+ protobuf==5.28.2
104
+ psutil==6.0.0
105
+ pyarrow==17.0.0
106
+ pycparser==2.22
107
+ pydantic==2.9.2
108
+ pydantic_core==2.23.4
109
+ pydub==0.25.1
110
+ Pygments==2.18.0
111
+ pynndescent==0.5.13
112
+ pyparsing==3.1.4
113
+ pypinyin==0.53.0
114
+ pysbd==0.3.4
115
+ python-crfsuite==0.9.10
116
+ python-dateutil==2.9.0.post0
117
+ python-multipart==0.0.12
118
+ pytz==2024.2
119
+ PyYAML==6.0.2
120
+ regex==2024.9.11
121
+ requests==2.32.3
122
+ rich==13.8.1
123
+ ruff==0.6.8
124
+ safetensors==0.4.5
125
+ scikit-learn==1.5.2
126
+ scipy==1.11.4
127
+ semantic-version==2.10.0
128
+ sentencepiece==0.2.0
129
+ shellingham==1.5.4
130
+ six==1.16.0
131
+ smart-open==7.0.4
132
+ sniffio==1.3.1
133
+ soundfile==0.12.1
134
+ soxr==0.5.0.post1
135
+ spacy==3.7.5
136
+ spacy-legacy==3.0.12
137
+ spacy-loggers==1.0.5
138
+ srsly==2.4.8
139
+ starlette==0.38.6
140
+ SudachiDict-core==20240716
141
+ SudachiPy==0.6.8
142
+ sympy==1.13.3
143
+ tensorboard==2.18.0
144
+ tensorboard-data-server==0.7.2
145
+ thinc==8.2.5
146
+ threadpoolctl==3.5.0
147
+ tokenizers==0.20.0
148
+ tomlkit==0.12.0
149
+ torch==2.4.1
150
+ torchaudio==2.4.1
151
+ tqdm==4.66.5
152
+ trainer==0.0.36
153
+ transformers==4.45.1
154
+ TTS==0.22.0
155
+ typeguard==4.3.0
156
+ typer==0.12.5
157
+ typing_extensions==4.12.2
158
+ tzlocal==5.2
159
+ umap-learn==0.5.6
160
+ Unidecode==1.3.8
161
+ urllib3==2.2.3
162
+ uvicorn==0.31.0
163
+ wasabi==1.1.3
164
+ weasel==0.4.1
165
+ websockets==12.0
166
+ Werkzeug==3.0.4
167
+ wrapt==1.16.0
168
+ xxhash==3.5.0
169
+ yarl==1.13.1
tts.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ from datasets import load_dataset
3
+ import soundfile as sf
4
+ import torch
5
+
6
+ synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts")
7
+
8
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
9
+ speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
10
+ # You can replace this embedding with your own as well.
11
+
12
+ speech = synthesiser("Hello, my dog is cooler than you!", forward_params={"speaker_embeddings": speaker_embedding})
13
+
14
+ sf.write("speech.wav", speech["audio"], samplerate=speech["sampling_rate"])
whisper.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import pipeline
3
+ from datasets import load_dataset
4
+
5
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
6
+
7
+ pipe = pipeline(
8
+ "automatic-speech-recognition",
9
+ model="openai/whisper-tiny.en",
10
+ chunk_length_s=30,
11
+ device=device,
12
+ )
13
+
14
+ ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
15
+ sample = ds[0]["audio"]
16
+
17
+ prediction = pipe(sample.copy(), batch_size=8)["text"]
18
+
19
+ # we can also return timestamps for the predictions
20
+ prediction = pipe(sample.copy(), batch_size=8, return_timestamps=True)["chunks"]