Respair commited on
Commit
9c4b396
1 Parent(s): 69cf26e

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +20 -2
README.md CHANGED
@@ -39,7 +39,8 @@ more accurate representation for Japanese.
39
 
40
  from datasets import Dataset, Audio
41
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
42
- import jaconv
 
43
 
44
  kana_mapper = dict([
45
  ("ゔぁ","ba"),
@@ -70,7 +71,24 @@ model = WhisperForConditionalGeneration.from_pretrained("Respair/Hibiki_ASR_Phon
70
  forced_decoder_ids = processor.get_decoder_prompt_ids(task="transcribe", language='japanese')
71
 
72
 
73
- import re
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  sample = Dataset.from_dict({"audio": ["/content/kl_chunk1987.wav"]}).cast_column("audio", Audio(16000))
76
  sample = sample[0]['audio']
 
39
 
40
  from datasets import Dataset, Audio
41
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
42
+ import re
43
+ import pykakasi
44
 
45
  kana_mapper = dict([
46
  ("ゔぁ","ba"),
 
71
  forced_decoder_ids = processor.get_decoder_prompt_ids(task="transcribe", language='japanese')
72
 
73
 
74
+
75
+
76
+ def convert_to_kana(text):
77
+ kks = pykakasi.kakasi()
78
+
79
+
80
+ def convert_word(word):
81
+ result = kks.convert(word)
82
+ return ''.join(item['hira'] for item in result)
83
+
84
+
85
+ parts = re.split(r'([^\u3000-\u30ff\u3400-\u4dbf\u4e00-\u9fff]+)', text)
86
+
87
+
88
+ converted_parts = [convert_word(part) if re.match(r'[\u3000-\u30ff\u3400-\u4dbf\u4e00-\u9fff]', part) else part for part in parts]
89
+
90
+ return ''.join(converted_parts)
91
+
92
 
93
  sample = Dataset.from_dict({"audio": ["/content/kl_chunk1987.wav"]}).cast_column("audio", Audio(16000))
94
  sample = sample[0]['audio']