faychu commited on
Commit
7a028b5
1 Parent(s): 0de6955

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +62 -51
README.md CHANGED
@@ -31,34 +31,38 @@ KeyError: 'qwen2-audio'
31
 
32
  ## Quickstart
33
 
34
- In the following, we demonstrate how to use `Qwen2-Audio-7B-Instrucct` for the inference, supporting both voice chat and audio analysis modes. Note that we have used the ChatML format for dialog, in this demo we show how to leverage `apply_chat_template` for this purpose.
35
 
36
  ### Voice Chat Inference
37
  In the voice chat mode, users can freely engage in voice interactions with Qwen2-Audio without text input:
38
  ```python
 
 
 
39
  from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
40
- from transformers.pipelines.audio_utils import ffmpeg_read
41
- import requests
42
 
43
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
44
  model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct", device_map="auto")
45
 
46
  conversation = [
47
- {"role": "user", "content": [
48
- {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav"},
49
- ]},
50
- {"role": "assistant", "content": "Yes, the speaker is female and in her twenties."},
51
- {"role": "user", "content": [
52
- {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav"},
53
- ]},
54
- ]
55
  text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
56
  audios = []
57
  for message in conversation:
58
  if isinstance(message["content"], list):
59
  for ele in message["content"]:
60
  if ele["type"] == "audio":
61
- audios.append(ffmpeg_read(requests.get(ele['audio_url']).content, sampling_rate=processor.feature_extractor.sampling_rate))
 
 
 
62
 
63
  inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
64
  inputs.input_ids = inputs.input_ids.to("cuda")
@@ -72,36 +76,41 @@ response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_
72
  ### Audio Analysis Inference
73
  In the audio analysis, users could provide both audio and text instructions for analysis:
74
  ```python
 
 
 
75
  from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
76
- from transformers.pipelines.audio_utils import ffmpeg_read
77
- import requests
78
 
79
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
80
  model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct", device_map="auto")
81
 
82
  conversation = [
83
- {'role': 'system', 'content': 'You are a helpful assistant.'},
84
- {"role": "user", "content": [
85
- {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3"},
86
- {"type": "text", "text": "What's that sound?"},
87
- ]},
88
- {"role": "assistant", "content": "It is the sound of glass shattering."},
89
- {"role": "user", "content": [
90
- {"type": "text", "text": "What can you do when you hear that?"},
91
- ]},
92
- {"role": "assistant", "content": "Stay alert and cautious, and check if anyone is hurt or if there is any damage to property."},
93
- {"role": "user", "content": [
94
- {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/1272-128104-0000.flac"},
95
- {"type": "text", "text": "What does the person say?"},
96
- ]},
97
- ]
98
  text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
99
  audios = []
100
  for message in conversation:
101
  if isinstance(message["content"], list):
102
  for ele in message["content"]:
103
  if ele["type"] == "audio":
104
- audios.append(ffmpeg_read(requests.get(ele['audio_url']).content, sampling_rate=processor.feature_extractor.sampling_rate))
 
 
 
 
105
 
106
  inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
107
  inputs.input_ids = inputs.input_ids.to("cuda")
@@ -110,39 +119,37 @@ generate_ids = model.generate(**inputs, max_length=256)
110
  generate_ids = generate_ids[:, inputs.input_ids.size(1):]
111
 
112
  response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
113
-
114
- print("prompt:\n", text)
115
- print("response:\n", response)
116
  ```
117
 
118
  ### Batch Inference
119
  We also support batch inference:
120
  ```python
 
 
 
121
  from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
122
- from transformers.pipelines.audio_utils import ffmpeg_read
123
- import requests
124
 
125
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
126
  model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct", device_map="auto")
127
 
128
  conversation1 = [
129
- {"role": "user", "content": [
130
- {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3"},
131
- {"type": "text", "text": "What's that sound?"},
132
- ]},
133
- {"role": "assistant", "content": "It is the sound of glass shattering."},
134
- {"role": "user", "content": [
135
- {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/f2641_0_throatclearing.wav"},
136
- {"type": "text", "text": "What can you hear?"},
137
- ]}
138
- ]
139
 
140
  conversation2 = [
141
- {"role": "user", "content": [
142
- {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/1272-128104-0000.flac"},
143
- {"type": "text", "text": "What does the person say?"},
144
- ]},
145
- ]
146
 
147
  conversations = [conversation1, conversation2]
148
 
@@ -154,7 +161,11 @@ for conversation in conversations:
154
  if isinstance(message["content"], list):
155
  for ele in message["content"]:
156
  if ele["type"] == "audio":
157
- audios.append(ffmpeg_read(requests.get(ele['audio_url']).content, sampling_rate=processor.feature_extractor.sampling_rate))
 
 
 
 
158
 
159
  inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
160
  inputs['input_ids'] = inputs['input_ids'].to("cuda")
 
31
 
32
  ## Quickstart
33
 
34
+ In the following, we demonstrate how to use `Qwen2-Audio-7B-Instruct` for the inference, supporting both voice chat and audio analysis modes. Note that we have used the ChatML format for dialog, in this demo we show how to leverage `apply_chat_template` for this purpose.
35
 
36
  ### Voice Chat Inference
37
  In the voice chat mode, users can freely engage in voice interactions with Qwen2-Audio without text input:
38
  ```python
39
+ from io import BytesIO
40
+ from urllib.request import urlopen
41
+ import librosa
42
  from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
 
 
43
 
44
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
45
  model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct", device_map="auto")
46
 
47
  conversation = [
48
+ {"role": "user", "content": [
49
+ {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav"},
50
+ ]},
51
+ {"role": "assistant", "content": "Yes, the speaker is female and in her twenties."},
52
+ {"role": "user", "content": [
53
+ {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav"},
54
+ ]},
55
+ ]
56
  text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
57
  audios = []
58
  for message in conversation:
59
  if isinstance(message["content"], list):
60
  for ele in message["content"]:
61
  if ele["type"] == "audio":
62
+ audios.append(librosa.load(
63
+ BytesIO(urlopen(ele['audio_url']).read()),
64
+ sr=processor.feature_extractor.sampling_rate)[0]
65
+ )
66
 
67
  inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
68
  inputs.input_ids = inputs.input_ids.to("cuda")
 
76
  ### Audio Analysis Inference
77
  In the audio analysis, users could provide both audio and text instructions for analysis:
78
  ```python
79
+ from io import BytesIO
80
+ from urllib.request import urlopen
81
+ import librosa
82
  from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
 
 
83
 
84
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
85
  model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct", device_map="auto")
86
 
87
  conversation = [
88
+ {'role': 'system', 'content': 'You are a helpful assistant.'},
89
+ {"role": "user", "content": [
90
+ {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3"},
91
+ {"type": "text", "text": "What's that sound?"},
92
+ ]},
93
+ {"role": "assistant", "content": "It is the sound of glass shattering."},
94
+ {"role": "user", "content": [
95
+ {"type": "text", "text": "What can you do when you hear that?"},
96
+ ]},
97
+ {"role": "assistant", "content": "Stay alert and cautious, and check if anyone is hurt or if there is any damage to property."},
98
+ {"role": "user", "content": [
99
+ {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/1272-128104-0000.flac"},
100
+ {"type": "text", "text": "What does the person say?"},
101
+ ]},
102
+ ]
103
  text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
104
  audios = []
105
  for message in conversation:
106
  if isinstance(message["content"], list):
107
  for ele in message["content"]:
108
  if ele["type"] == "audio":
109
+ audios.append(
110
+ librosa.load(
111
+ BytesIO(urlopen(ele['audio_url']).read()),
112
+ sr=processor.feature_extractor.sampling_rate)[0]
113
+ )
114
 
115
  inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
116
  inputs.input_ids = inputs.input_ids.to("cuda")
 
119
  generate_ids = generate_ids[:, inputs.input_ids.size(1):]
120
 
121
  response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
 
 
 
122
  ```
123
 
124
  ### Batch Inference
125
  We also support batch inference:
126
  ```python
127
+ from io import BytesIO
128
+ from urllib.request import urlopen
129
+ import librosa
130
  from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
 
 
131
 
132
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
133
  model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct", device_map="auto")
134
 
135
  conversation1 = [
136
+ {"role": "user", "content": [
137
+ {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3"},
138
+ {"type": "text", "text": "What's that sound?"},
139
+ ]},
140
+ {"role": "assistant", "content": "It is the sound of glass shattering."},
141
+ {"role": "user", "content": [
142
+ {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/f2641_0_throatclearing.wav"},
143
+ {"type": "text", "text": "What can you hear?"},
144
+ ]}
145
+ ]
146
 
147
  conversation2 = [
148
+ {"role": "user", "content": [
149
+ {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/1272-128104-0000.flac"},
150
+ {"type": "text", "text": "What does the person say?"},
151
+ ]},
152
+ ]
153
 
154
  conversations = [conversation1, conversation2]
155
 
 
161
  if isinstance(message["content"], list):
162
  for ele in message["content"]:
163
  if ele["type"] == "audio":
164
+ audios.append(
165
+ librosa.load(
166
+ BytesIO(urlopen(ele['audio_url']).read()),
167
+ sr=processor.feature_extractor.sampling_rate)[0]
168
+ )
169
 
170
  inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
171
  inputs['input_ids'] = inputs['input_ids'].to("cuda")