百度語音識別api使用python進行調用
- 2019 年 11 月 28 日
- 筆記
百度語音現在是比較方便的介面,具體說明請看官方文檔,本文分兩個部分,先是使用python實現錄音,然後再使用百度語音api進行識別上傳。
首先是實現錄音功能,因為百度語言識別有一些錄音品質的要求的。so。。下文的程式碼可以按時間為文件名生成錄音文件,產生一個gui介面,點擊按鈕後開始錄音。
百度語音REST API支援整段錄音文件的識別,對錄音格式有一定的要求,支援語音識別控制項:集成提示音、音量回饋動效整套交互的對話框控制項,方便開發者快速集成;
原始PCM的錄音參數必須符合8k/16k取樣率、16bit位深、單聲道,支援的壓縮格式有:pcm(不壓縮)、wav、opus、speex、amr、x-flac。
語音識別介面支援POST 方式 目前API僅支援整段語音識別的模式,即需要上傳整段語音進行識別 語音數據上傳方式有兩種:隱示發送和顯示發送 原始語音的錄音格式目前只支援評測8k/16k取樣率16bit位深的單聲道語音 壓縮格式支援:pcm(不壓縮)、wav、opus、speex、amr、x-flac 系統支援語言種類:中文(zh)、粵語(ct)、英文(en)
Python
#!usr/bin/env python #coding=utf-8 import numpy as np from pyaudio import PyAudio,paInt16 from datetime import datetime import wave from Tkinter import * #define of params NUM_SAMPLES = 2000 framerate = 8000 channels = 1 sampwidth = 2 #record time TIME = 10 def save_wave_file(filename, data): '''save the date to the wav file''' wf = wave.open(filename, 'wb') wf.setnchannels(channels) wf.setsampwidth(sampwidth) wf.setframerate(framerate) wf.writeframes("".join(data)) wf.close() def my_button(root,label_text,button_text,button_func): '''''function of creat label and button''' #label details label = Label(root) label['text'] = label_text label.pack() #label details button = Button(root) button['text'] = button_text button['command'] = button_func button.pack() def record_wave(): #open the input of wave pa = PyAudio() stream = pa.open(format = paInt16, channels = 1, rate = framerate, input = True, frames_per_buffer = NUM_SAMPLES) save_buffer = [] count = 0 while count < TIME*4: #read NUM_SAMPLES sampling data string_audio_data = stream.read(NUM_SAMPLES) save_buffer.append(string_audio_data) count += 1 print '.' filename = datetime.now().strftime("%Y-%m-%d_%H_%M_%S")+".wav" save_wave_file(filename, save_buffer) save_buffer = [] print filename, "saved" def main(): root = Tk() my_button(root,"Record a wave","clik to record",record_wave) root.mainloop() if __name__ == "__main__": main()
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465 |
#!usr/bin/env python#coding=utf-8 import numpy as npfrom pyaudio import PyAudio,paInt16from datetime import datetimeimport wavefrom Tkinter import * #define of paramsNUM_SAMPLES = 2000framerate = 8000channels = 1sampwidth = 2#record timeTIME = 10 def save_wave_file(filename, data): '''save the date to the wav file''' wf = wave.open(filename, 'wb') wf.setnchannels(channels) wf.setsampwidth(sampwidth) wf.setframerate(framerate) wf.writeframes("".join(data)) wf.close() def my_button(root,label_text,button_text,button_func): '''''function of creat label and button''' #label details label = Label(root) label['text'] = label_text label.pack() #label details button = Button(root) button['text'] = button_text button['command'] = button_func button.pack() def record_wave(): #open the input of wave pa = PyAudio() stream = pa.open(format = paInt16, channels = 1, rate = framerate, input = True, frames_per_buffer = NUM_SAMPLES) save_buffer = [] count = 0 while count < TIME*4: #read NUM_SAMPLES sampling data string_audio_data = stream.read(NUM_SAMPLES) save_buffer.append(string_audio_data) count += 1 print '.' filename = datetime.now().strftime("%Y-%m-%d_%H_%M_%S")+".wav" save_wave_file(filename, save_buffer) save_buffer = [] print filename, "saved" def main(): root = Tk() my_button(root,"Record a wave","clik to record",record_wave) root.mainloop() if __name__ == "__main__": main() |
---|
完成錄音後看文件目錄是否已經出現一個。wav格式的文件了呢。一次錄音大概是十秒鐘。然後修改文件名為1.wav
執行下面的程式。有部分需要按照你的id和key進行修改噢。
Python
#encoding=utf-8 import wave import urllib, urllib2, pycurl import base64 import json ## get access token by api key & secret key ## 獲得token,需要填寫你的apikey以及secretkey def get_token(): apiKey = "Ll0c53MSac6GBOtpg22ZSGAU**" secretKey = "44c8af396038a24e34936227d4a19dc2**" auth_url = "https://openapi.baidu.com/oauth/2.0/token?grant_type=client_credentials&client_id=" + apiKey + "&client_secret=" + secretKey; res = urllib2.urlopen(auth_url) json_data = res.read() return json.loads(json_data)['access_token'] def dump_res(buf): print (buf) ## post audio to server def use_cloud(token): fp = wave.open('1.wav', 'rb')#錄音文件名 ##已經錄好音的語音片段 nf = fp.getnframes() f_len = nf * 2 audio_data = fp.readframes(nf) cuid = "7519663**" #你的產品id srv_url = 'http://vop.baidu.com/server_api' + '?cuid=' + cuid + '&token=' + token http_header = [ 'Content-Type: audio/pcm; rate=8000', 'Content-Length: %d' % f_len ] c = pycurl.Curl() c.setopt(pycurl.URL, str(srv_url)) #curl doesn't support unicode #c.setopt(c.RETURNTRANSFER, 1) c.setopt(c.HTTPHEADER, http_header) #must be list, not dict c.setopt(c.POST, 1) c.setopt(c.CONNECTTIMEOUT, 30) c.setopt(c.TIMEOUT, 30) c.setopt(c.WRITEFUNCTION, dump_res) c.setopt(c.POSTFIELDS, audio_data) c.setopt(c.POSTFIELDSIZE, f_len) c.perform() #pycurl.perform() has no return val if __name__ == "__main__": token = get_token() #獲得token use_cloud(token) #進行處理,然後
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354 |
#encoding=utf-8 import waveimport urllib, urllib2, pycurlimport base64import json## get access token by api key & secret key## 獲得token,需要填寫你的apikey以及secretkeydef get_token(): apiKey = "Ll0c53MSac6GBOtpg22ZSGAU**" secretKey = "44c8af396038a24e34936227d4a19dc2**" auth_url = "https://openapi.baidu.com/oauth/2.0/token?grant_type=client_credentials&client_id=" + apiKey + "&client_secret=" + secretKey; res = urllib2.urlopen(auth_url) json_data = res.read() return json.loads(json_data)['access_token'] def dump_res(buf): print (buf) ## post audio to serverdef use_cloud(token): fp = wave.open('1.wav', 'rb')#錄音文件名 ##已經錄好音的語音片段 nf = fp.getnframes() f_len = nf * 2 audio_data = fp.readframes(nf) cuid = "7519663**" #你的產品id srv_url = 'http://vop.baidu.com/server_api' + '?cuid=' + cuid + '&token=' + token http_header = [ 'Content-Type: audio/pcm; rate=8000', 'Content-Length: %d' % f_len ] c = pycurl.Curl() c.setopt(pycurl.URL, str(srv_url)) #curl doesn't support unicode #c.setopt(c.RETURNTRANSFER, 1) c.setopt(c.HTTPHEADER, http_header) #must be list, not dict c.setopt(c.POST, 1) c.setopt(c.CONNECTTIMEOUT, 30) c.setopt(c.TIMEOUT, 30) c.setopt(c.WRITEFUNCTION, dump_res) c.setopt(c.POSTFIELDS, audio_data) c.setopt(c.POSTFIELDSIZE, f_len) c.perform() #pycurl.perform() has no return val if __name__ == "__main__": token = get_token() #獲得token use_cloud(token) #進行處理,然後 |
---|
再執行python,等待一小段時間就可以返回看到

就是這樣:
主要程式碼的思路流程很清晰的.
百度語音識別通過 REST API 的方式給開發者提供一個通用的 HTTP 介面,基於該介面,開發者可以輕鬆的獲取語音識別能力。SDK中只提供了PHP、C和JAVA的相關樣例,然而個人以為,使用Python開發難度更低,本文描述了簡單使用Python調用百度語音識別服務 REST API 的簡單樣例。
註冊開發者帳號和創建應用不再贅述,百度的REST API在調用過程基本分為三步:
- 獲取token
- 提交數據
- 處理JSON