Python语音识别折腾记

本文最后更新于:2020年4月9日 晚上

今天好不容易休息半天,但是又被英语作业给折服了,就上一次U校园半自动化造福大学生之后,就有这么一个想法,是什么呢?
就是能不能把听力的内容识别出来,变成文本。这样就可以看听力文本了,那答题岂不是很简单?
ok,废话不多说,下面就来介绍下自己的折腾日记和遇到的问题

抓包

我们第一步要做的就是拿到听力文件,怎么拿呢?——抓包
由于过于简单,我只说大概步骤,具体百度
登陆——进入听力测验——按F12打开开发者工具——点击音频播放按钮——看工具里的最后一个资源——双击该资源进入——下载音频

音频格式转换

我们拿到的音频格式是MP3格式的,按照百度语音识别的文档来说支持音频格式:pcm、wav、amr、m4a的音频,
当然我在这里卡住了,用格式工厂转码后进行识别会报错,由于格式工厂没有pcm格式,所以我在这里纠结怎么转换,当然百度也给出了转换方法
我这里还没研究明白,恳请大佬指点,下面我们只能用百度提供的示例音频做演示了QAQ

在百度智能云开通服务

1.登陆百度智能云
2.2
3.3
4.4
5.5
6.6
7.创建成功后,我们就会获得api-key,secret-key(后面会用到)
7

开始在python构造请求

这里我们直接使用百度提供的DEMO,然后自己做一些修改

  • 安装依赖库(利用pip,不懂请自行百度)

  • pip install baidu-aip # 百度语音识别Python SDK

  • pip install urllib # 网页请求库

  • 开始构造代码(注意看注释)

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    # coding=utf-8

    # 引入所需库
    import sys
    import json
    import base64
    import time

    IS_PY3 = sys.version_info.major == 3

    if IS_PY3:
    from urllib.request import urlopen
    from urllib.request import Request
    from urllib.error import URLError
    from urllib.parse import urlencode
    timer = time.perf_counter
    else:
    from urllib2 import urlopen
    from urllib2 import Request
    from urllib2 import URLError
    from urllib import urlencode
    if sys.platform == "win32":
    timer = time.clock
    else:
    # On most other platforms the best timer is time.time()
    timer = time.time

    # 设置api-key,secret-key 此数据在上文有提及
    API_KEY = '你的api-key'
    SECRET_KEY = '你的secret-key'

    # 需要识别的文件 ./audio/16k.pcm为音频文件路径,例如在D盘,那就是D:/audio/16k.pcm
    AUDIO_FILE = './audio/16k.pcm' # 只支持 pcm/wav/amr 格式,极速版额外支持m4a 格式
    # 文件格式
    FORMAT = AUDIO_FILE[-3:] # 文件后缀只支持 pcm/wav/amr 格式,极速版额外支持m4a 格式

    # 电脑mac地址,个人感觉没用,默认即可
    CUID = '123456PYTHON'
    # 采样率
    RATE = 16000 # 固定值

    # 普通版

    DEV_PID = 1537 # 1537 表示识别普通话,使用输入法模型。根据文档填写PID,选择语言及识别模型
    ASR_URL = 'http://vop.baidu.com/server_api'
    SCOPE = 'audio_voice_assistant_get' # 有此scope表示有asr能力,没有请在网页里勾选,非常旧的应用可能没有

    #测试自训练平台需要打开以下信息, 自训练平台模型上线后,您会看见 第二步:“”获取专属模型参数pid:8001,modelid:1234”,按照这个信息获取 dev_pid=8001,lm_id=1234
    # DEV_PID = 8001 ;
    # LM_ID = 1234 ;

    # 极速版 打开注释的话请填写自己申请的appkey appSecret ,并在网页中开通极速版(开通后可能会收费)

    # DEV_PID = 80001
    # ASR_URL = 'http://vop.baidu.com/pro_api'
    # SCOPE = 'brain_enhanced_asr' # 有此scope表示有极速版能力,没有请在网页里开通极速版

    # 忽略scope检查,非常旧的应用可能没有
    # SCOPE = False

    class DemoError(Exception):
    pass


    """ TOKEN start """

    TOKEN_URL = 'http://openapi.baidu.com/oauth/2.0/token'


    def fetch_token():
    params = {'grant_type': 'client_credentials',
    'client_id': API_KEY,
    'client_secret': SECRET_KEY}
    post_data = urlencode(params)
    if (IS_PY3):
    post_data = post_data.encode( 'utf-8')
    req = Request(TOKEN_URL, post_data)
    try:
    f = urlopen(req)
    result_str = f.read()
    except URLError as err:
    print('token http response http code : ' + str(err.code))
    result_str = err.read()
    if (IS_PY3):
    result_str = result_str.decode()

    print(result_str)
    result = json.loads(result_str)
    print(result)
    if ('access_token' in result.keys() and 'scope' in result.keys()):
    print(SCOPE)
    if SCOPE and (not SCOPE in result['scope'].split(' ')): # SCOPE = False 忽略检查
    raise DemoError('scope is not correct')
    print('SUCCESS WITH TOKEN: %s EXPIRES IN SECONDS: %s' % (result['access_token'], result['expires_in']))
    return result['access_token']
    else:
    raise DemoError('MAYBE API_KEY or SECRET_KEY not correct: access_token or scope not found in token response')

    """ TOKEN end """

    if __name__ == '__main__':
    token = fetch_token()

    speech_data = []
    with open(AUDIO_FILE, 'rb') as speech_file:
    speech_data = speech_file.read()

    length = len(speech_data)
    if length == 0:
    raise DemoError('file %s length read 0 bytes' % AUDIO_FILE)
    speech = base64.b64encode(speech_data)
    if (IS_PY3):
    speech = str(speech, 'utf-8')
    params = {'dev_pid': DEV_PID,
    #"lm_id" : LM_ID, #测试自训练平台开启此项
    'format': FORMAT,
    'rate': RATE,
    'token': token,
    'cuid': CUID,
    'channel': 1,
    'speech': speech,
    'len': length
    }
    post_data = json.dumps(params, sort_keys=False)
    # print post_data
    req = Request(ASR_URL, post_data.encode('utf-8'))
    req.add_header('Content-Type', 'application/json')
    try:
    begin = timer()
    f = urlopen(req)
    result_str = f.read()
    print ("Request time cost %f" % (timer() - begin))
    except URLError as err:
    print('asr http response http code : ' + str(err.code))
    result_str = err.read()

    if (IS_PY3):
    result_str = str(result_str, 'utf-8')
    print(result_str)
    with open("result.txt","w") as of:
    of.write(result_str)

    运行程序

    我们运行程序后或返回一下信息

    1
    2
    3
    4
    5
    6
    {"access_token":"24.0ed58d5f890b6388b8e1fbb4f595e64e.2592000.1589017415.282335-19345000","session_key":"9mzdC33+ru+xjYD0Bs+vqaaCurlrknFs75w6pvM+ddSFc9hWFqSn0t8aW8gWU7qWGl21RSTcM4A1bevZRf5m2labdgfjvA==","scope":"audio_voice_assistant_get brain_enhanced_asr audio_tts_post public brain_all_scope picchain_test_picchain_api_scope wise_adapt lebo_resource_base lightservice_public hetu_basic lightcms_map_poi kaidian_kaidian ApsMisTest_Test\u6743\u9650 vis-classify_flower lpq_\u5f00\u653e cop_helloScope ApsMis_fangdi_permission smartapp_snsapi_base iop_autocar oauth_tp_app smartapp_smart_game_openapi oauth_sessionkey smartapp_swanid_verify smartapp_opensource_openapi smartapp_opensource_recapi qatest_scope1 fake_face_detect_\u5f00\u653eScope vis-ocr_\u865a\u62df\u4eba\u7269\u52a9\u7406 idl-video_\u865a\u62df\u4eba\u7269\u52a9\u7406","refresh_token":"25.ccc2be055d9e6273d3c0bad9b8bb2368.315360000.1901785415.282335-19345000","session_secret":"a0a53e0d2183b7145217373b71784075","expires_in":2592000}
    {'access_token': '24.0ed58d5f890b6388b8e1fbb4f595e64e.2592000.1589017415.282335-19345000', 'session_key': '9mzdC33+ru+xjYD0Bs+vqaaCurlrknFs75w6pvM+ddSFc9hWFqSn0t8aW8gWU7qWGl21RSTcM4A1bevZRf5m2labdgfjvA==', 'scope': 'audio_voice_assistant_get brain_enhanced_asr audio_tts_post public brain_all_scope picchain_test_picchain_api_scope wise_adapt lebo_resource_base lightservice_public hetu_basic lightcms_map_poi kaidian_kaidian ApsMisTest_Test权限 vis-classify_flower lpq_开放 cop_helloScope ApsMis_fangdi_permission smartapp_snsapi_base iop_autocar oauth_tp_app smartapp_smart_game_openapi oauth_sessionkey smartapp_swanid_verify smartapp_opensource_openapi smartapp_opensource_recapi qatest_scope1 fake_face_detect_开放Scope vis-ocr_虚拟人物助理 idl-video_虚拟人物助理', 'refresh_token': '25.ccc2be055d9e6273d3c0bad9b8bb2368.315360000.1901785415.282335-19345000', 'session_secret': 'a0a53e0d2183b7145217373b71784075', 'expires_in': 2592000}
    audio_voice_assistant_get
    SUCCESS WITH TOKEN: 24.0ed58d5f890b6388b8e1fbb4f595e64e.2592000.1589017415.282335-19345000 EXPIRES IN SECONDS: 2592000
    Request time cost 1.736482
    {"corpus_no":"6813645276203395866","err_msg":"success.","err_no":0,"result":["Beijing Cody girl"],"sn":"487908489041586425415"}

    8
    “result”:[“Beijing Cody girl”]中的内容就是识别的内容,在程序根目录也会生成result.txt文件也可以查看。

ok,大概就是这个样子,我要去研究pcm这个鬼东西了。。。