语音转文字(用于prompt懒得输入)

做了个语音转文字的脚本,底层是有道智云的短语音识别,一次识别只要0.004元,基本可以用来解放双手,快速跟mini模型硬刚(不要token)
实测效果不错,接近100%的准确率,很适合多开agent。

识别脚本:

# -*- coding: utf-8 -*-
"""
Record from microphone and send WAV to the existing Demo.py recognizer.

Usage:
  python mic_to_youdao.py --duration 5

Dependencies:
  pip install -r requirements.txt
"""
import argparse
import time
import uuid
import base64
import wave
import tempfile
import os
import json
import threading

import sounddevice as sd
import soundfile as sf

import Demo

def record_until_enter(path, samplerate=16000, channels=1):
    """Record from the default microphone until the user presses Enter.

    Writes PCM_16 WAV to `path`.
    """
    print(f"Recording — samplerate={samplerate}, channels={channels}. Press Enter to stop.")
    stop_event = threading.Event()

    with sf.SoundFile(path, mode='w', samplerate=samplerate, channels=channels, subtype='PCM_16') as file:
        def callback(indata, frames, time_info, status):
            if status:
                print(f"Status: {status}")
            file.write(indata.copy())

        with sd.InputStream(samplerate=samplerate, channels=channels, dtype='int16', callback=callback):
            try:
                input()
            except KeyboardInterrupt:
                pass
    print(f"Saved recording to {path}")

def send_wav_to_youdao(wav_path):
    # Use Demo.connect which expects a WAV file path and returns the response
    print('Sending audio to Youdao...')
    response = Demo.connect(wav_path)
    return response

def extract_text_from_response(j):
    # Heuristic extractor: prefer 'result' lists or common keys, otherwise gather all strings
    def recurse_collect(x, out):
        if isinstance(x, str):
            out.append(x)
        elif isinstance(x, list):
            for e in x:
                recurse_collect(e, out)
        elif isinstance(x, dict):
            for v in x.values():
                recurse_collect(v, out)

    if j is None:
        return ''
    if isinstance(j, dict):
        # common patterns
        if 'result' in j:
            r = j['result']
            if isinstance(r, list):
                return ''.join([str(x) for x in r])
            if isinstance(r, str):
                return r
        if 'data' in j and isinstance(j['data'], dict) and 'result' in j['data']:
            r = j['data']['result']
            if isinstance(r, list):
                return ''.join([str(x) for x in r])
            if isinstance(r, str):
                return r

    # fallback: collect all strings
    out = []
    recurse_collect(j, out)
    return ''.join(out)

default_prompt = ''
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--samplerate', type=int, default=16000)
    parser.add_argument('--channels', type=int, default=1)
    parser.add_argument('--out', '-o', help='输出 WAV 文件路径(可选)')
    parser.add_argument('--prompt', '-p', type=str, default=default_prompt, help='拼接到结果前的前置提示文本')
    args = parser.parse_args()

    if args.out:
        out_path = args.out
    else:
        fd, out_path = tempfile.mkstemp(suffix='.wav')
        os.close(fd)

    try:
        print('开始录音,按 Enter 停止录音')
        record_until_enter(out_path, samplerate=args.samplerate, channels=args.channels)
        response = send_wav_to_youdao(out_path)
        # print raw json if possible
        try:
            j = response.json()
            print('Raw JSON response:')
            print(json.dumps(j, ensure_ascii=False, indent=2))
        except Exception:
            print('Raw response content:')
            print(response.content)
            j = None

        concatenated = extract_text_from_response(j)
        if args.prompt:
            # show the prompt for clarity
            print('Prompt:')
            print(args.prompt)
            if concatenated:
                combined = args.prompt.rstrip() + ' ' + concatenated
            else:
                combined = args.prompt
            print('Concatenated result (with prompt):')
            print(combined)
        else:
            print('Concatenated result:')
            print(concatenated)
    finally:
        if not args.out and os.path.exists(out_path):
            os.remove(out_path)

if __name__ == '__main__':
    main()
# Response:
# {'result': ['你好。你。'], 'requestId': '18112060-0fed-42c6-abfb-5143a5df810f', 'errorCode': '0', 'complexResult': [{'sentence': '你好。你。', 'encoutput': '', 'vad_id': 1, 'word_timestamps': [0, 480, 600],t': '', 'word_timestamps_eds': [60, 600, 660], 'encoutput_shape': '', 'vcoutput_shape': '', 'words': ['你', '好。', '你。'], 'partial': False}]}

有道原生Demo脚本:Demo.py

# -*- coding: utf-8 -*-
import sys
import uuid
import requests
import wave
import base64
import hashlib

from imp import reload

import time

reload(sys)

YOUDAO_URL = 'https://openapi.youdao.com/asrapi'
APP_KEY = 'your key'
APP_SECRET = 'your secret'

def truncate(q):
    if q is None:
        return None
    size = len(q)
    return q if size <= 20 else q[0:10] + str(size) + q[size-10:size]

def encrypt(signStr):
    hash_algorithm = hashlib.sha256()
    hash_algorithm.update(signStr.encode('utf-8'))
    return hash_algorithm.hexdigest()

def do_request(data):
    headers = {'Content-Type': 'application/x-www-form-urlencoded'}
    return requests.post(YOUDAO_URL, data=data, headers=headers)

def connect(audio_file_path='音频的路径', lang_type='zh-CHS'):
    """Send a local WAV file to Youdao ASR and return the response.

    audio_file_path: path to a .wav file
    lang_type: language code (default 'zh-CHS')
    """
    extension = audio_file_path[audio_file_path.rindex('.')+1:]
    if extension != 'wav':
        raise ValueError('不支持的音频类型: %s' % extension)

    wav_info = wave.open(audio_file_path, 'rb')
    sample_rate = wav_info.getframerate()
    nchannels = wav_info.getnchannels()
    wav_info.close()
    with open(audio_file_path, 'rb') as file_wav:
        q = base64.b64encode(file_wav.read()).decode('utf-8')

    data = {}
    curtime = str(int(time.time()))
    data['curtime'] = curtime
    salt = str(uuid.uuid1())
    signStr = APP_KEY + truncate(q) + salt + curtime + APP_SECRET
    sign = encrypt(signStr)
    data['appKey'] = APP_KEY
    data['q'] = q
    data['salt'] = salt
    data['sign'] = sign
    data['signType'] = "v2"
    data['langType'] = lang_type
    data['rate'] = sample_rate
    data['format'] = 'wav'
    data['channel'] = nchannels
    data['type'] = 1

    response = do_request(data)
    try:
        print(response.json())
    except Exception:
        print(response.content)
    return response

if __name__ == '__main__':
    connect()

pip list: sounddevice soundfile requests