做了个语音转文字的脚本,底层是有道智云的短语音识别,一次识别只要0.004元,基本可以用来解放双手,快速跟mini模型硬刚(不要token)
实测效果不错,接近100%的准确率,很适合多开agent。
识别脚本:
# -*- coding: utf-8 -*- """ Record from microphone and send WAV to the existing Demo.py recognizer. Usage: python mic_to_youdao.py --duration 5 Dependencies: pip install -r requirements.txt """ import argparse import time import uuid import base64 import wave import tempfile import os import json import threading import sounddevice as sd import soundfile as sf import Demo def record_until_enter(path, samplerate=16000, channels=1): """Record from the default microphone until the user presses Enter. Writes PCM_16 WAV to `path`. """ print(f"Recording — samplerate={samplerate}, channels={channels}. Press Enter to stop.") stop_event = threading.Event() with sf.SoundFile(path, mode='w', samplerate=samplerate, channels=channels, subtype='PCM_16') as file: def callback(indata, frames, time_info, status): if status: print(f"Status: {status}") file.write(indata.copy()) with sd.InputStream(samplerate=samplerate, channels=channels, dtype='int16', callback=callback): try: input() except KeyboardInterrupt: pass print(f"Saved recording to {path}") def send_wav_to_youdao(wav_path): # Use Demo.connect which expects a WAV file path and returns the response print('Sending audio to Youdao...') response = Demo.connect(wav_path) return response def extract_text_from_response(j): # Heuristic extractor: prefer 'result' lists or common keys, otherwise gather all strings def recurse_collect(x, out): if isinstance(x, str): out.append(x) elif isinstance(x, list): for e in x: recurse_collect(e, out) elif isinstance(x, dict): for v in x.values(): recurse_collect(v, out) if j is None: return '' if isinstance(j, dict): # common patterns if 'result' in j: r = j['result'] if isinstance(r, list): return ''.join([str(x) for x in r]) if isinstance(r, str): return r if 'data' in j and isinstance(j['data'], dict) and 'result' in j['data']: r = j['data']['result'] if isinstance(r, list): return ''.join([str(x) for x in r]) if isinstance(r, str): return r # fallback: collect all strings out = [] recurse_collect(j, out) return ''.join(out) default_prompt = '' def main(): parser = argparse.ArgumentParser() parser.add_argument('--samplerate', type=int, default=16000) parser.add_argument('--channels', type=int, default=1) parser.add_argument('--out', '-o', help='输出 WAV 文件路径(可选)') parser.add_argument('--prompt', '-p', type=str, default=default_prompt, help='拼接到结果前的前置提示文本') args = parser.parse_args() if args.out: out_path = args.out else: fd, out_path = tempfile.mkstemp(suffix='.wav') os.close(fd) try: print('开始录音,按 Enter 停止录音') record_until_enter(out_path, samplerate=args.samplerate, channels=args.channels) response = send_wav_to_youdao(out_path) # print raw json if possible try: j = response.json() print('Raw JSON response:') print(json.dumps(j, ensure_ascii=False, indent=2)) except Exception: print('Raw response content:') print(response.content) j = None concatenated = extract_text_from_response(j) if args.prompt: # show the prompt for clarity print('Prompt:') print(args.prompt) if concatenated: combined = args.prompt.rstrip() + ' ' + concatenated else: combined = args.prompt print('Concatenated result (with prompt):') print(combined) else: print('Concatenated result:') print(concatenated) finally: if not args.out and os.path.exists(out_path): os.remove(out_path) if __name__ == '__main__': main() # Response: # {'result': ['你好。你。'], 'requestId': '18112060-0fed-42c6-abfb-5143a5df810f', 'errorCode': '0', 'complexResult': [{'sentence': '你好。你。', 'encoutput': '', 'vad_id': 1, 'word_timestamps': [0, 480, 600],t': '', 'word_timestamps_eds': [60, 600, 660], 'encoutput_shape': '', 'vcoutput_shape': '', 'words': ['你', '好。', '你。'], 'partial': False}]}有道原生Demo脚本:Demo.py
# -*- coding: utf-8 -*- import sys import uuid import requests import wave import base64 import hashlib from imp import reload import time reload(sys) YOUDAO_URL = 'https://openapi.youdao.com/asrapi' APP_KEY = 'your key' APP_SECRET = 'your secret' def truncate(q): if q is None: return None size = len(q) return q if size <= 20 else q[0:10] + str(size) + q[size-10:size] def encrypt(signStr): hash_algorithm = hashlib.sha256() hash_algorithm.update(signStr.encode('utf-8')) return hash_algorithm.hexdigest() def do_request(data): headers = {'Content-Type': 'application/x-www-form-urlencoded'} return requests.post(YOUDAO_URL, data=data, headers=headers) def connect(audio_file_path='音频的路径', lang_type='zh-CHS'): """Send a local WAV file to Youdao ASR and return the response. audio_file_path: path to a .wav file lang_type: language code (default 'zh-CHS') """ extension = audio_file_path[audio_file_path.rindex('.')+1:] if extension != 'wav': raise ValueError('不支持的音频类型: %s' % extension) wav_info = wave.open(audio_file_path, 'rb') sample_rate = wav_info.getframerate() nchannels = wav_info.getnchannels() wav_info.close() with open(audio_file_path, 'rb') as file_wav: q = base64.b64encode(file_wav.read()).decode('utf-8') data = {} curtime = str(int(time.time())) data['curtime'] = curtime salt = str(uuid.uuid1()) signStr = APP_KEY + truncate(q) + salt + curtime + APP_SECRET sign = encrypt(signStr) data['appKey'] = APP_KEY data['q'] = q data['salt'] = salt data['sign'] = sign data['signType'] = "v2" data['langType'] = lang_type data['rate'] = sample_rate data['format'] = 'wav' data['channel'] = nchannels data['type'] = 1 response = do_request(data) try: print(response.json()) except Exception: print(response.content) return response if __name__ == '__main__': connect()pip list: sounddevice soundfile requests