
作为一名大模型从业和探索者,最近接到了一个颇具挑战性的任务:为客户构建一个高精度的语音转文字服务。在经过多方技术选型后,我们最终选择了OpenAI的Whisper-large-v3模型作为核心引擎。这不是一个简单的模型调用项目,而是需要从零开始构建一个完整、可靠、可扩展的生产级API服务。
在实际开发过程中,我们遇到了诸多挑战:如何高效加载15亿参数的大模型?如何设计兼顾易用性和性能的API接口?如何处理各种格式的音频输入?更重要的是,如何确保服务在高压环境下的稳定性和可维护性?
本文将分享我们如何通过FastAPI、PyTorch和现代Python生态构建这个语音识别服务的完整过程。无论你是正在寻找语音识别解决方案的工程师,还是对深度学习服务化感兴趣的后端开发者,相信这个实战案例都能为你提供有价值的参考。

这个项目是一个基于OpenAI Whisper-large-v3模型的语音识别API服务,采用FastAPI框架构建,提供高效、可扩展的语音转文字功能。通过封装先进的深度学习模型,本服务支持多语言识别和翻译,并提供了简单易用的RESTful API接口。项目注重工程实践,包含了完整的错误处理、日志记录和资源管理,适用于生产环境部署。
import os
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from fastapi import FastAPI, File, UploadFile, Form, HTTPException, BackgroundTasks
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
import base64
import tempfile
import numpy as np
from typing import Optional, Dict, Any
import logging
import io
import librosa
import soundfile as sf
import wave
import json
from modelscope import snapshot_download模块解析:
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# 创建 FastAPI 应用
app = FastAPI(
title="Whisper 语音识别 API",
description="基于 Whisper-large-v3 的语音转文字服务,支持 base64 音频输入",
version="1.0.0"
)
# 配置 CORS
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # 生产环境中应该限制具体的域名
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)详细解析:
class WhisperASR:
def __init__(self, model_path="iic/Whisper-large-v3"):
self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
logger.info(f"正在加载模型: {model_path}")
logger.info(f"使用设备: {self.device}")
cache_dir = "D:\\modelscope\\hub"
local_model_path = "openai/whisper-large-v3"
try:
# 加载模型和处理器
self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
local_model_path,
torch_dtype=self.torch_dtype,
low_cpu_mem_usage=True,
use_safetensors=True,
model_type="whisper"
)
self.model.to(self.device)
self.processor = AutoProcessor.from_pretrained(local_model_path)
# 创建 pipeline
self.pipe = pipeline(
"automatic-speech-recognition",
model=self.model,
tokenizer=self.processor.tokenizer,
feature_extractor=self.processor.feature_extractor,
torch_dtype=self.torch_dtype,
device=self.device,
)
logger.info("模型加载完成")
except Exception as e:
logger.error(f"模型加载失败: {str(e)}")
raise详细解析:
def transcribe_audio_file(self, audio_path: str, language: Optional[str] = None, task: str = "transcribe") -> Dict[str, Any]:
"""
从音频文件进行语音转文字
Args:
audio_path: 音频文件路径
language: 语言代码
task: "transcribe" 或 "translate"
"""
try:
# 配置生成参数
generate_kwargs = {}
if language:
generate_kwargs["language"] = language
# 使用 librosa 加载音频文件
audio, sr = librosa.load(audio_path, sr=16000)
# 转换为 numpy 数组
audio_array = np.array(audio)
# 执行语音识别
result = self.pipe(
audio_array,
chunk_length_s=30,
batch_size=8,
generate_kwargs=generate_kwargs,
return_timestamps=True
)
return {
"success": True,
"text": result["text"],
"chunks": result.get("chunks", []),
"language": result.get("language", "unknown")
}
except Exception as e:
logger.error(f"转录失败: {str(e)}")
return {
"success": False,
"error": str(e)
}详细解析:
def transcribe_audio_bytes(self, audio_bytes: bytes, language: Optional[str] = None, task: str = "transcribe") -> Dict[str, Any]:
"""
从字节数据直接进行语音转文字
"""
try:
# 创建临时文件
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
temp_file.write(audio_bytes)
temp_path = temp_file.name
# 转录
result = self.transcribe_audio_file(temp_path, language, task)
# 清理临时文件
try:
os.unlink(temp_path)
except:
pass
return result
except Exception as e:
logger.error(f"字节数据转录失败: {str(e)}")
return {
"success": False,
"error": str(e)
}详细解析:
# 初始化模型
try:
asr_model = WhisperASR()
logger.info("ASR 模型初始化成功")
except Exception as e:
logger.error(f"ASR 模型初始化失败: {e}")
asr_model = None
# 支持的语言列表
SUPPORTED_LANGUAGES = {
"zh": "中文",
"en": "英语",
"ja": "日语",
"ko": "韩语",
"fr": "法语",
"de": "德语",
"es": "西班牙语",
"ru": "俄语",
"ar": "阿拉伯语",
"pt": "葡萄牙语",
"it": "意大利语"
}详细解析:
def cleanup_temp_file(file_path: str):
"""清理临时文件"""
try:
if os.path.exists(file_path):
os.unlink(file_path)
logger.info(f"已清理临时文件: {file_path}")
except Exception as e:
logger.warning(f"清理临时文件失败: {e}")
def base64_to_audio_bytes(base64_string: str) -> bytes:
"""
将 base64 字符串转换为音频字节数据
Args:
base64_string: base64 编码的音频数据(可以包含 data:audio/...;base64, 前缀)
"""
try:
# 处理可能的数据URI格式
if base64_string.startswith('data:'):
base64_string = base64_string.split(',', 1)[1]
# 解码 base64
audio_bytes = base64.b64decode(base64_string)
return audio_bytes
except Exception as e:
logger.error(f"Base64 解码失败: {str(e)}")
raise HTTPException(status_code=400, detail=f"Base64 解码失败: {str(e)}")详细解析:
@app.get("/")
async def root():
"""根端点"""
return {
"message": "Whisper 语音识别服务",
"version": "1.0.0",
"model": "Whisper-large-v3",
"status": "running"
}
@app.get("/health")
async def health_check():
"""健康检查端点"""
if asr_model is None:
raise HTTPException(status_code=500, detail="模型未加载")
return {
"status": "healthy",
"device": asr_model.device,
"model_loaded": True,
"supported_languages": len(SUPPORTED_LANGUAGES)
}
@app.get("/languages")
async def get_supported_languages():
"""获取支持的语言列表"""
return SUPPORTED_LANGUAGES详细解析:
@app.post("/transcribe/file")
async def transcribe_from_file(
background_tasks: BackgroundTasks,
file: UploadFile = File(...),
language: Optional[str] = Form(None),
task: str = Form("transcribe")
):
"""
通过文件上传进行语音转文字
Args:
file: 音频文件 (支持 wav, mp3, m4a, flac, ogg 等格式)
language: 语言代码 (可选)
task: 任务类型 - "transcribe"(转录) 或 "translate"(翻译)
"""
if asr_model is None:
raise HTTPException(status_code=500, detail="模型未加载")
# 验证文件类型
allowed_extensions = {'.wav', '.mp3', '.m4a', '.flac', '.ogg', '.aac'}
file_extension = os.path.splitext(file.filename)[1].lower()
if file_extension not in allowed_extensions:
raise HTTPException(
status_code=400,
detail=f"不支持的文件类型。支持的类型: {', '.join(allowed_extensions)}"
)
# 验证语言参数
if language and language not in SUPPORTED_LANGUAGES:
raise HTTPException(
status_code=400,
detail=f"不支持的语言: {language}"
)
# 验证任务参数
if task not in ["transcribe", "translate"]:
raise HTTPException(
status_code=400,
detail="任务参数必须是 'transcribe' 或 'translate'"
)
try:
# 创建临时文件
with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file:
content = await file.read()
temp_file.write(content)
temp_path = temp_file.name
# 添加清理任务
background_tasks.add_task(cleanup_temp_file, temp_path)
logger.info(f"开始处理文件: {file.filename}, 语言: {language}, 任务: {task}")
# 执行语音识别
result = asr_model.transcribe_audio_file(temp_path, language=language, task=task)
if result["success"]:
return result
else:
raise HTTPException(status_code=500, detail=result.get("error", "转录失败"))
except Exception as e:
logger.error(f"处理文件时出错: {str(e)}")
raise HTTPException(status_code=500, detail=f"处理失败: {str(e)}")详细解析:
@app.post("/transcribe/base64")
async def transcribe_from_base64(
request_data: dict,
language: Optional[str] = None,
task: str = "transcribe"
):
"""
通过 base64 编码的音频数据进行语音转文字
Args (在请求体中):
audio_data: base64 编码的音频数据
language: 语言代码 (可选)
task: 任务类型 (可选)
"""
if asr_model is None:
raise HTTPException(status_code=500, detail="模型未加载")
# 从请求数据中获取参数
audio_data = request_data.get("audio_data")
language = request_data.get("language", language)
task = request_data.get("task", task)
if not audio_data:
raise HTTPException(status_code=400, detail="缺少 audio_data 参数")
# 验证语言参数
if language and language not in SUPPORTED_LANGUAGES:
raise HTTPException(
status_code=400,
detail=f"不支持的语言: {language}"
)
# 验证任务参数
if task not in ["transcribe", "translate"]:
raise HTTPException(
status_code=400,
detail="任务参数必须是 'transcribe' 或 'translate'"
)
try:
logger.info(f"开始处理 base64 音频数据, 语言: {language}, 任务: {task}")
# 解码 base64
audio_bytes = base64_to_audio_bytes(audio_data)
# 执行语音识别
result = asr_model.transcribe_audio_bytes(audio_bytes, language=language, task=task)
if result["success"]:
return result
else:
raise HTTPException(status_code=500, detail=result.get("error", "转录失败"))
except Exception as e:
logger.error(f"处理 base64 音频时出错: {str(e)}")
raise HTTPException(status_code=500, detail=f"处理失败: {str(e)}")详细解析:
if __name__ == "__main__":
import uvicorn
# 启动服务
uvicorn.run(
app,
host="0.0.0.0",
port=8000,
log_level="info"
)详细解析:
这个Whisper语音识别API项目展示了现代深度学习服务的完整架构:
初次运行先主动下载模型后再进行模型初始化;

模型本地化路径和文件结构:

点击【开始录音】开始监听对话,并进行录音:

先【停止录音】后,再【开始识别】音频中的文字内容:

在设计API时,我们遵循了这些工程原则:
性能优化策略:
基于当前架构,我们计划在以下方向进行扩展:
这个Whisper语音识别API项目初步构建了一个功能完备的语音转文字服务平台。从技术架构角度看,项目展现了多个突出优势:首先,它充分利用了Whisper-large-v3模型的先进特性,包括对99种语言的识别能力、出色的噪声鲁棒性以及准确的时序对齐功能;其次,通过智能的设备检测和内存管理机制,系统能够在不同硬件环境下自动优化运行策略,既支持GPU加速推理,也能在纯CPU环境下稳定运行。
在工程实现方面,项目采用了FastAPI这一现代Python Web框架,提供了高效异步处理能力,确保了服务的高并发性能。API设计遵循了RESTful原则,支持文件上传和Base64编码两种数据输入方式,满足了不同场景下的集成需求。完善的错误处理机制和日志系统,为系统的稳定运行和问题排查提供了有力保障。
后续要考虑音频文件的切块存储,内容的时时转录,Whisper-large-v3 模型本身对输入音频的长度没有硬性限制,但是实际处理时会受到内存和计算资源的限制,结合内存的实际大小,对音频的切断处理配置合适的时间点,未来更考虑结合其他大模型,对文本内容进行概要提取,内容整合,实现最终的总结和结论建议参考等等。
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>语音转文字工具</title>
<style>
:root {
--primary-color: #4361ee;
--secondary-color: #3a0ca3;
--success-color: #4cc9f0;
--danger-color: #f72585;
--light-color: #f8f9fa;
--dark-color: #212529;
--border-radius: 8px;
--box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
}
* {
margin: 0;
padding: 0;
box-sizing: border-box;
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
}
body {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: var(--dark-color);
min-height: 100vh;
padding: 20px;
}
.container {
max-width: 800px;
margin: 0 auto;
background-color: white;
border-radius: var(--border-radius);
box-shadow: var(--box-shadow);
overflow: hidden;
}
header {
background: var(--primary-color);
color: white;
padding: 20px;
text-align: center;
}
h1 {
font-size: 2rem;
margin-bottom: 5px;
}
.subtitle {
opacity: 0.9;
font-weight: 300;
}
.content {
padding: 20px;
}
.card {
background: var(--light-color);
border-radius: var(--border-radius);
padding: 20px;
margin-bottom: 20px;
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05);
}
.card-title {
font-size: 1.2rem;
margin-bottom: 15px;
color: var(--secondary-color);
display: flex;
align-items: center;
}
.card-title i {
margin-right: 10px;
}
.controls {
display: flex;
flex-wrap: wrap;
gap: 10px;
margin-bottom: 20px;
}
button {
padding: 10px 20px;
border: none;
border-radius: var(--border-radius);
cursor: pointer;
font-weight: 600;
transition: all 0.3s ease;
display: flex;
align-items: center;
justify-content: center;
}
button i {
margin-right: 8px;
}
.btn-primary {
background-color: var(--primary-color);
color: white;
}
.btn-primary:hover {
background-color: var(--secondary-color);
}
.btn-success {
background-color: var(--success-color);
color: white;
}
.btn-success:hover {
background-color: #3aa8d0;
}
.btn-danger {
background-color: var(--danger-color);
color: white;
}
.btn-danger:hover {
background-color: #d61a6e;
}
.btn:disabled {
background-color: #cccccc;
cursor: not-allowed;
}
select, input {
padding: 10px;
border: 1px solid #ddd;
border-radius: var(--border-radius);
font-size: 1rem;
}
.visualizer {
width: 100%;
height: 100px;
background-color: #f0f0f0;
border-radius: var(--border-radius);
margin: 20px 0;
overflow: hidden;
position: relative;
}
.waveform {
width: 100%;
height: 100%;
display: flex;
align-items: center;
justify-content: center;
}
.recording-indicator {
display: flex;
align-items: center;
margin-bottom: 15px;
opacity: 0;
transition: opacity 0.3s;
}
.recording-indicator.active {
opacity: 1;
}
.pulse {
width: 12px;
height: 12px;
background-color: var(--danger-color);
border-radius: 50%;
margin-right: 10px;
animation: pulse 1.5s infinite;
}
@keyframes pulse {
0% {
transform: scale(0.95);
box-shadow: 0 0 0 0 rgba(247, 37, 133, 0.7);
}
70% {
transform: scale(1);
box-shadow: 0 0 0 10px rgba(247, 37, 133, 0);
}
100% {
transform: scale(0.95);
box-shadow: 0 0 0 0 rgba(247, 37, 133, 0);
}
}
.timer {
font-size: 1.2rem;
font-weight: bold;
margin-left: 10px;
}
.result-area {
min-height: 150px;
border: 1px solid #ddd;
border-radius: var(--border-radius);
padding: 15px;
margin-top: 20px;
background-color: white;
white-space: pre-wrap;
overflow-y: auto;
max-height: 300px;
}
.status {
padding: 10px;
border-radius: var(--border-radius);
margin: 10px 0;
display: none;
}
.status.success {
background-color: #d4edda;
color: #155724;
border: 1px solid #c3e6cb;
display: block;
}
.status.error {
background-color: #f8d7da;
color: #721c24;
border: 1px solid #f5c6cb;
display: block;
}
.status.info {
background-color: #d1ecf1;
color: #0c5460;
border: 1px solid #bee5eb;
display: block;
}
.audio-player {
width: 100%;
margin: 15px 0;
}
.history-item {
padding: 10px;
border-bottom: 1px solid #eee;
cursor: pointer;
}
.history-item:hover {
background-color: #f5f5f5;
}
.history-item:last-child {
border-bottom: none;
}
.history-text {
font-size: 0.9rem;
color: #666;
overflow: hidden;
text-overflow: ellipsis;
white-space: nowrap;
}
.history-time {
font-size: 0.8rem;
color: #999;
}
.flex {
display: flex;
align-items: center;
}
.flex-between {
display: flex;
justify-content: space-between;
align-items: center;
}
.mt-10 {
margin-top: 10px;
}
.mb-10 {
margin-bottom: 10px;
}
.text-center {
text-align: center;
}
.hidden {
display: none;
}
@media (max-width: 600px) {
.controls {
flex-direction: column;
}
button, select, input {
width: 100%;
}
}
</style>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
</head>
<body>
<div class="container">
<header>
<h1><i class="fas fa-microphone-alt"></i> 语音转文字工具</h1>
<p class="subtitle">基于 Whisper 大模型的实时语音识别</p>
</header>
<div class="content">
<!-- 服务器设置 -->
<div class="card">
<h2 class="card-title"><i class="fas fa-server"></i> 服务器设置</h2>
<div class="controls">
<input type="text" id="apiUrl" placeholder="后端 API 地址" value="http://localhost:8000" style="flex: 1;">
<button id="testConnection" class="btn-primary"><i class="fas fa-plug"></i> 测试连接</button>
</div>
</div>
<!-- 录音控制 -->
<div class="card">
<h2 class="card-title"><i class="fas fa-record-vinyl"></i> 录音控制</h2>
<div class="recording-indicator" id="recordingIndicator">
<div class="pulse"></div>
<span>正在录音</span>
<div class="timer" id="timer">00:00</div>
</div>
<div class="visualizer">
<canvas id="waveform" class="waveform"></canvas>
</div>
<div class="controls">
<button id="startRecord" class="btn-primary"><i class="fas fa-microphone"></i> 开始录音</button>
<button id="stopRecord" class="btn-danger" disabled><i class="fas fa-stop"></i> 停止录音</button>
<button id="playRecord" class="btn-success" disabled><i class="fas fa-play"></i> 播放录音</button>
</div>
<div class="audio-player">
<audio id="audioPlayback" controls style="width: 100%;" class="hidden"></audio>
</div>
</div>
<!-- 识别设置 -->
<div class="card">
<h2 class="card-title"><i class="fas fa-cogs"></i> 识别设置</h2>
<div class="controls">
<select id="language">
<option value="">自动检测语言</option>
<option value="zh">中文</option>
<option value="en">英语</option>
<option value="ja">日语</option>
<option value="ko">韩语</option>
<option value="fr">法语</option>
<option value="de">德语</option>
<option value="es">西班牙语</option>
</select>
<select id="task">
<option value="transcribe">转录</option>
<option value="translate">翻译</option>
</select>
<button id="transcribe" class="btn-primary" disabled><i class="fas fa-language"></i> 开始识别</button>
</div>
</div>
<!-- 状态信息 -->
<div id="status" class="status"></div>
<!-- 识别结果 -->
<div class="card">
<h2 class="card-title"><i class="fas fa-file-alt"></i> 识别结果</h2>
<div class="flex-between mb-10">
<span>识别文本:</span>
<button id="copyText" class="btn-primary" disabled><i class="fas fa-copy"></i> 复制文本</button>
</div>
<div id="result" class="result-area">录音并识别后,结果将显示在这里...</div>
<div class="flex-between mt-10">
<span>识别详情:</span>
<button id="toggleDetails" class="btn-primary"><i class="fas fa-chevron-down"></i> 显示详情</button>
</div>
<div id="details" class="result-area hidden"></div>
</div>
<!-- 历史记录 -->
<div class="card">
<h2 class="card-title"><i class="fas fa-history"></i> 历史记录</h2>
<div id="historyList" class="result-area">
<div class="text-center" style="color: #999; padding: 20px;">
暂无历史记录
</div>
</div>
</div>
</div>
</div>
<script>
// 全局变量
let mediaRecorder = null;
let audioChunks = [];
let audioBlob = null;
let audioUrl = null;
let recordingTimer = null;
let recordingStartTime = null;
let audioContext = null;
let analyser = null;
let canvasContext = null;
let isRecording = false;
// DOM 元素
const startRecordBtn = document.getElementById('startRecord');
const stopRecordBtn = document.getElementById('stopRecord');
const playRecordBtn = document.getElementById('playRecord');
const transcribeBtn = document.getElementById('transcribe');
const testConnectionBtn = document.getElementById('testConnection');
const copyTextBtn = document.getElementById('copyText');
const toggleDetailsBtn = document.getElementById('toggleDetails');
const recordingIndicator = document.getElementById('recordingIndicator');
const timerElement = document.getElementById('timer');
const audioPlayback = document.getElementById('audioPlayback');
const waveformCanvas = document.getElementById('waveform');
const resultElement = document.getElementById('result');
const detailsElement = document.getElementById('details');
const statusElement = document.getElementById('status');
const historyList = document.getElementById('historyList');
const apiUrlInput = document.getElementById('apiUrl');
const languageSelect = document.getElementById('language');
const taskSelect = document.getElementById('task');
// 初始化
function init() {
// 设置 Canvas 上下文
canvasContext = waveformCanvas.getContext('2d');
waveformCanvas.width = waveformCanvas.offsetWidth;
waveformCanvas.height = waveformCanvas.offsetHeight;
// 加载历史记录
loadHistory();
// 检查浏览器兼容性
if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
showStatus('您的浏览器不支持录音功能,请使用 Chrome、Firefox 或 Edge 等现代浏览器。', 'error');
startRecordBtn.disabled = true;
}
}
// 显示状态信息
function showStatus(message, type = 'info') {
statusElement.textContent = message;
statusElement.className = `status ${type}`;
// 5秒后自动隐藏信息状态
if (type === 'info') {
setTimeout(() => {
statusElement.style.display = 'none';
}, 5000);
}
}
// 测试服务器连接
async function testConnection() {
const apiUrl = apiUrlInput.value;
if (!apiUrl) {
showStatus('请输入 API 地址', 'error');
return;
}
try {
showStatus('正在测试连接...', 'info');
const response = await fetch(`${apiUrl}/health`);
if (response.ok) {
const data = await response.json();
showStatus(`连接成功!设备: ${data.device}, 模型: ${data.model_loaded ? '已加载' : '未加载'}`, 'success');
} else {
showStatus('连接失败,请检查服务器地址和状态', 'error');
}
} catch (error) {
showStatus(`连接失败: ${error.message}`, 'error');
}
}
// 开始录音
async function startRecording() {
try {
// 请求麦克风权限
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
// 设置音频上下文和频谱分析
audioContext = new (window.AudioContext || window.webkitAudioContext)();
analyser = audioContext.createAnalyser();
const source = audioContext.createMediaStreamSource(stream);
source.connect(analyser);
analyser.fftSize = 256;
// 创建 MediaRecorder 实例
mediaRecorder = new MediaRecorder(stream);
audioChunks = [];
// 录音数据可用时的处理
mediaRecorder.ondataavailable = (event) => {
if (event.data.size > 0) {
audioChunks.push(event.data);
}
};
// 录音停止时的处理
mediaRecorder.onstop = () => {
// 创建音频 Blob 和 URL
audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
audioUrl = URL.createObjectURL(audioBlob);
audioPlayback.src = audioUrl;
// 更新 UI
playRecordBtn.disabled = false;
transcribeBtn.disabled = false;
audioPlayback.classList.remove('hidden');
// 停止可视化
stopVisualization();
// 关闭音频流
stream.getTracks().forEach(track => track.stop());
};
// 开始录音
mediaRecorder.start();
isRecording = true;
// 更新 UI
startRecordBtn.disabled = true;
stopRecordBtn.disabled = false;
recordingIndicator.classList.add('active');
// 开始计时
recordingStartTime = Date.now();
updateTimer();
recordingTimer = setInterval(updateTimer, 1000);
// 开始可视化
startVisualization();
showStatus('录音已开始...', 'info');
} catch (error) {
console.error('录音错误:', error);
showStatus(`录音失败: ${error.message}`, 'error');
}
}
// 停止录音
function stopRecording() {
if (mediaRecorder && isRecording) {
mediaRecorder.stop();
isRecording = false;
// 更新 UI
startRecordBtn.disabled = false;
stopRecordBtn.disabled = true;
recordingIndicator.classList.remove('active');
// 停止计时
clearInterval(recordingTimer);
timerElement.textContent = '00:00';
showStatus('录音已停止', 'success');
}
}
// 播放录音
function playRecording() {
if (audioUrl) {
audioPlayback.play();
}
}
// 更新计时器
function updateTimer() {
const elapsedTime = Math.floor((Date.now() - recordingStartTime) / 1000);
const minutes = Math.floor(elapsedTime / 60).toString().padStart(2, '0');
const seconds = (elapsedTime % 60).toString().padStart(2, '0');
timerElement.textContent = `${minutes}:${seconds}`;
}
// 开始音频可视化
function startVisualization() {
if (!analyser) return;
const bufferLength = analyser.frequencyBinCount;
const dataArray = new Uint8Array(bufferLength);
function draw() {
if (!isRecording) return;
requestAnimationFrame(draw);
analyser.getByteFrequencyData(dataArray);
canvasContext.fillStyle = 'rgb(240, 240, 240)';
canvasContext.fillRect(0, 0, waveformCanvas.width, waveformCanvas.height);
const barWidth = (waveformCanvas.width / bufferLength) * 2.5;
let barHeight;
let x = 0;
for (let i = 0; i < bufferLength; i++) {
barHeight = dataArray[i] / 2;
canvasContext.fillStyle = `rgb(${barHeight + 100}, 50, 150)`;
canvasContext.fillRect(x, waveformCanvas.height - barHeight, barWidth, barHeight);
x += barWidth + 1;
}
}
draw();
}
// 停止音频可视化
function stopVisualization() {
canvasContext.clearRect(0, 0, waveformCanvas.width, waveformCanvas.height);
}
// 语音转文字
async function transcribeAudio() {
if (!audioBlob) {
showStatus('请先录制音频', 'error');
return;
}
const apiUrl = apiUrlInput.value;
const language = languageSelect.value;
const task = taskSelect.value;
if (!apiUrl) {
showStatus('请设置 API 地址', 'error');
return;
}
try {
showStatus('正在转换语音为文字...', 'info');
transcribeBtn.disabled = true;
// 创建 FormData
const formData = new FormData();
formData.append('file', audioBlob, 'recording.wav');
if (language) formData.append('language', language);
formData.append('task', task);
// 发送请求
const response = await fetch(`${apiUrl}/transcribe/file`, {
method: 'POST',
body: formData
});
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
}
const data = await response.json();
if (data.success) {
resultElement.textContent = data.text;
detailsElement.textContent = JSON.stringify(data, null, 2);
copyTextBtn.disabled = false;
// 保存到历史记录
saveToHistory(data.text, data.language);
showStatus(`识别成功!语言: ${data.language}`, 'success');
} else {
throw new Error(data.error || '识别失败');
}
} catch (error) {
console.error('识别错误:', error);
showStatus(`识别失败: ${error.message}`, 'error');
} finally {
transcribeBtn.disabled = false;
}
}
// 复制文本
function copyText() {
const text = resultElement.textContent;
if (text && text !== '录音并识别后,结果将显示在这里...') {
navigator.clipboard.writeText(text)
.then(() => {
showStatus('文本已复制到剪贴板', 'success');
})
.catch(err => {
showStatus('复制失败', 'error');
console.error('复制错误:', err);
});
}
}
// 切换详情显示
function toggleDetails() {
const isHidden = detailsElement.classList.contains('hidden');
if (isHidden) {
detailsElement.classList.remove('hidden');
toggleDetailsBtn.innerHTML = '<i class="fas fa-chevron-up"></i> 隐藏详情';
} else {
detailsElement.classList.add('hidden');
toggleDetailsBtn.innerHTML = '<i class="fas fa-chevron-down"></i> 显示详情';
}
}
// 保存到历史记录
function saveToHistory(text, language) {
const history = getHistory();
const historyItem = {
text: text,
language: language,
timestamp: new Date().toISOString(),
displayTime: new Date().toLocaleString()
};
history.unshift(historyItem);
// 只保留最近20条记录
if (history.length > 20) {
history.pop();
}
localStorage.setItem('voiceToTextHistory', JSON.stringify(history));
loadHistory();
}
// 获取历史记录
function getHistory() {
const historyJson = localStorage.getItem('voiceToTextHistory');
return historyJson ? JSON.parse(historyJson) : [];
}
// 加载历史记录
function loadHistory() {
const history = getHistory();
if (history.length === 0) {
historyList.innerHTML = '<div class="text-center" style="color: #999; padding: 20px;">暂无历史记录</div>';
return;
}
historyList.innerHTML = '';
history.forEach(item => {
const historyItem = document.createElement('div');
historyItem.className = 'history-item';
historyItem.innerHTML = `
<div class="flex-between">
<div class="history-time">${item.displayTime}</div>
<span style="font-size: 0.8rem; color: #666;">${item.language ? `语言: ${item.language}` : ''}</span>
</div>
<div class="history-text">${item.text}</div>
`;
historyItem.addEventListener('click', () => {
resultElement.textContent = item.text;
copyTextBtn.disabled = false;
});
historyList.appendChild(historyItem);
});
}
// 事件监听
startRecordBtn.addEventListener('click', startRecording);
stopRecordBtn.addEventListener('click', stopRecording);
playRecordBtn.addEventListener('click', playRecording);
transcribeBtn.addEventListener('click', transcribeAudio);
testConnectionBtn.addEventListener('click', testConnection);
copyTextBtn.addEventListener('click', copyText);
toggleDetailsBtn.addEventListener('click', toggleDetails);
// 窗口大小改变时调整 Canvas
window.addEventListener('resize', () => {
waveformCanvas.width = waveformCanvas.offsetWidth;
waveformCanvas.height = waveformCanvas.offsetHeight;
});
// 初始化应用
init();
</script>
</body>
</html>原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。