godot-psd-training/Communication/voice_communication.gd
soul-walker e1cc2fb498 完善语音会话场景功能
场景加载添加token参数获取设置
2024-05-20 10:28:59 +08:00

260 lines
7.4 KiB
GDScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

extends Node
## 语音识别成功信号
signal speech_recognition_successed
## 录音效果器
var effect: AudioEffectRecord
## 录音捕获效果器(用于判断录音音量)
var capture: AudioEffectCapture
## 待语音识别的文本
var targetText: String
## 音量最小阈值
const VolumeMin = 0.05
## 长时间没有说话阈值
const LongTimeNoVoice = 1
var hasVoice = false
var novoiceTime = 0
## 是否存在未完成的HTTP请求
var hasUnfinishedRequest = false
## 待识别的录音队列
var recordingQueue: Array = []
@onready var http_req = $HTTPRequest
const URL = "http://192.168.33.233/rtss-server/api/voice/verify?text=%s"
var ConfigParams = preload("res://config_params.gd")
## 语音识别成功回复音效
var reply_correct = preload("res://Assets/training_speech/correct.mp3")
func _ready():
# We get the index of the "Record" bus.
var idx = AudioServer.get_bus_index("Record")
# And use it to retrieve its first effect, which has been defined
# as an "AudioEffectRecord" resource.
effect = AudioServer.get_bus_effect(idx, 0)
# 音频数据捕获,用于判断录音音量从而判断是否有声音输入
capture = AudioServer.get_bus_effect(idx, 1)
## 启动录音
func startRecord():
print("启动录音")
if not effect.is_recording_active():
effect.set_recording_active(true)
## 停止录音
func stopRecord():
if effect.is_recording_active():
effect.set_recording_active(false)
## 重启录音
func restartRecord():
effect.set_recording_active(false)
effect.set_recording_active(true)
## 播放回复
## PS: 是协程函数外部可以await
func play_reply(reply):
if reply == null:
return
stopRecord()
assert(reply is AudioStream, "reply不是音频资源")
## 确保不循环播放
if reply is AudioStreamMP3:
reply.loop = false
if reply is AudioStreamOggVorbis:
reply.loop = false
if reply is AudioStreamWAV:
reply.loop_mode = AudioStreamWAV.LOOP_DISABLED
$AudioStreamPlayer.stream = reply
$AudioStreamPlayer.play()
await $AudioStreamPlayer.finished
## 录音并语音识别检查
## PS: 是协程函数外部如果关心结果需await
func speech_record_check(text: String):
assert(text != null and not text.is_empty(), "待识别的结果text不能为空")
print("录音采样频率: ", AudioServer.get_mix_rate())
targetText = text
startRecord()
$Timer.start()
await speech_recognition_successed
print("识别成功,结束")
_reset_record_state()
## 重置录音识别相关状态
func _reset_record_state():
targetText = ""
hasVoice = false
novoiceTime = 0
stopRecord()
$Timer.stop()
## 定时处理录音并识别的逻辑
func _on_timer_timeout():
if effect.is_recording_active():
var buf = capture.get_buffer(capture.get_frames_available())
var soundDetected = false
for vec in buf:
if vec.x > VolumeMin or vec.y > VolumeMin:
#print("Left channel volume = ", vec.x, ", Right volume = ", vec.y)
soundDetected = true
# 检测到声音处理
if soundDetected:
hasVoice = true
novoiceTime = 0
# 未检测到声音处理
else:
novoiceTime += $Timer.wait_time
if hasVoice and novoiceTime >= LongTimeNoVoice:
var rcd = effect.get_recording()
if rcd == null:
return
print("音频时长: ", rcd.get_length())
restartRecord()
#await play_reply(rcd)
#startRecord()
_request_speech_recognition(rcd)
hasVoice = false
# 长时间无语音输入,重启录音
if novoiceTime >= LongTimeNoVoice:
print("长时间无声音,重启录音")
restartRecord()
novoiceTime = 0
## 请求语音识别
func _request_speech_recognition(recording: AudioStreamWAV):
if hasUnfinishedRequest:
recordingQueue.append(recording)
return
hasUnfinishedRequest = true
var url = URL % targetText
var headers = ["Content-Type: audio/wav"]
headers.append("X-Token: %s" % ConfigParams.Token)
var now = Time.get_datetime_string_from_system()
var body = _build_wav(recording)
var error = http_req.request_raw(url, headers, HTTPClient.METHOD_POST, body)
if error != OK:
push_error("在HTTP请求语音识别时发生了一个错误。", error)
## 语音识别接口调用结果
func _on_http_request_request_completed(result, response_code, headers, body):
hasUnfinishedRequest = false
var json = JSON.parse_string(body.get_string_from_utf8())
print("语音识别结果: ", json)
var data = json["data"]
# 验证成功,结束
if data != null and data["match"] == true:
await play_reply(reply_correct)
speech_recognition_successed.emit()
# 未成功,如果录音队列不空,取出最新的继续识别
if recordingQueue.size() > 0:
var next = recordingQueue.pop_back()
_request_speech_recognition(next)
## 构造wav文件二进制数据
func _build_wav(recording: AudioStreamWAV) -> PackedByteArray:
var data_bytes = recording.data.size()
#Subchunk2Size = Size of data in bytes
var sub_chunk_2_size = data_bytes
# Format code
# 1:PCM format (for 8 or 16 bit)
# 3:IEEE float format
var format_code = 3 if (recording.format == AudioStreamWAV.FORMAT_IMA_ADPCM) else 1
var n_channels = 2 if recording.stereo else 1
#print("录音结果采样率: ", recording.mix_rate)
var sample_rate = AudioServer.get_mix_rate()
var byte_pr_sample = 0
match recording.format:
AudioStreamWAV.FORMAT_8_BITS:
byte_pr_sample = 1
AudioStreamWAV.FORMAT_16_BITS:
byte_pr_sample = 2
AudioStreamWAV.FORMAT_IMA_ADPCM:
byte_pr_sample = 4
var wav: PackedByteArray = []
# Create WAV Header
store_string(wav, "RIFF") # ChunkID
store_32(wav, sub_chunk_2_size + 36) # ChunkSize = 36 + SubChunk2Size (size of entire file minus the 8 bits for this and previous header)
store_string(wav, "WAVE") # Format
store_string(wav, "fmt ") # Subchunk1ID
store_32(wav, 16) # Subchunk1Size = 16
store_16(wav, format_code) # AudioFormat
store_16(wav, n_channels) # Number of Channels
store_32(wav, sample_rate) # SampleRate
store_32(wav, sample_rate * n_channels * byte_pr_sample) # ByteRate
store_16(wav, n_channels * byte_pr_sample) # BlockAlign = NumChannels * BytePrSample
store_16(wav, byte_pr_sample * 8) # BitsPerSample
store_string(wav, "data") # Subchunk2ID
store_32(wav, sub_chunk_2_size) # Subchunk2Size
# Add data
var stream_data = recording.get_data()
#print("formatCode=", format_code, ", n_channels=", n_channels, ", sample_rate=", sample_rate,
#", byte_pr_sample=", byte_pr_sample, ", sub_chunk_2_size=", sub_chunk_2_size, ", data_size=", stream_data.size())
match recording.format:
AudioStreamWAV.FORMAT_8_BITS:
for i in data_bytes:
var data_point = stream_data[i] + 128
wav.append(data_point)
AudioStreamWAV.FORMAT_16_BITS:
for i in data_bytes/2:
var data_point = decode_uint16(stream_data[i*2], stream_data[i*2+1])
store_16(wav, data_point)
_:
push_error("构建wav错误不支持的音频格式")
return wav
func store_string(buffer: PackedByteArray, s: String):
buffer.append_array(s.to_utf8_buffer())
func store_16(buffer: PackedByteArray,p_dest: int, big_endian: bool = false):
var a
var b
a = p_dest & 0xFF
b = p_dest >> 8
var c
if big_endian:
c = a
a = b
b = c
buffer.append(a)
buffer.append(b)
func store_32(buffer: PackedByteArray, p_dest: int, big_endian: bool = false):
var a
var b
a = p_dest & 0xFFFF
b = p_dest >> 16
var c
if big_endian:
c = a
a = b
b = c
store_16(buffer, a, big_endian)
store_16(buffer, b, big_endian)
func decode_uint16(v1, v2) -> int:
var v = 0
v |= v1
v2 <<= 8
v |= v2
return v