2024-04-16 10:10:18 +08:00
|
|
|
|
extends Node
|
|
|
|
|
|
2024-05-14 10:00:22 +08:00
|
|
|
|
## 语音识别成功信号
|
2024-05-11 20:06:02 +08:00
|
|
|
|
signal speech_recognition_successed
|
|
|
|
|
|
2024-05-09 20:05:07 +08:00
|
|
|
|
## 录音效果器
|
2024-05-08 16:56:02 +08:00
|
|
|
|
var effect: AudioEffectRecord
|
2024-05-09 20:05:07 +08:00
|
|
|
|
## 录音捕获效果器(用于判断录音音量)
|
|
|
|
|
var capture: AudioEffectCapture
|
2024-04-16 10:10:18 +08:00
|
|
|
|
|
2024-05-14 10:00:22 +08:00
|
|
|
|
## 待语音识别的文本
|
|
|
|
|
var targetText: String
|
|
|
|
|
## 音量最小阈值
|
|
|
|
|
const VolumeMin = 0.03
|
|
|
|
|
## 长时间没有说话阈值
|
|
|
|
|
const LongTimeNoVoice = 1
|
|
|
|
|
var hasVoice = false
|
|
|
|
|
var novoiceTime = 0
|
|
|
|
|
|
|
|
|
|
## 语音识别成功回复音效
|
|
|
|
|
var reply_correct = preload("res://Assets/training_speech/correct.mp3")
|
2024-04-16 10:10:18 +08:00
|
|
|
|
|
|
|
|
|
func _ready():
|
|
|
|
|
# We get the index of the "Record" bus.
|
|
|
|
|
var idx = AudioServer.get_bus_index("Record")
|
|
|
|
|
# And use it to retrieve its first effect, which has been defined
|
|
|
|
|
# as an "AudioEffectRecord" resource.
|
|
|
|
|
effect = AudioServer.get_bus_effect(idx, 0)
|
2024-05-14 10:00:22 +08:00
|
|
|
|
# 音频数据捕获,用于判断录音音量从而判断是否有声音输入
|
2024-05-09 20:05:07 +08:00
|
|
|
|
capture = AudioServer.get_bus_effect(idx, 1)
|
2024-04-16 10:10:18 +08:00
|
|
|
|
|
2024-05-08 16:56:02 +08:00
|
|
|
|
## 启动录音
|
|
|
|
|
func startRecord():
|
2024-05-11 20:06:02 +08:00
|
|
|
|
print("启动录音")
|
2024-05-08 16:56:02 +08:00
|
|
|
|
if not effect.is_recording_active():
|
|
|
|
|
effect.set_recording_active(true)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## 停止录音
|
|
|
|
|
func stopRecord():
|
2024-04-16 10:10:18 +08:00
|
|
|
|
if effect.is_recording_active():
|
|
|
|
|
effect.set_recording_active(false)
|
2024-05-08 16:56:02 +08:00
|
|
|
|
|
2024-05-14 10:00:22 +08:00
|
|
|
|
|
|
|
|
|
## 重启录音
|
|
|
|
|
func restartRecord():
|
|
|
|
|
effect.set_recording_active(false)
|
|
|
|
|
effect.set_recording_active(true)
|
2024-05-09 20:05:07 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## 播放回复
|
|
|
|
|
## PS: 是协程函数,外部可以await
|
|
|
|
|
func play_reply(reply):
|
2024-05-11 20:06:02 +08:00
|
|
|
|
if reply == null:
|
|
|
|
|
return
|
2024-05-09 20:05:07 +08:00
|
|
|
|
stopRecord()
|
|
|
|
|
assert(reply is AudioStream, "reply不是音频资源")
|
|
|
|
|
## 确保不循环播放
|
|
|
|
|
if reply is AudioStreamMP3:
|
|
|
|
|
reply.loop = false
|
|
|
|
|
if reply is AudioStreamOggVorbis:
|
|
|
|
|
reply.loop = false
|
|
|
|
|
if reply is AudioStreamWAV:
|
|
|
|
|
reply.loop_mode = AudioStreamWAV.LOOP_DISABLED
|
|
|
|
|
$AudioStreamPlayer.stream = reply
|
|
|
|
|
$AudioStreamPlayer.play()
|
|
|
|
|
await $AudioStreamPlayer.finished
|
|
|
|
|
|
|
|
|
|
## 录音并语音识别检查
|
|
|
|
|
## PS: 是协程函数,外部如果关心结果需await
|
2024-05-14 10:00:22 +08:00
|
|
|
|
func speech_record_check(text: String):
|
2024-05-15 11:18:41 +08:00
|
|
|
|
assert(text != null and not text.is_empty(), "待识别的结果text不能为空")
|
2024-05-14 10:00:22 +08:00
|
|
|
|
print("录音采样频率: ", AudioServer.get_mix_rate())
|
|
|
|
|
targetText = text
|
2024-05-11 20:06:02 +08:00
|
|
|
|
startRecord()
|
|
|
|
|
$Timer.start()
|
|
|
|
|
await speech_recognition_successed
|
2024-05-14 10:00:22 +08:00
|
|
|
|
print("识别成功,结束")
|
2024-05-15 11:18:41 +08:00
|
|
|
|
_reset_record_state()
|
|
|
|
|
|
2024-05-11 20:06:02 +08:00
|
|
|
|
|
2024-05-15 11:18:41 +08:00
|
|
|
|
func _reset_record_state():
|
|
|
|
|
targetText = ""
|
|
|
|
|
hasVoice = false
|
|
|
|
|
novoiceTime = 0
|
|
|
|
|
stopRecord()
|
|
|
|
|
$Timer.stop()
|
2024-05-08 16:56:02 +08:00
|
|
|
|
|
2024-05-14 10:00:22 +08:00
|
|
|
|
## 定时处理录音并识别的逻辑
|
2024-05-08 16:56:02 +08:00
|
|
|
|
func _on_timer_timeout():
|
|
|
|
|
if effect.is_recording_active():
|
2024-05-09 20:05:07 +08:00
|
|
|
|
var buf = capture.get_buffer(capture.get_frames_available())
|
2024-05-14 10:00:22 +08:00
|
|
|
|
var soundDetected = false
|
2024-05-09 20:05:07 +08:00
|
|
|
|
for vec in buf:
|
2024-05-11 20:06:02 +08:00
|
|
|
|
if vec.x > VolumeMin or vec.y > VolumeMin:
|
|
|
|
|
#print("Left channel volume = ", vec.x, ", Right volume = ", vec.y)
|
2024-05-14 10:00:22 +08:00
|
|
|
|
soundDetected = true
|
|
|
|
|
# 检测到声音处理
|
|
|
|
|
if soundDetected:
|
|
|
|
|
hasVoice = true
|
|
|
|
|
novoiceTime = 0
|
|
|
|
|
# 未检测到声音处理
|
2024-05-11 20:06:02 +08:00
|
|
|
|
else:
|
|
|
|
|
novoiceTime += $Timer.wait_time
|
2024-05-14 10:00:22 +08:00
|
|
|
|
if hasVoice and novoiceTime >= LongTimeNoVoice:
|
2024-05-11 20:06:02 +08:00
|
|
|
|
var rcd = effect.get_recording()
|
2024-05-14 10:00:22 +08:00
|
|
|
|
if rcd == null:
|
|
|
|
|
return
|
|
|
|
|
print("音频时长: ", rcd.get_length())
|
|
|
|
|
restartRecord()
|
|
|
|
|
#await play_reply(rcd)
|
2024-05-15 11:18:41 +08:00
|
|
|
|
#startRecord()
|
|
|
|
|
_request_speech_recognition(rcd)
|
2024-05-14 10:00:22 +08:00
|
|
|
|
hasVoice = false
|
|
|
|
|
# 长时间无语音输入,重启录音
|
|
|
|
|
if novoiceTime >= LongTimeNoVoice:
|
|
|
|
|
print("长时间无声音,重启录音")
|
|
|
|
|
restartRecord()
|
|
|
|
|
novoiceTime = 0
|
|
|
|
|
|
2024-05-15 11:18:41 +08:00
|
|
|
|
## 是否存在未完成的HTTP请求
|
|
|
|
|
var hasUnfinishedRequest = false
|
|
|
|
|
## 待识别的录音队列
|
|
|
|
|
var recordingQueue: Array = []
|
|
|
|
|
|
|
|
|
|
@onready var http_req = $HTTPRequest
|
|
|
|
|
const URL = "http://192.168.33.233/rtss-server/api/voice/verify?text=%s"
|
|
|
|
|
var token = "3cccd952d6596ea36c3e0f68fb763818"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## 请求语音识别
|
|
|
|
|
func _request_speech_recognition(recording: AudioStreamWAV):
|
|
|
|
|
if hasUnfinishedRequest:
|
|
|
|
|
recordingQueue.append(recording)
|
|
|
|
|
return
|
|
|
|
|
hasUnfinishedRequest = true
|
|
|
|
|
var url = URL % targetText
|
|
|
|
|
var headers = ["Content-Type: audio/wav"]
|
|
|
|
|
headers.append("X-Token: %s" % token)
|
|
|
|
|
var now = Time.get_datetime_string_from_system()
|
|
|
|
|
var body = _build_wav(recording)
|
|
|
|
|
var error = http_req.request_raw(url, headers, HTTPClient.METHOD_POST, body)
|
|
|
|
|
if error != OK:
|
|
|
|
|
push_error("在HTTP请求语音识别时发生了一个错误。", error)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
func _on_http_request_request_completed(result, response_code, headers, body):
|
|
|
|
|
hasUnfinishedRequest = false
|
|
|
|
|
var json = JSON.parse_string(body.get_string_from_utf8())
|
|
|
|
|
print("语音识别结果: ", json)
|
|
|
|
|
var data = json["data"]
|
|
|
|
|
if data != null and data["match"] == true:
|
|
|
|
|
await play_reply(reply_correct)
|
|
|
|
|
speech_recognition_successed.emit()
|
|
|
|
|
if recordingQueue.size() > 0:
|
|
|
|
|
var next = recordingQueue.pop_back()
|
|
|
|
|
_request_speech_recognition(next)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
func _build_wav(recording: AudioStreamWAV) -> PackedByteArray:
|
|
|
|
|
var data_bytes = recording.data.size()
|
|
|
|
|
#Subchunk2Size = Size of data in bytes
|
|
|
|
|
var sub_chunk_2_size = data_bytes
|
|
|
|
|
|
|
|
|
|
# Format code
|
|
|
|
|
# 1:PCM format (for 8 or 16 bit)
|
|
|
|
|
# 3:IEEE float format
|
|
|
|
|
var format_code = 3 if (recording.format == AudioStreamWAV.FORMAT_IMA_ADPCM) else 1
|
|
|
|
|
var n_channels = 2 if recording.stereo else 1
|
|
|
|
|
#print("录音结果采样率: ", recording.mix_rate)
|
|
|
|
|
var sample_rate = AudioServer.get_mix_rate()
|
|
|
|
|
|
|
|
|
|
var byte_pr_sample = 0
|
|
|
|
|
match recording.format:
|
|
|
|
|
AudioStreamWAV.FORMAT_8_BITS:
|
|
|
|
|
byte_pr_sample = 1
|
|
|
|
|
AudioStreamWAV.FORMAT_16_BITS:
|
|
|
|
|
byte_pr_sample = 2
|
|
|
|
|
AudioStreamWAV.FORMAT_IMA_ADPCM:
|
|
|
|
|
byte_pr_sample = 4
|
|
|
|
|
|
|
|
|
|
var wav: PackedByteArray = []
|
|
|
|
|
|
|
|
|
|
# Create WAV Header
|
|
|
|
|
store_string(wav, "RIFF") # ChunkID
|
|
|
|
|
store_32(wav, sub_chunk_2_size + 36) # ChunkSize = 36 + SubChunk2Size (size of entire file minus the 8 bits for this and previous header)
|
|
|
|
|
store_string(wav, "WAVE") # Format
|
|
|
|
|
store_string(wav, "fmt ") # Subchunk1ID
|
|
|
|
|
store_32(wav, 16) # Subchunk1Size = 16
|
|
|
|
|
store_16(wav, format_code) # AudioFormat
|
|
|
|
|
store_16(wav, n_channels) # Number of Channels
|
|
|
|
|
store_32(wav, sample_rate) # SampleRate
|
|
|
|
|
store_32(wav, sample_rate * n_channels * byte_pr_sample) # ByteRate
|
|
|
|
|
store_16(wav, n_channels * byte_pr_sample) # BlockAlign = NumChannels * BytePrSample
|
|
|
|
|
store_16(wav, byte_pr_sample * 8) # BitsPerSample
|
|
|
|
|
store_string(wav, "data") # Subchunk2ID
|
|
|
|
|
store_32(wav, sub_chunk_2_size) # Subchunk2Size
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Add data
|
|
|
|
|
var stream_data = recording.get_data()
|
|
|
|
|
#print("formatCode=", format_code, ", n_channels=", n_channels, ", sample_rate=", sample_rate,
|
|
|
|
|
#", byte_pr_sample=", byte_pr_sample, ", sub_chunk_2_size=", sub_chunk_2_size, ", data_size=", stream_data.size())
|
|
|
|
|
match recording.format:
|
|
|
|
|
AudioStreamWAV.FORMAT_8_BITS:
|
|
|
|
|
for i in data_bytes:
|
|
|
|
|
var data_point = stream_data[i] + 128
|
|
|
|
|
wav.append(data_point)
|
|
|
|
|
AudioStreamWAV.FORMAT_16_BITS:
|
|
|
|
|
for i in data_bytes/2:
|
|
|
|
|
var data_point = decode_uint16(stream_data[i*2], stream_data[i*2+1])
|
|
|
|
|
store_16(wav, data_point)
|
|
|
|
|
_:
|
|
|
|
|
push_error("构建wav错误,不支持的音频格式")
|
|
|
|
|
return wav
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
func store_string(buffer: PackedByteArray, s: String):
|
|
|
|
|
buffer.append_array(s.to_utf8_buffer())
|
|
|
|
|
|
|
|
|
|
func store_16(buffer: PackedByteArray,p_dest: int, big_endian: bool = false):
|
|
|
|
|
var a
|
|
|
|
|
var b
|
|
|
|
|
a = p_dest & 0xFF
|
|
|
|
|
b = p_dest >> 8
|
|
|
|
|
var c
|
|
|
|
|
if big_endian:
|
|
|
|
|
c = a
|
|
|
|
|
a = b
|
|
|
|
|
b = c
|
|
|
|
|
buffer.append(a)
|
|
|
|
|
buffer.append(b)
|
|
|
|
|
|
|
|
|
|
func store_32(buffer: PackedByteArray, p_dest: int, big_endian: bool = false):
|
|
|
|
|
var a
|
|
|
|
|
var b
|
|
|
|
|
a = p_dest & 0xFFFF
|
|
|
|
|
b = p_dest >> 16
|
|
|
|
|
var c
|
|
|
|
|
if big_endian:
|
|
|
|
|
c = a
|
|
|
|
|
a = b
|
|
|
|
|
b = c
|
|
|
|
|
store_16(buffer, a, big_endian)
|
|
|
|
|
store_16(buffer, b, big_endian)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
func decode_uint16(v1, v2) -> int:
|
|
|
|
|
var v = 0
|
|
|
|
|
v |= v1
|
|
|
|
|
v2 <<= 8
|
|
|
|
|
v |= v2
|
|
|
|
|
return v
|