godot-psd-training/Communication/voice_communication.gd

extends Node

## 语音识别成功信号
signal speech_recognition_successed

## 录音效果器
var effect: AudioEffectRecord
## 录音捕获效果器（用于判断录音音量）
var capture: AudioEffectCapture

## 待语音识别的文本
var targetText: String
## 音量最小阈值
const VolumeMin = 0.03
## 长时间没有说话阈值
const LongTimeNoVoice = 1
var hasVoice = false
var novoiceTime = 0

## 语音识别成功回复音效
var reply_correct = preload("res://Assets/training_speech/correct.mp3")

func _ready():
	# We get the index of the "Record" bus.
	var idx = AudioServer.get_bus_index("Record")
	# And use it to retrieve its first effect, which has been defined
	# as an "AudioEffectRecord" resource.
	effect = AudioServer.get_bus_effect(idx, 0)
	# 音频数据捕获，用于判断录音音量从而判断是否有声音输入
	capture = AudioServer.get_bus_effect(idx, 1)

## 启动录音
func startRecord():
	print("启动录音")
	if not effect.is_recording_active():
		effect.set_recording_active(true)


## 停止录音
func stopRecord():
	if effect.is_recording_active():
		effect.set_recording_active(false)


## 重启录音
func restartRecord():
	effect.set_recording_active(false)
	effect.set_recording_active(true)


## 播放回复
## PS: 是协程函数，外部可以await
func play_reply(reply):
	if reply == null:
		return
	stopRecord()
	assert(reply is AudioStream, "reply不是音频资源")
	## 确保不循环播放
	if reply is AudioStreamMP3:
		reply.loop = false
	if reply is AudioStreamOggVorbis:
		reply.loop = false
	if reply is AudioStreamWAV:
		reply.loop_mode = AudioStreamWAV.LOOP_DISABLED
	$AudioStreamPlayer.stream = reply
	$AudioStreamPlayer.play()
	await $AudioStreamPlayer.finished

## 录音并语音识别检查
## PS: 是协程函数，外部如果关心结果需await
func speech_record_check(text: String):
	assert(text != null and not text.is_empty(), "待识别的结果text不能为空")
	print("录音采样频率: ", AudioServer.get_mix_rate())
	targetText = text
	startRecord()
	$Timer.start()
	await speech_recognition_successed
	print("识别成功，结束")
	_reset_record_state()


func _reset_record_state():
	targetText = ""
	hasVoice = false
	novoiceTime = 0
	stopRecord()
	$Timer.stop()

## 定时处理录音并识别的逻辑
func _on_timer_timeout():
	if effect.is_recording_active():
		var buf = capture.get_buffer(capture.get_frames_available())
		var soundDetected = false
		for vec in buf:
			if vec.x > VolumeMin or vec.y > VolumeMin:
				#print("Left channel volume = ", vec.x, ", Right volume = ", vec.y)
				soundDetected = true
		# 检测到声音处理
		if soundDetected:
			hasVoice = true
			novoiceTime = 0
		# 未检测到声音处理
		else:
			novoiceTime += $Timer.wait_time
			if hasVoice and novoiceTime >= LongTimeNoVoice:
				var rcd = effect.get_recording()
				if rcd == null:
					return
				print("音频时长: ", rcd.get_length())
				restartRecord()
				#await play_reply(rcd)
				#startRecord()
				_request_speech_recognition(rcd)
				hasVoice = false
			# 长时间无语音输入，重启录音
			if novoiceTime >= LongTimeNoVoice:
				print("长时间无声音，重启录音")
				restartRecord()
				novoiceTime = 0

## 是否存在未完成的HTTP请求
var hasUnfinishedRequest = false
## 待识别的录音队列
var recordingQueue: Array = []

@onready var http_req = $HTTPRequest
const URL = "http://192.168.33.233/rtss-server/api/voice/verify?text=%s"
var token = "3cccd952d6596ea36c3e0f68fb763818"


## 请求语音识别
func _request_speech_recognition(recording: AudioStreamWAV):
	if hasUnfinishedRequest:
		recordingQueue.append(recording)
		return
	hasUnfinishedRequest = true
	var url = URL % targetText
	var headers = ["Content-Type: audio/wav"]
	headers.append("X-Token: %s" % token)
	var now = Time.get_datetime_string_from_system()
	var body = _build_wav(recording)
	var error = http_req.request_raw(url, headers, HTTPClient.METHOD_POST, body)
	if error != OK:
		push_error("在HTTP请求语音识别时发生了一个错误。", error)


func _on_http_request_request_completed(result, response_code, headers, body):
	hasUnfinishedRequest = false
	var json = JSON.parse_string(body.get_string_from_utf8())
	print("语音识别结果: ", json)
	var data = json["data"]
	if data != null and data["match"] == true:
		await play_reply(reply_correct)
		speech_recognition_successed.emit()
	if recordingQueue.size() > 0:
		var next = recordingQueue.pop_back()
		_request_speech_recognition(next)


func _build_wav(recording: AudioStreamWAV) -> PackedByteArray:
	var data_bytes = recording.data.size()
	#Subchunk2Size = Size of data in bytes
	var sub_chunk_2_size = data_bytes

	# Format code
	# 1:PCM format (for 8 or 16 bit)
	# 3:IEEE float format
	var format_code = 3 if (recording.format == AudioStreamWAV.FORMAT_IMA_ADPCM) else 1
	var n_channels = 2 if recording.stereo else 1
	#print("录音结果采样率: ", recording.mix_rate)
	var sample_rate = AudioServer.get_mix_rate()
	
	var byte_pr_sample = 0
	match recording.format:
		AudioStreamWAV.FORMAT_8_BITS:
			byte_pr_sample = 1
		AudioStreamWAV.FORMAT_16_BITS:
			byte_pr_sample = 2
		AudioStreamWAV.FORMAT_IMA_ADPCM:
			byte_pr_sample = 4
	
	var wav: PackedByteArray = []
	
	# Create WAV Header
	store_string(wav, "RIFF") # ChunkID
	store_32(wav, sub_chunk_2_size + 36) # ChunkSize = 36 + SubChunk2Size (size of entire file minus the 8 bits for this and previous header)
	store_string(wav, "WAVE") # Format
	store_string(wav, "fmt ") # Subchunk1ID
	store_32(wav, 16) # Subchunk1Size = 16
	store_16(wav, format_code) # AudioFormat
	store_16(wav, n_channels) # Number of Channels
	store_32(wav, sample_rate) # SampleRate
	store_32(wav, sample_rate * n_channels * byte_pr_sample) # ByteRate
	store_16(wav, n_channels * byte_pr_sample) # BlockAlign = NumChannels * BytePrSample
	store_16(wav, byte_pr_sample * 8) # BitsPerSample
	store_string(wav, "data") # Subchunk2ID
	store_32(wav, sub_chunk_2_size) # Subchunk2Size
	
	
	# Add data
	var stream_data = recording.get_data()
	#print("formatCode=", format_code, ", n_channels=", n_channels, ", sample_rate=", sample_rate,
		#", byte_pr_sample=", byte_pr_sample, ", sub_chunk_2_size=", sub_chunk_2_size, ", data_size=", stream_data.size())
	match recording.format:
		AudioStreamWAV.FORMAT_8_BITS:
			for i in data_bytes:
				var data_point = stream_data[i] + 128
				wav.append(data_point)
		AudioStreamWAV.FORMAT_16_BITS:
			for i in data_bytes/2:
				var data_point = decode_uint16(stream_data[i*2], stream_data[i*2+1])
				store_16(wav, data_point)
		_:
			push_error("构建wav错误，不支持的音频格式")
	return wav


func store_string(buffer: PackedByteArray, s: String):
	buffer.append_array(s.to_utf8_buffer())

func store_16(buffer: PackedByteArray,p_dest: int, big_endian: bool = false):
	var a
	var b
	a = p_dest & 0xFF
	b = p_dest >> 8
	var c
	if big_endian:
		c = a
		a = b
		b = c
	buffer.append(a)
	buffer.append(b)

func store_32(buffer: PackedByteArray, p_dest: int, big_endian: bool = false):
	var a
	var b
	a = p_dest & 0xFFFF
	b = p_dest >> 16
	var c
	if big_endian:
		c = a
		a = b
		b = c
	store_16(buffer, a, big_endian)
	store_16(buffer, b, big_endian)


func decode_uint16(v1, v2) -> int:
	var v = 0
	v |= v1
	v2 <<= 8
	v |= v2
	return v
-												调整项目结构，添加根据URL参数加载相应实训场景的scene_loader场景
实验录音及播放
											
										
										
											2024-04-16 10:10:18 +08:00
+								extends Node
-												语音会话场景语音识别-录音处理逻辑完成
											
										
										
											2024-05-14 10:00:22 +08:00
+								## 语音识别成功信号
-												录音逻辑处理

											
										
										
											2024-05-11 20:06:02 +08:00
+								signal speech_recognition_successed
-												添加回复用音频（文本转语音生成的）
语音会话添加play_reply方法实现播放回复
探索录音方式
											
										
										
											2024-05-09 20:05:07 +08:00
+								## 录音效果器
-												语音会话组件构思
											
										
										
											2024-05-08 16:56:02 +08:00
+								var effect: AudioEffectRecord
-												添加回复用音频（文本转语音生成的）
语音会话添加play_reply方法实现播放回复
探索录音方式
											
										
										
											2024-05-09 20:05:07 +08:00
+								## 录音捕获效果器（用于判断录音音量）
 								var capture: AudioEffectCapture
-												调整项目结构，添加根据URL参数加载相应实训场景的scene_loader场景
实验录音及播放
											
										
										
											2024-04-16 10:10:18 +08:00
-												语音会话场景语音识别-录音处理逻辑完成
											
										
										
											2024-05-14 10:00:22 +08:00
+								## 待语音识别的文本
 								var targetText: String
 								## 音量最小阈值
 								const VolumeMin = 0.03
 								## 长时间没有说话阈值
 								const LongTimeNoVoice = 1
 								var hasVoice = false
 								var novoiceTime = 0
 								## 语音识别成功回复音效
 								var reply_correct = preload("res://Assets/training_speech/correct.mp3")
-												调整项目结构，添加根据URL参数加载相应实训场景的scene_loader场景
实验录音及播放
											
										
										
											2024-04-16 10:10:18 +08:00
 								func _ready():
 									# We get the index of the "Record" bus.
 									var idx = AudioServer.get_bus_index("Record")
 									# And use it to retrieve its first effect, which has been defined
 									# as an "AudioEffectRecord" resource.
 									effect = AudioServer.get_bus_effect(idx, 0)
-												语音会话场景语音识别-录音处理逻辑完成
											
										
										
											2024-05-14 10:00:22 +08:00
+									# 音频数据捕获，用于判断录音音量从而判断是否有声音输入
-												添加回复用音频（文本转语音生成的）
语音会话添加play_reply方法实现播放回复
探索录音方式
											
										
										
											2024-05-09 20:05:07 +08:00
+									capture = AudioServer.get_bus_effect(idx, 1)
-												调整项目结构，添加根据URL参数加载相应实训场景的scene_loader场景
实验录音及播放
											
										
										
											2024-04-16 10:10:18 +08:00
-												语音会话组件构思
											
										
										
											2024-05-08 16:56:02 +08:00
+								## 启动录音
 								func startRecord():
-												录音逻辑处理

											
										
										
											2024-05-11 20:06:02 +08:00
+									print("启动录音")
-												语音会话组件构思
											
										
										
											2024-05-08 16:56:02 +08:00
+									if not effect.is_recording_active():
 										effect.set_recording_active(true)
 								## 停止录音
 								func stopRecord():
-												调整项目结构，添加根据URL参数加载相应实训场景的scene_loader场景
实验录音及播放
											
										
										
											2024-04-16 10:10:18 +08:00
+									if effect.is_recording_active():
 										effect.set_recording_active(false)
-												语音会话组件构思
											
										
										
											2024-05-08 16:56:02 +08:00
-												语音会话场景语音识别-录音处理逻辑完成
											
										
										
											2024-05-14 10:00:22 +08:00
 								## 重启录音
 								func restartRecord():
 									effect.set_recording_active(false)
 									effect.set_recording_active(true)
-												添加回复用音频（文本转语音生成的）
语音会话添加play_reply方法实现播放回复
探索录音方式
											
										
										
											2024-05-09 20:05:07 +08:00
 								## 播放回复
 								## PS: 是协程函数，外部可以await
 								func play_reply(reply):
-												录音逻辑处理

											
										
										
											2024-05-11 20:06:02 +08:00
+									if reply == null:
 										return
-												添加回复用音频（文本转语音生成的）
语音会话添加play_reply方法实现播放回复
探索录音方式
											
										
										
											2024-05-09 20:05:07 +08:00
+									stopRecord()
 									assert(reply is AudioStream, "reply不是音频资源")
 									## 确保不循环播放
 									if reply is AudioStreamMP3:
 										reply.loop = false
 									if reply is AudioStreamOggVorbis:
 										reply.loop = false
 									if reply is AudioStreamWAV:
 										reply.loop_mode = AudioStreamWAV.LOOP_DISABLED
 									$AudioStreamPlayer.stream = reply
 									$AudioStreamPlayer.play()
 									await $AudioStreamPlayer.finished
 								## 录音并语音识别检查
 								## PS: 是协程函数，外部如果关心结果需await
-												语音会话场景语音识别-录音处理逻辑完成
											
										
										
											2024-05-14 10:00:22 +08:00
+								func speech_record_check(text: String):
-												实现录音转wav格式
											
										
										
											2024-05-15 11:18:41 +08:00
+									assert(text != null and not text.is_empty(), "待识别的结果text不能为空")
-												语音会话场景语音识别-录音处理逻辑完成
											
										
										
											2024-05-14 10:00:22 +08:00
+									print("录音采样频率: ", AudioServer.get_mix_rate())
 									targetText = text
-												录音逻辑处理

											
										
										
											2024-05-11 20:06:02 +08:00
+									startRecord()
 									$Timer.start()
 									await speech_recognition_successed
-												语音会话场景语音识别-录音处理逻辑完成
											
										
										
											2024-05-14 10:00:22 +08:00
+									print("识别成功，结束")
-												实现录音转wav格式
											
										
										
											2024-05-15 11:18:41 +08:00
+									_reset_record_state()
-												录音逻辑处理

											
										
										
											2024-05-11 20:06:02 +08:00
-												实现录音转wav格式
											
										
										
											2024-05-15 11:18:41 +08:00
+								func _reset_record_state():
 									targetText = ""
 									hasVoice = false
 									novoiceTime = 0
 									stopRecord()
 									$Timer.stop()
-												语音会话组件构思
											
										
										
											2024-05-08 16:56:02 +08:00
-												语音会话场景语音识别-录音处理逻辑完成
											
										
										
											2024-05-14 10:00:22 +08:00
+								## 定时处理录音并识别的逻辑
-												语音会话组件构思
											
										
										
											2024-05-08 16:56:02 +08:00
+								func _on_timer_timeout():
 									if effect.is_recording_active():
-												添加回复用音频（文本转语音生成的）
语音会话添加play_reply方法实现播放回复
探索录音方式
											
										
										
											2024-05-09 20:05:07 +08:00
+										var buf = capture.get_buffer(capture.get_frames_available())
-												语音会话场景语音识别-录音处理逻辑完成
											
										
										
											2024-05-14 10:00:22 +08:00
+										var soundDetected = false
-												添加回复用音频（文本转语音生成的）
语音会话添加play_reply方法实现播放回复
探索录音方式
											
										
										
											2024-05-09 20:05:07 +08:00
+										for vec in buf:
-												录音逻辑处理

											
										
										
											2024-05-11 20:06:02 +08:00
+											if vec.x > VolumeMin or vec.y > VolumeMin:
 												#print("Left channel volume = ", vec.x, ", Right volume = ", vec.y)
-												语音会话场景语音识别-录音处理逻辑完成
											
										
										
											2024-05-14 10:00:22 +08:00
+												soundDetected = true
 										# 检测到声音处理
 										if soundDetected:
 											hasVoice = true
 											novoiceTime = 0
 										# 未检测到声音处理
-												录音逻辑处理

											
										
										
											2024-05-11 20:06:02 +08:00
+										else:
 											novoiceTime += $Timer.wait_time
-												语音会话场景语音识别-录音处理逻辑完成
											
										
										
											2024-05-14 10:00:22 +08:00
+											if hasVoice and novoiceTime >= LongTimeNoVoice:
-												录音逻辑处理

											
										
										
											2024-05-11 20:06:02 +08:00
+												var rcd = effect.get_recording()
-												语音会话场景语音识别-录音处理逻辑完成
											
										
										
											2024-05-14 10:00:22 +08:00
+												if rcd == null:
 													return
 												print("音频时长: ", rcd.get_length())
 												restartRecord()
 												#await play_reply(rcd)
-												实现录音转wav格式
											
										
										
											2024-05-15 11:18:41 +08:00
+												#startRecord()
 												_request_speech_recognition(rcd)
-												语音会话场景语音识别-录音处理逻辑完成
											
										
										
											2024-05-14 10:00:22 +08:00
+												hasVoice = false
 											# 长时间无语音输入，重启录音
 											if novoiceTime >= LongTimeNoVoice:
 												print("长时间无声音，重启录音")
 												restartRecord()
 												novoiceTime = 0
-												实现录音转wav格式
											
										
										
											2024-05-15 11:18:41 +08:00
+								## 是否存在未完成的HTTP请求
 								var hasUnfinishedRequest = false
 								## 待识别的录音队列
 								var recordingQueue: Array = []
 								@onready var http_req = $HTTPRequest
 								const URL = "http://192.168.33.233/rtss-server/api/voice/verify?text=%s"
 								var token = "3cccd952d6596ea36c3e0f68fb763818"
 								## 请求语音识别
 								func _request_speech_recognition(recording: AudioStreamWAV):
 									if hasUnfinishedRequest:
 										recordingQueue.append(recording)
 										return
 									hasUnfinishedRequest = true
 									var url = URL % targetText
 									var headers = ["Content-Type: audio/wav"]
 									headers.append("X-Token: %s" % token)
 									var now = Time.get_datetime_string_from_system()
 									var body = _build_wav(recording)
 									var error = http_req.request_raw(url, headers, HTTPClient.METHOD_POST, body)
 									if error != OK:
 										push_error("在HTTP请求语音识别时发生了一个错误。", error)
 								func _on_http_request_request_completed(result, response_code, headers, body):
 									hasUnfinishedRequest = false
 									var json = JSON.parse_string(body.get_string_from_utf8())
 									print("语音识别结果: ", json)
 									var data = json["data"]
 									if data != null and data["match"] == true:
 										await play_reply(reply_correct)
 										speech_recognition_successed.emit()
 									if recordingQueue.size() > 0:
 										var next = recordingQueue.pop_back()
 										_request_speech_recognition(next)
 								func _build_wav(recording: AudioStreamWAV) -> PackedByteArray:
 									var data_bytes = recording.data.size()
 									#Subchunk2Size = Size of data in bytes
 									var sub_chunk_2_size = data_bytes
 									# Format code
 									# 1:PCM format (for 8 or 16 bit)
 									# 3:IEEE float format
 									var format_code = 3 if (recording.format == AudioStreamWAV.FORMAT_IMA_ADPCM) else 1
 									var n_channels = 2 if recording.stereo else 1
 									#print("录音结果采样率: ", recording.mix_rate)
 									var sample_rate = AudioServer.get_mix_rate()
 									var byte_pr_sample = 0
 									match recording.format:
 										AudioStreamWAV.FORMAT_8_BITS:
 											byte_pr_sample = 1
 										AudioStreamWAV.FORMAT_16_BITS:
 											byte_pr_sample = 2
 										AudioStreamWAV.FORMAT_IMA_ADPCM:
 											byte_pr_sample = 4
 									var wav: PackedByteArray = []
 									# Create WAV Header
 									store_string(wav, "RIFF") # ChunkID
 									store_32(wav, sub_chunk_2_size + 36) # ChunkSize = 36 + SubChunk2Size (size of entire file minus the 8 bits for this and previous header)
 									store_string(wav, "WAVE") # Format
 									store_string(wav, "fmt ") # Subchunk1ID
 									store_32(wav, 16) # Subchunk1Size = 16
 									store_16(wav, format_code) # AudioFormat
 									store_16(wav, n_channels) # Number of Channels
 									store_32(wav, sample_rate) # SampleRate
 									store_32(wav, sample_rate * n_channels * byte_pr_sample) # ByteRate
 									store_16(wav, n_channels * byte_pr_sample) # BlockAlign = NumChannels * BytePrSample
 									store_16(wav, byte_pr_sample * 8) # BitsPerSample
 									store_string(wav, "data") # Subchunk2ID
 									store_32(wav, sub_chunk_2_size) # Subchunk2Size
 									# Add data
 									var stream_data = recording.get_data()
 									#print("formatCode=", format_code, ", n_channels=", n_channels, ", sample_rate=", sample_rate,
 										#", byte_pr_sample=", byte_pr_sample, ", sub_chunk_2_size=", sub_chunk_2_size, ", data_size=", stream_data.size())
 									match recording.format:
 										AudioStreamWAV.FORMAT_8_BITS:
 											for i in data_bytes:
 												var data_point = stream_data[i] + 128
 												wav.append(data_point)
 										AudioStreamWAV.FORMAT_16_BITS:
 											for i in data_bytes/2:
 												var data_point = decode_uint16(stream_data[i*2], stream_data[i*2+1])
 												store_16(wav, data_point)
 										_:
 											push_error("构建wav错误，不支持的音频格式")
 									return wav
 								func store_string(buffer: PackedByteArray, s: String):
 									buffer.append_array(s.to_utf8_buffer())
 								func store_16(buffer: PackedByteArray,p_dest: int, big_endian: bool = false):
 									var a
 									var b
 									a = p_dest & 0xFF
 									b = p_dest >> 8
 									var c
 									if big_endian:
 										c = a
 										a = b
 										b = c
 									buffer.append(a)
 									buffer.append(b)
 								func store_32(buffer: PackedByteArray, p_dest: int, big_endian: bool = false):
 									var a
 									var b
 									a = p_dest & 0xFFFF
 									b = p_dest >> 16
 									var c
 									if big_endian:
 										c = a
 										a = b
 										b = c
 									store_16(buffer, a, big_endian)
 									store_16(buffer, b, big_endian)
 								func decode_uint16(v1, v2) -> int:
 									var v = 0
 									v |= v1
 									v2 <<= 8
 									v |= v2
 									return v