Unity 接入阿里的全模态大模型Qwen2.5-Omni
1 参考
根据B站up主阴沉的怪咖 开源的项目的基础上修改接入
AI二次元老婆开源项目地址(unity-AI-Chat-Toolkit):
Github地址:https://github.com/zhangliwei7758/unity-AI-Chat-Toolkit
Gitee地址:https://gitee.com/DammonSpace/unity-ai-chat-toolkit
2 官网参考
找到官网发现没C#的案例,于是参考python的脚本,改为C#
阿里全模态的官方地址:阿里云百炼
3 语音输入部分
在基类LLM里添加下属代码
public virtual void PostMsgAudio(string base64Audio, Action<string> _callback, Action<bool> _endCallBack = null, Action<AudioClip> _AudioCallBack = null){//上下文条数设置CheckHistory();//提示词处理string message = "当前为角色的人物设定:" + m_Prompt +" 回答的语言:" + lan;//缓存发送的信息列表Content content = new Content(){type = "input_audio",input_audio = new Input_audio(){data = string.Format("data:;base64,{0}", base64Audio),format = "mp3"}};Content content2 = new Content(){type = "text",text = message};Content[] contents = new Content[] { content, content2 };m_DataAudioList.Add(new SendDataAudio("user", contents));StartCoroutine(RequestAudio(message, _callback, _endCallBack, _AudioCallBack));}public virtual IEnumerator RequestAudio(string _postWord, System.Action<string> _callback, Action<bool> _endCallBack = null, Action<AudioClip> _AudioCallBack = null){yield return new WaitForEndOfFrame();}[Serializable]public class SendDataAudio{[SerializeField] public string role;[SerializeField] public Content[] content;public SendDataAudio() { }public SendDataAudio(string _role, Content[] _content){role = _role;content = _content;}}[Serializable]public class Content{[SerializeField] public string type;[SerializeField] public Input_audio input_audio;[SerializeField] public string text;}[Serializable]public class Input_audio{[SerializeField] public string data;[SerializeField] public string format;}
4 语音解析部分
新添加一个类AliQwenOmniChat,继承LLM
using Newtonsoft.Json.Linq;
using System.Collections.Generic;
using System;
using UnityEngine;
using UnityEngine.Networking;
using System.Collections;
using static ALiChat;public class AliQwenOmniChat : LLM
{public AliQwenOmniChat(){url = "https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions";}/// <summary>/// api key/// </summary>[SerializeField] private string api_key;/// <summary>/// AI设定/// </summary>public string m_SystemSetting = string.Empty;/// <summary>/// qwen-omni-turbo-0119/// </summary>public string m_gptModel = "qwen-omni-turbo-0119";[Header("设置说话的声音")] public SpeekerRole per = SpeekerRole.Cherry;private void Start(){//运行时,添加AI设定m_DataList.Add(new SendData("system", m_SystemSetting));}/// <summary>/// 发送消息/// </summary>/// <returns></returns>public override void PostMsgAudio(string _msg, Action<string> _callback, Action<bool> endAction, Action<AudioClip> AudioAction){base.PostMsgAudio(_msg, _callback, endAction, AudioAction);}public override IEnumerator RequestAudio(string requestData, Action<string> callback, Action<bool> EndAction, Action<AudioClip> AudioAction){using (var request = new UnityWebRequest(url, "POST")){PostDataAudio _postData = new PostDataAudio{model = m_gptModel,stream = this.stream,messages = m_DataAudioList,temperature = 1,top_p = 0.7f,modalities = new string[] { "text", "audio" },audio = new Audio { voice = SetSpeeker(per), format = "wav" },stream_options = new Stream_options { include_usage = true },};string _jsonText = JsonUtility.ToJson(_postData).Trim();Debug.Log(_jsonText);byte[] data = System.Text.Encoding.UTF8.GetBytes(_jsonText);request.uploadHandler = (UploadHandler)new UploadHandlerRaw(data);request.downloadHandler = (DownloadHandler)new DownloadHandlerBuffer();request.SetRequestHeader("Content-Type", "application/json");request.SetRequestHeader("Authorization", string.Format("Bearer {0}", api_key));yield return request.SendWebRequest();if (request.result == UnityWebRequest.Result.ConnectionError || request.result == UnityWebRequest.Result.ProtocolError){Debug.LogError("阿里Error: " + request.error);callback?.Invoke("阿里大模型出现点问题");yield break;}string temp = request.downloadHandler.text;var datas = temp.Split("data:");string textStr = "";string audioStr = "";foreach (var requestJson in datas){if (string.IsNullOrEmpty(requestJson))continue;Debug.Log(requestJson);var jsonP = JToken.Parse(requestJson);var item = jsonP["choices"][0];var audio = item["delta"].SelectToken("audio");if (audio != null){if (audio.SelectToken("transcript") != null){var tt = audio.SelectToken("transcript")?.ToString();//文字部分if (!string.IsNullOrEmpty(tt)){tt = tt.Trim();textStr += tt;}var finish = item.SelectToken("finish_reason");if (finish != null && finish.ToString() == "stop"){break;}}else{audioStr += audio.SelectToken("data")?.ToString();//语音部分}}}if (!string.IsNullOrEmpty(textStr)){callback.Invoke(textStr);}if (!string.IsNullOrEmpty(audioStr)){AudioAction(PlayAudio(audioStr));}EndAction.Invoke(true);}}//解析输出的Base64 编码的音频数据AudioClip PlayAudio(string audioString){if (!string.IsNullOrEmpty(audioString)){byte[] audioBytes = Convert.FromBase64String(audioString);AudioClip audioClip = WavUtility.ConvertBytesToAudioClip(audioBytes, 24000);return audioClip;}return null;}//阿里提供的四种支持的音色private string SetSpeeker(SpeekerRole _role){if (_role == SpeekerRole.Cherry) return "Cherry";if (_role == SpeekerRole.Serena) return "Serena";if (_role == SpeekerRole.Ethan) return "Ethan";if (_role == SpeekerRole.Chelsie) return "Chelsie";return "Cherry";//默认为音色Cherry}#region 数据包[Serializable]public class PostDataAudio{[SerializeField] public string model;[SerializeField] public bool stream;[SerializeField] public List<SendDataAudio> messages;[SerializeField] public float temperature = 0.7f;[SerializeField] public float top_p;[SerializeField] public string[] modalities;[SerializeField] public Audio audio;[SerializeField] public Stream_options stream_options;}[Serializable]public class Audio{public string voice;public string format;}[Serializable]public class Stream_options{public bool include_usage;}#endregionpublic enum SpeekerRole{Cherry,Serena,Ethan,Chelsie}
}
5 测试
输入需要语音输入时,找到开源项目里的录音结束处理的AcceptClip方法修改为:
public bool AliQwenOmniChat = false; private Queue<string> strDatas = new Queue<string>();private Queue<AudioClip> clipDatas = new Queue<AudioClip>();private bool end = true;private void AcceptClip(AudioClip _audioClip){if (m_ChatSettings.m_SpeechToText == null)return;if (AliQwenOmniChat)//阿里全模态语音输入时{byte[] _audioData = WavUtility.FromAudioClip(_audioClip);string base64String = Convert.ToBase64String(_audioData);m_ChatSettings.m_ChatModel.PostMsgAudio(base64String, CallBack, EndCallBack, AudioCallBack);//阿里语音输入m_InputWord.text = "阿里语音输入完成";}else{m_ChatSettings.m_SpeechToText.SpeechToText(_audioClip, DealingTextCallback);}}private void EndCallBack(bool isCompate){Debug.Log("是否回到结束:" + isCompate);this.end = isCompate;}private void CallBack(string _response)//文字回调{_response = _response.Trim();//m_TextBack.text = "";//Debug.Log("收到AI回复:" + _response);if (GetMesssgeIndex == 0){m_TextBack2.text = "";//切换到说话动作Debug.Log("播放声音******");m_TextBack.text = ""; SetAnimator("state", 2);}GetMesssgeIndex++;if (!string.IsNullOrEmpty(_response)){if (Ali)//阿里多模态直接返回语音 放到队列里面 strDatas.Enqueue(_response);elsem_ChatSettings.m_TextToSpeech.Speak(_response, PlayAudio);}//添加声音回调的方法private void AudioCallBack(AudioClip clip){clipDatas.Enqueue(clip);}private void Update(){if (AliQwenOmniChat){if (strDatas.Count > 0 && m_WriteState == false){StartTypeWords(strDatas.Dequeue());}if (clipDatas.Count > 0 && m_AudioSource.isPlaying == false){m_AudioSource.clip = clipDatas.Dequeue();m_AudioSource.Play();//返回的语音播放isEnd = false;}else if (m_AudioSource.isPlaying == false && this.end){if (isEnd){return;}isEnd = true;m_ChatHistory.Add(m_TextBack.text);m_AudioSource.Stop();resultDatas.Clear();GetMesssgeIndex = 0;切换到等待动作Debug.Log("切换到等待动作"); SetAnimator("state", 0);}}}
文字输入和开源项目里的原先输入一样。
语音输入测试:
我用声音问:你叫什么名字?
输入的打印:
语音输入后,返回了文字和声音,返回的打印: