Hackster is hosting Hackster Holidays, Finale: Livestream & Giveaway Drawing. Watch previous episodes or stream live on Tuesday!Stream Hackster Holidays, Finale on Tuesday!
Daniel liu
Created July 31, 2024

Understanding Gesture Knowledge Virtual Character Assistant

In the game, you can get hints about the game content or introduction in NPC conversations.

13
Understanding Gesture Knowledge Virtual Character Assistant

Things used in this project

Hardware components

AMD Radeon Pro W7900 GPU
AMD Radeon Pro W7900 GPU
×1

Software apps and online services

TensorFlow
TensorFlow
training neural network
Jupyter Notebook
Jupyter Notebook
AWS Polly
Amazon Web Services AWS Polly
Voice interaction
Mediapipe
Recognize gestures
Hugging Face
Download the trained model.
Rocm
Establish training model platform .

Hand tools and fabrication machines

Camera logic c310

Story

Read more

Schematics

No circuit boards and related equipment are used

User interaction diagram

Code

Mediapipe_code and Send data to unity via UDP

Python
This is the code written together for training and data collection. Please comment if you want to run it.
import  os
import mediapipe as mp
from matplotlib import pyplot as plt
import time
import cv2
import socket    
import numpy as np
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.layers import LSTM,Dense,Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import multilabel_confusion_matrix,accuracy_score
from tensorflow.keras.models import load_model


mediapipe_holistic = mp.solutions.holistic
mediapipe_drawing = mp.solutions.drawing_utils
holistic = mediapipe_holistic.Holistic(static_image_mode = False, smooth_landmarks=True,min_detection_confidence=0.5,min_tracking_confidence=0.5)

socket_ = socket.socket(socket.AF_INET,socket.SOCK_DGRAM)  # UDP
serverAddressport = ("127.0.0.1",5015)

def mp_detection(image,model):
    image = cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image,cv2.COLOR_RGB2BGR)
    return image,results


def draw_landmarks(image,results):
    mediapipe_drawing.draw_landmarks(image,results.face_landmarks,mediapipe_holistic.FACEMESH_TESSELATION)
    mediapipe_drawing.draw_landmarks(image,results.pose_landmarks,mediapipe_holistic.POSE_CONNECTIONS)
    mediapipe_drawing.draw_landmarks(image,results.left_hand_landmarks,mediapipe_holistic.HAND_CONNECTIONS)
    mediapipe_drawing.draw_landmarks(image,results.right_hand_landmarks,mediapipe_holistic.HAND_CONNECTIONS)
    
def draw_customerized_landmarks(image,results):
    #mediapipe_drawing.draw_landmarks(image,results.face_landmarks,mediapipe_holistic.FACEMESH_TESSELATION,mediapipe_drawing.DrawingSpec(color=(50,110,60),thickness=1,circle_radius=2))
    mediapipe_drawing.draw_landmarks(image,results.pose_landmarks,mediapipe_holistic.POSE_CONNECTIONS,mediapipe_drawing.DrawingSpec(color=(110,120,70),thickness=1,circle_radius=2))
    mediapipe_drawing.draw_landmarks(image,results.left_hand_landmarks,mediapipe_holistic.HAND_CONNECTIONS,mediapipe_drawing.DrawingSpec(color=(110,120,70),thickness=1,circle_radius=2))
    mediapipe_drawing.draw_landmarks(image,results.right_hand_landmarks,mediapipe_holistic.HAND_CONNECTIONS,mediapipe_drawing.DrawingSpec(color=(110,120,70),thickness=1,circle_radius=2))

def extract_keypoints(results):
    right_hand = np.array([[res.x,res.y,res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    left_hand = np.array([[res.x,res.y,res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    face = np.array([[res.x,res.y,res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    pose = np.array([[res.x,res.y,res.z,res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    
    return np.concatenate([pose,face,right_hand,left_hand])   

actions = np.array(['hi','ok'])   
fragment_seq = 30
sequence_len = 30
dataset_path = 'C:\dataset_path_2'

# Data processing after collection
label_mp = {'hi':0,'ok':1}

sequence,labels = [],[]

for action in (actions):
    for seq in range (fragment_seq):
        window = []
        for num in range (sequence_len) :
            res = np.load(os.path.join(dataset_path,action,str(seq),"{}.npy".format(num)))
            window.append(res)
        sequence.append(window)
        labels.append(label_mp[action])

x = np.array(sequence)
print(x.shape)

y = to_categorical(labels).astype(int)

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.03)



#Construct neural network and save model

# model = Sequential([
#     LSTM(64, return_sequences=True, activation='relu', input_shape=(30, 1662)),
#     #Dropout(0.2),  
#     LSTM(128, return_sequences=True, activation='relu'),
#     #Dropout(0.2),
#     #LSTM(64, return_sequences=True, activation='relu'),
#     #Dropout(0.2),
#     LSTM(64, return_sequences=False, activation='relu'),
#     #Dropout(0.2),
#     Dense(64, activation='relu'),
#     #Dropout(0.2),
#     Dense(32, activation='relu'),
#     Dropout(0.1),
#     Dense(actions.shape[0], activation='softmax')
# ])
# optimizer = Adam(learning_rate=1e-4)
# model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])
# model.fit(x_train,y_train,epochs=250)
#model.save('action_res.h5')





#Evaluate model
#del model
#model = load_model(r'C:\mediapipe_test\action_res.h5')
# answer = model.predict(x_test)
# true_answer = np.argmax(y_test,axis=1).tolist()
# answer = np.argmax(answer,axis=1).tolist()
# confusion_matrix = multilabel_confusion_matrix(true_answer, answer)
# print("Confusion Matrix:")
# print(confusion_matrix)
# accuracy = accuracy_score(true_answer,answer)
# print("Accuracy:", accuracy)

# res = model.predict(x_test)
# print(actions)




#test train model
model = load_model(r'C:\mediapipe_test\action_res.h5')
sequence = []
sentence = []
threshold = 0.6

cap = cv2.VideoCapture(0)
pre_time = 0
cur_time = 0

while cap.isOpened():
    ret,frame = cap.read()
    image,results = mp_detection(frame, holistic)
    draw_customerized_landmarks(image, results)
    
    cur_time = time.time()
    fps = 1/ (cur_time-pre_time)
    pre_time = cur_time
    
    cv2.putText(image,str(int(fps))+" FPS",(10,70),cv2.FONT_HERSHEY_COMPLEX,1,(0,255,0),2)
    
    kps = extract_keypoints(results)
    
    sequence.append(kps)
    sequence = sequence[-30:]
    
    if len(sequence) ==30:
        res = model.predict(np.expand_dims(sequence, axis=0))[0]
        print(res)
        print(actions[np.argmax(res)])
        print(res.shape)

        sentence.append(actions[np.argmax(res)])
        
    if res[np.argmax(res)]>=threshold:
        if len(sentence)>0:
            if actions[np.argmax(res)] != sentence[-1]:
                sentence.append(actions[np.argmax(res)])
            else:
                sentence.append(actions[np.argmax(res)])
        if len(sentence)>5:
            sentence = sentence[-5:]
        
        #image = prob_viz(res,actions,image,colors)

        cv2.rectangle(image,(0,0),(640,40),(245,110,160),-1)
        text = " ".join(sentence)
        cv2.putText(image,text,(3,30),cv2.FONT_HERSHEY_COMPLEX,1,(255,255,255),2,cv2.LINE_AA)
        socket_.sendto(text.encode(),serverAddressport)   #send data to unity
    else:
        cv2.rectangle(image,(0,0),(640,40),(140,60,160),-1)
        cv2.putText(image,"Null! ",(3,30),cv2.FONT_HERSHEY_COMPLEX,1,(255,255,255),2,cv2.LINE_AA)
    cv2.imshow('Read camera',image)
        
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break
    

cap.release()
cv2.destroyAllWindows()









#collect  data


# cap = cv2.VideoCapture(0)
# pre_time = 0
# cur_time = 0


# for  action in actions:
#     for seq in range(fragment_seq):
#         try:
#             os.makedirs(os.path.join(dataset_path,action,str(seq)))
#         except:
#             pass





# while cap.isOpened():
    
#     for action in actions:
#         for seq in range (fragment_seq):
#             for num in range (sequence_len):
#                 ret,frame = cap.read()
#                 image,results = mp_detection(frame,holistic)
#                 draw_customerized_landmarks(image, results)
                
#                 if num ==0:
#                     cv2.putText(image,"Begin Collection",(120,200),cv2.FONT_HERSHEY_COMPLEX,1,(0,255,0),2,cv2.LINE_AA)
#                     cv2.putText(image,"Collecting frames for {} video num{}".format(action,seq),(50,70),cv2.FONT_HERSHEY_COMPLEX,0.5,(130,130,0),1,cv2.LINE_AA)
#                     cv2.imshow('Read camera',image)
#                     cv2.waitKey(1000)
#                 else:
#                     cv2.putText(image,"Collecting frames for {} video num{}".format(action,seq),(50,70),cv2.FONT_HERSHEY_COMPLEX,0.5,(255,0,0),1,cv2.LINE_AA)
#                     cv2.imshow('Read camera',image)
                    
                
#                 kps = extract_keypoints(results)
#                 np_path = os.path.join(dataset_path,action,str(seq),str(num))
#                 np.save(np_path,kps)
                
#                 if cv2.waitKey(1) & 0xFF == ord('q'):
#                     break
                
                


#     cap.release()
#     cv2.destroyAllWindows()
    
    

LLM interaction in unity

C#
Use git hub: https://github.com/undreamai/LLMUnity to modify the code to implement the character's dialogue and control the time of text display.
using UnityEngine;
using LLMUnity;
using UnityEngine.UI;
using UnityEngine.EventSystems;
using UnityEngine.InputSystem;
//using static System.Net.Mime.MediaTypeNames;

namespace LLMUnitySamples
{
    public class SimpleInteraction : MonoBehaviour
    {
        [SerializeField] private GameObject Maincamera;
        [SerializeField] private GameObject NPC_camera;
        public Transform avatar;
        public LLMCharacter llmCharacter;
        public InputField playerText;
        public Text AIText;
        private TexttoSpeech texttospeech;
        private string aitext;

        void Start()
        {
            playerText.onSubmit.AddListener(onInputFieldSubmit);
            playerText.Select();
            texttospeech = FindObjectOfType<TexttoSpeech>();
        }

        void onInputFieldSubmit(string message)
        {
            playerText.interactable = false;
            AIText.text = "...";
            _ = llmCharacter.Chat(message, SetAIText, AIReplyComplete);
        }

        public void SetAIText(string text)
        {
            //AIText.text = text;
            aitext = text;
        }
        public void OnTextToSpeechPlaybackStarted()
        {
            AIText.text = aitext;
        }

        public void AIReplyComplete()
        {
            playerText.interactable = true;
            playerText.Select();
            playerText.text = "";
            if (texttospeech != null)
            {
                texttospeech.SynthesizeSpeech(aitext);
            }
        }

        public void CancelRequests()
        {
            llmCharacter.CancelRequests();
            AIReplyComplete();
        }

        public void ExitGame()
        {
            Debug.Log("Exit button clicked");
            Application.Quit();
            avatar.GetComponent<PlayerInput>().enabled = true;
            Maincamera.SetActive(true);
            NPC_camera.SetActive(false);
            Cursor.visible = false;
            Cursor.lockState = CursorLockMode.Locked;
        }
        bool onValidateWarning = true;
        void OnValidate()
        {
            if (onValidateWarning && !llmCharacter.remote && llmCharacter.llm != null && llmCharacter.llm.model == "")
            {
                Debug.LogWarning($"Please select a model in the {llmCharacter.llm.gameObject.name} GameObject!");
                onValidateWarning = false;
            }
        }
    }
}

Unity UDP_code

C#
Send socket data to unity. If UDP sends the same string more than 3 times, a specific prompt will be output.
using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using System;
using System.Text;
using System.Net;
using System.Net.Sockets;
using System.Threading;
using TMPro;

public class UDP_hand : MonoBehaviour
{
    private Thread receiveThread;
    private UdpClient client;
    public int port = 5015;
    public bool startReceiving = true;
    public bool printToConsole = false;
    public TextMeshProUGUI textMeshPro;

    private Queue<string> dataQueue = new Queue<string>();
    private readonly int maxDataCount = 5;
    private bool newDataAvailable = false;

    private void Start()
    {
        receiveThread = new Thread(new ThreadStart(ReceiveData));
        receiveThread.IsBackground = true;
        receiveThread.Start();
    }

    private void OnDestroy()
    {
        startReceiving = false;
        if (receiveThread != null && receiveThread.IsAlive)
        {
            receiveThread.Abort();
        }
        if (client != null)
        {
            client.Close();
        }
    }

    private void ReceiveData()
    {
        client = new UdpClient(port);
        while (startReceiving)
        {
            try
            {
                IPEndPoint anyIP = new IPEndPoint(IPAddress.Any, 0);
                byte[] dataByte = client.Receive(ref anyIP);
                string data = Encoding.UTF8.GetString(dataByte);

                lock (dataQueue)
                {
                    if (dataQueue.Count >= maxDataCount)
                    {
                        dataQueue.Dequeue(); 
                    }
                    dataQueue.Enqueue(data); 
                }

                newDataAvailable = true;

                if (printToConsole)
                {
                    print(data);
                }
            }
            catch (Exception err)
            {
                print(err.ToString());
            }
        }
    }

    private void Update()
    {
        if (newDataAvailable)
        {
            int okCount = 0;
            int hiCount = 0;
            lock (dataQueue)
            {
                foreach (string data in dataQueue)
                {
                    string[] words = data.Split(new char[] { ' ', '\t', '\n', '\r' }, StringSplitOptions.RemoveEmptyEntries);
                    okCount += Array.FindAll(words, word => word.Trim().ToLower() == "ok").Length;
                    hiCount += Array.FindAll(words, word => word.Trim().ToLower() == "hi").Length;
                }
            }

            if (okCount >= 3)
            {
                textMeshPro.text = "How to get rare Gwent cards?";
            }
            else if (hiCount >= 3)
            {
                textMeshPro.text = "How to get Netral Card:Xili?";
            }
            else
            {
                textMeshPro.text = "You got another one?";
            }

            newDataAvailable = false;
        }
        //else
        //{
            //textMeshPro.text = "No gesture recognized!";
        //}
    }
}

AWS polly Text to Speech

C#
Use aws polly to allow the NPC to regenerate text and then convert it into a voice file and play it out.
using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using Amazon.Polly;
using Amazon.Runtime;
using Amazon;
using Amazon.Polly.Model;
using System.IO;
using UnityEngine.Networking;
using System.Threading.Tasks;
using UnityEngine.Events;
public class TexttoSpeech : MonoBehaviour
{
    [SerializeField] private AudioSource audiosource;
    private string textTosynthhesize;
    public string accessKey;
    public string secretKey;
    public UnityEvent OnreplyReceived;
    public UnityEvent OnPlaybackStarted;
    private async void Start()
    {

    }
    public async void SynthesizeSpeech(string text)
    {
        var credentials = new BasicAWSCredentials(accessKey, secretKey);
        var client = new AmazonPollyClient(credentials, RegionEndpoint.EUCentral1);
        textTosynthhesize = text;

        var request = new SynthesizeSpeechRequest()
        {
            Text = textTosynthhesize,
            Engine = Engine.Neural,
            VoiceId = VoiceId.Aria,
            //VoiceId.Zhiyu,
            OutputFormat = OutputFormat.Mp3
        };

        var response = await client.SynthesizeSpeechAsync(request);

        WriteIntoFile(response.AudioStream);

        using (var www = UnityWebRequestMultimedia.GetAudioClip($"{Application.persistentDataPath}/audio.mp3", AudioType.MPEG))
        {
            var op = www.SendWebRequest();
            while (!op.isDone) await Task.Yield();

            var clip = DownloadHandlerAudioClip.GetContent(www);

            audiosource.clip = clip;
            audiosource.Play();
            OnreplyReceived.Invoke();
            OnPlaybackStarted.Invoke();
        }
    }




    private void WriteIntoFile(Stream stream)
    {
        using (var filestream = new FileStream($"{Application.persistentDataPath}/audio.mp3", FileMode.Create))
        {
            byte[] buffer = new byte[8 * 1024];
            int bytesRead;

            while ((bytesRead = stream.Read(buffer, 0, buffer.Length)) > 0)
            {
                filestream.Write(buffer, 0, bytesRead);

            }
        }
    }

}

Chat settings

C#
Set the character's designated location and triggered actions, such as: opening a conversation and unlocking the cursor
using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using UnityEngine.Windows;
using UnityEngine.InputSystem;
using System.Threading.Tasks;

public class Dialog_with_NPC : MonoBehaviour
{
    [SerializeField] private GameObject Maincamera;
    [SerializeField] private GameObject NPC_camera;

    [SerializeField] private GameObject Standing_Point;

    private Transform avatar;
 
    private async void OnTriggerEnter(Collider other)
    {
        if (other.CompareTag("Player"))
        {
            avatar = other.transform;
            avatar.GetComponent<PlayerInput>().enabled = false;
            await Task.Delay(20);
            avatar.position = Standing_Point.transform.position;
            avatar.rotation = Standing_Point.transform.rotation;

            Maincamera.SetActive(false);
            NPC_camera.SetActive(true);
            Cursor.visible = true;
            Cursor.lockState = CursorLockMode.None;
        }
    }
}

Credits

Daniel liu
1 project • 1 follower

Comments