Created July 31, 2024

Understanding Gesture Knowledge Virtual Character Assistant

In the game, you can get hints about the game content or introduction in NPC conversations.

Things used in this project

Hardware components

AMD Radeon™ Pro W7900 GPU

Software apps and online services

TensorFlow

training neural network

Jupyter Notebook

Amazon Web Services AWS Polly

Voice interaction

Mediapipe

Recognize gestures

Hugging Face

Download the trained model.

Rocm

Establish training model platform .

Hand tools and fabrication machines

Camera logic c310

Story

1.What is your project about?

  My project is about giving relevant game introductions and tips after identifying gestures. Here I use the story of the well-known game <Witcher III> as an experiment.

2.Why did you decide to make it?
  The purpose of this project is to provide a different experience for users using mouse and keyboard.

3.How does it work?
  Enter the game through Unity to chat and interact with the NPC. Mediapipe and the trained model will be used when recognizing gestures. If the recognition is successful, you will receive a prompt so that you can ask the NPC during the game.

4.Show images and videos

start screen

The white square in the upper left corner is the prompt text that will be generated after gesture recognition.

Talk to NPC


Stop in the dialog box means to stop generating text, and the sign in the upper right corner means to exit the dialog.

Prompt text that will pop up after successful gesture recognition

After displaying it, you can ask the NPC some things that are difficult for users to encounter.

User interacts with NPC

Enter conversation message.

NPC replies to conversation messages.

test gesture recognition video.

NPC voice response

Code

import  os
import mediapipe as mp
from matplotlib import pyplot as plt
import time
import cv2
import socket    
import numpy as np
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.layers import LSTM,Dense,Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import multilabel_confusion_matrix,accuracy_score
from tensorflow.keras.models import load_model


mediapipe_holistic = mp.solutions.holistic
mediapipe_drawing = mp.solutions.drawing_utils
holistic = mediapipe_holistic.Holistic(static_image_mode = False, smooth_landmarks=True,min_detection_confidence=0.5,min_tracking_confidence=0.5)

socket_ = socket.socket(socket.AF_INET,socket.SOCK_DGRAM)  # UDP
serverAddressport = ("127.0.0.1",5015)

def mp_detection(image,model):
    image = cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image,cv2.COLOR_RGB2BGR)
    return image,results


def draw_landmarks(image,results):
    mediapipe_drawing.draw_landmarks(image,results.face_landmarks,mediapipe_holistic.FACEMESH_TESSELATION)
    mediapipe_drawing.draw_landmarks(image,results.pose_landmarks,mediapipe_holistic.POSE_CONNECTIONS)
    mediapipe_drawing.draw_landmarks(image,results.left_hand_landmarks,mediapipe_holistic.HAND_CONNECTIONS)
    mediapipe_drawing.draw_landmarks(image,results.right_hand_landmarks,mediapipe_holistic.HAND_CONNECTIONS)
    
def draw_customerized_landmarks(image,results):
    #mediapipe_drawing.draw_landmarks(image,results.face_landmarks,mediapipe_holistic.FACEMESH_TESSELATION,mediapipe_drawing.DrawingSpec(color=(50,110,60),thickness=1,circle_radius=2))
    mediapipe_drawing.draw_landmarks(image,results.pose_landmarks,mediapipe_holistic.POSE_CONNECTIONS,mediapipe_drawing.DrawingSpec(color=(110,120,70),thickness=1,circle_radius=2))
    mediapipe_drawing.draw_landmarks(image,results.left_hand_landmarks,mediapipe_holistic.HAND_CONNECTIONS,mediapipe_drawing.DrawingSpec(color=(110,120,70),thickness=1,circle_radius=2))
    mediapipe_drawing.draw_landmarks(image,results.right_hand_landmarks,mediapipe_holistic.HAND_CONNECTIONS,mediapipe_drawing.DrawingSpec(color=(110,120,70),thickness=1,circle_radius=2))

def extract_keypoints(results):
    right_hand = np.array([[res.x,res.y,res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    left_hand = np.array([[res.x,res.y,res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    face = np.array([[res.x,res.y,res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    pose = np.array([[res.x,res.y,res.z,res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    
    return np.concatenate([pose,face,right_hand,left_hand])   

actions = np.array(['hi','ok'])   
fragment_seq = 30
sequence_len = 30
dataset_path = 'C:\dataset_path_2'

# Data processing after collection
label_mp = {'hi':0,'ok':1}

sequence,labels = [],[]

for action in (actions):
    for seq in range (fragment_seq):
        window = []
        for num in range (sequence_len) :
            res = np.load(os.path.join(dataset_path,action,str(seq),"{}.npy".format(num)))
            window.append(res)
        sequence.append(window)
        labels.append(label_mp[action])

x = np.array(sequence)
print(x.shape)

y = to_categorical(labels).astype(int)

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.03)



#Construct neural network and save model

# model = Sequential([
#     LSTM(64, return_sequences=True, activation='relu', input_shape=(30, 1662)),
#     #Dropout(0.2),  
#     LSTM(128, return_sequences=True, activation='relu'),
#     #Dropout(0.2),
#     #LSTM(64, return_sequences=True, activation='relu'),
#     #Dropout(0.2),
#     LSTM(64, return_sequences=False, activation='relu'),
#     #Dropout(0.2),
#     Dense(64, activation='relu'),
#     #Dropout(0.2),
#     Dense(32, activation='relu'),
#     Dropout(0.1),
#     Dense(actions.shape[0], activation='softmax')
# ])
# optimizer = Adam(learning_rate=1e-4)
# model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])
# model.fit(x_train,y_train,epochs=250)
#model.save('action_res.h5')





#Evaluate model
#del model
#model = load_model(r'C:\mediapipe_test\action_res.h5')
# answer = model.predict(x_test)
# true_answer = np.argmax(y_test,axis=1).tolist()
# answer = np.argmax(answer,axis=1).tolist()
# confusion_matrix = multilabel_confusion_matrix(true_answer, answer)
# print("Confusion Matrix:")
# print(confusion_matrix)
# accuracy = accuracy_score(true_answer,answer)
# print("Accuracy:", accuracy)

# res = model.predict(x_test)
# print(actions)




#test train model
model = load_model(r'C:\mediapipe_test\action_res.h5')
sequence = []
sentence = []
threshold = 0.6

cap = cv2.VideoCapture(0)
pre_time = 0
cur_time = 0

while cap.isOpened():
    ret,frame = cap.read()
    image,results = mp_detection(frame, holistic)
    draw_customerized_landmarks(image, results)
    
    cur_time = time.time()
    fps = 1/ (cur_time-pre_time)
    pre_time = cur_time
    
    cv2.putText(image,str(int(fps))+" FPS",(10,70),cv2.FONT_HERSHEY_COMPLEX,1,(0,255,0),2)
    
    kps = extract_keypoints(results)
    
    sequence.append(kps)
    sequence = sequence[-30:]
    
    if len(sequence) ==30:
        res = model.predict(np.expand_dims(sequence, axis=0))[0]
        print(res)
        print(actions[np.argmax(res)])
        print(res.shape)

        sentence.append(actions[np.argmax(res)])
        
    if res[np.argmax(res)]>=threshold:
        if len(sentence)>0:
            if actions[np.argmax(res)] != sentence[-1]:
                sentence.append(actions[np.argmax(res)])
            else:
                sentence.append(actions[np.argmax(res)])
        if len(sentence)>5:
            sentence = sentence[-5:]
        
        #image = prob_viz(res,actions,image,colors)

        cv2.rectangle(image,(0,0),(640,40),(245,110,160),-1)
        text = " ".join(sentence)
        cv2.putText(image,text,(3,30),cv2.FONT_HERSHEY_COMPLEX,1,(255,255,255),2,cv2.LINE_AA)
        socket_.sendto(text.encode(),serverAddressport)   #send data to unity
    else:
        cv2.rectangle(image,(0,0),(640,40),(140,60,160),-1)
        cv2.putText(image,"Null! ",(3,30),cv2.FONT_HERSHEY_COMPLEX,1,(255,255,255),2,cv2.LINE_AA)
    cv2.imshow('Read camera',image)
        
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break
    

cap.release()
cv2.destroyAllWindows()









#collect  data


# cap = cv2.VideoCapture(0)
# pre_time = 0
# cur_time = 0


# for  action in actions:
#     for seq in range(fragment_seq):
#         try:
#             os.makedirs(os.path.join(dataset_path,action,str(seq)))
#         except:
#             pass





# while cap.isOpened():
    
#     for action in actions:
#         for seq in range (fragment_seq):
#             for num in range (sequence_len):
#                 ret,frame = cap.read()
#                 image,results = mp_detection(frame,holistic)
#                 draw_customerized_landmarks(image, results)
                
#                 if num ==0:
#                     cv2.putText(image,"Begin Collection",(120,200),cv2.FONT_HERSHEY_COMPLEX,1,(0,255,0),2,cv2.LINE_AA)
#                     cv2.putText(image,"Collecting frames for {} video num{}".format(action,seq),(50,70),cv2.FONT_HERSHEY_COMPLEX,0.5,(130,130,0),1,cv2.LINE_AA)
#                     cv2.imshow('Read camera',image)
#                     cv2.waitKey(1000)
#                 else:
#                     cv2.putText(image,"Collecting frames for {} video num{}".format(action,seq),(50,70),cv2.FONT_HERSHEY_COMPLEX,0.5,(255,0,0),1,cv2.LINE_AA)
#                     cv2.imshow('Read camera',image)
                    
                
#                 kps = extract_keypoints(results)
#                 np_path = os.path.join(dataset_path,action,str(seq),str(num))
#                 np.save(np_path,kps)
                
#                 if cv2.waitKey(1) & 0xFF == ord('q'):
#                     break
                
                


#     cap.release()
#     cv2.destroyAllWindows()

using UnityEngine;
using LLMUnity;
using UnityEngine.UI;
using UnityEngine.EventSystems;
using UnityEngine.InputSystem;
//using static System.Net.Mime.MediaTypeNames;

namespace LLMUnitySamples
{
    public class SimpleInteraction : MonoBehaviour
    {
        [SerializeField] private GameObject Maincamera;
        [SerializeField] private GameObject NPC_camera;
        public Transform avatar;
        public LLMCharacter llmCharacter;
        public InputField playerText;
        public Text AIText;
        private TexttoSpeech texttospeech;
        private string aitext;

        void Start()
        {
            playerText.onSubmit.AddListener(onInputFieldSubmit);
            playerText.Select();
            texttospeech = FindObjectOfType<TexttoSpeech>();
        }

        void onInputFieldSubmit(string message)
        {
            playerText.interactable = false;
            AIText.text = "...";
            _ = llmCharacter.Chat(message, SetAIText, AIReplyComplete);
        }

        public void SetAIText(string text)
        {
            //AIText.text = text;
            aitext = text;
        }
        public void OnTextToSpeechPlaybackStarted()
        {
            AIText.text = aitext;
        }

        public void AIReplyComplete()
        {
            playerText.interactable = true;
            playerText.Select();
            playerText.text = "";
            if (texttospeech != null)
            {
                texttospeech.SynthesizeSpeech(aitext);
            }
        }

        public void CancelRequests()
        {
            llmCharacter.CancelRequests();
            AIReplyComplete();
        }

        public void ExitGame()
        {
            Debug.Log("Exit button clicked");
            Application.Quit();
            avatar.GetComponent<PlayerInput>().enabled = true;
            Maincamera.SetActive(true);
            NPC_camera.SetActive(false);
            Cursor.visible = false;
            Cursor.lockState = CursorLockMode.Locked;
        }
        bool onValidateWarning = true;
        void OnValidate()
        {
            if (onValidateWarning && !llmCharacter.remote && llmCharacter.llm != null && llmCharacter.llm.model == "")
            {
                Debug.LogWarning($"Please select a model in the {llmCharacter.llm.gameObject.name} GameObject!");
                onValidateWarning = false;
            }
        }
    }
}

using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using System;
using System.Text;
using System.Net;
using System.Net.Sockets;
using System.Threading;
using TMPro;

public class UDP_hand : MonoBehaviour
{
    private Thread receiveThread;
    private UdpClient client;
    public int port = 5015;
    public bool startReceiving = true;
    public bool printToConsole = false;
    public TextMeshProUGUI textMeshPro;

    private Queue<string> dataQueue = new Queue<string>();
    private readonly int maxDataCount = 5;
    private bool newDataAvailable = false;

    private void Start()
    {
        receiveThread = new Thread(new ThreadStart(ReceiveData));
        receiveThread.IsBackground = true;
        receiveThread.Start();
    }

    private void OnDestroy()
    {
        startReceiving = false;
        if (receiveThread != null && receiveThread.IsAlive)
        {
            receiveThread.Abort();
        }
        if (client != null)
        {
            client.Close();
        }
    }

    private void ReceiveData()
    {
        client = new UdpClient(port);
        while (startReceiving)
        {
            try
            {
                IPEndPoint anyIP = new IPEndPoint(IPAddress.Any, 0);
                byte[] dataByte = client.Receive(ref anyIP);
                string data = Encoding.UTF8.GetString(dataByte);

                lock (dataQueue)
                {
                    if (dataQueue.Count >= maxDataCount)
                    {
                        dataQueue.Dequeue(); 
                    }
                    dataQueue.Enqueue(data); 
                }

                newDataAvailable = true;

                if (printToConsole)
                {
                    print(data);
                }
            }
            catch (Exception err)
            {
                print(err.ToString());
            }
        }
    }

    private void Update()
    {
        if (newDataAvailable)
        {
            int okCount = 0;
            int hiCount = 0;
            lock (dataQueue)
            {
                foreach (string data in dataQueue)
                {
                    string[] words = data.Split(new char[] { ' ', '\t', '\n', '\r' }, StringSplitOptions.RemoveEmptyEntries);
                    okCount += Array.FindAll(words, word => word.Trim().ToLower() == "ok").Length;
                    hiCount += Array.FindAll(words, word => word.Trim().ToLower() == "hi").Length;
                }
            }

            if (okCount >= 3)
            {
                textMeshPro.text = "How to get rare Gwent cards?";
            }
            else if (hiCount >= 3)
            {
                textMeshPro.text = "How to get Netral Card:Xili?";
            }
            else
            {
                textMeshPro.text = "You got another one?";
            }

            newDataAvailable = false;
        }
        //else
        //{
            //textMeshPro.text = "No gesture recognized!";
        //}
    }
}

using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using Amazon.Polly;
using Amazon.Runtime;
using Amazon;
using Amazon.Polly.Model;
using System.IO;
using UnityEngine.Networking;
using System.Threading.Tasks;
using UnityEngine.Events;
public class TexttoSpeech : MonoBehaviour
{
    [SerializeField] private AudioSource audiosource;
    private string textTosynthhesize;
    public string accessKey;
    public string secretKey;
    public UnityEvent OnreplyReceived;
    public UnityEvent OnPlaybackStarted;
    private async void Start()
    {

    }
    public async void SynthesizeSpeech(string text)
    {
        var credentials = new BasicAWSCredentials(accessKey, secretKey);
        var client = new AmazonPollyClient(credentials, RegionEndpoint.EUCentral1);
        textTosynthhesize = text;

        var request = new SynthesizeSpeechRequest()
        {
            Text = textTosynthhesize,
            Engine = Engine.Neural,
            VoiceId = VoiceId.Aria,
            //VoiceId.Zhiyu,
            OutputFormat = OutputFormat.Mp3
        };

        var response = await client.SynthesizeSpeechAsync(request);

        WriteIntoFile(response.AudioStream);

        using (var www = UnityWebRequestMultimedia.GetAudioClip($"{Application.persistentDataPath}/audio.mp3", AudioType.MPEG))
        {
            var op = www.SendWebRequest();
            while (!op.isDone) await Task.Yield();

            var clip = DownloadHandlerAudioClip.GetContent(www);

            audiosource.clip = clip;
            audiosource.Play();
            OnreplyReceived.Invoke();
            OnPlaybackStarted.Invoke();
        }
    }




    private void WriteIntoFile(Stream stream)
    {
        using (var filestream = new FileStream($"{Application.persistentDataPath}/audio.mp3", FileMode.Create))
        {
            byte[] buffer = new byte[8 * 1024];
            int bytesRead;

            while ((bytesRead = stream.Read(buffer, 0, buffer.Length)) > 0)
            {
                filestream.Write(buffer, 0, bytesRead);

            }
        }
    }

}

using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using UnityEngine.Windows;
using UnityEngine.InputSystem;
using System.Threading.Tasks;

public class Dialog_with_NPC : MonoBehaviour
{
    [SerializeField] private GameObject Maincamera;
    [SerializeField] private GameObject NPC_camera;

    [SerializeField] private GameObject Standing_Point;

    private Transform avatar;
 
    private async void OnTriggerEnter(Collider other)
    {
        if (other.CompareTag("Player"))
        {
            avatar = other.transform;
            avatar.GetComponent<PlayerInput>().enabled = false;
            await Task.Delay(20);
            avatar.position = Standing_Point.transform.position;
            avatar.rotation = Standing_Point.transform.rotation;

            Maincamera.SetActive(false);
            NPC_camera.SetActive(true);
            Cursor.visible = true;
            Cursor.lockState = CursorLockMode.None;
        }
    }
}

Understanding Gesture Knowledge Virtual Character Assistant

Things used in this project

Hardware components

Software apps and online services

Hand tools and fabrication machines

Story

Schematics

No circuit boards and related equipment are used

Code

Mediapipe_code and Send data to unity via UDP

LLM interaction in unity

Unity UDP_code

AWS polly Text to Speech

Chat settings

Credits

Daniel liu

Comments

Embed the widget on your own site

Understanding Gesture Knowledge Virtual Character Assistant

Understanding Gesture Knowledge Virtual Character Assistant

Things used in this project

Hardware components

Software apps and online services

Hand tools and fabrication machines

Story

Schematics

No circuit boards and related equipment are used

Code

Mediapipe_code and Send data to unity via UDP

LLM interaction in unity

Unity UDP_code

AWS polly Text to Speech

Chat settings

Credits

Daniel liu

Comments