Hardware components | ||||||
| × | 1 | ||||
Software apps and online services | ||||||
| ||||||
| ||||||
| ||||||
| ||||||
| ||||||
| ||||||
Hand tools and fabrication machines | ||||||
|
1.What is your project about?
My project is about giving relevant game introductions and tips after identifying gestures. Here I use the story of the well-known game <Witcher III> as an experiment.
2.Why did you decide to make it?
The purpose of this project is to provide a different experience for users using mouse and keyboard.
3.How does it work?
Enter the game through Unity to chat and interact with the NPC. Mediapipe and the trained model will be used when recognizing gestures. If the recognition is successful, you will receive a prompt so that you can ask the NPC during the game.
4.Show images and videos
The white square in the upper left corner is the prompt text that will be generated after gesture recognition.
Stop in the dialog box means to stop generating text, and the sign in the upper right corner means to exit the dialog.
After displaying it, you can ask the NPC some things that are difficult for users to encounter.
Enter conversation message.
NPC replies to conversation messages.
Mediapipe_code and Send data to unity via UDP
PythonThis is the code written together for training and data collection. Please comment if you want to run it.
import os
import mediapipe as mp
from matplotlib import pyplot as plt
import time
import cv2
import socket
import numpy as np
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.layers import LSTM,Dense,Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import multilabel_confusion_matrix,accuracy_score
from tensorflow.keras.models import load_model
mediapipe_holistic = mp.solutions.holistic
mediapipe_drawing = mp.solutions.drawing_utils
holistic = mediapipe_holistic.Holistic(static_image_mode = False, smooth_landmarks=True,min_detection_confidence=0.5,min_tracking_confidence=0.5)
socket_ = socket.socket(socket.AF_INET,socket.SOCK_DGRAM) # UDP
serverAddressport = ("127.0.0.1",5015)
def mp_detection(image,model):
image = cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
image.flags.writeable = False
results = model.process(image)
image.flags.writeable = True
image = cv2.cvtColor(image,cv2.COLOR_RGB2BGR)
return image,results
def draw_landmarks(image,results):
mediapipe_drawing.draw_landmarks(image,results.face_landmarks,mediapipe_holistic.FACEMESH_TESSELATION)
mediapipe_drawing.draw_landmarks(image,results.pose_landmarks,mediapipe_holistic.POSE_CONNECTIONS)
mediapipe_drawing.draw_landmarks(image,results.left_hand_landmarks,mediapipe_holistic.HAND_CONNECTIONS)
mediapipe_drawing.draw_landmarks(image,results.right_hand_landmarks,mediapipe_holistic.HAND_CONNECTIONS)
def draw_customerized_landmarks(image,results):
#mediapipe_drawing.draw_landmarks(image,results.face_landmarks,mediapipe_holistic.FACEMESH_TESSELATION,mediapipe_drawing.DrawingSpec(color=(50,110,60),thickness=1,circle_radius=2))
mediapipe_drawing.draw_landmarks(image,results.pose_landmarks,mediapipe_holistic.POSE_CONNECTIONS,mediapipe_drawing.DrawingSpec(color=(110,120,70),thickness=1,circle_radius=2))
mediapipe_drawing.draw_landmarks(image,results.left_hand_landmarks,mediapipe_holistic.HAND_CONNECTIONS,mediapipe_drawing.DrawingSpec(color=(110,120,70),thickness=1,circle_radius=2))
mediapipe_drawing.draw_landmarks(image,results.right_hand_landmarks,mediapipe_holistic.HAND_CONNECTIONS,mediapipe_drawing.DrawingSpec(color=(110,120,70),thickness=1,circle_radius=2))
def extract_keypoints(results):
right_hand = np.array([[res.x,res.y,res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
left_hand = np.array([[res.x,res.y,res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
face = np.array([[res.x,res.y,res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
pose = np.array([[res.x,res.y,res.z,res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
return np.concatenate([pose,face,right_hand,left_hand])
actions = np.array(['hi','ok'])
fragment_seq = 30
sequence_len = 30
dataset_path = 'C:\dataset_path_2'
# Data processing after collection
label_mp = {'hi':0,'ok':1}
sequence,labels = [],[]
for action in (actions):
for seq in range (fragment_seq):
window = []
for num in range (sequence_len) :
res = np.load(os.path.join(dataset_path,action,str(seq),"{}.npy".format(num)))
window.append(res)
sequence.append(window)
labels.append(label_mp[action])
x = np.array(sequence)
print(x.shape)
y = to_categorical(labels).astype(int)
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.03)
#Construct neural network and save model
# model = Sequential([
# LSTM(64, return_sequences=True, activation='relu', input_shape=(30, 1662)),
# #Dropout(0.2),
# LSTM(128, return_sequences=True, activation='relu'),
# #Dropout(0.2),
# #LSTM(64, return_sequences=True, activation='relu'),
# #Dropout(0.2),
# LSTM(64, return_sequences=False, activation='relu'),
# #Dropout(0.2),
# Dense(64, activation='relu'),
# #Dropout(0.2),
# Dense(32, activation='relu'),
# Dropout(0.1),
# Dense(actions.shape[0], activation='softmax')
# ])
# optimizer = Adam(learning_rate=1e-4)
# model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])
# model.fit(x_train,y_train,epochs=250)
#model.save('action_res.h5')
#Evaluate model
#del model
#model = load_model(r'C:\mediapipe_test\action_res.h5')
# answer = model.predict(x_test)
# true_answer = np.argmax(y_test,axis=1).tolist()
# answer = np.argmax(answer,axis=1).tolist()
# confusion_matrix = multilabel_confusion_matrix(true_answer, answer)
# print("Confusion Matrix:")
# print(confusion_matrix)
# accuracy = accuracy_score(true_answer,answer)
# print("Accuracy:", accuracy)
# res = model.predict(x_test)
# print(actions)
#test train model
model = load_model(r'C:\mediapipe_test\action_res.h5')
sequence = []
sentence = []
threshold = 0.6
cap = cv2.VideoCapture(0)
pre_time = 0
cur_time = 0
while cap.isOpened():
ret,frame = cap.read()
image,results = mp_detection(frame, holistic)
draw_customerized_landmarks(image, results)
cur_time = time.time()
fps = 1/ (cur_time-pre_time)
pre_time = cur_time
cv2.putText(image,str(int(fps))+" FPS",(10,70),cv2.FONT_HERSHEY_COMPLEX,1,(0,255,0),2)
kps = extract_keypoints(results)
sequence.append(kps)
sequence = sequence[-30:]
if len(sequence) ==30:
res = model.predict(np.expand_dims(sequence, axis=0))[0]
print(res)
print(actions[np.argmax(res)])
print(res.shape)
sentence.append(actions[np.argmax(res)])
if res[np.argmax(res)]>=threshold:
if len(sentence)>0:
if actions[np.argmax(res)] != sentence[-1]:
sentence.append(actions[np.argmax(res)])
else:
sentence.append(actions[np.argmax(res)])
if len(sentence)>5:
sentence = sentence[-5:]
#image = prob_viz(res,actions,image,colors)
cv2.rectangle(image,(0,0),(640,40),(245,110,160),-1)
text = " ".join(sentence)
cv2.putText(image,text,(3,30),cv2.FONT_HERSHEY_COMPLEX,1,(255,255,255),2,cv2.LINE_AA)
socket_.sendto(text.encode(),serverAddressport) #send data to unity
else:
cv2.rectangle(image,(0,0),(640,40),(140,60,160),-1)
cv2.putText(image,"Null! ",(3,30),cv2.FONT_HERSHEY_COMPLEX,1,(255,255,255),2,cv2.LINE_AA)
cv2.imshow('Read camera',image)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
cap.release()
cv2.destroyAllWindows()
#collect data
# cap = cv2.VideoCapture(0)
# pre_time = 0
# cur_time = 0
# for action in actions:
# for seq in range(fragment_seq):
# try:
# os.makedirs(os.path.join(dataset_path,action,str(seq)))
# except:
# pass
# while cap.isOpened():
# for action in actions:
# for seq in range (fragment_seq):
# for num in range (sequence_len):
# ret,frame = cap.read()
# image,results = mp_detection(frame,holistic)
# draw_customerized_landmarks(image, results)
# if num ==0:
# cv2.putText(image,"Begin Collection",(120,200),cv2.FONT_HERSHEY_COMPLEX,1,(0,255,0),2,cv2.LINE_AA)
# cv2.putText(image,"Collecting frames for {} video num{}".format(action,seq),(50,70),cv2.FONT_HERSHEY_COMPLEX,0.5,(130,130,0),1,cv2.LINE_AA)
# cv2.imshow('Read camera',image)
# cv2.waitKey(1000)
# else:
# cv2.putText(image,"Collecting frames for {} video num{}".format(action,seq),(50,70),cv2.FONT_HERSHEY_COMPLEX,0.5,(255,0,0),1,cv2.LINE_AA)
# cv2.imshow('Read camera',image)
# kps = extract_keypoints(results)
# np_path = os.path.join(dataset_path,action,str(seq),str(num))
# np.save(np_path,kps)
# if cv2.waitKey(1) & 0xFF == ord('q'):
# break
# cap.release()
# cv2.destroyAllWindows()
LLM interaction in unity
C#Use git hub: https://github.com/undreamai/LLMUnity to modify the code to implement the character's dialogue and control the time of text display.
using UnityEngine;
using LLMUnity;
using UnityEngine.UI;
using UnityEngine.EventSystems;
using UnityEngine.InputSystem;
//using static System.Net.Mime.MediaTypeNames;
namespace LLMUnitySamples
{
public class SimpleInteraction : MonoBehaviour
{
[SerializeField] private GameObject Maincamera;
[SerializeField] private GameObject NPC_camera;
public Transform avatar;
public LLMCharacter llmCharacter;
public InputField playerText;
public Text AIText;
private TexttoSpeech texttospeech;
private string aitext;
void Start()
{
playerText.onSubmit.AddListener(onInputFieldSubmit);
playerText.Select();
texttospeech = FindObjectOfType<TexttoSpeech>();
}
void onInputFieldSubmit(string message)
{
playerText.interactable = false;
AIText.text = "...";
_ = llmCharacter.Chat(message, SetAIText, AIReplyComplete);
}
public void SetAIText(string text)
{
//AIText.text = text;
aitext = text;
}
public void OnTextToSpeechPlaybackStarted()
{
AIText.text = aitext;
}
public void AIReplyComplete()
{
playerText.interactable = true;
playerText.Select();
playerText.text = "";
if (texttospeech != null)
{
texttospeech.SynthesizeSpeech(aitext);
}
}
public void CancelRequests()
{
llmCharacter.CancelRequests();
AIReplyComplete();
}
public void ExitGame()
{
Debug.Log("Exit button clicked");
Application.Quit();
avatar.GetComponent<PlayerInput>().enabled = true;
Maincamera.SetActive(true);
NPC_camera.SetActive(false);
Cursor.visible = false;
Cursor.lockState = CursorLockMode.Locked;
}
bool onValidateWarning = true;
void OnValidate()
{
if (onValidateWarning && !llmCharacter.remote && llmCharacter.llm != null && llmCharacter.llm.model == "")
{
Debug.LogWarning($"Please select a model in the {llmCharacter.llm.gameObject.name} GameObject!");
onValidateWarning = false;
}
}
}
}
Unity UDP_code
C#Send socket data to unity. If UDP sends the same string more than 3 times, a specific prompt will be output.
using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using System;
using System.Text;
using System.Net;
using System.Net.Sockets;
using System.Threading;
using TMPro;
public class UDP_hand : MonoBehaviour
{
private Thread receiveThread;
private UdpClient client;
public int port = 5015;
public bool startReceiving = true;
public bool printToConsole = false;
public TextMeshProUGUI textMeshPro;
private Queue<string> dataQueue = new Queue<string>();
private readonly int maxDataCount = 5;
private bool newDataAvailable = false;
private void Start()
{
receiveThread = new Thread(new ThreadStart(ReceiveData));
receiveThread.IsBackground = true;
receiveThread.Start();
}
private void OnDestroy()
{
startReceiving = false;
if (receiveThread != null && receiveThread.IsAlive)
{
receiveThread.Abort();
}
if (client != null)
{
client.Close();
}
}
private void ReceiveData()
{
client = new UdpClient(port);
while (startReceiving)
{
try
{
IPEndPoint anyIP = new IPEndPoint(IPAddress.Any, 0);
byte[] dataByte = client.Receive(ref anyIP);
string data = Encoding.UTF8.GetString(dataByte);
lock (dataQueue)
{
if (dataQueue.Count >= maxDataCount)
{
dataQueue.Dequeue();
}
dataQueue.Enqueue(data);
}
newDataAvailable = true;
if (printToConsole)
{
print(data);
}
}
catch (Exception err)
{
print(err.ToString());
}
}
}
private void Update()
{
if (newDataAvailable)
{
int okCount = 0;
int hiCount = 0;
lock (dataQueue)
{
foreach (string data in dataQueue)
{
string[] words = data.Split(new char[] { ' ', '\t', '\n', '\r' }, StringSplitOptions.RemoveEmptyEntries);
okCount += Array.FindAll(words, word => word.Trim().ToLower() == "ok").Length;
hiCount += Array.FindAll(words, word => word.Trim().ToLower() == "hi").Length;
}
}
if (okCount >= 3)
{
textMeshPro.text = "How to get rare Gwent cards?";
}
else if (hiCount >= 3)
{
textMeshPro.text = "How to get Netral Card:Xili?";
}
else
{
textMeshPro.text = "You got another one?";
}
newDataAvailable = false;
}
//else
//{
//textMeshPro.text = "No gesture recognized!";
//}
}
}
AWS polly Text to Speech
C#Use aws polly to allow the NPC to regenerate text and then convert it into a voice file and play it out.
using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using Amazon.Polly;
using Amazon.Runtime;
using Amazon;
using Amazon.Polly.Model;
using System.IO;
using UnityEngine.Networking;
using System.Threading.Tasks;
using UnityEngine.Events;
public class TexttoSpeech : MonoBehaviour
{
[SerializeField] private AudioSource audiosource;
private string textTosynthhesize;
public string accessKey;
public string secretKey;
public UnityEvent OnreplyReceived;
public UnityEvent OnPlaybackStarted;
private async void Start()
{
}
public async void SynthesizeSpeech(string text)
{
var credentials = new BasicAWSCredentials(accessKey, secretKey);
var client = new AmazonPollyClient(credentials, RegionEndpoint.EUCentral1);
textTosynthhesize = text;
var request = new SynthesizeSpeechRequest()
{
Text = textTosynthhesize,
Engine = Engine.Neural,
VoiceId = VoiceId.Aria,
//VoiceId.Zhiyu,
OutputFormat = OutputFormat.Mp3
};
var response = await client.SynthesizeSpeechAsync(request);
WriteIntoFile(response.AudioStream);
using (var www = UnityWebRequestMultimedia.GetAudioClip($"{Application.persistentDataPath}/audio.mp3", AudioType.MPEG))
{
var op = www.SendWebRequest();
while (!op.isDone) await Task.Yield();
var clip = DownloadHandlerAudioClip.GetContent(www);
audiosource.clip = clip;
audiosource.Play();
OnreplyReceived.Invoke();
OnPlaybackStarted.Invoke();
}
}
private void WriteIntoFile(Stream stream)
{
using (var filestream = new FileStream($"{Application.persistentDataPath}/audio.mp3", FileMode.Create))
{
byte[] buffer = new byte[8 * 1024];
int bytesRead;
while ((bytesRead = stream.Read(buffer, 0, buffer.Length)) > 0)
{
filestream.Write(buffer, 0, bytesRead);
}
}
}
}
Chat settings
C#Set the character's designated location and triggered actions, such as: opening a conversation and unlocking the cursor
using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using UnityEngine.Windows;
using UnityEngine.InputSystem;
using System.Threading.Tasks;
public class Dialog_with_NPC : MonoBehaviour
{
[SerializeField] private GameObject Maincamera;
[SerializeField] private GameObject NPC_camera;
[SerializeField] private GameObject Standing_Point;
private Transform avatar;
private async void OnTriggerEnter(Collider other)
{
if (other.CompareTag("Player"))
{
avatar = other.transform;
avatar.GetComponent<PlayerInput>().enabled = false;
await Task.Delay(20);
avatar.position = Standing_Point.transform.position;
avatar.rotation = Standing_Point.transform.rotation;
Maincamera.SetActive(false);
NPC_camera.SetActive(true);
Cursor.visible = true;
Cursor.lockState = CursorLockMode.None;
}
}
}
Comments