import csv
import copy
import argparse
import itertools
from collections import Counter
from collections import deque
import serial
import serial.tools.list_ports
import cv2 as cv
import numpy as np
import mediapipe as mp
from utils import CvFpsCalc
from model import KeyPointClassifier
from model import PointHistoryClassifier
def find_working_serial_port():
ports = serial.tools.list_ports.comports()
working_port = None
for port in ports:
try:
print(f"Trying port: {port.device}")
with serial.Serial(port.device, baudrate=9600, timeout=1) as ser:
if ser.is_open:
working_port = port.device
print(f"Found working port: {working_port}")
break
except (serial.SerialException, OSError) as e:
print(f"Port {port.device} not working: {e}")
if working_port:
return working_port
else:
print("No working serial port found.")
return None
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--device", type=int, default=1)
parser.add_argument("--width", help='cap width', type=int, default=960)
parser.add_argument("--height", help='cap height', type=int, default=540)
parser.add_argument('--use_static_image_mode', action='store_true')
parser.add_argument("--min_detection_confidence",
help='min_detection_confidence',
type=float,
default=0.7)
parser.add_argument("--min_tracking_confidence",
help='min_tracking_confidence',
type=int,
default=0.5)
args = parser.parse_args()
return args
def main():
# Argument parsing
args = get_args()
cap_device = args.device
cap_width = args.width
cap_height = args.height
use_static_image_mode = args.use_static_image_mode
min_detection_confidence = args.min_detection_confidence
min_tracking_confidence = args.min_tracking_confidence
use_brect = True
# Camera preparation
cap = cv.VideoCapture(cap_device)
cap.set(cv.CAP_PROP_FRAME_WIDTH, cap_width)
cap.set(cv.CAP_PROP_FRAME_HEIGHT, cap_height)
# Detect serial port
# serial_port = find_working_serial_port()
serial_port = '/dev/cu.usbmodem1101' # Replace this with your port
if serial_port:
ser = serial.Serial(port=serial_port, baudrate=9600, timeout=1)
print(f"Connected to {serial_port}")
else:
print("No available serial port. Exiting.")
return
# Model load
# 1. Hand gesture model
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
static_image_mode=use_static_image_mode,
max_num_hands=1,
min_detection_confidence=min_detection_confidence,
min_tracking_confidence=min_tracking_confidence,
)
keypoint_classifier = KeyPointClassifier()
point_history_classifier = PointHistoryClassifier()
# Read labels
with open('model/keypoint_classifier/keypoint_classifier_label.csv',
encoding='utf-8-sig') as f:
keypoint_classifier_labels = csv.reader(f)
keypoint_classifier_labels = [
row[0] for row in keypoint_classifier_labels
]
with open(
'model/point_history_classifier/point_history_classifier_label.csv',
encoding='utf-8-sig') as f:
point_history_classifier_labels = csv.reader(f)
point_history_classifier_labels = [
row[0] for row in point_history_classifier_labels
]
# Coordinate history
history_length = 16
point_history = deque(maxlen=history_length)
# Finger gesture history
finger_gesture_history = deque(maxlen=history_length)
mode = 0
# 2. Face model
mp_face_mesh = mp.solutions.face_mesh
face_mesh = mp_face_mesh.FaceMesh(min_detection_confidence=0.5, min_tracking_confidence=0.5)
mp_drawing = mp.solutions.drawing_utils
drawing_spec = mp_drawing.DrawingSpec(color=(128,0,128),thickness=2,circle_radius=1)
# FPS Measurement
cvFpsCalc = CvFpsCalc(buffer_len=10)
while cap.isOpened():
fps = cvFpsCalc.get()
# Process Key (ESC: end)
key = cv.waitKey(10)
if key == 27: # ESC
break
number, mode = select_mode(key, mode)
# Camera capture
ret, image = cap.read()
if not ret:
break
image = cv.flip(image, 1) # Mirror display
hand_debug_image = copy.deepcopy(image)
face_debug_image = copy.deepcopy(image)
# Detection implementation
image = cv.cvtColor(image, cv.COLOR_BGR2RGB)
image.flags.writeable = False
hand_results = hands.process(image)
face_results = face_mesh.process(image)
image.flags.writeable = True
if ser.in_waiting > 0: # Check if data is available
data = ser.readline().decode('utf-8').strip()
print(f"Received from serial: {data}")
# else:
# print("No data available in the serial buffer.")
img_h , img_w, img_c = image.shape
# 1. Hand gesture processing
if hand_results.multi_hand_landmarks is not None:
for hand_landmarks, handedness in zip(hand_results.multi_hand_landmarks,
hand_results.multi_handedness):
# Bounding box calculation
brect = calc_bounding_rect(hand_debug_image, hand_landmarks)
# Landmark calculation
landmark_list = calc_landmark_list(hand_debug_image, hand_landmarks)
# Conversion to relative coordinates / normalized coordinates
pre_processed_landmark_list = pre_process_landmark(
landmark_list)
pre_processed_point_history_list = pre_process_point_history(
hand_debug_image, point_history)
# Write to the dataset file
logging_csv(number, mode, pre_processed_landmark_list,
pre_processed_point_history_list)
# Hand sign classification
hand_sign_id = keypoint_classifier(pre_processed_landmark_list)
if hand_sign_id == 2: # Point gesture
point_history.append(landmark_list[8])
else:
point_history.append([0, 0])
# Finger gesture classification
finger_gesture_id = 0
point_history_len = len(pre_processed_point_history_list)
if point_history_len == (history_length * 2):
finger_gesture_id = point_history_classifier(
pre_processed_point_history_list)
# Calculates the gesture IDs in the latest detection
finger_gesture_history.append(finger_gesture_id)
most_common_fg_id = Counter(
finger_gesture_history).most_common()
# Drawing part
gesture_label = keypoint_classifier_labels[hand_sign_id] # Get the label
ser.write(f"Gesture: {gesture_label}\n".encode('utf-8')) # Send gesture label to serial port
hand_debug_image = draw_bounding_rect(use_brect, hand_debug_image, brect)
hand_debug_image = draw_landmarks(hand_debug_image, landmark_list)
hand_debug_image = draw_info_text(
hand_debug_image,
brect,
handedness,
keypoint_classifier_labels[hand_sign_id],
point_history_classifier_labels[most_common_fg_id[0][0]],
)
else:
point_history.append([0, 0])
hand_debug_image = draw_point_history(hand_debug_image, point_history)
hand_debug_image = draw_info(hand_debug_image, fps, mode, number)
# Screen reflection
cv.imshow('Hand Gesture Recognition', hand_debug_image)
# 2. Face recognition
face_2d = []
face_3d = []
if face_results.multi_face_landmarks:
for face_landmarks in face_results.multi_face_landmarks:
for idx, lm in enumerate(face_landmarks.landmark):
if idx == 33 or idx == 263 or idx ==1 or idx == 61 or idx == 291 or idx==199:
if idx ==1:
nose_2d = (lm.x * img_w,lm.y * img_h)
nose_3d = (lm.x * img_w,lm.y * img_h,lm.z * 3000)
x,y = int(lm.x * img_w),int(lm.y * img_h)
face_2d.append([x,y])
face_3d.append(([x,y,lm.z]))
# Get 2d Coord
face_2d = np.array(face_2d,dtype=np.float64)
face_3d = np.array(face_3d,dtype=np.float64)
focal_length = 1 * img_w
cam_matrix = np.array([[focal_length,0,img_h/2],
[0,focal_length,img_w/2],
[0,0,1]])
distortion_matrix = np.zeros((4,1),dtype=np.float64)
success,rotation_vec,translation_vec = cv.solvePnP(face_3d,face_2d,cam_matrix,distortion_matrix)
# Getting rotational of face
rmat,jac = cv.Rodrigues(rotation_vec)
angles,mtxR,mtxQ,Qx,Qy,Qz = cv.RQDecomp3x3(rmat)
x = angles[0] * 360
y = angles[1] * 360
z = angles[2] * 360
# Here based on axis rot angle is calculated
if y < -10:
text="Looking Left"
ser.write(b'5: Left\n')
elif y > 10:
text="Looking Right"
ser.write(b'4: Right\n')
elif x < -10:
text="Looking Down"
ser.write(b'3: Down\n')
elif x > 10:
text="Looking Up"
ser.write(b'2: Up\n')
else:
text="Forward"
ser.write(b'1: Forward\n')
nose_3d_projection,jacobian = cv.projectPoints(nose_3d,rotation_vec,translation_vec,cam_matrix,distortion_matrix)
p1 = (int(nose_2d[0]),int(nose_2d[1]))
p2 = (int(nose_2d[0] + y*10), int(nose_2d[1] -x *10))
cv.line(hand_debug_image,p1,p2,(255,0,0),3)
cv.putText(hand_debug_image,text,(20,50),cv.FONT_HERSHEY_SIMPLEX,2,(0,255,0),2)
cv.putText(hand_debug_image,"x: " + str(np.round(x,2)),(500,50),cv.FONT_HERSHEY_SIMPLEX,1,(0,0,255),2)
cv.putText(hand_debug_image,"y: "+ str(np.round(y,2)),(500,100),cv.FONT_HERSHEY_SIMPLEX,1,(0,0,255),2)
cv.putText(hand_debug_image,"z: "+ str(np.round(z, 2)), (500, 150),cv.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
mp_drawing.draw_landmarks(image=hand_debug_image,
landmark_list=face_landmarks,
connections=mp_face_mesh.FACEMESH_CONTOURS,
landmark_drawing_spec=drawing_spec,
connection_drawing_spec=drawing_spec)
cv.imshow('Head Pose Detection', hand_debug_image)
cap.release()
cv.destroyAllWindows()
def select_mode(key, mode):
number = -1
if 48 <= key <= 57: # 0 ~ 9
number = key - 48
if key == 110: # n
mode = 0
if key == 107: # k
mode = 1
if key == 104: # h
mode = 2
return number, mode
def calc_bounding_rect(image, landmarks):
image_width, image_height = image.shape[1], image.shape[0]
landmark_array = np.empty((0, 2), int)
for _, landmark in enumerate(landmarks.landmark):
landmark_x = min(int(landmark.x * image_width), image_width - 1)
landmark_y = min(int(landmark.y * image_height), image_height - 1)
landmark_point = [np.array((landmark_x, landmark_y))]
landmark_array = np.append(landmark_array, landmark_point, axis=0)
x, y, w, h = cv.boundingRect(landmark_array)
return [x, y, x + w, y + h]
def calc_landmark_list(image, landmarks):
image_width, image_height = image.shape[1], image.shape[0]
landmark_point = []
# Keypoint
for _, landmark in enumerate(landmarks.landmark):
landmark_x = min(int(landmark.x * image_width), image_width - 1)
landmark_y = min(int(landmark.y * image_height), image_height - 1)
# landmark_z = landmark.z
landmark_point.append([landmark_x, landmark_y])
return landmark_point
def pre_process_landmark(landmark_list):
temp_landmark_list = copy.deepcopy(landmark_list)
# Convert to relative coordinates
base_x, base_y = 0, 0
for index, landmark_point in enumerate(temp_landmark_list):
if index == 0:
base_x, base_y = landmark_point[0], landmark_point[1]
temp_landmark_list[index][0] = temp_landmark_list[index][0] - base_x
temp_landmark_list[index][1] = temp_landmark_list[index][1] - base_y
# Convert to a one-dimensional list
temp_landmark_list = list(
itertools.chain.from_iterable(temp_landmark_list))
# Normalization
max_value = max(list(map(abs, temp_landmark_list)))
def normalize_(n):
return n / max_value
temp_landmark_list = list(map(normalize_, temp_landmark_list))
return temp_landmark_list
def pre_process_point_history(image, point_history):
image_width, image_height = image.shape[1], image.shape[0]
temp_point_history = copy.deepcopy(point_history)
# Convert to relative coordinates
base_x, base_y = 0, 0
for index, point in enumerate(temp_point_history):
if index == 0:
base_x, base_y = point[0], point[1]
temp_point_history[index][0] = (temp_point_history[index][0] -
base_x) / image_width
temp_point_history[index][1] = (temp_point_history[index][1] -
base_y) / image_height
# Convert to a one-dimensional list
temp_point_history = list(
itertools.chain.from_iterable(temp_point_history))
return temp_point_history
def logging_csv(number, mode, landmark_list, point_history_list):
if mode == 0:
pass
if mode == 1 and (0 <= number <= 9):
csv_path = 'model/keypoint_classifier/keypoint.csv'
with open(csv_path, 'a', newline="") as f:
writer = csv.writer(f)
writer.writerow([number, *landmark_list])
if mode == 2 and (0 <= number <= 9):
csv_path = 'model/point_history_classifier/point_history.csv'
with open(csv_path, 'a', newline="") as f:
writer = csv.writer(f)
writer.writerow([number, *point_history_list])
return
def draw_landmarks(image, landmark_point):
if len(landmark_point) > 0:
# Thumb
cv.line(image, tuple(landmark_point[2]), tuple(landmark_point[3]),
(0, 0, 0), 6)
cv.line(image, tuple(landmark_point[2]), tuple(landmark_point[3]),
(255, 255, 255), 2)
cv.line(image, tuple(landmark_point[3]), tuple(landmark_point[4]),
(0, 0, 0), 6)
cv.line(image, tuple(landmark_point[3]), tuple(landmark_point[4]),
(255, 255, 255), 2)
# Index finger
cv.line(image, tuple(landmark_point[5]), tuple(landmark_point[6]),
(0, 0, 0), 6)
cv.line(image, tuple(landmark_point[5]), tuple(landmark_point[6]),
(255, 255, 255), 2)
cv.line(image, tuple(landmark_point[6]), tuple(landmark_point[7]),
(0, 0, 0), 6)
cv.line(image, tuple(landmark_point[6]), tuple(landmark_point[7]),
(255, 255, 255), 2)
cv.line(image, tuple(landmark_point[7]), tuple(landmark_point[8]),
(0, 0, 0), 6)
cv.line(image, tuple(landmark_point[7]), tuple(landmark_point[8]),
(255, 255, 255), 2)
# Middle finger
cv.line(image, tuple(landmark_point[9]), tuple(landmark_point[10]),
(0, 0, 0), 6)
cv.line(image, tuple(landmark_point[9]), tuple(landmark_point[10]),
(255, 255, 255), 2)
cv.line(image, tuple(landmark_point[10]), tuple(landmark_point[11]),
(0, 0, 0), 6)
cv.line(image, tuple(landmark_point[10]), tuple(landmark_point[11]),
(255, 255, 255), 2)
cv.line(image, tuple(landmark_point[11]), tuple(landmark_point[12]),
(0, 0, 0), 6)
cv.line(image, tuple(landmark_point[11]), tuple(landmark_point[12]),
(255, 255, 255), 2)
# Ring finger
cv.line(image, tuple(landmark_point[13]), tuple(landmark_point[14]),
(0, 0, 0), 6)
cv.line(image, tuple(landmark_point[13]), tuple(landmark_point[14]),
(255, 255, 255), 2)
cv.line(image, tuple(landmark_point[14]), tuple(landmark_point[15]),
(0, 0, 0), 6)
cv.line(image, tuple(landmark_point[14]), tuple(landmark_point[15]),
(255, 255, 255), 2)
cv.line(image, tuple(landmark_point[15]), tuple(landmark_point[16]),
(0, 0, 0), 6)
cv.line(image, tuple(landmark_point[15]), tuple(landmark_point[16]),
(255, 255, 255), 2)
# Little finger
cv.line(image, tuple(landmark_point[17]), tuple(landmark_point[18]),
(0, 0, 0), 6)
cv.line(image, tuple(landmark_point[17]), tuple(landmark_point[18]),
(255, 255, 255), 2)
cv.line(image, tuple(landmark_point[18]), tuple(landmark_point[19]),
(0, 0, 0), 6)
cv.line(image, tuple(landmark_point[18]), tuple(landmark_point[19]),
(255, 255, 255), 2)
cv.line(image, tuple(landmark_point[19]), tuple(landmark_point[20]),
(0, 0, 0), 6)
cv.line(image, tuple(landmark_point[19]), tuple(landmark_point[20]),
(255, 255, 255), 2)
# Palm
cv.line(image, tuple(landmark_point[0]), tuple(landmark_point[1]),
(0, 0, 0), 6)
cv.line(image, tuple(landmark_point[0]), tuple(landmark_point[1]),
(255, 255, 255), 2)
cv.line(image, tuple(landmark_point[1]), tuple(landmark_point[2]),
(0, 0, 0), 6)
cv.line(image, tuple(landmark_point[1]), tuple(landmark_point[2]),
(255, 255, 255), 2)
cv.line(image, tuple(landmark_point[2]), tuple(landmark_point[5]),
(0, 0, 0), 6)
cv.line(image, tuple(landmark_point[2]), tuple(landmark_point[5]),
(255, 255, 255), 2)
cv.line(image, tuple(landmark_point[5]), tuple(landmark_point[9]),
(0, 0, 0), 6)
cv.line(image, tuple(landmark_point[5]), tuple(landmark_point[9]),
(255, 255, 255), 2)
cv.line(image, tuple(landmark_point[9]), tuple(landmark_point[13]),
(0, 0, 0), 6)
cv.line(image, tuple(landmark_point[9]), tuple(landmark_point[13]),
(255, 255, 255), 2)
cv.line(image, tuple(landmark_point[13]), tuple(landmark_point[17]),
(0, 0, 0), 6)
cv.line(image, tuple(landmark_point[13]), tuple(landmark_point[17]),
(255, 255, 255), 2)
cv.line(image, tuple(landmark_point[17]), tuple(landmark_point[0]),
(0, 0, 0), 6)
cv.line(image, tuple(landmark_point[17]), tuple(landmark_point[0]),
(255, 255, 255), 2)
# Key Points
for index, landmark in enumerate(landmark_point):
if index == 0: # 1
cv.circle(image, (landmark[0], landmark[1]), 5, (255, 255, 255),
-1)
cv.circle(image, (landmark[0], landmark[1]), 5, (0, 0, 0), 1)
if index == 1: # 2
cv.circle(image, (landmark[0], landmark[1]), 5, (255, 255, 255),
-1)
cv.circle(image, (landmark[0], landmark[1]), 5, (0, 0, 0), 1)
if index == 2: #
cv.circle(image, (landmark[0], landmark[1]), 5, (255, 255, 255),
-1)
cv.circle(image, (landmark[0], landmark[1]), 5, (0, 0, 0), 1)
if index == 3: # 1
cv.circle(image, (landmark[0], landmark[1]), 5, (255, 255, 255),
-1)
cv.circle(image, (landmark[0], landmark[1]), 5, (0, 0, 0), 1)
if index == 4: #
cv.circle(image, (landmark[0], landmark[1]), 8, (255, 255, 255),
-1)
cv.circle(image, (landmark[0], landmark[1]), 8, (0, 0, 0), 1)
if index == 5: #
cv.circle(image, (landmark[0], landmark[1]), 5, (255, 255, 255),
-1)
cv.circle(image, (landmark[0], landmark[1]), 5, (0, 0, 0), 1)
if index == 6: # 2
cv.circle(image, (landmark[0], landmark[1]), 5, (255, 255, 255),
-1)
cv.circle(image, (landmark[0], landmark[1]), 5, (0, 0, 0), 1)
if index == 7: # 1
cv.circle(image, (landmark[0], landmark[1]), 5, (255, 255, 255),
-1)
cv.circle(image, (landmark[0], landmark[1]), 5, (0, 0, 0), 1)
if index == 8: #
cv.circle(image, (landmark[0], landmark[1]), 8, (255, 255, 255),
-1)
cv.circle(image, (landmark[0], landmark[1]), 8, (0, 0, 0), 1)
if index == 9: #
cv.circle(image, (landmark[0], landmark[1]), 5, (255, 255, 255),
-1)
cv.circle(image, (landmark[0], landmark[1]), 5, (0, 0, 0), 1)
if index == 10: # 2
cv.circle(image, (landmark[0], landmark[1]), 5, (255, 255, 255),
-1)
cv.circle(image, (landmark[0], landmark[1]), 5, (0, 0, 0), 1)
if index == 11: # 1
cv.circle(image, (landmark[0], landmark[1]), 5, (255, 255, 255),
-1)
cv.circle(image, (landmark[0], landmark[1]), 5, (0, 0, 0), 1)
if index == 12: #
cv.circle(image, (landmark[0], landmark[1]), 8, (255, 255, 255),
-1)
cv.circle(image, (landmark[0], landmark[1]), 8, (0, 0, 0), 1)
if index == 13: #
cv.circle(image, (landmark[0], landmark[1]), 5, (255, 255, 255),
-1)
cv.circle(image, (landmark[0], landmark[1]), 5, (0, 0, 0), 1)
if index == 14: # 2
cv.circle(image, (landmark[0], landmark[1]), 5, (255, 255, 255),
-1)
cv.circle(image, (landmark[0], landmark[1]), 5, (0, 0, 0), 1)
if index == 15: # 1
cv.circle(image, (landmark[0], landmark[1]), 5, (255, 255, 255),
-1)
cv.circle(image, (landmark[0], landmark[1]), 5, (0, 0, 0), 1)
if index == 16: #
cv.circle(image, (landmark[0], landmark[1]), 8, (255, 255, 255),
-1)
cv.circle(image, (landmark[0], landmark[1]), 8, (0, 0, 0), 1)
if index == 17: #
cv.circle(image, (landmark[0], landmark[1]), 5, (255, 255, 255),
-1)
cv.circle(image, (landmark[0], landmark[1]), 5, (0, 0, 0), 1)
if index == 18: # 2
cv.circle(image, (landmark[0], landmark[1]), 5, (255, 255, 255),
-1)
cv.circle(image, (landmark[0], landmark[1]), 5, (0, 0, 0), 1)
if index == 19: # 1
cv.circle(image, (landmark[0], landmark[1]), 5, (255, 255, 255),
-1)
cv.circle(image, (landmark[0], landmark[1]), 5, (0, 0, 0), 1)
if index == 20: #
cv.circle(image, (landmark[0], landmark[1]), 8, (255, 255, 255),
-1)
cv.circle(image, (landmark[0], landmark[1]), 8, (0, 0, 0), 1)
return image
def draw_bounding_rect(use_brect, image, brect):
if use_brect:
# Outer rectangle
cv.rectangle(image, (brect[0], brect[1]), (brect[2], brect[3]),
(0, 0, 0), 1)
return image
def draw_info_text(image, brect, handedness, hand_sign_text,
finger_gesture_text):
cv.rectangle(image, (brect[0], brect[1]), (brect[2], brect[1] - 22),
(0, 0, 0), -1)
info_text = handedness.classification[0].label[0:]
if hand_sign_text != "":
info_text = info_text + ':' + hand_sign_text
cv.putText(image, info_text, (brect[0] + 5, brect[1] - 4),
cv.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 1, cv.LINE_AA)
if finger_gesture_text != "":
cv.putText(image, "Finger Gesture:" + finger_gesture_text, (10, 60),
cv.FONT_HERSHEY_SIMPLEX, 1.0, (0, 0, 0), 4, cv.LINE_AA)
cv.putText(image, "Finger Gesture:" + finger_gesture_text, (10, 60),
cv.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255), 2,
cv.LINE_AA)
return image
def draw_point_history(image, point_history):
for index, point in enumerate(point_history):
if point[0] != 0 and point[1] != 0:
cv.circle(image, (point[0], point[1]), 1 + int(index / 2),
(152, 251, 152), 2)
return image
def draw_info(image, fps, mode, number):
cv.putText(image, "FPS:" + str(fps), (10, 30), cv.FONT_HERSHEY_SIMPLEX,
1.0, (0, 0, 0), 4, cv.LINE_AA)
cv.putText(image, "FPS:" + str(fps), (10, 30), cv.FONT_HERSHEY_SIMPLEX,
1.0, (255, 255, 255), 2, cv.LINE_AA)
mode_string = ['Logging Key Point', 'Logging Point History']
if 1 <= mode <= 2:
cv.putText(image, "MODE:" + mode_string[mode - 1], (10, 90),
cv.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 1,
cv.LINE_AA)
if 0 <= number <= 9:
cv.putText(image, "NUM:" + str(number), (10, 110),
cv.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 1,
cv.LINE_AA)
return image
if __name__ == '__main__':
main()
Comments
Please log in or sign up to comment.