Daksh Sambhare
Created August 30, 2024

Haptic-Navigator and Visual Information Scanner

A device capable of helping in navigation using binocular vision and Visual information reading through audio

AdvancedFull instructions providedOver 1 day77

Things used in this project

Hardware components

UNIHIKER - IoT Python Programming Single Board Computer with Touchscreen
DFRobot UNIHIKER - IoT Python Programming Single Board Computer with Touchscreen
×1
Raspberry Pi 5
Raspberry Pi 5
×1
Seeed Studio XIAO ESP32S3 Sense
Seeed Studio XIAO ESP32S3 Sense
×1
Adafruit STM32F411 "BlackPill" Development Board
×1
DFRobot PAM8403 3W Mini Audio Stereo Amplifier
×1
Grove - Ultrasonic Ranger
Seeed Studio Grove - Ultrasonic Ranger
×5
Webcam, Logitech® HD Pro
Webcam, Logitech® HD Pro
×3
Speaker: 0.25W, 8 ohms
Speaker: 0.25W, 8 ohms
×1
200HB - 4 PORT USB HUB
×1
Solar Cockroach Vibrating Disc Motor
Brown Dog Gadgets Solar Cockroach Vibrating Disc Motor
×6

Software apps and online services

VS Code
Microsoft VS Code
Arduino IDE
Arduino IDE
Raspbian
Raspberry Pi Raspbian
STM32CUBEPROG
STMicroelectronics STM32CUBEPROG
Edge Impulse Studio
Edge Impulse Studio

Story

Read more

Schematics

Circuit Diagrams

Circuit Diagrams

circuit Diagrams

Code

Depth estimation

Python
Raw format of metric Depth estimation
import argparse
import cv2
import numpy as np
import os
import torch
import time
import matplotlib

from depth_anything_v2.dpt import DepthAnythingV2

# Import the OpenCV extra functionalities for object detection
classNames = []
classFile = "/home/dsay/Documents/hackster/depthanything/Depth-Anything-V2/metric_depth/cocoobject/Object_Detection_Files/coco.names"
with open(classFile, "rt") as f:
    classNames = f.read().rstrip("\n").split("\n")

configPath = "/home/dsay/Documents/hackster/depthanything/Depth-Anything-V2/metric_depth/cocoobject/Object_Detection_Files/ssd_mobilenet_v3_large_coco_2020_01_14.pbtxt"
weightsPath = "/home/dsay/Documents/hackster/depthanything/Depth-Anything-V2/metric_depth/cocoobject/Object_Detection_Files/frozen_inference_graph.pb"

net = cv2.dnn_DetectionModel(weightsPath, configPath)
net.setInputSize(320, 320)
net.setInputScale(1.0 / 127.5)
net.setInputMean((127.5, 127.5, 127.5))
net.setInputSwapRB(True)

def getObjects(img, thres, nms, draw=True, objects=[]):
    classIds, confs, bbox = net.detect(img, confThreshold=thres, nmsThreshold=nms)
    objectInfo = []
    if len(classIds) != 0:
        for classId, confidence, box in zip(classIds.flatten(), confs.flatten(), bbox):
            className = classNames[classId - 1]
            if className in objects:
                objectInfo.append([box, className])
                if draw:
                    cv2.rectangle(img, box, color=(0, 255, 0), thickness=2)
                    cv2.putText(img, classNames[classId-1].upper(), (box[0]-10, box[1]-30), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 255, 0), 2)
                    cv2.putText(img, str(round(confidence*100, 2)), (box[0]-200, box[1]-30), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 255, 0), 2)
    return img, objectInfo

def process_frame(frame, depth_anything, args, frame_count):
    raw_image = frame
    
    # Perform object detection
    result, objectInfo = getObjects(raw_image, 0.45, 0.2, objects=['person', 'chair', 'cup', 'cell phone', 'box', 'bottle'])
    
    # Print metric depth only where objects are detected
    if len(objectInfo) > 0:
        print(f'Frame {frame_count}: Metric depth (in meters) at object locations:')
        for box, className in objectInfo:
            x, y, w, h = box
            center_x = x + w // 2
            center_y = y + h // 2
            
            # Ensure center coordinates are within image bounds
            if center_y < 0 or center_y >= raw_image.shape[0] or center_x < 0 or center_x >= raw_image.shape[1]:
                continue
            
            # Infer depth for the center of detected object
            depth = depth_anything.infer_image(raw_image, args.input_size)
            depth_meter = depth[center_y, center_x] * args.max_depth / 255.0
           # time.sleep(4)
           
            print(f'Object: {className}, Depth: {depth_meter} meters', center_x)
            
        print()
    
    # Perform depth estimation on the entire frame
    depth = depth_anything.infer_image(raw_image, args.input_size)
    
    # Normalize and convert depth to visual representation
    depth_visual = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
    depth_visual = depth_visual.astype(np.uint8)
    
    # Convert depth to meters
    depth_meters = depth * args.max_depth / 255.0
    
    # Apply colormap or grayscale
    if args.grayscale:
        depth_visual = cv2.cvtColor(depth_visual, cv2.COLOR_GRAY2BGR)
    else:
        cmap = matplotlib.cm.get_cmap('Spectral')
        depth_visual = (cmap(depth_visual)[:, :, :3] * 255).astype(np.uint8)
    
    # Combine original frame with depth visualization
    split_region = np.ones((raw_image.shape[0], 50, 3), dtype=np.uint8) * 255
    combined_result = np.hstack([raw_image, split_region, depth_visual])
    
    return combined_result


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Depth Anything V2 Metric Depth Estimation on Webcam')
    parser.add_argument('--video-path1', type=int, default=2, help='Webcam index for video capture (camera 1)')
    parser.add_argument('--video-path2', type=int, default=1, help='Webcam index for video capture (camera 2)')
    parser.add_argument('--input-size', type=int, default=518, help='Input size for image processing')
    parser.add_argument('--outdir', type=str, default='./vis_depth', help='Output directory')
    
    parser.add_argument('--encoder', type=str, default='vits', choices=['vits', 'vitb', 'vitl', 'vitg'], help='Encoder type')
    parser.add_argument('--load-from', type=str, default='/home/dsay/Documents/hackster/depthanything/Depth-Anything-V2/metric_depth/checkpoints/depth_anything_v2_metric_hypersim_vits.pth', help='Path to model checkpoint')
    parser.add_argument('--max-depth', type=float, default=50, help='Maximum depth value')
    
    parser.add_argument('--save-numpy', dest='save_numpy', action='store_true', help='Save the model raw output')
    parser.add_argument('--pred-only', dest='pred_only', action='store_true', help='Only display the depth prediction')
    parser.add_argument('--grayscale', dest='grayscale', action='store_true', help='Do not apply colorful palette')
    
    args = parser.parse_args()
    
    # Initialize device
    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    # Initialize model configuration based on chosen encoder
    model_configs = {
        'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
        'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
        'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
        'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
    }
    
    # Initialize DepthAnythingV2 model
    depth_anything = DepthAnythingV2(**{**model_configs[args.encoder], 'max_depth': args.max_depth})
    depth_anything.load_state_dict(torch.load(args.load_from, map_location='cpu'))
    depth_anything = depth_anything.to(DEVICE).eval()
    
    # Open webcam captures for both cameras
    cap1 = cv2.VideoCapture(args.video_path1)
    cap2 = cv2.VideoCapture(args.video_path2)
    frame_count = 0
    
    # Create output directory if not exists
    os.makedirs(args.outdir, exist_ok=True)
    
    while True:
        # Read frames from both cameras
        ret1, frame1 = cap1.read()
        ret2, frame2 = cap2.read()
        
        if not ret1 or not ret2:
            break
        
        frame_count += 1
        
        # Process frames from both cameras
        combined_result1 = process_frame(frame1, depth_anything, args, frame_count)
        combined_result2 = process_frame(frame2, depth_anything, args, frame_count)
        
        # Display the processed frames from both cameras
        cv2.imshow('Camera 1 Depth Estimation', combined_result1)
        cv2.imshow('Camera 2 Depth Estimation', combined_result2)
        
        # Press 'q' to quit
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    
    # Release webcam captures and close all windows
    cap1.release()
    cap2.release()
    cv2.destroyAllWindows()

Credits

Daksh Sambhare

Daksh Sambhare

1 project • 3 followers

Comments