Running AI Pose Detection on Top of Agora Video SDK

This is an example of how to getting AI to run on top of Agora's Real Time Engagement.

IntermediateFull instructions provided20 hours3,105

Things used in this project

Hardware components

NVIDIA Jetson Nano Developer Kit

Software apps and online services

TensorFlow

Story

Introduction: Getting Pose Tracking to run with Agora VideoSDK

As we are starting to utilize more and more video conferencing software, it is crucial for us to to be able to run AI and AR on top of the video itself. As MixPose is a yoga live streaming app with AI Pose detection, this guide will focus on how to integrate AI to overlay on top of WebRTC. You can try out a live demo of the yoga class at https://mixpose.com

Running AI on top of Agora's Video SDK

Prerequisites:

nodejs
firebase functions
Agora Web SDK 3.1.0 or above
firebase hosting

Agora offers 10, 000 free minutes per month on the their sdk usage, which can be used perfectly for testing out the application. To sign up for an account go to https://sso.agora.io/v2/signup and you can get started there.

Signing up for Agora

After account is created, simply create a API key.

Step 2: Run the Agora WebSDK example

In this example we will be focusing on javascript SDK for the web, for more detail you can follow the guide through https://docs.agora.io/en/Video/start_call_web?platform=Web

I've used firebase functions as backend serving to get the token to start the class, which you can call the functions generateRtcToken to get the token with your APIId or certificate. Any nodejs server would do

const functions = require('firebase-functions');

// // Create and Deploy Your First Cloud Functions
// // https://firebase.google.com/docs/functions/write-firebase-functions
//
// exports.helloWorld = functions.https.onRequest((request, response) => {
//   functions.logger.info("Hello logs!", {structuredData: true});
//   response.send("Hello from Firebase!");
// });


const RtcTokenBuilder = require('./src/RtcTokenBuilder').RtcTokenBuilder;
const RtcRole = require('./src/RtcTokenBuilder').Role;
const RtmTokenBuilder = require('./src/RtmTokenBuilder').RtmTokenBuilder;
const RtmRole = require('./src/RtmTokenBuilder').Role

const generateRtcToken = (className, accountName, roleValue) => {
  // Rtc Examples
  const appID = 'test';
  const appCertificate = 'test';
  const channelName = className;
  //const uid = 100;
  const account = accountName;
  const role = roleValue;

  const expirationTimeInSeconds = 7200;

  const currentTimestamp = Math.floor(Date.now() / 1000);

  const privilegeExpiredTs = currentTimestamp + expirationTimeInSeconds;

  // IMPORTANT! Build token with either the uid or with the user account. Comment out the option you do not want to use below.

  // Build token with uid
  //const tokenA = RtcTokenBuilder.buildTokenWithUid(appID, appCertificate, channelName, uid, role, privilegeExpiredTs);
  //console.log("Token With Integer Number Uid: " + tokenA);

  // Build token with user account
    const token = RtcTokenBuilder.buildTokenWithAccount(appID, appCertificate, channelName, account, role, privilegeExpiredTs);
    return token
}

exports.generateRtcToken = functions.https.onCall((data, context) => {
  var token =  generateRtcToken(data.className, data.userName, RtcRole.PUBLISHER);
  return token;
});

alternatively, if you are building quick POC you can also use temp token for audio/video call via Agora dashboard, but this would not be recommended for production.

Agora dashboard to generate temp token

There is also a step by step guide that can help to build your first video chat web app on Agora at https://www.agora.io/en/blog/building-a-group-video-chat-web-app/?_ga=2.184722720.864757430.1597829478-674087565.1593581567

The video joining process is pretty standard across the field

Step 3: Redraw the video element into canvas

We can't really run tfjs on the video element from agora SDK, but we can create 2 elements that can redraw the live videofeed into canvas

<canvas id="output" >
<canvas id="image" />

Then we can transfer the video image feed into output by following code

imagectx.clearRect(0, 0, $('#output').width(), $('#output').height());

    imagectx.save();
    imagectx.scale(-1, 1);
    imagectx.translate(-$('#output').width(), 0);
    imagectx.drawImage(video, 0, 0, $('#output').width(), $('#output').height());
    imagectx.restore();

Step 4: Run tfjs on the canvas itself

Now we have an element we can run AI inference on, we can finally use TFJS. as time of the writing we are doing it via TF 2.0

<script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs"></script>
<script src="https://cdn.jsdelivr.net/npm/@tensorflow-models/posenet"></script>
<script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs-backend-wasm@2.0.0/dist/tf-backend-wasm.js"></script>

And we can now run the inference on top of the output canvas

async function poseDetectionFrame() {    
    if(!JSON.parse(getMeta("useai")))
    {      
      return;
    }
    let poses = [];
    let minPoseConfidence;
    let minPartConfidence;

    imagectx.clearRect(0, 0, $('#output').width(), $('#output').height());

    imagectx.save();
    imagectx.scale(-1, 1);
    imagectx.translate(-$('#output').width(), 0);
    imagectx.drawImage(video, 0, 0, $('#output').width(), $('#output').height());
    imagectx.restore();

      const pose = await net.estimatePoses(image, {
            flipHorizontal: false,
            decodingMethod: 'single-person'
          });

    ctx.clearRect(0, 0, $('#output').width(), $('#output').height());

    ctx.save();
    ctx.scale(-1, 1);
    ctx.translate(-$('#output').width(), 0);
    ctx.drawImage(video, 0, 0, $('#output').width(), $('#output').height());
    ctx.restore();

      poses = poses.concat(pose);
      minPoseConfidence = + 0.15;
      minPartConfidence = + 0.1;
          // For each pose (i.e. person) detected in an image, loop through the poses
      // and draw the resulting skeleton and keypoints if over certain confidence
      // Step 5

      requestAnimationFrame(poseDetectionFrame);
    

  }
  poseDetectionFrame();
  setTimeout(function () {
    $owlCarouselNew.trigger('refresh.owl.carousel');
  }, 1500);
}

Step 5: Getting AR stick figure

AR is actually quiet easy once you have the key points, we can draw the stick figures that represent the real time inference. We now have AR on top of the AI data to give user a better representation of their poses.

       poses.forEach(({score, keypoints}) => {
        
        if (score >= minPoseConfidence) {
          drawKeypoints(keypoints, minPartConfidence, ctx);
          drawSkeleton(keypoints, minPartConfidence, ctx);
          /*
          if (guiState.output.showBoundingBox) {
            drawBoundingBox(keypoints, ctx);
          }*/
        }
      });

via following functions

function drawPoint(ctx, y, x, r, color) {
  ctx.beginPath();
  ctx.arc(x, y, r, 0, 2 * Math.PI);
  ctx.fillStyle = color;
  ctx.fill();
}

/**
 * Draw the bounding box of a pose. For example, for a whole person standing
 * in an image, the bounding box will begin at the nose and extend to one of
 * ankles
 */
function drawBoundingBox(keypoints, ctx) {
  const boundingBox = posenet.getBoundingBox(keypoints);

  ctx.rect(
      boundingBox.minX, boundingBox.minY, boundingBox.maxX - boundingBox.minX,
      boundingBox.maxY - boundingBox.minY);

  ctx.strokeStyle = boundingBoxColor;
  ctx.stroke();
}

/**
 * Draws a line on a canvas, i.e. a joint
 */
function drawSegment([ay, ax], [by, bx], color, scale, ctx) {
  ctx.beginPath();
  ctx.moveTo(ax * scale, ay * scale);
  ctx.lineTo(bx * scale, by * scale);
  ctx.lineWidth = lineWidth;
  ctx.strokeStyle = color;
  ctx.stroke();
}

/**
 * Draws a pose skeleton by looking up all adjacent keypoints/joints
 */
function drawSkeleton(keypoints, minConfidence, ctx, scale = 1) {
  const adjacentKeyPoints =function drawPoint(ctx, y, x, r, color) {
  ctx.beginPath();
  ctx.arc(x, y, r, 0, 2 * Math.PI);
  ctx.fillStyle = color;
  ctx.fill();
}

/**
 * Draw the bounding box of a pose. For example, for a whole person standing
 * in an image, the bounding box will begin at the nose and extend to one of
 * ankles
 */
function drawBoundingBox(keypoints, ctx) {
  const boundingBox = posenet.getBoundingBox(keypoints);

  ctx.rect(
      boundingBox.minX, boundingBox.minY, boundingBox.maxX - boundingBox.minX,
      boundingBox.maxY - boundingBox.minY);

  ctx.strokeStyle = boundingBoxColor;
  ctx.stroke();
}

/**
 * Draws a line on a canvas, i.e. a joint
 */
function drawSegment([ay, ax], [by, bx], color, scale, ctx) {
  ctx.beginPath();
  ctx.moveTo(ax * scale, ay * scale);
  ctx.lineTo(bx * scale, by * scale);
  ctx.lineWidth = lineWidth;
  ctx.strokeStyle = color;
  ctx.stroke();
}

/**
 * Draws a pose skeleton by looking up all adjacent keypoints/joints
 */
function drawSkeleton(keypoints, minConfidence, ctx, scale = 1) {
  const adjacentKeyPoints =
      posenet.getAdjacentKeyPoints(keypoints, minConfidence);

  adjacentKeyPoints.forEach((keypoints) => {
    drawSegment(
        toTuple(keypoints[0].position), toTuple(keypoints[1].position), color,
        scale, ctx);
  });
}

/**
 * Draw pose keypoints onto a canvas
 */
function drawKeypoints(keypoints, minConfidence, ctx, scale = 1) {
  for (let i = 0; i < keypoints.length; i++) {
    const keypoint = keypoints[i];

    if (keypoint.score < minConfidence) {
      continue;
    }

    const {y, x} = keypoint.position;
    drawPoint(ctx, y * scale, x * scale, 3, color);
  }
}

function toTuple({y, x}) {
  return [y, x];
}
      posenet.getAdjacentKeyPoints(keypoints, minConfidence);

  adjacentKeyPoints.forEach((keypoints) => {
    drawSegment(
        toTuple(keypoints[0].position), toTuple(keypoints[1].position), color,
        scale, ctx);
  });
}

/**
 * Draw pose keypoints onto a canvas
 */
function drawKeypoints(keypoints, minConfidence, ctx, scale = 1) {
  for (let i = 0; i < keypoints.length; i++) {
    const keypoint = keypoints[i];

    if (keypoint.score < minConfidence) {
      continue;
    }

    const {y, x} = keypoint.position;
    drawPoint(ctx, y * scale, x * scale, 3, color);
  }
}

function toTuple({y, x}) {
  return [y, x];
}