What're we building?
In a previous tutorial, I went over how you can connect an ESP32 to the Google Gemini API via REST to send an audio file and have it transcribed. In this tutorial I will expand on that to actuallydosomething with that recorded audio instead of transcribing it - in this case turning on and off an LED ring, plus changing the LEDs colors based on spoken commands. While I'm only manipulating an LED ring, you can do anything with the technique used here, called function calling, from controlling motors to making additional requests to online services.
In the previous tutorial I set up a device using these connections:
MAX9814 -> ESP32
GND -> GND
VDD+GAIN connected together, attached to 3.3v
OUT -> IO34
---
SD card adapter -> ESP32
CS -> IO5
SCK -> IO18
MOSI -> IO23
MISO -> IO19
VCC -> 3.3v
GND -> GND
---
Button -> ESP32
One end to GND
One end split with 10k resistor to 3.3V and IO32
---
LED
Connect to GND and IO33
The only difference now is that I have added a 24 neopixel ring from Adafruit with the power line on the same 3.3v rail, GND to GND, and the data line on the ESP32 pin D15.
To keep things easier, this is also the code that the last tutorial ended with, which is what you can use as a starting place for this walkthrough.
#include <Arduino.h>
#include <WiFi.h>
#include <FS.h>
#include <SD.h>
#include <HTTPClient.h>
#include <WiFiClientSecure.h>
#include <ArduinoJson.h>
#include "soc/soc.h"
#include "soc/rtc_cntl_reg.h"
// Pins
const int SD_CS = 5;
const int AUDIO_PIN = 34;
const int BUTTON_PIN = 32;
const int LED_PIN = 33;
// Configuration for audio recording
const int SAMPLE_RATE = 8000;
const int BIT_DEPTH = 16;
const int RECORD_DURATION = 2;
// WIFI connection
String SSID = "";
String PASSWORD = "";
// Gemini API key
String API_KEY = "";
void setupWifi() {
WiFi.begin(SSID, PASSWORD);
while (WiFi.status()!= WL_CONNECTED) {
delay(1000);
Serial.print("...");
}
Serial.print("IP address: ");
Serial.println(WiFi.localIP());
}
void recordAudio() {
if (!SD.begin(SD_CS, SPI, 1000000)) {
Serial.println("SD card initialization failed!");
while (1);
} else {
Serial.println("SD card initialized!");
}
if (SD.exists("/tmp.wav")) {
if (SD.remove("/tmp.wav")) {
Serial.println("Previous audio file deleted.");
} else {
Serial.println("Failed to delete previous audio file.");
return;
}
} else {
Serial.println("No previous audio file detected, starting new");
}
File audioFile = SD.open("/tmp.wav", FILE_WRITE);
if (!audioFile) {
Serial.println("Failed to create audio file.");
return;
}
Serial.println("Start recording");
writeWavHeader(audioFile, SAMPLE_RATE, BIT_DEPTH, 1);
int numSamples = SAMPLE_RATE * RECORD_DURATION;
for (int i = 0; i < numSamples; i++) {
int rawValue = analogRead(AUDIO_PIN);
int16_t sample = map(rawValue, 0, 4095, -32768, 32767);
audioFile.write((uint8_t*)&sample, 2);
delayMicroseconds(1000000 / SAMPLE_RATE);
}
audioFile.close();
Serial.println("Audio recorded to /tmp.wav");
}
void writeWavHeader(File& file, int sampleRate, int bitDepth, int channels) {
uint32_t byteRate = sampleRate * channels * bitDepth / 8;
uint16_t blockAlign = channels * bitDepth / 8;
file.write((const uint8_t*)"RIFF", 4);
uint32_t fileSize = 36 + RECORD_DURATION * byteRate;
file.write((uint8_t*)&fileSize, 4);
file.write((const uint8_t*)"WAVE", 4);
file.write((const uint8_t*)"fmt ", 4);
uint32_t subchunk1Size = 16;
file.write((uint8_t*)&subchunk1Size, 4);
uint16_t audioFormat = 1;
file.write((uint8_t*)&audioFormat, 2);
file.write((uint8_t*)&channels, 2);
file.write((uint8_t*)&sampleRate, 4);
file.write((uint8_t*)&byteRate, 4);
file.write((uint8_t*)&blockAlign, 2);
file.write((uint8_t*)&bitDepth, 2);
file.write((const uint8_t*)"data", 4);
uint32_t subchunk2Size = RECORD_DURATION * byteRate;
file.write((uint8_t*)&subchunk2Size, 4);
}
String base64Encode(const uint8_t* data, size_t length) {
const char* b64_alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
String encodedString = "";
uint32_t i = 0;
uint8_t b1, b2, b3;
while (i < length) {
b1 = data[i++];
encodedString += b64_alphabet[b1 >> 2];
if (i < length) {
b2 = data[i++];
encodedString += b64_alphabet[((b1 & 0x03) << 4) | (b2 >> 4)];
} else {
encodedString += b64_alphabet[(b1 & 0x03) << 4];
encodedString += "==";
break;
}
if (i < length) {
b3 = data[i++];
encodedString += b64_alphabet[((b2 & 0x0F) << 2) | (b3 >> 6)];
encodedString += b64_alphabet[b3 & 0x3F];
} else {
encodedString += b64_alphabet[(b2 & 0x0F) << 2];
encodedString += '=';
break;
}
}
return encodedString;
}
void createAudioJsonRequest() {
if (SD.exists("/request-tmp.json")) {
if (SD.remove("/request-tmp.json")) {
Serial.println("Previous request file deleted.");
} else {
Serial.println("Failed to delete previous request file.");
return;
}
} else {
Serial.println("No previous request file detected, starting new");
}
File stringFile = SD.open("/audiostring.txt", FILE_READ);
if (!stringFile) {
Serial.println("Failed to open audiostring.txt for reading");
return;
}
// Read the base64 encoded audio data from the file
String base64EncodedData = stringFile.readString();
stringFile.close();
// Create the JSON document
const size_t jsonBufferSize = 1024 * 48; // Adjust as needed
DynamicJsonDocument doc(jsonBufferSize);
JsonArray contents = doc.createNestedArray("contents");
JsonObject content = contents.createNestedObject();
JsonArray parts = content.createNestedArray("parts");
JsonObject textPart = parts.createNestedObject();
textPart["text"] = "Provide a transcript of this audio clip. Only include words said in the audio.";
JsonObject audioPart = parts.createNestedObject();
JsonObject inlineData = audioPart.createNestedObject("inline_data");
inlineData["mime_type"] = "audio/x-wav";
inlineData["data"] = base64EncodedData; // Use the data read from the file
// Open a file on the SD card for writing the JSON request
File jsonFile = SD.open("/request-tmp.json", FILE_WRITE);
if (!jsonFile) {
Serial.println("Failed to open JSON file for writing");
return;
}
// Serialize the JSON document to the file
serializeJson(doc, jsonFile);
jsonFile.close();
Serial.println("JSON request saved to /request-tmp.json");
}
void transcribeAudio() {
WiFiClientSecure client;
client.setInsecure();
HTTPClient http;
if (http.begin(client, "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key=" + API_KEY)) {
http.addHeader("Content-Type", "application/json");
File file = SD.open("/request-tmp.json", FILE_READ);
if (!file) {
Serial.println("Failed to open file for reading from SD card");
return;
}
const int BUFFER_SIZE = 64;
uint8_t fileBuffer[BUFFER_SIZE];
const int JSON_STRING_SIZE = 65536; // Allocate 64kb for the audio file request. Likely smaller.
char *jsonString = (char *)malloc(JSON_STRING_SIZE);
if (jsonString == NULL) {
Serial.println("Failed to allocate memory for JSON string");
file.close();
return;
}
int jsonStringIndex = 0;
while (file.available()) {
int bytesRead = file.read(fileBuffer, BUFFER_SIZE);
for (int i = 0; i < bytesRead && jsonStringIndex < JSON_STRING_SIZE - 1; i++) {
jsonString[jsonStringIndex++] = fileBuffer[i];
}
}
jsonString[jsonStringIndex] = '\0';
file.close();
SD.end(); // Close the SD connection after reading the file
// Serial.println(jsonString);
int httpCode = http.POST(jsonString);
free(jsonString);
Serial.print(F("Http code: "));
Serial.println(httpCode);
if (httpCode == HTTP_CODE_OK) {
String payload = http.getString();
DynamicJsonDocument doc(1024);
deserializeJson(doc, payload);
String responseText = doc["candidates"][0]["content"]["parts"][0]["text"];
Serial.print("Response: ");
Serial.println(responseText);
}
}
}
void saveAudioString() {
File audioFile = SD.open("/tmp.wav", FILE_READ);
if (!audioFile) {
Serial.println("Failed to open audio file for reading");
return;
}
size_t fileSize = audioFile.size();
uint8_t* audioData = (uint8_t*)malloc(fileSize);
if (audioData == NULL) {
Serial.println("Failed to allocate memory for audio data");
audioFile.close();
return;
}
audioFile.read(audioData, fileSize);
audioFile.close();
String base64AudioData = base64Encode(audioData, fileSize);
free(audioData);
File stringFile = SD.open("/audiostring.txt", FILE_WRITE);
if (!stringFile) {
Serial.println("Failed to open audiostring.txt for writing");
return;
}
stringFile.print(base64AudioData);
stringFile.close();
Serial.println("Audio base64 string saved to /audiostring.txt");
}
void setup() {
WRITE_PERI_REG(RTC_CNTL_BROWN_OUT_REG, 0);
WRITE_PERI_REG(RTC_CNTL_WDTCONFIG0_REG, 0);
pinMode(BUTTON_PIN, INPUT_PULLUP);
pinMode(LED_PIN, OUTPUT);
Serial.begin(115200);
WiFi.mode(WIFI_STA);
WiFi.disconnect();
while (!Serial);
setupWifi();
if (!SD.begin(SD_CS, SPI, 1000000)) {
Serial.println("SD card initialization failed!");
while (1);
} else {
Serial.println("SD card initialized!");
}
}
void loop() {
if (digitalRead(BUTTON_PIN) == LOW) {
digitalWrite(LED_PIN, HIGH);
// This delay is to debounce the button and allow time to speak
delay(500);
recordAudio();
digitalWrite(LED_PIN, LOW);
saveAudioString();
createAudioJsonRequest();
transcribeAudio();
}
}
What is Function Calling?
Great, so before we dive into what's new, let's talk about function calling. Essentially what this is is a way for you to tell the Gemini API about some custom function definitions that you have in your program and then Gemini can decide any of them should be called based on the input/request from the user. You can do this by adding new properties to the JSON that you POST to the API. This is what that JSON looks like for this lights example.
{
"contents": [
{
"parts": [
{
"text": "Trigger a function based on this audio input."
},
{
"inline_data": {
"mime_type": "audio/x-wav",
"data": "$DATA"
}
}
]
}
],
"tools": [
{
"function_declarations": [
{
"name": "changeColor",
"description": "Change the default color for the lights in an RGB format. Example: Green would be 0 255 0",
"parameters": {
"type": "object",
"properties": {
"red": {
"type": "integer",
"description": "A value from 0 to 255 for the color RED in an RGB color code"
},
"green": {
"type": "integer",
"description": "A value from 0 to 255 for the color GREEN in an RGB color code"
},
"blue": {
"type": "integer",
"description": "A value from 0 to 255 for the color BLUE in an RGB color code"
}
},
"required": [
"red",
"green",
"blue"
]
}
},
{
"name": "toggleLights",
"description": "Turn on or off the lights",
"parameters": {
"type": "object",
"properties": {
"toggle": {
"type": "boolean",
"description": "Determine if the lights should be turned on or off."
}
},
"required": [
"toggle"
]
}
}
]
}
]
}
You can see here that you need to send the name of the function that will be called, a description of it so that the Gemini API knows when it's appropriate to call, and optionally a list of properties that can be passed to the function for customizing how it works. You can also require that those properties be sent when a function is called.
New Code
Alright, now that we have the basic concepts out of the way, let's update the audio recording code to work with function calling! Let's start by adding the Adafruit NeoPixel library to the top of the class, as well as define the pin used by the LED ring, how many NeoPixels are on that ring, and some default values for the red/green/blue colors used by the LEDs. We'll also need to define the NeoPixel controller object.
#include <Adafruit_NeoPixel.h>
...
const int NEOPIXEL_PIN = 15
const int NEOPIXEL_COUNT = 24;
int red = 255;
int green = 255;
int blue = 255;
Adafruit_NeoPixel pixels(NEOPIXEL_COUNT, NEOPIXEL_PIN, NEO_GRB + NEO_KHZ800);
Moving into the setup() function, we just need to initialize the pixels object and set the LEDs to a default off state.
pixels.begin();
pixels.show();
We'll also add a new function named toggleLights(bool on) to toggle the lights on or off when triggered by the Gemini API.
void toggleLights(bool on) {
if( on ) {
Serial.println("Turning on lights");
for (int i = 0; i < NEOPIXEL_COUNT; i++) {
pixels.setPixelColor(i, pixels.Color(red, green, blue));
pixels.setBrightness(255);
}
pixels.show();
} else {
Serial.println("Turning off lights");
pixels.clear();
pixels.show();
}
}
And now we can get into the core code for function calling. Go into the createAudioJsonRequest() function and replace it with this code:
void createAudioJsonRequest() {
if (SD.exists("/request-tmp.json")) {
if (SD.remove("/request-tmp.json")) {
Serial.println("Previous request file deleted.");
} else {
Serial.println("Failed to delete previous request file.");
return;
}
} else {
Serial.println("No previous request file detected, starting new");
}
File stringFile = SD.open("/audiostring.txt", FILE_READ);
if (!stringFile) {
Serial.println("Failed to open audiostring.txt for reading");
return;
}
// Read the base64 encoded audio data from the file
String base64EncodedData = stringFile.readString();
stringFile.close();
// Create the JSON document
const size_t jsonBufferSize = 1024 * 64; // Adjust as needed
DynamicJsonDocument doc(jsonBufferSize);
// Set up REST call to call custom functions based on sent audio clip
JsonArray contents = doc.createNestedArray("contents");
JsonObject content = contents.createNestedObject();
JsonArray parts = content.createNestedArray("parts");
JsonObject textPart = parts.createNestedObject();
// Core instructions about the audio file.
textPart["text"] = "Trigger a function based on this audio input.";
JsonObject audioPart = parts.createNestedObject();
JsonObject inlineData = audioPart.createNestedObject("inline_data");
inlineData["mime_type"] = "audio/x-wav";
inlineData["data"] = base64EncodedData; // Use the data read from the file
// Establish the tools node. There's multiple tools available for the Gemini API, but that's a larger topic for another time.
JsonArray tools = doc.createNestedArray("tools");
JsonObject tool = tools.createNestedObject();
JsonArray functionDeclarations = tool.createNestedArray("function_declarations");
// Change color function
JsonObject changeColor = functionDeclarations.createNestedObject();
changeColor["name"] = "changeColor";
changeColor["description"] = "Change the default color for the lights in an RGB format. Example: Green would be 0 255 0.";
JsonObject parametersChangeColor = changeColor.createNestedObject("parameters");
parametersChangeColor["type"] = "object";
JsonObject propertiesChangeColor = parametersChangeColor.createNestedObject("properties");
JsonObject red = propertiesChangeColor.createNestedObject("red");
red["type"] = "integer";
red["description"] = "A value from 0 to 255 for the color RED in an RGB color code";
JsonObject green = propertiesChangeColor.createNestedObject("green");
green["type"] = "integer";
green["description"] = "A value from 0 to 255 for the color GREEN in an RGB color code";
JsonObject blue = propertiesChangeColor.createNestedObject("blue");
blue["type"] = "integer";
blue["description"] = "A value from 0 to 255 for the color BLUE in an RGB color code";
JsonArray requiredChangeColor = parametersChangeColor.createNestedArray("required");
requiredChangeColor.add("red");
requiredChangeColor.add("green");
requiredChangeColor.add("blue");
// Toggle lights function definition
JsonObject toggleLights = functionDeclarations.createNestedObject();
toggleLights["name"] = "toggleLights";
toggleLights["description"] = "Turn on or off the lights";
JsonObject parametersToggleLights = toggleLights.createNestedObject("parameters");
parametersToggleLights["type"] = "object";
JsonObject propertiesToggleLights = parametersToggleLights.createNestedObject("properties");
JsonObject toggle = propertiesToggleLights.createNestedObject("toggle");
toggle["type"] = "boolean";
toggle["description"] = "Determine if the lights should be turned on or off.";
JsonArray requiredToggleLights = parametersToggleLights.createNestedArray("required");
requiredToggleLights.add("toggle");
// Open a file on the SD card for writing the JSON request
File jsonFile = SD.open("/request-tmp.json", FILE_WRITE);
if (!jsonFile) {
Serial.println("Failed to open JSON file for writing");
return;
}
// Serialize the JSON document to the file
serializeJson(doc, jsonFile);
jsonFile.close();
Serial.println("JSON request saved to /request-tmp.json");
}
While it might look like there's a lot going on here, it's mostly just reading the audio data from the SD card, and then building out the JSON structure that you can see earlier in the tutorial.
Finally, I've renamed the transcribeAudio() function to sendAudio(). Rather than printing out the returned transcript of the audio, now we can check to see if the functionCall parameter exists within the response, figure out which function is being called, and then pull the arguments for that function to do something with them. In this case I'm checking for toggleLights, which will include a boolean value for whether or not the lights should be on, and I'll pass that value to the toggleLights() function. In addition to that, I'm listening for the Gemini API to send back "changeColor" as a functionCall value. If that comes through, I extract the red, green, and blue values from the arguments and save those, then toggle the lights on with those saved values. You can see the entire function here:
void sendAudio() {
WiFiClientSecure client;
client.setInsecure();
HTTPClient http;
if (http.begin(client, "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key=" + API_KEY)) {
http.addHeader("Content-Type", "application/json");
File file = SD.open("/request-tmp.json", FILE_READ);
if (!file) {
Serial.println("Failed to open file for reading from SD card");
return;
}
const int BUFFER_SIZE = 64;
uint8_t fileBuffer[BUFFER_SIZE];
const int JSON_STRING_SIZE = 65536; // Allocate 64kb for the audio file request. Likely smaller.
char *jsonString = (char *)malloc(JSON_STRING_SIZE);
if (jsonString == NULL) {
Serial.println("Failed to allocate memory for JSON string");
file.close();
return;
}
int jsonStringIndex = 0;
while (file.available()) {
int bytesRead = file.read(fileBuffer, BUFFER_SIZE);
for (int i = 0; i < bytesRead && jsonStringIndex < JSON_STRING_SIZE - 1; i++) {
jsonString[jsonStringIndex++] = fileBuffer[i];
}
}
jsonString[jsonStringIndex] = '\0';
file.close();
SD.end(); // Close the SD connection after reading the file
int httpCode = http.POST(jsonString);
free(jsonString);
Serial.print(F("Http code: "));
Serial.println(httpCode);
if (httpCode == HTTP_CODE_OK) {
String payload = http.getString();
DynamicJsonDocument doc(1024);
DeserializationError error = deserializeJson(doc, payload);
if (error) {
Serial.print(F("deserializeJson() failed: "));
Serial.println(error.c_str());
return;
}
if (doc["candidates"][0]["content"]["parts"][0].containsKey("functionCall") &&
doc["candidates"][0]["content"]["parts"][0]["functionCall"].is<JsonObject>()) {
JsonObject functionCall =
doc["candidates"][0]["content"]["parts"][0]["functionCall"].as<JsonObject>();
if (functionCall.containsKey("name")) {
String functionName = functionCall["name"].as<String>();
if( functionName == "toggleLights") {
if (functionCall.containsKey("args") && functionCall["args"].is<JsonObject>()) {
JsonObject args = functionCall["args"].as<JsonObject>();
if (args.containsKey("toggle")) {
bool toggleValue = args["toggle"].as<bool>();
toggleLights(toggleValue);
} else {
Serial.println("Toggle argument not found.");
}
} else {
Serial.println("Args not found in function call.");
}
} else if( functionName == "changeColor") {
if (functionCall.containsKey("args") && functionCall["args"].is<JsonObject>()) {
JsonObject args = functionCall["args"].as<JsonObject>();
red = args["red"].as<int>();
green = args["green"].as<int>();
blue = args["blue"].as<int>();
toggleLights(true);
} else {
Serial.println("Args not found in function call.");
}
}
} else {
Serial.println("Function name not found.");
}
} else {
Serial.println("Function call not found.");
}
} else {
Serial.println("HTTP POST request failed");
}
http.end();
} else {
Serial.println("HTTP begin failed");
}
}
In addition, the Gemini API can understand audio recordings in multiple languages without you needing to define those languages beforehand. For example, I used this exact code with Spanish to turn on the LED ring and change the color to green (mind you, I don'tactuallyspeakSpanish, so I used Google Translate and the text to speech feature, but it works well enough!).
Conclusion
And that's it for the additions to the audio recording example to be able to control your device using fullsentences to do complex operations with speech. I'd love to see the really cool projects you all end up making using this information, so please leave a comment with links to them and I'll see folks in later tutorials!
Comments
Please log in or sign up to comment.