Bharath Ram
Published

Reading Aid for the Visually Impaired Using XIAO ESP32S3

A compact, real-time reading aid for the visually impaired that converts printed text to speech using the Seeed Studio XIAO ESP32S3.

AdvancedProtip157
Reading Aid for the Visually Impaired Using XIAO ESP32S3

Things used in this project

Hardware components

Seeed Studio XIAO ESP32S3 Sense
Seeed Studio XIAO ESP32S3 Sense
×1
Adafruit MAX98357A Amplifier
×1
DIY Speaker
×1

Software apps and online services

Arduino IDE
Arduino IDE
Node.js
Tesseract.js
Google Text-to-Speech (TTS)

Story

Read more

Schematics

Circuit diagram

Code

Aruduino IDE code

C/C++
#include <WiFi.h>
#include <WiFiClientSecure.h>
#include <ArduinoJson.h>
#include "esp_camera.h"
#include "soc/soc.h"
#include "soc/rtc_cntl_reg.h"
#include <Arduino.h>
#include <HTTPClient.h>
#include "Audio.h"

// WiFi credentials
const char* ssid = "";
const char* password = "";

#define CAMERA_MODEL_XIAO_ESP32S3
#include "camera_pins.h"

// Server URL for uploading images and receiving text
const char* serverUrl = "";

#define I2S_DOUT   D1  
#define I2S_BCLK   D2  
#define I2S_LRC    D0  

Audio audio;

void setup() {
  Serial.begin(115200);
  Serial.setDebugOutput(true);
  Serial.println();

  camera_config_t config;
  config.ledc_channel = LEDC_CHANNEL_0;
  config.ledc_timer = LEDC_TIMER_0;
  config.pin_d0 = Y2_GPIO_NUM;
  config.pin_d1 = Y3_GPIO_NUM;
  config.pin_d2 = Y4_GPIO_NUM;
  config.pin_d3 = Y5_GPIO_NUM;
  config.pin_d4 = Y6_GPIO_NUM;
  config.pin_d5 = Y7_GPIO_NUM;
  config.pin_d6 = Y8_GPIO_NUM;
  config.pin_d7 = Y9_GPIO_NUM;
  config.pin_xclk = XCLK_GPIO_NUM;
  config.pin_pclk = PCLK_GPIO_NUM;
  config.pin_vsync = VSYNC_GPIO_NUM;
  config.pin_href = HREF_GPIO_NUM;
  config.pin_sccb_sda = SIOD_GPIO_NUM;
  config.pin_sccb_scl = SIOC_GPIO_NUM;
  config.pin_pwdn = PWDN_GPIO_NUM;
  config.pin_reset = RESET_GPIO_NUM;
  config.xclk_freq_hz = 20000000;
  config.frame_size = FRAMESIZE_UXGA;
  config.pixel_format = PIXFORMAT_JPEG;
  config.grab_mode = CAMERA_GRAB_WHEN_EMPTY;
  config.fb_location = CAMERA_FB_IN_PSRAM;
  config.jpeg_quality = 12;
  config.fb_count = 1;

  if (config.pixel_format == PIXFORMAT_JPEG) {
    if (psramFound()) {
      Serial.println("Using PSRAM");
      config.jpeg_quality = 10;
      config.fb_count = 2;
      config.grab_mode = CAMERA_GRAB_LATEST;
    } else {
      config.frame_size = FRAMESIZE_UXGA;
      config.fb_location = CAMERA_FB_IN_DRAM;
    }
  } else {
    config.frame_size = FRAMESIZE_240X240;
#if CONFIG_IDF_TARGET_ESP32S3
    config.fb_count = 2;
#endif
  }

  esp_err_t err = esp_camera_init(&config);
  if (err != ESP_OK) {
    Serial.printf("Camera init failed with error 0x%x", err);
    return;
  }

  WiFi.mode(WIFI_STA);
  WiFi.begin(ssid, password);
  delay(1000);
  Serial.println("Connecting to WiFi...");
  long int StartTime = millis();
  while (WiFi.status() != WL_CONNECTED) {
    delay(500);
    if ((StartTime + 10000) < millis()) break;
  }
  if (WiFi.status() == WL_CONNECTED) {
    Serial.println("Connected to WiFi");
    Serial.print("IP address: ");
    Serial.println(WiFi.localIP());
  } else {
    Serial.println("Failed to connect to WiFi");
    return;
  }

  audio.setPinout(I2S_BCLK, I2S_LRC, I2S_DOUT);
  audio.setVolume(100);

  playDetectedText("Welcome to the reader for the blind.");
}

void loop() {
  String detectedText = captureAndUploadImage();
  if (detectedText.length() > 0) {
    playDetectedText(detectedText);
  } else {
    playDetectedText("Please keep the device at least 15 cm above the text.");
  }
  delay(5000); 
}

String captureAndUploadImage() {
  camera_fb_t *fb = esp_camera_fb_get();
  if (!fb) {
    Serial.println("Failed to capture image");
    return "";
  }

  Serial.printf("Captured image of size: %u bytes\n", fb->len);
  Serial.printf("Free heap before sending: %d bytes\n", ESP.getFreeHeap());

  HTTPClient http;
  http.begin(serverUrl);
  http.setTimeout(15000); 

  String boundary = "----WebKitFormBoundary7MA4YWxkTrZu0gW";
  String contentType = "multipart/form-data; boundary=" + boundary;
  http.addHeader("Content-Type", contentType);

  String bodyStart = "--" + boundary + "\r\n";
  bodyStart += "Content-Disposition: form-data; name=\"file\"; filename=\"image.jpg\"\r\n";
  bodyStart += "Content-Type: image/jpeg\r\n\r\n";

  String bodyEnd = "\r\n--" + boundary + "--\r\n";

  size_t bodySize = bodyStart.length();
  size_t bodyEndSize = bodyEnd.length();
  size_t totalSize = bodySize + fb->len + bodyEndSize;

  uint8_t* bodyBuffer = (uint8_t*)malloc(totalSize);
  if (bodyBuffer == NULL) {
    Serial.println("Failed to allocate memory for body buffer");
    esp_camera_fb_return(fb);
    return "";
  }

  memcpy(bodyBuffer, bodyStart.c_str(), bodySize);
  memcpy(bodyBuffer + bodySize, fb->buf, fb->len);
  memcpy(bodyBuffer + bodySize + fb->len, bodyEnd.c_str(), bodyEndSize);

  int httpResponseCode = http.sendRequest("POST", bodyBuffer, totalSize);
  String response = "";
  if (httpResponseCode > 0) {
    response = http.getString();
    Serial.println("Server response: " + response);
  } else {
    Serial.printf("Error on HTTP request: %s\n", http.errorToString(httpResponseCode).c_str());
  }

  free(bodyBuffer);
  http.end();
  esp_camera_fb_return(fb);
  Serial.printf("Free heap after sending: %d bytes\n", ESP.getFreeHeap());
  return extractTextFromResponse(response);
}

String extractTextFromResponse(String response) {
  StaticJsonDocument<1024> doc;
  deserializeJson(doc, response);
  String detectedText = doc["detectedText"];
  return detectedText;
}

void playDetectedText(String text) {
  Serial.println("Playing detected text: " + text);
  audio.connecttospeech(text.c_str(), "en"); // Google TTS
  while (audio.isRunning()) {
    audio.loop();
  }
}

void audio_info(const char *info) {
  Serial.print("audio_info: "); Serial.println(info);
}

Server code

JavaScript
const express = require('express');
const multer = require('multer');
const path = require('path');
const fs = require('fs');
const Tesseract = require('tesseract.js');

const app = express();
const port = 3000;

const storage = multer.diskStorage({
    destination: (req, file, cb) => {
        cb(null, 'uploads/');
    },
    filename: (req, file, cb) => {
        cb(null, Date.now() + path.extname(file.originalname));
    }
});

const upload = multer({ storage: storage });

if (!fs.existsSync('uploads')) {
    fs.mkdirSync('uploads');
}

app.use('/uploads', express.static(path.join(__dirname, 'uploads')));

app.post('/upload', upload.single('file'), (req, res) => {
    if (!req.file) {
        console.error('No file uploaded');
        return res.status(400).json({ error: 'No file uploaded' });
    }

    const filePath = path.join(__dirname, req.file.path);

    Tesseract.recognize(filePath, 'eng', { logger: info => console.log(info) })
        .then(({ data: { text } }) => {
            console.log('Recognized text:', text);
            res.json({ recognizedText: text });
            fs.unlinkSync(filePath);
        })
        .catch(error => {
            console.error('Error during OCR:', error);
            res.status(500).json({ error: 'OCR processing failed' });
        });
});

app.listen(port, () => {
    console.log(`Server is running at http://localhost:${port}`);
});

Credits

Bharath Ram

Bharath Ram

1 project • 1 follower

Comments