Skip to main content

1. Networking and Communications

Project Overview

My final project is an ESP32-based smart glasses system requiring internet connectivity for data transmission and interactive capabilities. The implementation focuses on two core functionalities: Speech-to-Text (STT) and
Text-to-Speech (TTS).

1.1 System Architecture

The smart glasses system consists of:

  • ESP32 microcontroller with WiFi capabilities
  • MEMS microphone for voice input
  • Speaker for audio output
  • Custom PCB for peripheral integration
  • Cloud-based speech processing services

Conection Diagram

1.2 Baidu Cloud Speech Service

Key Steps

  1. Service Setup
    • Register at Baidu AI Open Platform • Create application → Enable Speech Recognition (STT) and Text-to-Speech (TTS) • Get credentials: App ID, API Key, Secret Key

  2. Authentication

    # Get access token (refresh every 30 days)
    GET https://aip.baidubce.com/oauth/2.0/token?
    grant_type=client_credentials&
    client_id=YOUR_API_KEY&
    client_secret=YOUR_SECRET_KEY

API Specifications

ServiceEndpointKey Parameters
STTvop.baidubce.com/server_apiFormat, Sample Rate, Channel
TTStsn.baidubce.com/text2audioVoice, Speed, Audio Format

Basic Configuration

STT Requirements:

- Audio: PCM/WAV, 8k/16k sample rate
- Single channel
- Max 60s duration

TTS Defaults:

- Voice: Standard female (ID 0)
- Speed: Normal (5)
- Format: MP3

Error Handling

CodeMeaningSolution
3300Invalid inputCheck audio parameters
3301Poor audio qualityImprove recording
3302Auth failureRenew access token
3303Server timeoutRetry request

Best Practices

  1. Security
    Store credentials using ESP32's encrypted NVS storage

  2. Audio Processing

    - Noise filtering
    - Auto gain control
    - Silence removal
  3. Network
    Implement retry logic with exponential backoff

1.3 Speech-to-Text (STT) Implementation

Functional Requirements

  1. Device wake-up detection
  2. Voice recording through onboard microphone
  3. Audio transmission to cloud service
  4. Text conversion processing
  5. Response playback via TTS

Technical Implementation

1. ​​Audio Capture​​ (I2S PDM Interface):

  • 8kHz sampling rate
  • 16-bit PCM format
  • Single channel configuration

2. ​Base64 Encoding​​

3. Cloud Communication

  • HTTP POST with JSON payload
  • SSL/TLS encryption
  • 60s timeout handling
#include <stdio.h>
#include <string.h>
#include "sdkconfig.h"
#include "driver/i2s_pdm.h"
#include "esp_http_client.h"
#include "esp_log.h"
#include "console_stt.h"
#include "console_tts.h"
#include "mbedtls/base64.h"

#include "cJSON.h"

static const char *TAG = "console_stt";

#define SAMPLE_SIZE (16 * 1024)
#define BYTE_RATE (8000 * (16 / 8))

static i2s_chan_handle_t rx_handle = NULL;

static char gResult[2048] = { 0 };
static size_t gResultLen = 0;
static char gContent[2048] = { 0 };
static bool gIsFinish = false;

static const char *url = "https://vop.baidu.com/server_api";

static const char *cert = "your_cert";
static const char *cuid = "your_cuid";
static const char *token = "your_token";

static const console_cmd_plugin_desc_t __attribute__((section(".console_cmd_desc"), used)) PLUGIN = { .name = "console_cmd_stt", .plugin_regd_fn = &console_cmd_stt_register };

typedef struct stt
{
char *name;
esp_err_t (*operation)(struct stt *self, int argc, char *argv[]);
int arg_cnt;
int start_index;
char *help;
} stt_op_t;

static esp_err_t stt_help_op(stt_op_t *self, int argc, char *argv[]);
static esp_err_t stt_rec_op(stt_op_t *self, int argc, char *argv[]);

static stt_op_t cmd_list[] = {
{ .name = "help", .operation = stt_help_op, .arg_cnt = 2, .start_index = 1, .help = "stt help: Prints the help text for all stt commands" },
{ .name = "rec", .operation = stt_rec_op, .arg_cnt = 3, .start_index = 1, .help = "stt rec <duration>: speech to text" },
};

static esp_err_t stt_help_op(stt_op_t *self, int argc, char *argv[])
{
int cmd_count = sizeof(cmd_list) / sizeof(cmd_list[0]);

for (int i = 0; i < cmd_count; i++)
{
if ((cmd_list[i].help != NULL) && (strlen(cmd_list[i].help) != 0))
{
printf(" %s\n", cmd_list[i].help);
}
}

return ESP_OK;
}

static esp_err_t do_cmd_stt(int argc, char **argv)
{
int cmd_count = sizeof(cmd_list) / sizeof(cmd_list[0]);
stt_op_t cmd;

for (int i = 0; i < cmd_count; i++)
{
cmd = cmd_list[i];

if (argc < cmd.start_index + 1)
{
continue;
}

if (!strcmp(cmd.name, argv[cmd.start_index]))
{
/* Get interface for eligible commands */
if (cmd.arg_cnt == argc)
{
if (cmd.operation != NULL)
{
if (cmd.operation(&cmd, argc, argv) != ESP_OK)
{
ESP_LOGE(TAG, "Usage:\n%s", cmd.help);
return 0;
}
}
return ESP_OK;
}
}
}

ESP_LOGE(TAG, "Command not available");

return ESP_OK;
}

esp_err_t console_cmd_stt_get_result(char *ret, int max_len)
{
cJSON *root = cJSON_Parse(gResult);
if (root)
{
cJSON *result = cJSON_GetObjectItem(root, "result");
if (result == NULL)
{
ESP_LOGE(TAG, "Missing 'result' field");
printf("%s", gResult);
goto ret;
}
if (!cJSON_IsArray(result))
{
ESP_LOGE(TAG, "'result' field is not an array");
goto ret;
}
cJSON *item = cJSON_GetArrayItem(result, 0);
if (item == NULL)
{
ESP_LOGE(TAG, "Missing array item");
goto ret;
}
strncpy(ret, item->valuestring, max_len);
}
else
{
ESP_LOGE(TAG, "Invalid response format");
return ESP_FAIL;
}

ret:
cJSON_Delete(root);
return ESP_OK;
}

static esp_err_t _http_event_handler(esp_http_client_event_t *evt)
{
switch (evt->event_id)
{
case HTTP_EVENT_ON_DATA:
if (evt->data_len > 0)
{
memcpy(gResult + gResultLen, evt->data, evt->data_len);
gResultLen = evt->data_len;
gResult[gResultLen] = '\0';
}

break;
case HTTP_EVENT_ON_FINISH:
gIsFinish = true;
break;
default:
break;
}
return ESP_OK;
}

esp_err_t console_cmd_stt_rec(int duration)
{
static int16_t i2s_readraw_buff[SAMPLE_SIZE];
uint32_t flash_rec_time = BYTE_RATE * duration;
uint8_t *pcm_buff = malloc(256 * 1024);
size_t bytes_read;
int flash_wr_size = 0;
ESP_ERROR_CHECK(i2s_channel_enable(rx_handle));
query();
printf("Recording for %d seconds...\n", duration);
while (flash_wr_size < flash_rec_time)
{
if (i2s_channel_read(rx_handle, (char *)(i2s_readraw_buff), SAMPLE_SIZE, &bytes_read, 1000) == ESP_OK)
{
memcpy(pcm_buff + flash_wr_size, i2s_readraw_buff, bytes_read);
flash_wr_size += bytes_read;
}
}
printf("\n");
ESP_ERROR_CHECK(i2s_channel_disable(rx_handle));
uint8_t *base64_pcm = malloc(flash_wr_size * 2);
size_t base64_size = 0;
// Base64 encode the PCM data
mbedtls_base64_encode(base64_pcm, flash_wr_size * 2, &base64_size, pcm_buff, flash_wr_size);
base64_pcm[base64_size] = '\0';

esp_http_client_config_t config = {
.url = "https://vop.baidu.com/server_api",
.event_handler = _http_event_handler,
.cert_pem = cert,
.buffer_size = 10240,
};

esp_http_client_handle_t client = esp_http_client_init(&config);

// 构造请求体
cJSON *root = cJSON_CreateObject();
cJSON *format = cJSON_CreateString("pcm");
cJSON *rate = cJSON_CreateNumber(8000);
cJSON *channel = cJSON_CreateNumber(1);
cJSON *qcuid = cJSON_CreateString(cuid);
cJSON *qtoken = cJSON_CreateString(token);
cJSON *speech = cJSON_CreateString((const char *)base64_pcm);
cJSON *len = cJSON_CreateNumber(flash_wr_size);
cJSON_AddItemToObject(root, "format", format);
cJSON_AddItemToObject(root, "rate", rate);
cJSON_AddItemToObject(root, "channel", channel);
cJSON_AddItemToObject(root, "cuid", qcuid);
cJSON_AddItemToObject(root, "token", qtoken);
cJSON_AddItemToObject(root, "speech", speech);
cJSON_AddItemToObject(root, "len", len);

char *json_data = cJSON_PrintUnformatted(root);

esp_http_client_set_header(client, "Content-Type", "application/json");
esp_http_client_set_method(client, HTTP_METHOD_POST);
esp_http_client_set_post_field(client, json_data, strlen(json_data));
esp_http_client_set_timeout_ms(client, 60000);

gIsFinish = false;
gResultLen = 0;
esp_err_t err = esp_http_client_perform(client);
if (err != ESP_OK)
{
ESP_LOGE(TAG, "HTTP request failed: %s", esp_err_to_name(err));
}
else
{
while (!gIsFinish)
{
vTaskDelay(10 / portTICK_PERIOD_MS);
}
}

esp_http_client_cleanup(client);
free(base64_pcm);
free(pcm_buff);
cJSON_Delete(root);

return ESP_OK;
}

static esp_err_t stt_rec_op(stt_op_t *self, int argc, char *argv[])
{
if (argc < cmd_list[self->start_index].arg_cnt)
{
ESP_LOGE(TAG, "Error: Invalid command\n");
ESP_LOGE(TAG, "%s\n", cmd_list[self->start_index].help);
return ESP_FAIL;
}
int duration = atoi(argv[self->start_index + 1]);
if (duration <= 0)
{
duration = 10;
}
if (duration > 15)
{
duration = 15;
}
console_cmd_stt_rec(duration);
memset(gContent, 0, sizeof(gContent));
console_cmd_stt_get_result(gContent, sizeof(gContent));
printf("Result: %s\n", gContent);
return ESP_OK;
}

void init_microphone(void)
{
i2s_chan_config_t chan_cfg = I2S_CHANNEL_DEFAULT_CONFIG(I2S_NUM_AUTO, I2S_ROLE_MASTER);
ESP_ERROR_CHECK(i2s_new_channel(&chan_cfg, NULL, &rx_handle));

i2s_pdm_rx_config_t pdm_rx_cfg = {
.clk_cfg = I2S_PDM_RX_CLK_DEFAULT_CONFIG(8000),
/* The default mono slot is the left slot (whose 'select pin' of the PDM microphone is pulled down) */
.slot_cfg = I2S_PDM_RX_SLOT_DEFAULT_CONFIG(I2S_DATA_BIT_WIDTH_16BIT, I2S_SLOT_MODE_MONO),
.gpio_cfg = {
.clk = 42,
.din = 41,
.invert_flags = {
.clk_inv = false,
},
},
};
ESP_ERROR_CHECK(i2s_channel_init_pdm_rx_mode(rx_handle, &pdm_rx_cfg));
}
esp_err_t console_cmd_stt_register(void)
{
init_microphone();

esp_err_t ret;
esp_console_cmd_t command = { .command = "stt", .help = "stt help", .hint = NULL, .func = &do_cmd_stt, .argtable = NULL };

ret = esp_console_cmd_register(&command);
if (ret)
{
ESP_LOGE(TAG, "Unable to register stt");
}

return ret;
}

Key Components

  1. Audio Capture: Uses I2S PDM interface for high-quality audio sampling
  2. Base64 Encoding: Converts raw PCM data for API compatibility
  3. HTTP Client: Handles secure communication with Baidu Voice API
  4. JSON Parsing: Processes API responses using cJSON library
Security Note The authentication tokens shown are placeholders. Always

use environment variables for actual credentials:

// Replace with secure credential storage
static const char *cuid = CONFIG_VOICE_API_CUID;
static const char *token = CONFIG_VOICE_API_TOKEN;

Testing

  1. Input stt rec 5 to start recording for 5 seconds
stt rec 5
  1. After recording, the system will display the recognized text
Result: How are you

1.4 Text-to-Speech (TTS) Implementation

Functional Flow

  1. Text input via serial interface
  2. Cloud API request generation
  3. Audio stream reception
  4. Real-time audio playback

Technical Implementation

​​#### 1. Text Processing​​:

  • URL encoding for special characters
  • Voice parameter customization (speed/pitch) ​​#### 2. Stream Handlin
#include <stdio.h>
#include <string.h>
#include "sdkconfig.h"
#include "driver/i2s_std.h"
#include "esp_http_client.h"
#include "esp_log.h"
#include "console_tts.h"
#include "audio.h"

static const char *TAG = "console_tts";

static i2s_chan_handle_t tx_handle = NULL;

static const char *url = "https://tsn.baidu.com/text2audio";
static const char *cert = "your_cert";

static const char *cuid = "5b1jdUrHrgLHXQidbBG0WDluQL1JHkdJ";
static const char *token = "14.862e68858a40ff300746670d540544ca.2592000.1746511680.282335-117091108";

static const console_cmd_plugin_desc_t __attribute__((section(".console_cmd_desc"), used)) PLUGIN = { .name = "console_cmd_tts", .plugin_regd_fn = &console_cmd_tts_register };

typedef struct tts
{
char *name;
esp_err_t (*operation)(struct tts *self, int argc, char *argv[]);
int arg_cnt;
int start_index;
char *help;
} tts_op_t;

static esp_err_t tts_help_op(tts_op_t *self, int argc, char *argv[]);
static esp_err_t tts_play_op(tts_op_t *self, int argc, char *argv[]);

static tts_op_t cmd_list[] = {
{ .name = "help", .operation = tts_help_op, .arg_cnt = 2, .start_index = 1, .help = "tts help: Prints the help text for all tts commands" },
{ .name = "play", .operation = tts_play_op, .arg_cnt = 3, .start_index = 1, .help = "tts play <text>: play tts text." },
};

static esp_err_t tts_help_op(tts_op_t *self, int argc, char *argv[])
{
int cmd_count = sizeof(cmd_list) / sizeof(cmd_list[0]);

for (int i = 0; i < cmd_count; i++)
{
if ((cmd_list[i].help != NULL) && (strlen(cmd_list[i].help) != 0))
{
printf(" %s\n", cmd_list[i].help);
}
}

return ESP_OK;
}

static esp_err_t do_cmd_tts(int argc, char **argv)
{
int cmd_count = sizeof(cmd_list) / sizeof(cmd_list[0]);
tts_op_t cmd;

for (int i = 0; i < cmd_count; i++)
{
cmd = cmd_list[i];

if (argc < cmd.start_index + 1)
{
continue;
}

if (!strcmp(cmd.name, argv[cmd.start_index]))
{
/* Get interface for eligible commands */
if (cmd.arg_cnt == argc)
{
if (cmd.operation != NULL)
{
if (cmd.operation(&cmd, argc, argv) != ESP_OK)
{
ESP_LOGE(TAG, "Usage:\n%s", cmd.help);
return 0;
}
}
return ESP_OK;
}
}
}

ESP_LOGE(TAG, "Command not available");

return ESP_OK;
}

esp_err_t _tts_event_handler(esp_http_client_event_t *evt)
{
static bool is_playing = false;
size_t bytes_written = 0;
switch (evt->event_id)
{
case HTTP_EVENT_ON_DATA:
if (!is_playing)
{
is_playing = true;
i2s_channel_enable(tx_handle);
}
i2s_channel_write(tx_handle, evt->data, evt->data_len, &bytes_written, 1000);
break;
default:
if (is_playing)
{
is_playing = false;
i2s_channel_disable(tx_handle);
}
break;
}
return ESP_OK;
}

esp_err_t console_cmd_tts_write(uint8_t *data, int len)
{
size_t bytes_written = 0;
i2s_channel_enable(tx_handle);
i2s_channel_write(tx_handle, data, len, &bytes_written, 1000);
i2s_channel_disable(tx_handle);
return ESP_OK;
}

esp_err_t query(void)
{
i2s_channel_enable(tx_handle);
i2s_channel_write(tx_handle, (uint8_t *)_audio1, sizeof(_audio1), NULL, 1000);
i2s_channel_disable(tx_handle);
return ESP_OK;
}

esp_err_t wait(void)
{
i2s_channel_enable(tx_handle);
i2s_channel_write(tx_handle, (uint8_t *)_audio2, sizeof(_audio2), NULL, 1000);
i2s_channel_disable(tx_handle);
return ESP_OK;
}

esp_err_t console_cmd_tts_play(const char *text)
{
esp_http_client_config_t config = {
.url = url,
.event_handler = _tts_event_handler,
.cert_pem = cert,
.buffer_size = 10240,
.disable_auto_redirect = true,
};

char *payload = malloc(4096);
snprintf(payload, 4096, "tex=%s&tok=%s&cuid=%s&ctp=1&lan=en&spd=5&pit=5&vol=10&per=0&aue=5", text, token, cuid);
esp_http_client_handle_t client = esp_http_client_init(&config);
esp_http_client_set_header(client, "Content-Type", "application/x-www-form-urlencoded");
esp_http_client_set_header(client, "Accept", "*/*");
esp_http_client_set_method(client, HTTP_METHOD_POST);
esp_http_client_set_post_field(client, payload, strlen(payload));
esp_http_client_set_timeout_ms(client, 60000);

esp_err_t err = esp_http_client_perform(client);
if (err != ESP_OK)
{
ESP_LOGE(TAG, "HTTP request failed: %s", esp_err_to_name(err));
}

esp_http_client_cleanup(client);
free(payload);
return ESP_OK;
}

static esp_err_t tts_play_op(tts_op_t *self, int argc, char *argv[])
{
if (argc < cmd_list[self->start_index].arg_cnt)
{
ESP_LOGE(TAG, "Error: Invalid command\n");
ESP_LOGE(TAG, "%s\n", cmd_list[self->start_index].help);
return ESP_FAIL;
}
char *text = argv[self->start_index + 1];

if (strlen(text) == 0)
{
ESP_LOGE(TAG, "Error: Invalid text\n");
return ESP_FAIL;
}
console_cmd_tts_play(text);
return ESP_OK;
}

void init_speaker(void)
{
i2s_chan_config_t tx_chan_cfg = I2S_CHANNEL_DEFAULT_CONFIG(I2S_NUM_AUTO, I2S_ROLE_MASTER);
ESP_ERROR_CHECK(i2s_new_channel(&tx_chan_cfg, &tx_handle, NULL));

i2s_std_config_t tx_std_cfg = {
.clk_cfg = I2S_STD_CLK_DEFAULT_CONFIG(8000),
.slot_cfg = I2S_STD_MSB_SLOT_DEFAULT_CONFIG(I2S_DATA_BIT_WIDTH_16BIT, I2S_SLOT_MODE_MONO),
.gpio_cfg = {
.mclk = I2S_GPIO_UNUSED, // some codecs may require mclk signal, this example doesn't need it
.bclk = 2,
.ws = 3,
.dout = 1,
.din = I2S_GPIO_UNUSED,
.invert_flags = {
.mclk_inv = false,
.bclk_inv = false,
.ws_inv = false,
},
},
};
ESP_ERROR_CHECK(i2s_channel_init_std_mode(tx_handle, &tx_std_cfg));
}

esp_err_t console_cmd_tts_register(void)
{
init_speaker();

esp_err_t ret;
esp_console_cmd_t command = { .command = "tts", .help = "tts help", .hint = NULL, .func = &do_cmd_tts, .argtable = NULL };

ret = esp_console_cmd_register(&command);
if (ret)
{
ESP_LOGE(TAG, "Unable to register tts");
}

return ret;
}

Security Note The authentication tokens shown are placeholders. Always

use environment variables for actual credentials:

// Replace with secure credential storage
static const char *cuid = CONFIG_VOICE_API_CUID;
static const char *token = CONFIG_VOICE_API_TOKEN;

Testing

  1. Input tts play "how are you" to play the audio
tts play "how are you"
  1. The system will play the audio

Here's a condensed version of challenges & solutions:

1.5 Key Challenges & Solutions

ChallengeSolution
Audio quality issuesIIR noise filtering Auto gain control
Network instabilityExponential backoff retry Dual WiFi/ESP-NOW mode
Memory constraintsStream buffering with RING_BUF_SIZE=8KB NVS credential storage
API limitationsRequest batching Local voice cache
Real-time syncDouble buffering DMA-driven I2S

1.6 Development Resources

Code Repository

  1. SmartGlasses Project Codebase

  2. PCB