1. Networking and Communications
Project Overview
My final project is an ESP32-based smart glasses system requiring internet
connectivity for data transmission and interactive capabilities. The
implementation focuses on two core functionalities: Speech-to-Text (STT) and
Text-to-Speech (TTS).
1.1 System Architecture
The smart glasses system consists of:
- ESP32 microcontroller with WiFi capabilities
- MEMS microphone for voice input
- Speaker for audio output
- Custom PCB for peripheral integration
- Cloud-based speech processing services
Conection Diagram
1.2 Baidu Cloud Speech Service
Key Steps
-
Service Setup
• Register at Baidu AI Open Platform • Create application → Enable Speech Recognition (STT) and Text-to-Speech (TTS) • Get credentials:App ID
,API Key
,Secret Key
-
Authentication
# Get access token (refresh every 30 days)
GET https://aip.baidubce.com/oauth/2.0/token?
grant_type=client_credentials&
client_id=YOUR_API_KEY&
client_secret=YOUR_SECRET_KEY
API Specifications
Service | Endpoint | Key Parameters |
---|---|---|
STT | vop.baidubce.com/server_api | Format, Sample Rate, Channel |
TTS | tsn.baidubce.com/text2audio | Voice, Speed, Audio Format |
Basic Configuration
• STT Requirements:
- Audio: PCM/WAV, 8k/16k sample rate
- Single channel
- Max 60s duration
• TTS Defaults:
- Voice: Standard female (ID 0)
- Speed: Normal (5)
- Format: MP3
Error Handling
Code | Meaning | Solution |
---|---|---|
3300 | Invalid input | Check audio parameters |
3301 | Poor audio quality | Improve recording |
3302 | Auth failure | Renew access token |
3303 | Server timeout | Retry request |
Best Practices
-
Security
Store credentials using ESP32's encrypted NVS storage -
Audio Processing
- Noise filtering
- Auto gain control
- Silence removal -
Network
Implement retry logic with exponential backoff
1.3 Speech-to-Text (STT) Implementation
Functional Requirements
- Device wake-up detection
- Voice recording through onboard microphone
- Audio transmission to cloud service
- Text conversion processing
- Response playback via TTS
Technical Implementation
1. Audio Capture (I2S PDM Interface):
- 8kHz sampling rate
- 16-bit PCM format
- Single channel configuration
2. Base64 Encoding
3. Cloud Communication
- HTTP POST with JSON payload
- SSL/TLS encryption
- 60s timeout handling
#include <stdio.h>
#include <string.h>
#include "sdkconfig.h"
#include "driver/i2s_pdm.h"
#include "esp_http_client.h"
#include "esp_log.h"
#include "console_stt.h"
#include "console_tts.h"
#include "mbedtls/base64.h"
#include "cJSON.h"
static const char *TAG = "console_stt";
#define SAMPLE_SIZE (16 * 1024)
#define BYTE_RATE (8000 * (16 / 8))
static i2s_chan_handle_t rx_handle = NULL;
static char gResult[2048] = { 0 };
static size_t gResultLen = 0;
static char gContent[2048] = { 0 };
static bool gIsFinish = false;
static const char *url = "https://vop.baidu.com/server_api";
static const char *cert = "your_cert";
static const char *cuid = "your_cuid";
static const char *token = "your_token";
static const console_cmd_plugin_desc_t __attribute__((section(".console_cmd_desc"), used)) PLUGIN = { .name = "console_cmd_stt", .plugin_regd_fn = &console_cmd_stt_register };
typedef struct stt
{
char *name;
esp_err_t (*operation)(struct stt *self, int argc, char *argv[]);
int arg_cnt;
int start_index;
char *help;
} stt_op_t;
static esp_err_t stt_help_op(stt_op_t *self, int argc, char *argv[]);
static esp_err_t stt_rec_op(stt_op_t *self, int argc, char *argv[]);
static stt_op_t cmd_list[] = {
{ .name = "help", .operation = stt_help_op, .arg_cnt = 2, .start_index = 1, .help = "stt help: Prints the help text for all stt commands" },
{ .name = "rec", .operation = stt_rec_op, .arg_cnt = 3, .start_index = 1, .help = "stt rec <duration>: speech to text" },
};
static esp_err_t stt_help_op(stt_op_t *self, int argc, char *argv[])
{
int cmd_count = sizeof(cmd_list) / sizeof(cmd_list[0]);
for (int i = 0; i < cmd_count; i++)
{
if ((cmd_list[i].help != NULL) && (strlen(cmd_list[i].help) != 0))
{
printf(" %s\n", cmd_list[i].help);
}
}
return ESP_OK;
}
static esp_err_t do_cmd_stt(int argc, char **argv)
{
int cmd_count = sizeof(cmd_list) / sizeof(cmd_list[0]);
stt_op_t cmd;
for (int i = 0; i < cmd_count; i++)
{
cmd = cmd_list[i];
if (argc < cmd.start_index + 1)
{
continue;
}
if (!strcmp(cmd.name, argv[cmd.start_index]))
{
/* Get interface for eligible commands */
if (cmd.arg_cnt == argc)
{
if (cmd.operation != NULL)
{
if (cmd.operation(&cmd, argc, argv) != ESP_OK)
{
ESP_LOGE(TAG, "Usage:\n%s", cmd.help);
return 0;
}
}
return ESP_OK;
}
}
}
ESP_LOGE(TAG, "Command not available");
return ESP_OK;
}
esp_err_t console_cmd_stt_get_result(char *ret, int max_len)
{
cJSON *root = cJSON_Parse(gResult);
if (root)
{
cJSON *result = cJSON_GetObjectItem(root, "result");
if (result == NULL)
{
ESP_LOGE(TAG, "Missing 'result' field");
printf("%s", gResult);
goto ret;
}
if (!cJSON_IsArray(result))
{
ESP_LOGE(TAG, "'result' field is not an array");
goto ret;
}
cJSON *item = cJSON_GetArrayItem(result, 0);
if (item == NULL)
{
ESP_LOGE(TAG, "Missing array item");
goto ret;
}
strncpy(ret, item->valuestring, max_len);
}
else
{
ESP_LOGE(TAG, "Invalid response format");
return ESP_FAIL;
}
ret:
cJSON_Delete(root);
return ESP_OK;
}
static esp_err_t _http_event_handler(esp_http_client_event_t *evt)
{
switch (evt->event_id)
{
case HTTP_EVENT_ON_DATA:
if (evt->data_len > 0)
{
memcpy(gResult + gResultLen, evt->data, evt->data_len);
gResultLen = evt->data_len;
gResult[gResultLen] = '\0';
}
break;
case HTTP_EVENT_ON_FINISH:
gIsFinish = true;
break;
default:
break;
}
return ESP_OK;
}
esp_err_t console_cmd_stt_rec(int duration)
{
static int16_t i2s_readraw_buff[SAMPLE_SIZE];
uint32_t flash_rec_time = BYTE_RATE * duration;
uint8_t *pcm_buff = malloc(256 * 1024);
size_t bytes_read;
int flash_wr_size = 0;
ESP_ERROR_CHECK(i2s_channel_enable(rx_handle));
query();
printf("Recording for %d seconds...\n", duration);
while (flash_wr_size < flash_rec_time)
{
if (i2s_channel_read(rx_handle, (char *)(i2s_readraw_buff), SAMPLE_SIZE, &bytes_read, 1000) == ESP_OK)
{
memcpy(pcm_buff + flash_wr_size, i2s_readraw_buff, bytes_read);
flash_wr_size += bytes_read;
}
}
printf("\n");
ESP_ERROR_CHECK(i2s_channel_disable(rx_handle));
uint8_t *base64_pcm = malloc(flash_wr_size * 2);
size_t base64_size = 0;
// Base64 encode the PCM data
mbedtls_base64_encode(base64_pcm, flash_wr_size * 2, &base64_size, pcm_buff, flash_wr_size);
base64_pcm[base64_size] = '\0';
esp_http_client_config_t config = {
.url = "https://vop.baidu.com/server_api",
.event_handler = _http_event_handler,
.cert_pem = cert,
.buffer_size = 10240,
};
esp_http_client_handle_t client = esp_http_client_init(&config);
// 构造请求体
cJSON *root = cJSON_CreateObject();
cJSON *format = cJSON_CreateString("pcm");
cJSON *rate = cJSON_CreateNumber(8000);
cJSON *channel = cJSON_CreateNumber(1);
cJSON *qcuid = cJSON_CreateString(cuid);
cJSON *qtoken = cJSON_CreateString(token);
cJSON *speech = cJSON_CreateString((const char *)base64_pcm);
cJSON *len = cJSON_CreateNumber(flash_wr_size);
cJSON_AddItemToObject(root, "format", format);
cJSON_AddItemToObject(root, "rate", rate);
cJSON_AddItemToObject(root, "channel", channel);
cJSON_AddItemToObject(root, "cuid", qcuid);
cJSON_AddItemToObject(root, "token", qtoken);
cJSON_AddItemToObject(root, "speech", speech);
cJSON_AddItemToObject(root, "len", len);
char *json_data = cJSON_PrintUnformatted(root);
esp_http_client_set_header(client, "Content-Type", "application/json");
esp_http_client_set_method(client, HTTP_METHOD_POST);
esp_http_client_set_post_field(client, json_data, strlen(json_data));
esp_http_client_set_timeout_ms(client, 60000);
gIsFinish = false;
gResultLen = 0;
esp_err_t err = esp_http_client_perform(client);
if (err != ESP_OK)
{
ESP_LOGE(TAG, "HTTP request failed: %s", esp_err_to_name(err));
}
else
{
while (!gIsFinish)
{
vTaskDelay(10 / portTICK_PERIOD_MS);
}
}
esp_http_client_cleanup(client);
free(base64_pcm);
free(pcm_buff);
cJSON_Delete(root);
return ESP_OK;
}
static esp_err_t stt_rec_op(stt_op_t *self, int argc, char *argv[])
{
if (argc < cmd_list[self->start_index].arg_cnt)
{
ESP_LOGE(TAG, "Error: Invalid command\n");
ESP_LOGE(TAG, "%s\n", cmd_list[self->start_index].help);
return ESP_FAIL;
}
int duration = atoi(argv[self->start_index + 1]);
if (duration <= 0)
{
duration = 10;
}
if (duration > 15)
{
duration = 15;
}
console_cmd_stt_rec(duration);
memset(gContent, 0, sizeof(gContent));
console_cmd_stt_get_result(gContent, sizeof(gContent));
printf("Result: %s\n", gContent);
return ESP_OK;
}
void init_microphone(void)
{
i2s_chan_config_t chan_cfg = I2S_CHANNEL_DEFAULT_CONFIG(I2S_NUM_AUTO, I2S_ROLE_MASTER);
ESP_ERROR_CHECK(i2s_new_channel(&chan_cfg, NULL, &rx_handle));
i2s_pdm_rx_config_t pdm_rx_cfg = {
.clk_cfg = I2S_PDM_RX_CLK_DEFAULT_CONFIG(8000),
/* The default mono slot is the left slot (whose 'select pin' of the PDM microphone is pulled down) */
.slot_cfg = I2S_PDM_RX_SLOT_DEFAULT_CONFIG(I2S_DATA_BIT_WIDTH_16BIT, I2S_SLOT_MODE_MONO),
.gpio_cfg = {
.clk = 42,
.din = 41,
.invert_flags = {
.clk_inv = false,
},
},
};
ESP_ERROR_CHECK(i2s_channel_init_pdm_rx_mode(rx_handle, &pdm_rx_cfg));
}
esp_err_t console_cmd_stt_register(void)
{
init_microphone();
esp_err_t ret;
esp_console_cmd_t command = { .command = "stt", .help = "stt help", .hint = NULL, .func = &do_cmd_stt, .argtable = NULL };
ret = esp_console_cmd_register(&command);
if (ret)
{
ESP_LOGE(TAG, "Unable to register stt");
}
return ret;
}
Key Components
- Audio Capture: Uses I2S PDM interface for high-quality audio sampling
- Base64 Encoding: Converts raw PCM data for API compatibility
- HTTP Client: Handles secure communication with Baidu Voice API
- JSON Parsing: Processes API responses using cJSON library
use environment variables for actual credentials:
// Replace with secure credential storage
static const char *cuid = CONFIG_VOICE_API_CUID;
static const char *token = CONFIG_VOICE_API_TOKEN;
Testing
- Input
stt rec 5
to start recording for 5 seconds
stt rec 5
- After recording, the system will display the recognized text
Result: How are you
1.4 Text-to-Speech (TTS) Implementation
Functional Flow
- Text input via serial interface
- Cloud API request generation
- Audio stream reception
- Real-time audio playback
Technical Implementation
#### 1. Text Processing:
- URL encoding for special characters
- Voice parameter customization (speed/pitch)
#### 2. Stream Handlin
#include <stdio.h>
#include <string.h>
#include "sdkconfig.h"
#include "driver/i2s_std.h"
#include "esp_http_client.h"
#include "esp_log.h"
#include "console_tts.h"
#include "audio.h"
static const char *TAG = "console_tts";
static i2s_chan_handle_t tx_handle = NULL;
static const char *url = "https://tsn.baidu.com/text2audio";
static const char *cert = "your_cert";
static const char *cuid = "5b1jdUrHrgLHXQidbBG0WDluQL1JHkdJ";
static const char *token = "14.862e68858a40ff300746670d540544ca.2592000.1746511680.282335-117091108";
static const console_cmd_plugin_desc_t __attribute__((section(".console_cmd_desc"), used)) PLUGIN = { .name = "console_cmd_tts", .plugin_regd_fn = &console_cmd_tts_register };
typedef struct tts
{
char *name;
esp_err_t (*operation)(struct tts *self, int argc, char *argv[]);
int arg_cnt;
int start_index;
char *help;
} tts_op_t;
static esp_err_t tts_help_op(tts_op_t *self, int argc, char *argv[]);
static esp_err_t tts_play_op(tts_op_t *self, int argc, char *argv[]);
static tts_op_t cmd_list[] = {
{ .name = "help", .operation = tts_help_op, .arg_cnt = 2, .start_index = 1, .help = "tts help: Prints the help text for all tts commands" },
{ .name = "play", .operation = tts_play_op, .arg_cnt = 3, .start_index = 1, .help = "tts play <text>: play tts text." },
};
static esp_err_t tts_help_op(tts_op_t *self, int argc, char *argv[])
{
int cmd_count = sizeof(cmd_list) / sizeof(cmd_list[0]);
for (int i = 0; i < cmd_count; i++)
{
if ((cmd_list[i].help != NULL) && (strlen(cmd_list[i].help) != 0))
{
printf(" %s\n", cmd_list[i].help);
}
}
return ESP_OK;
}
static esp_err_t do_cmd_tts(int argc, char **argv)
{
int cmd_count = sizeof(cmd_list) / sizeof(cmd_list[0]);
tts_op_t cmd;
for (int i = 0; i < cmd_count; i++)
{
cmd = cmd_list[i];
if (argc < cmd.start_index + 1)
{
continue;
}
if (!strcmp(cmd.name, argv[cmd.start_index]))
{
/* Get interface for eligible commands */
if (cmd.arg_cnt == argc)
{
if (cmd.operation != NULL)
{
if (cmd.operation(&cmd, argc, argv) != ESP_OK)
{
ESP_LOGE(TAG, "Usage:\n%s", cmd.help);
return 0;
}
}
return ESP_OK;
}
}
}
ESP_LOGE(TAG, "Command not available");
return ESP_OK;
}
esp_err_t _tts_event_handler(esp_http_client_event_t *evt)
{
static bool is_playing = false;
size_t bytes_written = 0;
switch (evt->event_id)
{
case HTTP_EVENT_ON_DATA:
if (!is_playing)
{
is_playing = true;
i2s_channel_enable(tx_handle);
}
i2s_channel_write(tx_handle, evt->data, evt->data_len, &bytes_written, 1000);
break;
default:
if (is_playing)
{
is_playing = false;
i2s_channel_disable(tx_handle);
}
break;
}
return ESP_OK;
}
esp_err_t console_cmd_tts_write(uint8_t *data, int len)
{
size_t bytes_written = 0;
i2s_channel_enable(tx_handle);
i2s_channel_write(tx_handle, data, len, &bytes_written, 1000);
i2s_channel_disable(tx_handle);
return ESP_OK;
}
esp_err_t query(void)
{
i2s_channel_enable(tx_handle);
i2s_channel_write(tx_handle, (uint8_t *)_audio1, sizeof(_audio1), NULL, 1000);
i2s_channel_disable(tx_handle);
return ESP_OK;
}
esp_err_t wait(void)
{
i2s_channel_enable(tx_handle);
i2s_channel_write(tx_handle, (uint8_t *)_audio2, sizeof(_audio2), NULL, 1000);
i2s_channel_disable(tx_handle);
return ESP_OK;
}
esp_err_t console_cmd_tts_play(const char *text)
{
esp_http_client_config_t config = {
.url = url,
.event_handler = _tts_event_handler,
.cert_pem = cert,
.buffer_size = 10240,
.disable_auto_redirect = true,
};
char *payload = malloc(4096);
snprintf(payload, 4096, "tex=%s&tok=%s&cuid=%s&ctp=1&lan=en&spd=5&pit=5&vol=10&per=0&aue=5", text, token, cuid);
esp_http_client_handle_t client = esp_http_client_init(&config);
esp_http_client_set_header(client, "Content-Type", "application/x-www-form-urlencoded");
esp_http_client_set_header(client, "Accept", "*/*");
esp_http_client_set_method(client, HTTP_METHOD_POST);
esp_http_client_set_post_field(client, payload, strlen(payload));
esp_http_client_set_timeout_ms(client, 60000);
esp_err_t err = esp_http_client_perform(client);
if (err != ESP_OK)
{
ESP_LOGE(TAG, "HTTP request failed: %s", esp_err_to_name(err));
}
esp_http_client_cleanup(client);
free(payload);
return ESP_OK;
}
static esp_err_t tts_play_op(tts_op_t *self, int argc, char *argv[])
{
if (argc < cmd_list[self->start_index].arg_cnt)
{
ESP_LOGE(TAG, "Error: Invalid command\n");
ESP_LOGE(TAG, "%s\n", cmd_list[self->start_index].help);
return ESP_FAIL;
}
char *text = argv[self->start_index + 1];
if (strlen(text) == 0)
{
ESP_LOGE(TAG, "Error: Invalid text\n");
return ESP_FAIL;
}
console_cmd_tts_play(text);
return ESP_OK;
}
void init_speaker(void)
{
i2s_chan_config_t tx_chan_cfg = I2S_CHANNEL_DEFAULT_CONFIG(I2S_NUM_AUTO, I2S_ROLE_MASTER);
ESP_ERROR_CHECK(i2s_new_channel(&tx_chan_cfg, &tx_handle, NULL));
i2s_std_config_t tx_std_cfg = {
.clk_cfg = I2S_STD_CLK_DEFAULT_CONFIG(8000),
.slot_cfg = I2S_STD_MSB_SLOT_DEFAULT_CONFIG(I2S_DATA_BIT_WIDTH_16BIT, I2S_SLOT_MODE_MONO),
.gpio_cfg = {
.mclk = I2S_GPIO_UNUSED, // some codecs may require mclk signal, this example doesn't need it
.bclk = 2,
.ws = 3,
.dout = 1,
.din = I2S_GPIO_UNUSED,
.invert_flags = {
.mclk_inv = false,
.bclk_inv = false,
.ws_inv = false,
},
},
};
ESP_ERROR_CHECK(i2s_channel_init_std_mode(tx_handle, &tx_std_cfg));
}
esp_err_t console_cmd_tts_register(void)
{
init_speaker();
esp_err_t ret;
esp_console_cmd_t command = { .command = "tts", .help = "tts help", .hint = NULL, .func = &do_cmd_tts, .argtable = NULL };
ret = esp_console_cmd_register(&command);
if (ret)
{
ESP_LOGE(TAG, "Unable to register tts");
}
return ret;
}
use environment variables for actual credentials:
// Replace with secure credential storage
static const char *cuid = CONFIG_VOICE_API_CUID;
static const char *token = CONFIG_VOICE_API_TOKEN;
Testing
- Input
tts play "how are you"
to play the audio
tts play "how are you"
- The system will play the audio
Here's a condensed version of challenges & solutions:
1.5 Key Challenges & Solutions
Challenge | Solution |
---|---|
Audio quality issues | IIR noise filtering Auto gain control |
Network instability | Exponential backoff retry Dual WiFi/ESP-NOW mode |
Memory constraints | Stream buffering with RING_BUF_SIZE=8KB NVS credential storage |
API limitations | Request batching Local voice cache |
Real-time sync | Double buffering DMA-driven I2S |