1. Networking and Communications
Project Overview
My final project is an ESP32-based smart glasses system requiring internet
connectivity for data transmission and interactive capabilities. The
implementation focuses on two core functionalities: Speech-to-Text (STT) and
Text-to-Speech (TTS).
1.1 System Architecture
The smart glasses system consists of:
- ESP32 microcontroller with WiFi capabilities
- MEMS microphone for voice input
- Speaker for audio output
- Custom PCB for peripheral integration
- Cloud-based speech processing services

Conection Diagram

1.2 Baidu Cloud Speech Service
Key Steps
- 
Service Setup 
 • Register at Baidu AI Open Platform • Create application → Enable Speech Recognition (STT) and Text-to-Speech (TTS) • Get credentials:App ID,API Key,Secret Key
- 
Authentication # Get access token (refresh every 30 days)
 GET https://aip.baidubce.com/oauth/2.0/token?
 grant_type=client_credentials&
 client_id=YOUR_API_KEY&
 client_secret=YOUR_SECRET_KEY

API Specifications
| Service | Endpoint | Key Parameters | 
|---|---|---|
| STT | vop.baidubce.com/server_api | Format, Sample Rate, Channel | 
| TTS | tsn.baidubce.com/text2audio | Voice, Speed, Audio Format | 
Basic Configuration
• STT Requirements:
- Audio: PCM/WAV, 8k/16k sample rate
- Single channel
- Max 60s duration
• TTS Defaults:
- Voice: Standard female (ID 0)
- Speed: Normal (5)
- Format: MP3

Error Handling
| Code | Meaning | Solution | 
|---|---|---|
| 3300 | Invalid input | Check audio parameters | 
| 3301 | Poor audio quality | Improve recording | 
| 3302 | Auth failure | Renew access token | 
| 3303 | Server timeout | Retry request | 
Best Practices
- 
Security 
 Store credentials using ESP32's encrypted NVS storage
- 
Audio Processing - Noise filtering
 - Auto gain control
 - Silence removal
- 
Network 
 Implement retry logic with exponential backoff
1.3 Speech-to-Text (STT) Implementation
Functional Requirements
- Device wake-up detection
- Voice recording through onboard microphone
- Audio transmission to cloud service
- Text conversion processing
- Response playback via TTS

Technical Implementation
1. Audio Capture (I2S PDM Interface):
- 8kHz sampling rate
- 16-bit PCM format
- Single channel configuration
2. Base64 Encoding

3. Cloud Communication
- HTTP POST with JSON payload
- SSL/TLS encryption
- 60s timeout handling
#include <stdio.h>
#include <string.h>
#include "sdkconfig.h"
#include "driver/i2s_pdm.h"
#include "esp_http_client.h"
#include "esp_log.h"
#include "console_stt.h"
#include "console_tts.h"
#include "mbedtls/base64.h"
#include "cJSON.h"
static const char *TAG = "console_stt";
#define SAMPLE_SIZE (16 * 1024)
#define BYTE_RATE   (8000 * (16 / 8))
static i2s_chan_handle_t rx_handle = NULL;
static char gResult[2048] = { 0 };
static size_t gResultLen = 0;
static char gContent[2048] = { 0 };
static bool gIsFinish = false;
static const char *url = "https://vop.baidu.com/server_api";
static const char *cert = "your_cert";
static const char *cuid = "your_cuid";
static const char *token = "your_token";
static const console_cmd_plugin_desc_t __attribute__((section(".console_cmd_desc"), used)) PLUGIN = { .name = "console_cmd_stt", .plugin_regd_fn = &console_cmd_stt_register };
typedef struct stt
{
    char *name;
    esp_err_t (*operation)(struct stt *self, int argc, char *argv[]);
    int arg_cnt;
    int start_index;
    char *help;
} stt_op_t;
static esp_err_t stt_help_op(stt_op_t *self, int argc, char *argv[]);
static esp_err_t stt_rec_op(stt_op_t *self, int argc, char *argv[]);
static stt_op_t cmd_list[] = {
    { .name = "help", .operation = stt_help_op, .arg_cnt = 2, .start_index = 1, .help = "stt help: Prints the help text for all stt commands" },
    { .name = "rec", .operation = stt_rec_op, .arg_cnt = 3, .start_index = 1, .help = "stt rec <duration>: speech to text" },
};
static esp_err_t stt_help_op(stt_op_t *self, int argc, char *argv[])
{
    int cmd_count = sizeof(cmd_list) / sizeof(cmd_list[0]);
    for (int i = 0; i < cmd_count; i++)
    {
        if ((cmd_list[i].help != NULL) && (strlen(cmd_list[i].help) != 0))
        {
            printf(" %s\n", cmd_list[i].help);
        }
    }
    return ESP_OK;
}
static esp_err_t do_cmd_stt(int argc, char **argv)
{
    int cmd_count = sizeof(cmd_list) / sizeof(cmd_list[0]);
    stt_op_t cmd;
    for (int i = 0; i < cmd_count; i++)
    {
        cmd = cmd_list[i];
        if (argc < cmd.start_index + 1)
        {
            continue;
        }
        if (!strcmp(cmd.name, argv[cmd.start_index]))
        {
            /* Get interface for eligible commands */
            if (cmd.arg_cnt == argc)
            {
                if (cmd.operation != NULL)
                {
                    if (cmd.operation(&cmd, argc, argv) != ESP_OK)
                    {
                        ESP_LOGE(TAG, "Usage:\n%s", cmd.help);
                        return 0;
                    }
                }
                return ESP_OK;
            }
        }
    }
    ESP_LOGE(TAG, "Command not available");
    return ESP_OK;
}
esp_err_t console_cmd_stt_get_result(char *ret, int max_len)
{
    cJSON *root = cJSON_Parse(gResult);
    if (root)
    {
        cJSON *result = cJSON_GetObjectItem(root, "result");
        if (result == NULL)
        {
            ESP_LOGE(TAG, "Missing 'result' field");
            printf("%s", gResult);
            goto ret;
        }
        if (!cJSON_IsArray(result))
        {
            ESP_LOGE(TAG, "'result' field is not an array");
            goto ret;
        }
        cJSON *item = cJSON_GetArrayItem(result, 0);
        if (item == NULL)
        {
            ESP_LOGE(TAG, "Missing array item");
            goto ret;
        }
        strncpy(ret, item->valuestring, max_len);
    }
    else
    {
        ESP_LOGE(TAG, "Invalid response format");
        return ESP_FAIL;
    }
ret:
    cJSON_Delete(root);
    return ESP_OK;
}
static esp_err_t _http_event_handler(esp_http_client_event_t *evt)
{
    switch (evt->event_id)
    {
        case HTTP_EVENT_ON_DATA:
            if (evt->data_len > 0)
            {
                memcpy(gResult + gResultLen, evt->data, evt->data_len);
                gResultLen = evt->data_len;
                gResult[gResultLen] = '\0';
            }
            break;
        case HTTP_EVENT_ON_FINISH:
            gIsFinish = true;
            break;
        default:
            break;
    }
    return ESP_OK;
}
esp_err_t console_cmd_stt_rec(int duration)
{
    static int16_t i2s_readraw_buff[SAMPLE_SIZE];
    uint32_t flash_rec_time = BYTE_RATE * duration;
    uint8_t *pcm_buff = malloc(256 * 1024);
    size_t bytes_read;
    int flash_wr_size = 0;
    ESP_ERROR_CHECK(i2s_channel_enable(rx_handle));
    query();
    printf("Recording for %d seconds...\n", duration);
    while (flash_wr_size < flash_rec_time)
    {
        if (i2s_channel_read(rx_handle, (char *)(i2s_readraw_buff), SAMPLE_SIZE, &bytes_read, 1000) == ESP_OK)
        {
            memcpy(pcm_buff + flash_wr_size, i2s_readraw_buff, bytes_read);
            flash_wr_size += bytes_read;
        }
    }
    printf("\n");
    ESP_ERROR_CHECK(i2s_channel_disable(rx_handle));
    uint8_t *base64_pcm = malloc(flash_wr_size * 2);
    size_t base64_size = 0;
    // Base64 encode the PCM data
    mbedtls_base64_encode(base64_pcm, flash_wr_size * 2, &base64_size, pcm_buff, flash_wr_size);
    base64_pcm[base64_size] = '\0';
    esp_http_client_config_t config = {
        .url = "https://vop.baidu.com/server_api",
        .event_handler = _http_event_handler,
        .cert_pem = cert,
        .buffer_size = 10240,
    };
    esp_http_client_handle_t client = esp_http_client_init(&config);
    // 构造请求体
    cJSON *root = cJSON_CreateObject();
    cJSON *format = cJSON_CreateString("pcm");
    cJSON *rate = cJSON_CreateNumber(8000);
    cJSON *channel = cJSON_CreateNumber(1);
    cJSON *qcuid = cJSON_CreateString(cuid);
    cJSON *qtoken = cJSON_CreateString(token);
    cJSON *speech = cJSON_CreateString((const char *)base64_pcm);
    cJSON *len = cJSON_CreateNumber(flash_wr_size);
    cJSON_AddItemToObject(root, "format", format);
    cJSON_AddItemToObject(root, "rate", rate);
    cJSON_AddItemToObject(root, "channel", channel);
    cJSON_AddItemToObject(root, "cuid", qcuid);
    cJSON_AddItemToObject(root, "token", qtoken);
    cJSON_AddItemToObject(root, "speech", speech);
    cJSON_AddItemToObject(root, "len", len);
    char *json_data = cJSON_PrintUnformatted(root);
    esp_http_client_set_header(client, "Content-Type", "application/json");
    esp_http_client_set_method(client, HTTP_METHOD_POST);
    esp_http_client_set_post_field(client, json_data, strlen(json_data));
    esp_http_client_set_timeout_ms(client, 60000);
    gIsFinish = false;
    gResultLen = 0;
    esp_err_t err = esp_http_client_perform(client);
    if (err != ESP_OK)
    {
        ESP_LOGE(TAG, "HTTP request failed: %s", esp_err_to_name(err));
    }
    else
    {
        while (!gIsFinish)
        {
            vTaskDelay(10 / portTICK_PERIOD_MS);
        }
    }
    esp_http_client_cleanup(client);
    free(base64_pcm);
    free(pcm_buff);
    cJSON_Delete(root);
    return ESP_OK;
}
static esp_err_t stt_rec_op(stt_op_t *self, int argc, char *argv[])
{
    if (argc < cmd_list[self->start_index].arg_cnt)
    {
        ESP_LOGE(TAG, "Error: Invalid command\n");
        ESP_LOGE(TAG, "%s\n", cmd_list[self->start_index].help);
        return ESP_FAIL;
    }
    int duration = atoi(argv[self->start_index + 1]);
    if (duration <= 0)
    {
        duration = 10;
    }
    if (duration > 15)
    {
        duration = 15;
    }
    console_cmd_stt_rec(duration);
    memset(gContent, 0, sizeof(gContent));
    console_cmd_stt_get_result(gContent, sizeof(gContent));
    printf("Result: %s\n", gContent);
    return ESP_OK;
}
void init_microphone(void)
{
    i2s_chan_config_t chan_cfg = I2S_CHANNEL_DEFAULT_CONFIG(I2S_NUM_AUTO, I2S_ROLE_MASTER);
    ESP_ERROR_CHECK(i2s_new_channel(&chan_cfg, NULL, &rx_handle));
    i2s_pdm_rx_config_t pdm_rx_cfg = {
        .clk_cfg = I2S_PDM_RX_CLK_DEFAULT_CONFIG(8000),
        /* The default mono slot is the left slot (whose 'select pin' of the PDM microphone is pulled down) */
        .slot_cfg = I2S_PDM_RX_SLOT_DEFAULT_CONFIG(I2S_DATA_BIT_WIDTH_16BIT, I2S_SLOT_MODE_MONO),
        .gpio_cfg = {
            .clk = 42,
            .din = 41,
            .invert_flags = {
                .clk_inv = false,
            },
        },
    };
    ESP_ERROR_CHECK(i2s_channel_init_pdm_rx_mode(rx_handle, &pdm_rx_cfg));
}
esp_err_t console_cmd_stt_register(void)
{
    init_microphone();
    esp_err_t ret;
    esp_console_cmd_t command = { .command = "stt", .help = "stt help", .hint = NULL, .func = &do_cmd_stt, .argtable = NULL };
    ret = esp_console_cmd_register(&command);
    if (ret)
    {
        ESP_LOGE(TAG, "Unable to register stt");
    }
    return ret;
}
Key Components
- Audio Capture: Uses I2S PDM interface for high-quality audio sampling
- Base64 Encoding: Converts raw PCM data for API compatibility
- HTTP Client: Handles secure communication with Baidu Voice API
- JSON Parsing: Processes API responses using cJSON library
use environment variables for actual credentials:
// Replace with secure credential storage
static const char *cuid = CONFIG_VOICE_API_CUID;
static const char *token = CONFIG_VOICE_API_TOKEN;
Testing
- Input stt rec 5to start recording for 5 seconds
stt rec 5
- After recording, the system will display the recognized text
Result: How are you

1.4 Text-to-Speech (TTS) Implementation
Functional Flow
- Text input via serial interface
- Cloud API request generation
- Audio stream reception
- Real-time audio playback

Technical Implementation
#### 1. Text Processing:
- URL encoding for special characters
- Voice parameter customization (speed/pitch)
#### 2. Stream Handlin
 
#include <stdio.h>
#include <string.h>
#include "sdkconfig.h"
#include "driver/i2s_std.h"
#include "esp_http_client.h"
#include "esp_log.h"
#include "console_tts.h"
#include "audio.h"
static const char *TAG = "console_tts";
static i2s_chan_handle_t tx_handle = NULL;
static const char *url = "https://tsn.baidu.com/text2audio";
static const char *cert =  "your_cert";
static const char *cuid = "5b1jdUrHrgLHXQidbBG0WDluQL1JHkdJ";
static const char *token = "14.862e68858a40ff300746670d540544ca.2592000.1746511680.282335-117091108";
static const console_cmd_plugin_desc_t __attribute__((section(".console_cmd_desc"), used)) PLUGIN = { .name = "console_cmd_tts", .plugin_regd_fn = &console_cmd_tts_register };
typedef struct tts
{
    char *name;
    esp_err_t (*operation)(struct tts *self, int argc, char *argv[]);
    int arg_cnt;
    int start_index;
    char *help;
} tts_op_t;
static esp_err_t tts_help_op(tts_op_t *self, int argc, char *argv[]);
static esp_err_t tts_play_op(tts_op_t *self, int argc, char *argv[]);
static tts_op_t cmd_list[] = {
    { .name = "help", .operation = tts_help_op, .arg_cnt = 2, .start_index = 1, .help = "tts help: Prints the help text for all tts commands" },
    { .name = "play", .operation = tts_play_op, .arg_cnt = 3, .start_index = 1, .help = "tts play <text>: play tts text." },
};
static esp_err_t tts_help_op(tts_op_t *self, int argc, char *argv[])
{
    int cmd_count = sizeof(cmd_list) / sizeof(cmd_list[0]);
    for (int i = 0; i < cmd_count; i++)
    {
        if ((cmd_list[i].help != NULL) && (strlen(cmd_list[i].help) != 0))
        {
            printf(" %s\n", cmd_list[i].help);
        }
    }
    return ESP_OK;
}
static esp_err_t do_cmd_tts(int argc, char **argv)
{
    int cmd_count = sizeof(cmd_list) / sizeof(cmd_list[0]);
    tts_op_t cmd;
    for (int i = 0; i < cmd_count; i++)
    {
        cmd = cmd_list[i];
        if (argc < cmd.start_index + 1)
        {
            continue;
        }
        if (!strcmp(cmd.name, argv[cmd.start_index]))
        {
            /* Get interface for eligible commands */
            if (cmd.arg_cnt == argc)
            {
                if (cmd.operation != NULL)
                {
                    if (cmd.operation(&cmd, argc, argv) != ESP_OK)
                    {
                        ESP_LOGE(TAG, "Usage:\n%s", cmd.help);
                        return 0;
                    }
                }
                return ESP_OK;
            }
        }
    }
    ESP_LOGE(TAG, "Command not available");
    return ESP_OK;
}
esp_err_t _tts_event_handler(esp_http_client_event_t *evt)
{
    static bool is_playing = false;
    size_t bytes_written = 0;
    switch (evt->event_id)
    {
        case HTTP_EVENT_ON_DATA:
            if (!is_playing)
            {
                is_playing = true;
                i2s_channel_enable(tx_handle);
            }
            i2s_channel_write(tx_handle, evt->data, evt->data_len, &bytes_written, 1000);
            break;
        default:
            if (is_playing)
            {
                is_playing = false;
                i2s_channel_disable(tx_handle);
            }
            break;
    }
    return ESP_OK;
}
esp_err_t console_cmd_tts_write(uint8_t *data, int len)
{
    size_t bytes_written = 0;
    i2s_channel_enable(tx_handle);
    i2s_channel_write(tx_handle, data, len, &bytes_written, 1000);
    i2s_channel_disable(tx_handle);
    return ESP_OK;
}
esp_err_t query(void)
{
    i2s_channel_enable(tx_handle);
    i2s_channel_write(tx_handle, (uint8_t *)_audio1, sizeof(_audio1), NULL, 1000);
    i2s_channel_disable(tx_handle);
    return ESP_OK;
}
esp_err_t wait(void)
{
    i2s_channel_enable(tx_handle);
    i2s_channel_write(tx_handle, (uint8_t *)_audio2, sizeof(_audio2), NULL, 1000);
    i2s_channel_disable(tx_handle);
    return ESP_OK;
}
esp_err_t console_cmd_tts_play(const char *text)
{
    esp_http_client_config_t config = {
        .url = url,
        .event_handler = _tts_event_handler,
        .cert_pem = cert,
        .buffer_size = 10240,
        .disable_auto_redirect = true,
    };
    char *payload = malloc(4096);
    snprintf(payload, 4096, "tex=%s&tok=%s&cuid=%s&ctp=1&lan=en&spd=5&pit=5&vol=10&per=0&aue=5", text, token, cuid);
    esp_http_client_handle_t client = esp_http_client_init(&config);
    esp_http_client_set_header(client, "Content-Type", "application/x-www-form-urlencoded");
    esp_http_client_set_header(client, "Accept", "*/*");
    esp_http_client_set_method(client, HTTP_METHOD_POST);
    esp_http_client_set_post_field(client, payload, strlen(payload));
    esp_http_client_set_timeout_ms(client, 60000); 
    esp_err_t err = esp_http_client_perform(client);
    if (err != ESP_OK)
    {
        ESP_LOGE(TAG, "HTTP request failed: %s", esp_err_to_name(err));
    }
    esp_http_client_cleanup(client);
    free(payload);
    return ESP_OK;
}
static esp_err_t tts_play_op(tts_op_t *self, int argc, char *argv[])
{
    if (argc < cmd_list[self->start_index].arg_cnt)
    {
        ESP_LOGE(TAG, "Error: Invalid command\n");
        ESP_LOGE(TAG, "%s\n", cmd_list[self->start_index].help);
        return ESP_FAIL;
    }
    char *text = argv[self->start_index + 1];
    if (strlen(text) == 0)
    {
        ESP_LOGE(TAG, "Error: Invalid text\n");
        return ESP_FAIL;
    }
    console_cmd_tts_play(text);
    return ESP_OK;
}
void init_speaker(void)
{
    i2s_chan_config_t tx_chan_cfg = I2S_CHANNEL_DEFAULT_CONFIG(I2S_NUM_AUTO, I2S_ROLE_MASTER);
    ESP_ERROR_CHECK(i2s_new_channel(&tx_chan_cfg, &tx_handle, NULL));
    i2s_std_config_t tx_std_cfg = {
        .clk_cfg  = I2S_STD_CLK_DEFAULT_CONFIG(8000),
        .slot_cfg = I2S_STD_MSB_SLOT_DEFAULT_CONFIG(I2S_DATA_BIT_WIDTH_16BIT, I2S_SLOT_MODE_MONO),
        .gpio_cfg = {
            .mclk = I2S_GPIO_UNUSED,    // some codecs may require mclk signal, this example doesn't need it
            .bclk = 2,
            .ws   = 3,
            .dout = 1,
            .din  = I2S_GPIO_UNUSED,
            .invert_flags = {
                .mclk_inv = false,
                .bclk_inv = false,
                .ws_inv   = false,
            },
        },
    };
    ESP_ERROR_CHECK(i2s_channel_init_std_mode(tx_handle, &tx_std_cfg));
}
esp_err_t console_cmd_tts_register(void)
{
    init_speaker();
    esp_err_t ret;
    esp_console_cmd_t command = { .command = "tts", .help = "tts help", .hint = NULL, .func = &do_cmd_tts, .argtable = NULL };
    ret = esp_console_cmd_register(&command);
    if (ret)
    {
        ESP_LOGE(TAG, "Unable to register tts");
    }
    return ret;
}
use environment variables for actual credentials:
// Replace with secure credential storage
static const char *cuid = CONFIG_VOICE_API_CUID;
static const char *token = CONFIG_VOICE_API_TOKEN;
Testing
- Input tts play "how are you"to play the audio
tts play "how are you"
- The system will play the audio
 
Here's a condensed version of challenges & solutions:
1.5 Key Challenges & Solutions
| Challenge | Solution | 
|---|---|
| Audio quality issues | IIR noise filtering Auto gain control | 
| Network instability | Exponential backoff retry Dual WiFi/ESP-NOW mode | 
| Memory constraints | Stream buffering with RING_BUF_SIZE=8KB NVS credential storage | 
| API limitations | Request batching Local voice cache | 
| Real-time sync | Double buffering DMA-driven I2S |