1. Networking and Communications

Project Overview

My final project is an ESP32-based smart glasses system requiring internet connectivity for data transmission and interactive capabilities. The implementation focuses on two core functionalities: Speech-to-Text (STT) and
Text-to-Speech (TTS).

1.1 System Architecture

The smart glasses system consists of:

ESP32 microcontroller with WiFi capabilities
MEMS microphone for voice input
Speaker for audio output
Custom PCB for peripheral integration
Cloud-based speech processing services

Conection Diagram

1.2 Baidu Cloud Speech Service

Key Steps

Service Setup
• Register at Baidu AI Open Platform • Create application → Enable Speech Recognition (STT) and Text-to-Speech (TTS) • Get credentials: App ID, API Key, Secret Key

Authentication

# Get access token (refresh every 30 days)
GET https://aip.baidubce.com/oauth/2.0/token?
    grant_type=client_credentials&
    client_id=YOUR_API_KEY&
    client_secret=YOUR_SECRET_KEY

API Specifications

Service	Endpoint	Key Parameters
STT	`vop.baidubce.com/server_api`	Format, Sample Rate, Channel
TTS	`tsn.baidubce.com/text2audio`	Voice, Speed, Audio Format

Basic Configuration

• STT Requirements:

- Audio: PCM/WAV, 8k/16k sample rate
- Single channel
- Max 60s duration

• TTS Defaults:

- Voice: Standard female (ID 0)
- Speed: Normal (5)
- Format: MP3

Error Handling

Code	Meaning	Solution
3300	Invalid input	Check audio parameters
3301	Poor audio quality	Improve recording
3302	Auth failure	Renew access token
3303	Server timeout	Retry request

Best Practices

Security
Store credentials using ESP32's encrypted NVS storage

Audio Processing

- Noise filtering
- Auto gain control
- Silence removal

Network
Implement retry logic with exponential backoff

1.3 Speech-to-Text (STT) Implementation

Functional Requirements

Device wake-up detection
Voice recording through onboard microphone
Audio transmission to cloud service
Text conversion processing
Response playback via TTS

Technical Implementation

1. Audio Capture (I2S PDM Interface):

8kHz sampling rate
16-bit PCM format
Single channel configuration

2. Base64 Encoding

3. Cloud Communication

HTTP POST with JSON payload
SSL/TLS encryption
60s timeout handling

#include <stdio.h>
#include <string.h>
#include "sdkconfig.h"
#include "driver/i2s_pdm.h"
#include "esp_http_client.h"
#include "esp_log.h"
#include "console_stt.h"
#include "console_tts.h"
#include "mbedtls/base64.h"

#include "cJSON.h"

static const char *TAG = "console_stt";

#define SAMPLE_SIZE (16 * 1024)
#define BYTE_RATE   (8000 * (16 / 8))

static i2s_chan_handle_t rx_handle = NULL;

static char gResult[2048] = { 0 };
static size_t gResultLen = 0;
static char gContent[2048] = { 0 };
static bool gIsFinish = false;

static const char *url = "https://vop.baidu.com/server_api";

static const char *cert = "your_cert";
static const char *cuid = "your_cuid";
static const char *token = "your_token";

static const console_cmd_plugin_desc_t __attribute__((section(".console_cmd_desc"), used)) PLUGIN = { .name = "console_cmd_stt", .plugin_regd_fn = &console_cmd_stt_register };

typedef struct stt
{
    char *name;
    esp_err_t (*operation)(struct stt *self, int argc, char *argv[]);
    int arg_cnt;
    int start_index;
    char *help;
} stt_op_t;

static esp_err_t stt_help_op(stt_op_t *self, int argc, char *argv[]);
static esp_err_t stt_rec_op(stt_op_t *self, int argc, char *argv[]);

static stt_op_t cmd_list[] = {
    { .name = "help", .operation = stt_help_op, .arg_cnt = 2, .start_index = 1, .help = "stt help: Prints the help text for all stt commands" },
    { .name = "rec", .operation = stt_rec_op, .arg_cnt = 3, .start_index = 1, .help = "stt rec <duration>: speech to text" },
};

static esp_err_t stt_help_op(stt_op_t *self, int argc, char *argv[])
{
    int cmd_count = sizeof(cmd_list) / sizeof(cmd_list[0]);

    for (int i = 0; i < cmd_count; i++)
    {
        if ((cmd_list[i].help != NULL) && (strlen(cmd_list[i].help) != 0))
        {
            printf(" %s\n", cmd_list[i].help);
        }
    }

    return ESP_OK;
}

static esp_err_t do_cmd_stt(int argc, char **argv)
{
    int cmd_count = sizeof(cmd_list) / sizeof(cmd_list[0]);
    stt_op_t cmd;

    for (int i = 0; i < cmd_count; i++)
    {
        cmd = cmd_list[i];

        if (argc < cmd.start_index + 1)
        {
            continue;
        }

        if (!strcmp(cmd.name, argv[cmd.start_index]))
        {
            /* Get interface for eligible commands */
            if (cmd.arg_cnt == argc)
            {
                if (cmd.operation != NULL)
                {
                    if (cmd.operation(&cmd, argc, argv) != ESP_OK)
                    {
                        ESP_LOGE(TAG, "Usage:\n%s", cmd.help);
                        return 0;
                    }
                }
                return ESP_OK;
            }
        }
    }

    ESP_LOGE(TAG, "Command not available");

    return ESP_OK;
}

esp_err_t console_cmd_stt_get_result(char *ret, int max_len)
{
    cJSON *root = cJSON_Parse(gResult);
    if (root)
    {
        cJSON *result = cJSON_GetObjectItem(root, "result");
        if (result == NULL)
        {
            ESP_LOGE(TAG, "Missing 'result' field");
            printf("%s", gResult);
            goto ret;
        }
        if (!cJSON_IsArray(result))
        {
            ESP_LOGE(TAG, "'result' field is not an array");
            goto ret;
        }
        cJSON *item = cJSON_GetArrayItem(result, 0);
        if (item == NULL)
        {
            ESP_LOGE(TAG, "Missing array item");
            goto ret;
        }
        strncpy(ret, item->valuestring, max_len);
    }
    else
    {
        ESP_LOGE(TAG, "Invalid response format");
        return ESP_FAIL;
    }

ret:
    cJSON_Delete(root);
    return ESP_OK;
}

static esp_err_t _http_event_handler(esp_http_client_event_t *evt)
{
    switch (evt->event_id)
    {
        case HTTP_EVENT_ON_DATA:
            if (evt->data_len > 0)
            {
                memcpy(gResult + gResultLen, evt->data, evt->data_len);
                gResultLen = evt->data_len;
                gResult[gResultLen] = '\0';
            }

            break;
        case HTTP_EVENT_ON_FINISH:
            gIsFinish = true;
            break;
        default:
            break;
    }
    return ESP_OK;
}

esp_err_t console_cmd_stt_rec(int duration)
{
    static int16_t i2s_readraw_buff[SAMPLE_SIZE];
    uint32_t flash_rec_time = BYTE_RATE * duration;
    uint8_t *pcm_buff = malloc(256 * 1024);
    size_t bytes_read;
    int flash_wr_size = 0;
    ESP_ERROR_CHECK(i2s_channel_enable(rx_handle));
    query();
    printf("Recording for %d seconds...\n", duration);
    while (flash_wr_size < flash_rec_time)
    {
        if (i2s_channel_read(rx_handle, (char *)(i2s_readraw_buff), SAMPLE_SIZE, &bytes_read, 1000) == ESP_OK)
        {
            memcpy(pcm_buff + flash_wr_size, i2s_readraw_buff, bytes_read);
            flash_wr_size += bytes_read;
        }
    }
    printf("\n");
    ESP_ERROR_CHECK(i2s_channel_disable(rx_handle));
    uint8_t *base64_pcm = malloc(flash_wr_size * 2);
    size_t base64_size = 0;
    // Base64 encode the PCM data
    mbedtls_base64_encode(base64_pcm, flash_wr_size * 2, &base64_size, pcm_buff, flash_wr_size);
    base64_pcm[base64_size] = '\0';

    esp_http_client_config_t config = {
        .url = "https://vop.baidu.com/server_api",
        .event_handler = _http_event_handler,
        .cert_pem = cert,
        .buffer_size = 10240,
    };

    esp_http_client_handle_t client = esp_http_client_init(&config);

    // 构造请求体
    cJSON *root = cJSON_CreateObject();
    cJSON *format = cJSON_CreateString("pcm");
    cJSON *rate = cJSON_CreateNumber(8000);
    cJSON *channel = cJSON_CreateNumber(1);
    cJSON *qcuid = cJSON_CreateString(cuid);
    cJSON *qtoken = cJSON_CreateString(token);
    cJSON *speech = cJSON_CreateString((const char *)base64_pcm);
    cJSON *len = cJSON_CreateNumber(flash_wr_size);
    cJSON_AddItemToObject(root, "format", format);
    cJSON_AddItemToObject(root, "rate", rate);
    cJSON_AddItemToObject(root, "channel", channel);
    cJSON_AddItemToObject(root, "cuid", qcuid);
    cJSON_AddItemToObject(root, "token", qtoken);
    cJSON_AddItemToObject(root, "speech", speech);
    cJSON_AddItemToObject(root, "len", len);

    char *json_data = cJSON_PrintUnformatted(root);

    esp_http_client_set_header(client, "Content-Type", "application/json");
    esp_http_client_set_method(client, HTTP_METHOD_POST);
    esp_http_client_set_post_field(client, json_data, strlen(json_data));
    esp_http_client_set_timeout_ms(client, 60000);

    gIsFinish = false;
    gResultLen = 0;
    esp_err_t err = esp_http_client_perform(client);
    if (err != ESP_OK)
    {
        ESP_LOGE(TAG, "HTTP request failed: %s", esp_err_to_name(err));
    }
    else
    {
        while (!gIsFinish)
        {
            vTaskDelay(10 / portTICK_PERIOD_MS);
        }
    }

    esp_http_client_cleanup(client);
    free(base64_pcm);
    free(pcm_buff);
    cJSON_Delete(root);

    return ESP_OK;
}

static esp_err_t stt_rec_op(stt_op_t *self, int argc, char *argv[])
{
    if (argc < cmd_list[self->start_index].arg_cnt)
    {
        ESP_LOGE(TAG, "Error: Invalid command\n");
        ESP_LOGE(TAG, "%s\n", cmd_list[self->start_index].help);
        return ESP_FAIL;
    }
    int duration = atoi(argv[self->start_index + 1]);
    if (duration <= 0)
    {
        duration = 10;
    }
    if (duration > 15)
    {
        duration = 15;
    }
    console_cmd_stt_rec(duration);
    memset(gContent, 0, sizeof(gContent));
    console_cmd_stt_get_result(gContent, sizeof(gContent));
    printf("Result: %s\n", gContent);
    return ESP_OK;
}

void init_microphone(void)
{
    i2s_chan_config_t chan_cfg = I2S_CHANNEL_DEFAULT_CONFIG(I2S_NUM_AUTO, I2S_ROLE_MASTER);
    ESP_ERROR_CHECK(i2s_new_channel(&chan_cfg, NULL, &rx_handle));

    i2s_pdm_rx_config_t pdm_rx_cfg = {
        .clk_cfg = I2S_PDM_RX_CLK_DEFAULT_CONFIG(8000),
        /* The default mono slot is the left slot (whose 'select pin' of the PDM microphone is pulled down) */
        .slot_cfg = I2S_PDM_RX_SLOT_DEFAULT_CONFIG(I2S_DATA_BIT_WIDTH_16BIT, I2S_SLOT_MODE_MONO),
        .gpio_cfg = {
            .clk = 42,
            .din = 41,
            .invert_flags = {
                .clk_inv = false,
            },
        },
    };
    ESP_ERROR_CHECK(i2s_channel_init_pdm_rx_mode(rx_handle, &pdm_rx_cfg));
}
esp_err_t console_cmd_stt_register(void)
{
    init_microphone();

    esp_err_t ret;
    esp_console_cmd_t command = { .command = "stt", .help = "stt help", .hint = NULL, .func = &do_cmd_stt, .argtable = NULL };

    ret = esp_console_cmd_register(&command);
    if (ret)
    {
        ESP_LOGE(TAG, "Unable to register stt");
    }

    return ret;
}

Key Components

Audio Capture: Uses I2S PDM interface for high-quality audio sampling
Base64 Encoding: Converts raw PCM data for API compatibility
HTTP Client: Handles secure communication with Baidu Voice API
JSON Parsing: Processes API responses using cJSON library

Security Note The authentication tokens shown are placeholders. Always

use environment variables for actual credentials:

// Replace with secure credential storage
static const char *cuid = CONFIG_VOICE_API_CUID;
static const char *token = CONFIG_VOICE_API_TOKEN;

Testing

Input stt rec 5 to start recording for 5 seconds

stt rec 5

After recording, the system will display the recognized text

Result: How are you

1.4 Text-to-Speech (TTS) Implementation

Functional Flow

Text input via serial interface
Cloud API request generation
Audio stream reception
Real-time audio playback

Technical Implementation

#### 1. Text Processing:

URL encoding for special characters
Voice parameter customization (speed/pitch) #### 2. Stream Handlin

#include <stdio.h>
#include <string.h>
#include "sdkconfig.h"
#include "driver/i2s_std.h"
#include "esp_http_client.h"
#include "esp_log.h"
#include "console_tts.h"
#include "audio.h"

static const char *TAG = "console_tts";

static i2s_chan_handle_t tx_handle = NULL;

static const char *url = "https://tsn.baidu.com/text2audio";
static const char *cert =  "your_cert";

static const char *cuid = "5b1jdUrHrgLHXQidbBG0WDluQL1JHkdJ";
static const char *token = "14.862e68858a40ff300746670d540544ca.2592000.1746511680.282335-117091108";

static const console_cmd_plugin_desc_t __attribute__((section(".console_cmd_desc"), used)) PLUGIN = { .name = "console_cmd_tts", .plugin_regd_fn = &console_cmd_tts_register };

typedef struct tts
{
    char *name;
    esp_err_t (*operation)(struct tts *self, int argc, char *argv[]);
    int arg_cnt;
    int start_index;
    char *help;
} tts_op_t;

static esp_err_t tts_help_op(tts_op_t *self, int argc, char *argv[]);
static esp_err_t tts_play_op(tts_op_t *self, int argc, char *argv[]);

static tts_op_t cmd_list[] = {
    { .name = "help", .operation = tts_help_op, .arg_cnt = 2, .start_index = 1, .help = "tts help: Prints the help text for all tts commands" },
    { .name = "play", .operation = tts_play_op, .arg_cnt = 3, .start_index = 1, .help = "tts play <text>: play tts text." },
};

static esp_err_t tts_help_op(tts_op_t *self, int argc, char *argv[])
{
    int cmd_count = sizeof(cmd_list) / sizeof(cmd_list[0]);

    for (int i = 0; i < cmd_count; i++)
    {
        if ((cmd_list[i].help != NULL) && (strlen(cmd_list[i].help) != 0))
        {
            printf(" %s\n", cmd_list[i].help);
        }
    }

    return ESP_OK;
}

static esp_err_t do_cmd_tts(int argc, char **argv)
{
    int cmd_count = sizeof(cmd_list) / sizeof(cmd_list[0]);
    tts_op_t cmd;

    for (int i = 0; i < cmd_count; i++)
    {
        cmd = cmd_list[i];

        if (argc < cmd.start_index + 1)
        {
            continue;
        }

        if (!strcmp(cmd.name, argv[cmd.start_index]))
        {
            /* Get interface for eligible commands */
            if (cmd.arg_cnt == argc)
            {
                if (cmd.operation != NULL)
                {
                    if (cmd.operation(&cmd, argc, argv) != ESP_OK)
                    {
                        ESP_LOGE(TAG, "Usage:\n%s", cmd.help);
                        return 0;
                    }
                }
                return ESP_OK;
            }
        }
    }

    ESP_LOGE(TAG, "Command not available");

    return ESP_OK;
}

esp_err_t _tts_event_handler(esp_http_client_event_t *evt)
{
    static bool is_playing = false;
    size_t bytes_written = 0;
    switch (evt->event_id)
    {
        case HTTP_EVENT_ON_DATA:
            if (!is_playing)
            {
                is_playing = true;
                i2s_channel_enable(tx_handle);
            }
            i2s_channel_write(tx_handle, evt->data, evt->data_len, &bytes_written, 1000);
            break;
        default:
            if (is_playing)
            {
                is_playing = false;
                i2s_channel_disable(tx_handle);
            }
            break;
    }
    return ESP_OK;
}

esp_err_t console_cmd_tts_write(uint8_t *data, int len)
{
    size_t bytes_written = 0;
    i2s_channel_enable(tx_handle);
    i2s_channel_write(tx_handle, data, len, &bytes_written, 1000);
    i2s_channel_disable(tx_handle);
    return ESP_OK;
}

esp_err_t query(void)
{
    i2s_channel_enable(tx_handle);
    i2s_channel_write(tx_handle, (uint8_t *)_audio1, sizeof(_audio1), NULL, 1000);
    i2s_channel_disable(tx_handle);
    return ESP_OK;
}

esp_err_t wait(void)
{
    i2s_channel_enable(tx_handle);
    i2s_channel_write(tx_handle, (uint8_t *)_audio2, sizeof(_audio2), NULL, 1000);
    i2s_channel_disable(tx_handle);
    return ESP_OK;
}

esp_err_t console_cmd_tts_play(const char *text)
{
    esp_http_client_config_t config = {
        .url = url,
        .event_handler = _tts_event_handler,
        .cert_pem = cert,
        .buffer_size = 10240,
        .disable_auto_redirect = true,
    };

    char *payload = malloc(4096);
    snprintf(payload, 4096, "tex=%s&tok=%s&cuid=%s&ctp=1&lan=en&spd=5&pit=5&vol=10&per=0&aue=5", text, token, cuid);
    esp_http_client_handle_t client = esp_http_client_init(&config);
    esp_http_client_set_header(client, "Content-Type", "application/x-www-form-urlencoded");
    esp_http_client_set_header(client, "Accept", "*/*");
    esp_http_client_set_method(client, HTTP_METHOD_POST);
    esp_http_client_set_post_field(client, payload, strlen(payload));
    esp_http_client_set_timeout_ms(client, 60000); 

    esp_err_t err = esp_http_client_perform(client);
    if (err != ESP_OK)
    {
        ESP_LOGE(TAG, "HTTP request failed: %s", esp_err_to_name(err));
    }

    esp_http_client_cleanup(client);
    free(payload);
    return ESP_OK;
}

static esp_err_t tts_play_op(tts_op_t *self, int argc, char *argv[])
{
    if (argc < cmd_list[self->start_index].arg_cnt)
    {
        ESP_LOGE(TAG, "Error: Invalid command\n");
        ESP_LOGE(TAG, "%s\n", cmd_list[self->start_index].help);
        return ESP_FAIL;
    }
    char *text = argv[self->start_index + 1];

    if (strlen(text) == 0)
    {
        ESP_LOGE(TAG, "Error: Invalid text\n");
        return ESP_FAIL;
    }
    console_cmd_tts_play(text);
    return ESP_OK;
}

void init_speaker(void)
{
    i2s_chan_config_t tx_chan_cfg = I2S_CHANNEL_DEFAULT_CONFIG(I2S_NUM_AUTO, I2S_ROLE_MASTER);
    ESP_ERROR_CHECK(i2s_new_channel(&tx_chan_cfg, &tx_handle, NULL));

    i2s_std_config_t tx_std_cfg = {
        .clk_cfg  = I2S_STD_CLK_DEFAULT_CONFIG(8000),
        .slot_cfg = I2S_STD_MSB_SLOT_DEFAULT_CONFIG(I2S_DATA_BIT_WIDTH_16BIT, I2S_SLOT_MODE_MONO),
        .gpio_cfg = {
            .mclk = I2S_GPIO_UNUSED,    // some codecs may require mclk signal, this example doesn't need it
            .bclk = 2,
            .ws   = 3,
            .dout = 1,
            .din  = I2S_GPIO_UNUSED,
            .invert_flags = {
                .mclk_inv = false,
                .bclk_inv = false,
                .ws_inv   = false,
            },
        },
    };
    ESP_ERROR_CHECK(i2s_channel_init_std_mode(tx_handle, &tx_std_cfg));
}

esp_err_t console_cmd_tts_register(void)
{
    init_speaker();

    esp_err_t ret;
    esp_console_cmd_t command = { .command = "tts", .help = "tts help", .hint = NULL, .func = &do_cmd_tts, .argtable = NULL };

    ret = esp_console_cmd_register(&command);
    if (ret)
    {
        ESP_LOGE(TAG, "Unable to register tts");
    }

    return ret;
}

Security Note The authentication tokens shown are placeholders. Always

use environment variables for actual credentials:

// Replace with secure credential storage
static const char *cuid = CONFIG_VOICE_API_CUID;
static const char *token = CONFIG_VOICE_API_TOKEN;

Testing

Input tts play "how are you" to play the audio

tts play "how are you"

The system will play the audio

Here's a condensed version of challenges & solutions:

1.5 Key Challenges & Solutions

Challenge	Solution
Audio quality issues	IIR noise filtering Auto gain control
Network instability	Exponential backoff retry Dual WiFi/ESP-NOW mode
Memory constraints	Stream buffering with RING_BUF_SIZE=8KB NVS credential storage
API limitations	Request batching Local voice cache
Real-time sync	Double buffering DMA-driven I2S

1. Networking and Communications

Project Overview

1.1 System Architecture

Conection Diagram

1.2 Baidu Cloud Speech Service

Key Steps

API Specifications

Basic Configuration

Error Handling

Best Practices

1.3 Speech-to-Text (STT) Implementation

Functional Requirements

Technical Implementation

1. Audio Capture (I2S PDM Interface):

2. Base64 Encoding

3. Cloud Communication

Key Components

Testing

1.4 Text-to-Speech (TTS) Implementation

Functional Flow

Technical Implementation

Testing

1.5 Key Challenges & Solutions

1.6 Development Resources

Code Repository

Project Overview​

1.1 System Architecture​

Conection Diagram​

1.2 Baidu Cloud Speech Service​

Key Steps​

API Specifications​

Basic Configuration​

Error Handling​

Best Practices​

1.3 Speech-to-Text (STT) Implementation​

Functional Requirements​

Technical Implementation​

1. ​​Audio Capture​​ (I2S PDM Interface):​

2. ​Base64 Encoding​​​

3. Cloud Communication​

Key Components​

Testing​

1.4 Text-to-Speech (TTS) Implementation​

Functional Flow​

Technical Implementation​

Testing​

1.5 Key Challenges & Solutions​

1.6 Development Resources​

Code Repository​

Project Overview

1.1 System Architecture

Conection Diagram

1.2 Baidu Cloud Speech Service

Key Steps

API Specifications

Basic Configuration

Error Handling

Best Practices

1.3 Speech-to-Text (STT) Implementation

Functional Requirements

Technical Implementation

1. Audio Capture (I2S PDM Interface):

2. Base64 Encoding

3. Cloud Communication

Key Components

Testing

1.4 Text-to-Speech (TTS) Implementation

Functional Flow

Technical Implementation

Testing

1.5 Key Challenges & Solutions

1.6 Development Resources

Code Repository