This shows you the differences between two versions of the page.
| — |
iothings:laboratoare:2025_code:lab6_7 [2025/11/01 21:14] (current) dan.tudose created |
||
|---|---|---|---|
| Line 1: | Line 1: | ||
| + | <code C main.cpp> | ||
| + | // main.cpp -- streaming wake-word validator with low-latency inference | ||
| + | #include <Arduino.h> | ||
| + | #include <ESP_I2S.h> | ||
| + | #include <Adafruit_NeoPixel.h> | ||
| + | #include <arduinoFFT.h> | ||
| + | #include <math.h> | ||
| + | #include "audio_model.h" | ||
| + | #define AUTO_PICK_CHANNEL 1 | ||
| + | #define FORCE_RIGHT_CHANNEL 1 // used if auto-pick disabled | ||
| + | |||
| + | // Audio capture configuration ------------------------------------------------ | ||
| + | constexpr int SAMPLE_RATE_HZ = 16000; | ||
| + | constexpr int CLIP_SECONDS = 1; | ||
| + | constexpr int CLIP_SAMPLES = SAMPLE_RATE_HZ * CLIP_SECONDS; | ||
| + | |||
| + | // Feature extraction parameters (must match training script) | ||
| + | constexpr int FRAME_LEN = 400; // 25 ms | ||
| + | constexpr int HOP = 160; // 10 ms | ||
| + | constexpr int FFT_LEN = 512; | ||
| + | constexpr int N_BANDS = 10; | ||
| + | constexpr int N_FRAMES = 1 + (CLIP_SAMPLES - FRAME_LEN) / HOP; | ||
| + | static_assert(N_FRAMES * N_BANDS == KW_INPUT_DIM, "Feature dimension mismatch"); | ||
| + | constexpr int N_BINS = (FFT_LEN / 2) + 1; | ||
| + | constexpr int BINS_PER_BAND = N_BINS / N_BANDS; | ||
| + | |||
| + | // Stream chunk configuration -------------------------------------------------- | ||
| + | constexpr int CHUNK_FRAMES = HOP; // process every 10 ms hop | ||
| + | constexpr size_t BYTES_PER_FRAME = 8; // 32-bit stereo (4B * 2) | ||
| + | constexpr size_t RAW_CHUNK_BYTES = CHUNK_FRAMES * BYTES_PER_FRAME; | ||
| + | |||
| + | // Inference smoothing and gating | ||
| + | constexpr float RMS_GATE = 0.0030f; // silence gate (tune to mic noise floor) | ||
| + | constexpr float HELLO_THRESH = 0.65f; // smoother decision threshold | ||
| + | constexpr int SMOOTH_N = 3; | ||
| + | constexpr int CHUNKS_PER_INFERENCE = 4; // run NN every ~40 ms | ||
| + | constexpr uint32_t DETECT_COOLDOWN_MS = 1500; | ||
| + | |||
| + | constexpr int NEOPIXEL_PIN = 3; | ||
| + | constexpr int NEOPIXEL_COUNT = 1; | ||
| + | |||
| + | // ----------------------------------------------------------------------------- | ||
| + | |||
| + | struct FeatureSummary { | ||
| + | float min; | ||
| + | float max; | ||
| + | float mean; | ||
| + | }; | ||
| + | |||
| + | struct ModelResult { | ||
| + | float prob; | ||
| + | float logit_off; | ||
| + | float logit_on; | ||
| + | }; | ||
| + | |||
| + | static I2SClass g_i2s; | ||
| + | static arduinoFFT g_fft; | ||
| + | static Adafruit_NeoPixel g_pixel(NEOPIXEL_COUNT, NEOPIXEL_PIN, NEO_GRB + NEO_KHZ800); | ||
| + | |||
| + | // Ring buffer to hold the most recent CLIP_SAMPLES samples | ||
| + | static int16_t g_ring[CLIP_SAMPLES]; | ||
| + | static size_t g_ringWrite = 0; | ||
| + | static size_t g_ringCount = 0; | ||
| + | |||
| + | // Working buffers | ||
| + | static int16_t g_pcm[CLIP_SAMPLES]; | ||
| + | static float g_feat[KW_INPUT_DIM]; | ||
| + | static int16_t g_chunkMono[CHUNK_FRAMES]; | ||
| + | static int16_t g_chunkL[CHUNK_FRAMES]; | ||
| + | static int16_t g_chunkR[CHUNK_FRAMES]; | ||
| + | static uint8_t g_rawChunk[RAW_CHUNK_BYTES]; | ||
| + | static int16_t g_frameBuffer[FRAME_LEN]; | ||
| + | static float g_frameFeatures[N_FRAMES][N_BANDS]; | ||
| + | static size_t g_frameStart = 0; | ||
| + | static size_t g_frameCount = 0; | ||
| + | |||
| + | static float g_hann[FRAME_LEN]; | ||
| + | static double g_fftReal[FFT_LEN]; | ||
| + | static double g_fftImag[FFT_LEN]; | ||
| + | |||
| + | static bool g_channelChosen = false; | ||
| + | static bool g_useRight = true; | ||
| + | static bool g_dumpedRaw = false; | ||
| + | |||
| + | // ----------------------------------------------------------------------------- | ||
| + | static void initHann() { | ||
| + | for (int i = 0; i < FRAME_LEN; ++i) { | ||
| + | g_hann[i] = 0.5f - 0.5f * cosf(2.0f * PI * i / (FRAME_LEN - 1)); | ||
| + | } | ||
| + | } | ||
| + | |||
| + | static void ledOff() { | ||
| + | g_pixel.setPixelColor(0, g_pixel.Color(0, 0, 0)); | ||
| + | g_pixel.show(); | ||
| + | } | ||
| + | |||
| + | static void ledBlink() { | ||
| + | g_pixel.setPixelColor(0, g_pixel.Color(0, 0, 255)); | ||
| + | g_pixel.show(); | ||
| + | delay(100); | ||
| + | ledOff(); | ||
| + | } | ||
| + | |||
| + | static float computeRms(const int16_t* data, size_t count) { | ||
| + | double acc = 0.0; | ||
| + | for (size_t i = 0; i < count; ++i) { | ||
| + | double v = static_cast<double>(data[i]); | ||
| + | acc += v * v; | ||
| + | } | ||
| + | acc = (count > 0) ? acc / static_cast<double>(count) : 0.0; | ||
| + | return static_cast<float>(sqrt(acc)) / 32768.0f; | ||
| + | } | ||
| + | |||
| + | static void chooseChannel(double energyL, double energyR) { | ||
| + | constexpr double EPS = 1e-9; | ||
| + | bool pickRight = (energyR + EPS >= energyL); | ||
| + | if (!g_channelChosen || pickRight != g_useRight) { | ||
| + | Serial.print("# Auto channel -> "); | ||
| + | Serial.println(pickRight ? "RIGHT" : "LEFT"); | ||
| + | } | ||
| + | g_useRight = pickRight; | ||
| + | g_channelChosen = true; | ||
| + | } | ||
| + | |||
| + | static bool readChunkMono(int16_t* dst, int frames) { | ||
| + | const size_t wantBytes = static_cast<size_t>(frames) * BYTES_PER_FRAME; | ||
| + | size_t got = g_i2s.readBytes(reinterpret_cast<char*>(g_rawChunk), wantBytes); | ||
| + | if (got != wantBytes) { | ||
| + | Serial.printf("! readBytes short: got %u need %u\n", static_cast<unsigned>(got), static_cast<unsigned>(wantBytes)); | ||
| + | return false; | ||
| + | } | ||
| + | |||
| + | const int32_t* src = reinterpret_cast<const int32_t*>(g_rawChunk); | ||
| + | double energyL = 0.0; | ||
| + | double energyR = 0.0; | ||
| + | for (int i = 0; i < frames; ++i) { | ||
| + | int32_t Lraw = src[2 * i + 0]; | ||
| + | int32_t Rraw = src[2 * i + 1]; | ||
| + | int16_t L = static_cast<int16_t>(Lraw >> 16); | ||
| + | int16_t R = static_cast<int16_t>(Rraw >> 16); | ||
| + | g_chunkL[i] = L; | ||
| + | g_chunkR[i] = R; | ||
| + | energyL += static_cast<double>(L) * L; | ||
| + | energyR += static_cast<double>(R) * R; | ||
| + | } | ||
| + | |||
| + | #if AUTO_PICK_CHANNEL | ||
| + | chooseChannel(energyL, energyR); | ||
| + | #endif | ||
| + | |||
| + | if (!g_dumpedRaw) { | ||
| + | g_dumpedRaw = true; | ||
| + | Serial.println("# raw32 dump (first 16 frames):"); | ||
| + | for (int i = 0; i < min(frames, 16); ++i) { | ||
| + | int32_t Lraw = src[2 * i + 0]; | ||
| + | int32_t Rraw = src[2 * i + 1]; | ||
| + | Serial.printf(" [%02d] L=0x%08lx R=0x%08lx\n", | ||
| + | i, | ||
| + | static_cast<unsigned long>(Lraw), | ||
| + | static_cast<unsigned long>(Rraw)); | ||
| + | } | ||
| + | Serial.printf("# channel energy L=%.1f R=%.1f\n", energyL, energyR); | ||
| + | } | ||
| + | |||
| + | const int16_t* chosen = nullptr; | ||
| + | #if AUTO_PICK_CHANNEL | ||
| + | chosen = g_useRight ? g_chunkR : g_chunkL; | ||
| + | #else | ||
| + | chosen = FORCE_RIGHT_CHANNEL ? g_chunkR : g_chunkL; | ||
| + | #endif | ||
| + | |||
| + | for (int i = 0; i < frames; ++i) { | ||
| + | dst[i] = chosen[i]; | ||
| + | } | ||
| + | return true; | ||
| + | } | ||
| + | |||
| + | static void ringPush(const int16_t* samples, int frames) { | ||
| + | for (int i = 0; i < frames; ++i) { | ||
| + | g_ring[g_ringWrite] = samples[i]; | ||
| + | g_ringWrite = (g_ringWrite + 1) % CLIP_SAMPLES; | ||
| + | if (g_ringCount < static_cast<size_t>(CLIP_SAMPLES)) { | ||
| + | g_ringCount++; | ||
| + | } | ||
| + | } | ||
| + | } | ||
| + | |||
| + | static bool ringCopyToClip(int16_t* dst) { | ||
| + | if (g_ringCount < static_cast<size_t>(CLIP_SAMPLES)) { | ||
| + | return false; | ||
| + | } | ||
| + | size_t start = g_ringWrite; | ||
| + | for (int i = 0; i < CLIP_SAMPLES; ++i) { | ||
| + | dst[i] = g_ring[(start + i) % CLIP_SAMPLES]; | ||
| + | } | ||
| + | return true; | ||
| + | } | ||
| + | |||
| + | static void computeBandsForFrame(const int16_t* frame, float* outBands) { | ||
| + | for (int i = 0; i < FRAME_LEN; ++i) { | ||
| + | float sample = static_cast<float>(frame[i]) / 32768.0f; | ||
| + | g_fftReal[i] = static_cast<double>(sample * g_hann[i]); | ||
| + | g_fftImag[i] = 0.0; | ||
| + | } | ||
| + | for (int i = FRAME_LEN; i < FFT_LEN; ++i) { | ||
| + | g_fftReal[i] = 0.0; | ||
| + | g_fftImag[i] = 0.0; | ||
| + | } | ||
| + | |||
| + | g_fft.Windowing(g_fftReal, FFT_LEN, FFT_WIN_TYP_RECTANGLE, FFT_FORWARD); | ||
| + | g_fft.Compute(g_fftReal, g_fftImag, FFT_LEN, FFT_FORWARD); | ||
| + | g_fft.ComplexToMagnitude(g_fftReal, g_fftImag, FFT_LEN); | ||
| + | |||
| + | constexpr float EPS = 1e-10f; | ||
| + | for (int band = 0; band < N_BANDS; ++band) { | ||
| + | int b0 = band * BINS_PER_BAND; | ||
| + | int b1 = (band == N_BANDS - 1) ? N_BINS : (band + 1) * BINS_PER_BAND; | ||
| + | double acc = 0.0; | ||
| + | int count = 0; | ||
| + | for (int k = b0; k < b1; ++k) { | ||
| + | double mag = g_fftReal[k]; | ||
| + | acc += mag * mag; | ||
| + | ++count; | ||
| + | } | ||
| + | float meanPower = (count > 0) ? static_cast<float>(acc / static_cast<double>(count)) : 0.0f; | ||
| + | outBands[band] = 10.0f * log10f(meanPower + EPS); | ||
| + | } | ||
| + | } | ||
| + | |||
| + | static bool updateFrameFeatures() { | ||
| + | if (g_ringCount < static_cast<size_t>(FRAME_LEN)) { | ||
| + | return false; | ||
| + | } | ||
| + | |||
| + | size_t idx = (g_ringWrite + CLIP_SAMPLES - FRAME_LEN) % CLIP_SAMPLES; | ||
| + | for (int i = 0; i < FRAME_LEN; ++i) { | ||
| + | g_frameBuffer[i] = g_ring[idx]; | ||
| + | idx = (idx + 1) % CLIP_SAMPLES; | ||
| + | } | ||
| + | |||
| + | float bands[N_BANDS]; | ||
| + | computeBandsForFrame(g_frameBuffer, bands); | ||
| + | |||
| + | size_t slot; | ||
| + | if (g_frameCount < static_cast<size_t>(N_FRAMES)) { | ||
| + | slot = (g_frameStart + g_frameCount) % N_FRAMES; | ||
| + | g_frameCount++; | ||
| + | } else { | ||
| + | slot = g_frameStart; | ||
| + | g_frameStart = (g_frameStart + 1) % N_FRAMES; | ||
| + | } | ||
| + | |||
| + | for (int b = 0; b < N_BANDS; ++b) { | ||
| + | g_frameFeatures[slot][b] = bands[b]; | ||
| + | } | ||
| + | return g_frameCount == static_cast<size_t>(N_FRAMES); | ||
| + | } | ||
| + | |||
| + | static FeatureSummary buildFeatureVector() { | ||
| + | FeatureSummary summary; | ||
| + | summary.min = 1e9f; | ||
| + | summary.max = -1e9f; | ||
| + | summary.mean = 0.0f; | ||
| + | |||
| + | if (g_frameCount < static_cast<size_t>(N_FRAMES)) { | ||
| + | summary.min = summary.max = summary.mean = 0.0f; | ||
| + | return summary; | ||
| + | } | ||
| + | |||
| + | size_t idx = g_frameStart; | ||
| + | int featIndex = 0; | ||
| + | for (int f = 0; f < N_FRAMES; ++f) { | ||
| + | const float* bands = g_frameFeatures[idx]; | ||
| + | for (int b = 0; b < N_BANDS; ++b) { | ||
| + | float val = bands[b]; | ||
| + | g_feat[featIndex++] = val; | ||
| + | summary.min = min(summary.min, val); | ||
| + | summary.max = max(summary.max, val); | ||
| + | summary.mean += val; | ||
| + | } | ||
| + | idx = (idx + 1) % N_FRAMES; | ||
| + | } | ||
| + | summary.mean /= static_cast<float>(N_FRAMES * N_BANDS); | ||
| + | return summary; | ||
| + | } | ||
| + | |||
| + | static ModelResult runKeywordModel(float* features /* mutated in place */) { | ||
| + | float hidden[KW_HIDDEN_DIM]; | ||
| + | float logits[KW_OUTPUT_DIM]; | ||
| + | |||
| + | for (int i = 0; i < KW_INPUT_DIM; ++i) { | ||
| + | features[i] = (features[i] - kw_mu[i]) * kw_sigma_inv[i]; | ||
| + | } | ||
| + | |||
| + | for (int j = 0; j < KW_HIDDEN_DIM; ++j) { | ||
| + | float sum = kw_b0[j]; | ||
| + | for (int i = 0; i < KW_INPUT_DIM; ++i) { | ||
| + | sum += kw_W0[j + i * KW_HIDDEN_DIM] * features[i]; | ||
| + | } | ||
| + | hidden[j] = (sum > 0.0f) ? sum : 0.0f; | ||
| + | } | ||
| + | |||
| + | for (int j = 0; j < KW_OUTPUT_DIM; ++j) { | ||
| + | float sum = kw_b1[j]; | ||
| + | for (int i = 0; i < KW_HIDDEN_DIM; ++i) { | ||
| + | sum += kw_W1[j + i * KW_OUTPUT_DIM] * hidden[i]; | ||
| + | } | ||
| + | logits[j] = sum; | ||
| + | } | ||
| + | |||
| + | float m = max(logits[0], logits[1]); | ||
| + | float e0 = expf(logits[0] - m); | ||
| + | float e1 = expf(logits[1] - m); | ||
| + | |||
| + | ModelResult res; | ||
| + | res.logit_off = logits[0]; | ||
| + | res.logit_on = logits[1]; | ||
| + | res.prob = e1 / (e0 + e1); | ||
| + | return res; | ||
| + | } | ||
| + | |||
| + | // ----------------------------------------------------------------------------- | ||
| + | static void setupI2S() { | ||
| + | g_i2s.setPins(/*bclk*/18, /*ws*/20, /*dout*/-1, /*din*/19, /*mclk*/-1); | ||
| + | bool ok = g_i2s.begin(I2S_MODE_STD, SAMPLE_RATE_HZ, I2S_DATA_BIT_WIDTH_32BIT, I2S_SLOT_MODE_STEREO); | ||
| + | if (!ok) Serial.println("ERR: I2S.begin failed"); | ||
| + | |||
| + | ok = g_i2s.configureRX(SAMPLE_RATE_HZ, | ||
| + | I2S_DATA_BIT_WIDTH_32BIT, | ||
| + | I2S_SLOT_MODE_STEREO, | ||
| + | I2S_RX_TRANSFORM_NONE); | ||
| + | if (!ok) Serial.println("ERR: I2S.configureRX failed"); | ||
| + | } | ||
| + | |||
| + | void setup() { | ||
| + | Serial.begin(115200); | ||
| + | delay(2000); | ||
| + | |||
| + | g_pixel.begin(); | ||
| + | ledOff(); | ||
| + | initHann(); | ||
| + | setupI2S(); | ||
| + | |||
| + | Serial.printf("# Wake word validator (streaming). chunk=%d frames, clip=%d samples\n", | ||
| + | CHUNK_FRAMES, CLIP_SAMPLES); | ||
| + | } | ||
| + | |||
| + | void loop() { | ||
| + | if (!readChunkMono(g_chunkMono, CHUNK_FRAMES)) { | ||
| + | delay(5); | ||
| + | return; | ||
| + | } | ||
| + | |||
| + | ringPush(g_chunkMono, CHUNK_FRAMES); | ||
| + | bool framesReady = updateFrameFeatures(); | ||
| + | |||
| + | if (g_ringCount < static_cast<size_t>(CLIP_SAMPLES) || !framesReady) { | ||
| + | return; | ||
| + | } | ||
| + | |||
| + | static int chunkAccumulator = 0; | ||
| + | if (++chunkAccumulator < CHUNKS_PER_INFERENCE) { | ||
| + | return; | ||
| + | } | ||
| + | chunkAccumulator = 0; | ||
| + | |||
| + | if (!ringCopyToClip(g_pcm)) { | ||
| + | return; | ||
| + | } | ||
| + | |||
| + | static bool dumpedPcm = false; | ||
| + | if (!dumpedPcm) { | ||
| + | dumpedPcm = true; | ||
| + | Serial.print("# pcm[0..15]:"); | ||
| + | for (int i = 0; i < 16; ++i) { | ||
| + | Serial.printf(" %d", g_pcm[i]); | ||
| + | } | ||
| + | Serial.println(); | ||
| + | } | ||
| + | |||
| + | float rms = computeRms(g_pcm, CLIP_SAMPLES); | ||
| + | Serial.printf("rms=%.5f\n", rms); | ||
| + | if (rms < RMS_GATE) { | ||
| + | Serial.println("probHello=0.0000 (gated by RMS)"); | ||
| + | return; | ||
| + | } | ||
| + | |||
| + | FeatureSummary summary = buildFeatureVector(); | ||
| + | Serial.printf("feat[min=%.2f max=%.2f mean=%.2f]\n", summary.min, summary.max, summary.mean); | ||
| + | |||
| + | static bool printedFeatRaw = false; | ||
| + | if (!printedFeatRaw) { | ||
| + | printedFeatRaw = true; | ||
| + | Serial.print("# featRaw[0..7]:"); | ||
| + | for (int i = 0; i < 8; ++i) { | ||
| + | Serial.printf(" %.2f", g_feat[i]); | ||
| + | } | ||
| + | Serial.println(); | ||
| + | } | ||
| + | |||
| + | ModelResult res = runKeywordModel(g_feat); | ||
| + | |||
| + | static bool printedFeatNorm = false; | ||
| + | if (!printedFeatNorm) { | ||
| + | printedFeatNorm = true; | ||
| + | Serial.print("# featNorm[0..7]:"); | ||
| + | for (int i = 0; i < 8; ++i) { | ||
| + | Serial.printf(" %.2f", g_feat[i]); | ||
| + | } | ||
| + | Serial.println(); | ||
| + | } | ||
| + | |||
| + | static float smooth[SMOOTH_N] = {0}; | ||
| + | static int smoothIdx = 0; | ||
| + | smooth[smoothIdx] = res.prob; | ||
| + | smoothIdx = (smoothIdx + 1) % SMOOTH_N; | ||
| + | |||
| + | float avg = 0.0f; | ||
| + | int valid = 0; | ||
| + | for (int i = 0; i < SMOOTH_N; ++i) { | ||
| + | avg += smooth[i]; | ||
| + | if (smooth[i] > 0.0f) ++valid; | ||
| + | } | ||
| + | if (valid > 0) { | ||
| + | avg /= static_cast<float>(valid); | ||
| + | } else { | ||
| + | avg = res.prob; | ||
| + | } | ||
| + | |||
| + | Serial.printf("logits[off=%.3f on=%.3f] prob=%.4f avg=%.4f\n", | ||
| + | res.logit_off, res.logit_on, res.prob, avg); | ||
| + | |||
| + | static uint32_t lastDetectMs = 0; | ||
| + | uint32_t nowMs = millis(); | ||
| + | if (avg > HELLO_THRESH && (nowMs - lastDetectMs) > DETECT_COOLDOWN_MS) { | ||
| + | ledBlink(); | ||
| + | lastDetectMs = nowMs; | ||
| + | } | ||
| + | } | ||
| + | |||
| + | </code> | ||