main.cpp
#include <Arduino.h>
#include <Adafruit_NeoPixel.h>
#include "driver/i2s.h"
#include <arduinoFFT.h>
 
// ===== Sparrow audio pins (ICS-43434 I2S mic) =====
static const int I2S_WS_PIN  = 20; // LRCLK / WS
static const int I2S_SCK_PIN = 18; // BCLK
static const int I2S_SD_PIN  = 19; // DOUT from mic
 
// ===== LED =====
static const int NEOPIXEL_PIN = 3;
Adafruit_NeoPixel pixel(1, NEOPIXEL_PIN, NEO_GRB + NEO_KHZ800);
 
// ===== Audio / DSP params =====
static const int SAMPLE_RATE   = 16000;     // 16 kHz mic rate
static const int N_FFT         = 1024;      // 64 ms window
static const int HOP_SAMPLES   = N_FFT/2;   // 50% overlap (optional)
static const int N_BANDS       = 16;        // log-spaced bands up to Nyquist
static const int BOOTSTRAP_W   = 40;        // ~2–3 s of audio given overlap
static const int K_CLUSTERS    = 4;         // small K for "normal" modes
static const float ANOM_MULT   = 3.0f;      // thresh = median + ANOM_MULT*MAD
 
// ===== Buffers =====
static int16_t  pcm[N_FFT];
static float    win[N_FFT];
static double   vReal[N_FFT];
static double   vImag[N_FFT];
arduinoFFT FFT = arduinoFFT(vReal, vImag, N_FFT, SAMPLE_RATE);
 
// ===== Feature vector: 16 band log-energies + 2 spectral stats (optional) =====
static const int FEAT_DIM = N_BANDS + 2;
 
// ===== Simple utilities =====
static inline int16_t convert_24_to_16(int32_t s32) {
  // 24-bit sample left-justified in 32-bit slot -> scale down
  return (int16_t)(s32 >> 11); // tweak shift if amplitude looks off
}
 
void i2s_init() {
  i2s_config_t cfg = {
    .mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_RX),
    .sample_rate = SAMPLE_RATE,
    .bits_per_sample = I2S_BITS_PER_SAMPLE_32BIT,
    .channel_format = I2S_CHANNEL_FMT_ONLY_LEFT,
    .communication_format = I2S_COMM_FORMAT_STAND_I2S,
    .intr_alloc_flags = 0,
    .dma_buf_count = 4,
    .dma_buf_len = 1024,
    .use_apll = false,
    .tx_desc_auto_clear = false,
    .fixed_mclk = 0
  };
  i2s_pin_config_t pins = {
    .bck_io_num   = I2S_SCK_PIN,
    .ws_io_num    = I2S_WS_PIN,
    .data_out_num = I2S_PIN_NO_CHANGE,
    .data_in_num  = I2S_SD_PIN
  };
  i2s_driver_install(I2S_NUM_0, &cfg, 0, nullptr);
  i2s_set_pin(I2S_NUM_0, &pins);
  i2s_zero_dma_buffer(I2S_NUM_0);
}
 
void make_hann() {
  for (int i = 0; i < N_FFT; i++) {
    win[i] = 0.5f * (1.0f - cosf(2.0f * PI * i / (N_FFT - 1)));
  }
}
 
// Log-spaced frequency edges for bands
void calc_band_edges(int (&edges)[N_BANDS+1]) {
  float fmin = 50.0f;                 // ignore DC / very low
  float fmax = SAMPLE_RATE / 2.0f;    // Nyquist
  for (int b=0; b<=N_BANDS; b++) {
    float t = (float)b / (float)N_BANDS;
    float f = fmin * powf(fmax / fmin, t);
    int bin = (int)roundf(f * N_FFT / SAMPLE_RATE);
    if (bin < 1) bin = 1;
    if (bin > N_FFT/2) bin = N_FFT/2;
    edges[b] = bin;
  }
  // ensure strictly increasing
  for (int b=1; b<=N_BANDS; b++) if (edges[b] <= edges[b-1]) edges[b] = edges[b-1] + 1;
  edges[N_BANDS] = min(edges[N_BANDS], N_FFT/2);
}
 
bool capture_frame_blocking() {
  // Read N_FFT mono samples
  int filled = 0;
  while (filled < N_FFT) {
    int32_t tmp[256];
    size_t br = 0;
    if (i2s_read(I2S_NUM_0, (void*)tmp, sizeof(tmp), &br, portMAX_DELAY) != ESP_OK) return false;
    int got = br / sizeof(int32_t);
    for (int i=0; i<got && filled < N_FFT; i++) {
      pcm[filled++] = convert_24_to_16(tmp[i]);
    }
  }
  return true;
}
 
// Compute feature vector into out[FEAT_DIM]
void extract_features(float *out) {
  // window -> FFT
  for (int i=0; i<N_FFT; i++) {
    vReal[i] = (double)((float)pcm[i] * win[i]);
    vImag[i] = 0.0;
  }
  FFT.Windowing(FFT_WIN_TYP_RECTANGLE, FFT_FORWARD); // already applied Hann, so RECT here
  FFT.Compute(FFT_FORWARD);
  FFT.ComplexToMagnitude();
 
  // magnitude spectrum is in vReal[0..N_FFT/2]
  int edges[N_BANDS+1];
  calc_band_edges(edges);
 
  // band log-energies
  for (int b=0; b<N_BANDS; b++) {
    int s = edges[b], e = edges[b+1];
    double sum = 0.0;
    for (int k=s; k<e; k++) {
      double m = vReal[k];
      sum += m*m; // power
    }
    float energy = (float)sum + 1e-9f;
    out[b] = logf(energy);
  }
 
  // spectral centroid & rolloff (0.85)
  double num=0.0, den=0.0;
  for (int k=1; k<=N_FFT/2; k++) {
    double f = (double)k * SAMPLE_RATE / N_FFT;
    double p = vReal[k]*vReal[k];
    num += f * p; den += p;
  }
  float centroid = (den>0) ? (float)(num/den) : 0.0f;
 
  double cumulative = 0.0, target = 0.85 * den;
  float rolloff = 0.0f;
  for (int k=1; k<=N_FFT/2; k++) {
    cumulative += vReal[k]*vReal[k];
    if (cumulative >= target) {
      rolloff = (float)k * SAMPLE_RATE / N_FFT;
      break;
    }
  }
  out[N_BANDS]   = centroid;
  out[N_BANDS+1] = rolloff;
}
 
// ===== Stats helpers =====
struct OnlineMeanStd {
  double mean=0, M2=0; int n=0;
  void add(double x){ n++; double d=x-mean; mean += d/n; M2 += d*(x-mean); }
  double std() const { return (n>1) ? sqrt(M2/(n-1)) : 1.0; }
};
 
// ===== K-means (from scratch) on standardized features =====
struct KMeans {
  int k;
  float centroids[K_CLUSTERS][FEAT_DIM];
  bool initialized=false;
 
  KMeans(int kk):k(kk){}
 
  static float dist2(const float *a, const float *b){
    float d=0;
    for(int i=0;i<FEAT_DIM;i++){ float t=a[i]-b[i]; d+=t*t; }
    return d;
  }
  int assign(const float *x) const {
    int best=0; float bd=dist2(x, centroids[0]);
    for(int c=1;c<k;c++){ float d=dist2(x,centroids[c]); if(d<bd){bd=d; best=c;} }
    return best;
  }
  void init_plus_plus(const float *X, int n){
    // pick first at random
    int pick = random(n);
    memcpy(centroids[0], &X[pick*FEAT_DIM], sizeof(float)*FEAT_DIM);
    // others by D^2
    for(int c=1;c<k;c++){
      // compute distances
      double sumD=0;
      for(int i=0;i<n;i++){
        float bestd = dist2(&X[i*FEAT_DIM], centroids[0]);
        for(int j=1;j<c;j++){ float d=dist2(&X[i*FEAT_DIM], centroids[j]); if(d<bestd) bestd=d; }
        sumD += bestd;
      }
      double r = (random(0,1000000)/1000000.0) * sumD;
      double acc=0;
      int idx=0;
      for(; idx<n; idx++){
        float bestd = dist2(&X[idx*FEAT_DIM], centroids[0]);
        for(int j=1;j<c;j++){ float d=dist2(&X[idx*FEAT_DIM], centroids[j]); if(d<bestd) bestd=d; }
        acc += bestd; if(acc >= r) break;
      }
      if (idx>=n) idx=n-1;
      memcpy(centroids[c], &X[idx*FEAT_DIM], sizeof(float)*FEAT_DIM);
    }
    initialized=true;
  }
  void fit(const float *X, int n, int iters=10){
    if(!initialized) init_plus_plus(X,n);
    // assignment buffer
    int *as = (int*)malloc(sizeof(int)*n);
    for(int it=0; it<iters; it++){
      // assign
      for(int i=0;i<n;i++) as[i]=assign(&X[i*FEAT_DIM]);
      // recompute
      float sums[K_CLUSTERS][FEAT_DIM]; int cnt[K_CLUSTERS];
      for(int c=0;c<k;c++){ memset(sums[c],0,sizeof(sums[c])); cnt[c]=0; }
      for(int i=0;i<n;i++){
        int c=as[i]; cnt[c]++;
        for(int j=0;j<FEAT_DIM;j++) sums[c][j]+=X[i*FEAT_DIM+j];
      }
      for(int c=0;c<k;c++){
        if(cnt[c]>0){
          for(int j=0;j<FEAT_DIM;j++) centroids[c][j]=sums[c][j]/cnt[c];
        }
      }
    }
    free(as);
  }
};
 
static KMeans kmeans(K_CLUSTERS);
 
// Standardization (z-score) learned from bootstrap
static float feat_mean[FEAT_DIM], feat_std[FEAT_DIM];
 
// Anomaly scaling using distances in bootstrap
static float dist_median=0.0f, dist_mad=1.0f;
 
float median_of(float *arr, int n){
  // simple partial sort
  for(int i=0;i<n-1;i++){
    for(int j=i+1;j<n;j++) if(arr[j]<arr[i]) { float t=arr[i]; arr[i]=arr[j]; arr[j]=t; }
  }
  return (n%2) ? arr[n/2] : 0.5f*(arr[n/2-1]+arr[n/2]);
}
 
void standardize(const float *x, float *z){
  for(int i=0;i<FEAT_DIM;i++){
    float s = (feat_std[i] > 1e-6f) ? feat_std[i] : 1.0f;
    z[i] = (x[i] - feat_mean[i]) / s;
  }
}
 
void online_led(float anomaly){
  // green normal, red anomaly
  bool isAnom = (anomaly > (dist_median + ANOM_MULT * dist_mad));
  pixel.setPixelColor(0, pixel.Color(isAnom ? 255:0, isAnom ? 0:255, 0));
  pixel.show();
}
 
void setup() {
  Serial.begin(115200);
  delay(200);
 
  pixel.begin(); pixel.clear(); pixel.show();
 
  i2s_init();
  make_hann();
 
  Serial.println("Audio anomaly (no SDK) — bootstrapping normal audio...");
}
 
void loop() {
  static bool model_ready=false;
  static int collected=0;
 
  static float Xbuf[BOOTSTRAP_W * FEAT_DIM];   // feature windows
  static float Zbuf[BOOTSTRAP_W * FEAT_DIM];   // standardized features (after we learn stats)
  static float Dbuf[BOOTSTRAP_W];              // distances for MAD
 
  // Capture one frame (with optional hop/overlap)
  if (!capture_frame_blocking()) return;
 
  // DSP → features
  float feat[FEAT_DIM];
  extract_features(feat);
 
  if (!model_ready) {
    // Accumulate bootstrap features (raw)
    if (collected < BOOTSTRAP_W) {
      memcpy(&Xbuf[collected*FEAT_DIM], feat, sizeof(float)*FEAT_DIM);
      collected++;
      Serial.printf("bootstrap %d/%d\n", collected, BOOTSTRAP_W);
    }
 
    if (collected >= BOOTSTRAP_W) {
      // Compute per-feature mean/std
      for (int j=0;j<FEAT_DIM;j++){
        OnlineMeanStd s;
        for(int i=0;i<BOOTSTRAP_W;i++) s.add(Xbuf[i*FEAT_DIM+j]);
        feat_mean[j] = (float)s.mean;
        feat_std[j]  = (float)s.std();
      }
      // Standardize and fit K-means
      for (int i=0;i<BOOTSTRAP_W;i++){
        standardize(&Xbuf[i*FEAT_DIM], &Zbuf[i*FEAT_DIM]);
      }
      kmeans.init_plus_plus(Zbuf, BOOTSTRAP_W);
      kmeans.fit(Zbuf, BOOTSTRAP_W, 12);
 
      // Distances to nearest centroid on bootstrap → median/MAD
      for (int i=0;i<BOOTSTRAP_W;i++){
        int c = kmeans.assign(&Zbuf[i*FEAT_DIM]);
        float d = KMeans::dist2(&Zbuf[i*FEAT_DIM], kmeans.centroids[c]);
        Dbuf[i] = sqrtf(d);
      }
      // median
      float tmp[BOOTSTRAP_W];
      memcpy(tmp, Dbuf, sizeof(tmp));
      dist_median = median_of(tmp, BOOTSTRAP_W);
      // MAD = median(|d - median|)
      for (int i=0;i<BOOTSTRAP_W;i++) tmp[i] = fabsf(Dbuf[i] - dist_median);
      dist_mad = median_of(tmp, BOOTSTRAP_W) + 1e-6f;
 
      model_ready = true;
      Serial.printf("Model ready. median=%.3f MAD=%.3f (thresh≈%.3f)\n",
                    dist_median, dist_mad, dist_median + ANOM_MULT*dist_mad);
    }
  } else {
    // Standardize current feature, compute anomaly
    float z[FEAT_DIM];
    standardize(feat, z);
    int c = kmeans.assign(z);
    float d = sqrtf(KMeans::dist2(z, kmeans.centroids[c]));
    float thresh = dist_median + ANOM_MULT * dist_mad;
    bool isAnom = (d > thresh);
 
    // gentle online centroid update on non-anomalous frames
    if (!isAnom) {
      const float alpha = 0.05f;
      for (int j=0;j<FEAT_DIM;j++) {
        kmeans.centroids[c][j] = (1-alpha)*kmeans.centroids[c][j] + alpha*z[j];
      }
    }
 
    Serial.printf("dist=%.3f  thresh≈%.3f  %s\n", d, thresh, isAnom?"ANOMALY":"ok");
    online_led(d);
  }
 
  // crude 50% overlap: reuse last half by shifting; here we just sleep to approximate hop
  // delay((HOP_SAMPLES * 1000) / SAMPLE_RATE);
}