#include "portrait_engine.h"

#include <algorithm>
#include <cmath>
#include <cstring>
#include <new>
#include <vector>

#include "app_state.h"
#include "dl_image_jpeg.hpp"
#include "esp_camera.h"
#include "esp_heap_caps.h"
#include "esp_log.h"
#include "esp_timer.h"
#include "freertos/FreeRTOS.h"
#include "freertos/semphr.h"
#include "freertos/task.h"
#include "human_face_detect.hpp"

namespace {

constexpr float kPi = 3.14159265358979323846f;
constexpr int kMinTracePixels = 18;
constexpr int kHairMinTracePixels = 28;
constexpr int kTraceDecimate = 2;          // 元の3より細かく拾う(後でRDPで間引く)
constexpr float kRdpEpsilon = 1.6f;        // RDP簡略化の許容誤差(px)
constexpr int kWorkerStackBytes = 32768;

const char *TAG = "portrait_engine";

SemaphoreHandle_t s_mutex = nullptr;
SemaphoreHandle_t s_capture_done = nullptr;
TaskHandle_t s_worker_task = nullptr;
HumanFaceDetect *s_detector = nullptr;
portrait_result_t s_result = {};
uint8_t *s_last_jpeg = nullptr;
size_t s_last_jpeg_len = 0;
bool s_capture_busy = false;
esp_err_t s_last_capture_err = ESP_OK;

struct FaceInfo {
    bool found = false;
    float score = 0.0f;
    int box[4] = {0, 0, 0, 0};
    int keypoint[10] = {0};
};

void *portrait_alloc(size_t size)
{
    void *ptr = heap_caps_malloc(size, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
    if (ptr == nullptr) {
        ptr = heap_caps_malloc(size, MALLOC_CAP_8BIT);
    }
    return ptr;
}

void portrait_free(void *ptr)
{
    heap_caps_free(ptr);
}

int clamp_int(int v, int lo, int hi)
{
    return std::max(lo, std::min(v, hi));
}

void reset_result_locked(const char *message)
{
    std::memset(&s_result, 0, sizeof(s_result));
    s_result.timestamp_ms = static_cast<uint32_t>(esp_timer_get_time() / 1000);
    if (message != nullptr) {
        std::strncpy(s_result.message, message, sizeof(s_result.message) - 1);
    }
}

bool add_stroke(portrait_result_t &result, portrait_stroke_type_t type, uint16_t first, uint16_t count)
{
    if (count < 2 || result.stroke_count >= PORTRAIT_MAX_STROKES) {
        return false;
    }
    portrait_stroke_t &stroke = result.strokes[result.stroke_count++];
    stroke.type = type;
    stroke.first_point = first;
    stroke.point_count = count;
    return true;
}

bool add_point(portrait_result_t &result, int x, int y)
{
    if (result.point_count >= PORTRAIT_MAX_POINTS) {
        return false;
    }
    portrait_point_t &point = result.points[result.point_count++];
    point.x = static_cast<int16_t>(x);
    point.y = static_cast<int16_t>(y);
    return true;
}

void store_jpeg_copy(const uint8_t *data, size_t len)
{
    if (data == nullptr || len == 0) {
        return;
    }

    uint8_t *copy = static_cast<uint8_t *>(portrait_alloc(len));
    if (copy == nullptr) {
        ESP_LOGW(TAG, "could not allocate %u bytes for last JPEG", static_cast<unsigned>(len));
        return;
    }
    std::memcpy(copy, data, len);

    portrait_engine_lock();
    portrait_free(s_last_jpeg);
    s_last_jpeg = copy;
    s_last_jpeg_len = len;
    portrait_engine_unlock();
}

bool begin_capture_request()
{
    bool accepted = false;
    portrait_engine_lock();
    if (!s_capture_busy) {
        s_capture_busy = true;
        s_last_capture_err = ESP_ERR_INVALID_STATE;
        accepted = true;
    }
    portrait_engine_unlock();
    return accepted;
}

void finish_capture_request(esp_err_t err)
{
    portrait_engine_lock();
    s_last_capture_err = err;
    s_capture_busy = false;
    portrait_engine_unlock();
    if (s_capture_done != nullptr) {
        xSemaphoreGive(s_capture_done);
    }
}

void rgb888_to_gray(const uint8_t *rgb, int w, int h, uint8_t *gray)
{
    const int pixels = w * h;
    for (int i = 0; i < pixels; ++i) {
        const uint8_t r = rgb[i * 3 + 0];
        const uint8_t g = rgb[i * 3 + 1];
        const uint8_t b = rgb[i * 3 + 2];
        gray[i] = static_cast<uint8_t>((77 * r + 150 * g + 29 * b) >> 8);
    }
}

// ---- separable box blur (1D水平 + 1D垂直). O(w*h)で軽い. ----
void box_blur_1d_h(const uint8_t *src, uint8_t *dst, int w, int h, int radius)
{
    const int span = radius * 2 + 1;
    for (int y = 0; y < h; ++y) {
        const uint8_t *row = src + y * w;
        uint8_t *out = dst + y * w;
        int sum = 0;
        // 初期窓
        for (int x = -radius; x <= radius; ++x) {
            sum += row[clamp_int(x, 0, w - 1)];
        }
        for (int x = 0; x < w; ++x) {
            out[x] = static_cast<uint8_t>(sum / span);
            const int x_out = x - radius;
            const int x_in = x + radius + 1;
            sum -= row[clamp_int(x_out, 0, w - 1)];
            sum += row[clamp_int(x_in, 0, w - 1)];
        }
    }
}

void box_blur_1d_v(const uint8_t *src, uint8_t *dst, int w, int h, int radius)
{
    const int span = radius * 2 + 1;
    for (int x = 0; x < w; ++x) {
        int sum = 0;
        for (int y = -radius; y <= radius; ++y) {
            sum += src[clamp_int(y, 0, h - 1) * w + x];
        }
        for (int y = 0; y < h; ++y) {
            dst[y * w + x] = static_cast<uint8_t>(sum / span);
            const int y_out = y - radius;
            const int y_in = y + radius + 1;
            sum -= src[clamp_int(y_out, 0, h - 1) * w + x];
            sum += src[clamp_int(y_in, 0, h - 1) * w + x];
        }
    }
}

// box×3でガウス近似(中心極限定理). tmp1とtmp2を交互に使う.
void approx_gaussian_blur(const uint8_t *src, uint8_t *dst, uint8_t *tmp,
                          int w, int h, int radius)
{
    if (radius < 1) {
        std::memcpy(dst, src, static_cast<size_t>(w) * h);
        return;
    }
    // pass 1
    box_blur_1d_h(src, tmp, w, h, radius);
    box_blur_1d_v(tmp, dst, w, h, radius);
    // pass 2
    box_blur_1d_h(dst, tmp, w, h, radius);
    box_blur_1d_v(tmp, dst, w, h, radius);
    // pass 3
    box_blur_1d_h(dst, tmp, w, h, radius);
    box_blur_1d_v(tmp, dst, w, h, radius);
}

// ---- 簡易CLAHE: タイル平均を線形補間し局所コントラストを引き上げる ----
// 完全なCLAHEは重いのでヒストグラム使わず, 「タイル平均からの偏差を強調」する版.
void light_local_contrast(uint8_t *img, int w, int h, int tile = 16, float gain = 1.6f)
{
    const int tx = std::max(1, w / tile);
    const int ty = std::max(1, h / tile);
    const int n_tx = (w + tx - 1) / tx;
    const int n_ty = (h + ty - 1) / ty;

    std::vector<int> tile_mean(n_tx * n_ty, 0);
    for (int j = 0; j < n_ty; ++j) {
        for (int i = 0; i < n_tx; ++i) {
            const int x0 = i * tx;
            const int y0 = j * ty;
            const int x1 = std::min(w, x0 + tx);
            const int y1 = std::min(h, y0 + ty);
            int sum = 0;
            int cnt = 0;
            for (int yy = y0; yy < y1; ++yy) {
                for (int xx = x0; xx < x1; ++xx) {
                    sum += img[yy * w + xx];
                    ++cnt;
                }
            }
            tile_mean[j * n_tx + i] = cnt > 0 ? sum / cnt : 128;
        }
    }

    // バイリニア補間してピクセルごとの局所平均を求め, 偏差をgain倍
    for (int y = 0; y < h; ++y) {
        const float fy = (static_cast<float>(y) + 0.5f) / static_cast<float>(ty) - 0.5f;
        const int j0 = clamp_int(static_cast<int>(std::floor(fy)), 0, n_ty - 1);
        const int j1 = clamp_int(j0 + 1, 0, n_ty - 1);
        const float ay = fy - static_cast<float>(j0);
        for (int x = 0; x < w; ++x) {
            const float fx = (static_cast<float>(x) + 0.5f) / static_cast<float>(tx) - 0.5f;
            const int i0 = clamp_int(static_cast<int>(std::floor(fx)), 0, n_tx - 1);
            const int i1 = clamp_int(i0 + 1, 0, n_tx - 1);
            const float ax = fx - static_cast<float>(i0);

            const float m00 = static_cast<float>(tile_mean[j0 * n_tx + i0]);
            const float m10 = static_cast<float>(tile_mean[j0 * n_tx + i1]);
            const float m01 = static_cast<float>(tile_mean[j1 * n_tx + i0]);
            const float m11 = static_cast<float>(tile_mean[j1 * n_tx + i1]);
            const float local_mean = (1.0f - ax) * (1.0f - ay) * m00
                                   + ax * (1.0f - ay) * m10
                                   + (1.0f - ax) * ay * m01
                                   + ax * ay * m11;

            const int idx = y * w + x;
            const float v = static_cast<float>(img[idx]);
            const float enhanced = local_mean + (v - local_mean) * gain;
            img[idx] = static_cast<uint8_t>(clamp_int(static_cast<int>(enhanced + 0.5f), 0, 255));
        }
    }
}

// ---- XDoG: tanhソフト閾値版 ----
// 元式: x = g1 + p*(g1 - g2);  out = (x>=ε) ? 1 : 1 + tanh(φ(x-ε))
// 整数演算寄りに: g1, g2 は uint8 (0..255). pは22, εは0.5*255=128, φは10.
// tanh(z)を [-3,3] のLUTで近似(端は±1で飽和).
struct TanhLut {
    static constexpr int kN = 257;
    static constexpr float kRange = 3.0f;
    float v[kN];
    TanhLut()
    {
        for (int i = 0; i < kN; ++i) {
            const float z = -kRange + 2.0f * kRange * static_cast<float>(i) / static_cast<float>(kN - 1);
            v[i] = std::tanh(z);
        }
    }
    float operator()(float z) const
    {
        if (z <= -kRange) return -1.0f;
        if (z >=  kRange) return  1.0f;
        const float u = (z + kRange) / (2.0f * kRange) * static_cast<float>(kN - 1);
        const int i = static_cast<int>(u);
        const float a = u - static_cast<float>(i);
        return v[i] * (1.0f - a) + v[i + 1] * a;
    }
};

void mark_xdog_edges_tanh(const uint8_t *small_blur,
                          const uint8_t *large_blur,
                          uint8_t *edge,
                          int w,
                          int h,
                          const int roi[4])
{
    static const TanhLut kTanh;
    std::memset(edge, 0, static_cast<size_t>(w) * h);

    constexpr float p = 22.0f;
    constexpr float epsilon = 0.5f;   // 0..1 スケール
    constexpr float phi = 10.0f;
    // 出力のしきい値: 出力 < 0.5 をエッジとみなす(線=黒).
    constexpr float edge_threshold = 0.5f;

    for (int y = roi[1]; y < roi[3]; ++y) {
        for (int x = roi[0]; x < roi[2]; ++x) {
            const int idx = y * w + x;
            const float g1 = static_cast<float>(small_blur[idx]) / 255.0f;
            const float g2 = static_cast<float>(large_blur[idx]) / 255.0f;
            const float xdog = g1 + p * (g1 - g2);
            float out;
            if (xdog >= epsilon) {
                out = 1.0f;
            } else {
                out = 1.0f + kTanh(phi * (xdog - epsilon));
            }
            if (out < edge_threshold) {
                edge[idx] = 1;
            }
        }
    }
}

float distance2(float ax, float ay, float bx, float by)
{
    const float dx = ax - bx;
    const float dy = ay - by;
    return dx * dx + dy * dy;
}

struct FaceGeometry {
    bool valid = false;
    int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
    int bw = 1, bh = 1;
    float eye_dist = 0.0f;
    float lx = 0, ly = 0, rx = 0, ry = 0;
    float lmx = 0, lmy = 0, rmx = 0, rmy = 0;
    float nose_x = 0, nose_y = 0;
    float eye_y = 0;
};

FaceGeometry make_face_geometry(const FaceInfo &face)
{
    FaceGeometry g;
    if (!face.found) return g;

    g.lx  = face.keypoint[0]; g.ly  = face.keypoint[1];
    g.lmx = face.keypoint[2]; g.lmy = face.keypoint[3];
    g.nose_x = face.keypoint[4]; g.nose_y = face.keypoint[5];
    g.rx  = face.keypoint[6]; g.ry  = face.keypoint[7];
    g.rmx = face.keypoint[8]; g.rmy = face.keypoint[9];
    g.eye_dist = std::sqrt(distance2(g.lx, g.ly, g.rx, g.ry));
    if (g.eye_dist < 4.0f) return g;

    g.valid = true;
    g.x0 = face.box[0]; g.y0 = face.box[1];
    g.x1 = face.box[2]; g.y1 = face.box[3];
    g.bw = std::max(1, g.x1 - g.x0);
    g.bh = std::max(1, g.y1 - g.y0);
    g.eye_y = (g.ly + g.ry) * 0.5f;
    return g;
}

// ---- 目/口の領域からエッジを除去(記号化のため) ----
void mask_symbol_regions(uint8_t *edge, int w, int h, const FaceGeometry &g)
{
    if (!g.valid) return;

    const float eye_mask_r = g.eye_dist * 0.18f;
    const float eye_mask_r2 = eye_mask_r * eye_mask_r;
    const float mouth_cx = (g.lmx + g.rmx) * 0.5f;
    const float mouth_cy = (g.lmy + g.rmy) * 0.5f;
    const float mouth_w = std::sqrt(distance2(g.lmx, g.lmy, g.rmx, g.rmy));
    const float mouth_rx = std::max(g.eye_dist * 0.25f, mouth_w * 0.8f);
    const float mouth_ry = std::max(g.eye_dist * 0.12f, 4.0f);

    for (int y = 0; y < h; ++y) {
        for (int x = 0; x < w; ++x) {
            const int idx = y * w + x;
            if (!edge[idx]) continue;
            if (distance2(x, y, g.lx, g.ly) <= eye_mask_r2 ||
                distance2(x, y, g.rx, g.ry) <= eye_mask_r2) {
                edge[idx] = 0;
                continue;
            }
            const float ex = (x - mouth_cx) / mouth_rx;
            const float ey = (y - mouth_cy) / mouth_ry;
            if (ex * ex + ey * ey <= 1.0f) {
                edge[idx] = 0;
            }
        }
    }
}

// ---- 連結成分(4近傍)で最大領域を抽出. 結果は mask_out に1で書き込む ----
//   work: w*h バイト, ラベル兼用テンポラリ
int largest_component(const uint8_t *src, uint8_t *mask_out, uint8_t *work,
                      int w, int h, int min_area)
{
    std::memset(work, 0, static_cast<size_t>(w) * h);
    std::memset(mask_out, 0, static_cast<size_t>(w) * h);

    std::vector<int> stack;
    stack.reserve(1024);
    int best_size = 0;
    std::vector<int> best_pixels;

    for (int y = 0; y < h; ++y) {
        for (int x = 0; x < w; ++x) {
            const int s = y * w + x;
            if (!src[s] || work[s]) continue;
            stack.clear();
            stack.push_back(s);
            work[s] = 1;
            std::vector<int> pixels;
            pixels.reserve(64);
            while (!stack.empty()) {
                const int p = stack.back();
                stack.pop_back();
                pixels.push_back(p);
                const int px = p % w;
                const int py = p / w;
                if (px > 0)     { const int q = p - 1; if (src[q] && !work[q]) { work[q] = 1; stack.push_back(q); } }
                if (px < w - 1) { const int q = p + 1; if (src[q] && !work[q]) { work[q] = 1; stack.push_back(q); } }
                if (py > 0)     { const int q = p - w; if (src[q] && !work[q]) { work[q] = 1; stack.push_back(q); } }
                if (py < h - 1) { const int q = p + w; if (src[q] && !work[q]) { work[q] = 1; stack.push_back(q); } }
            }
            if (static_cast<int>(pixels.size()) > best_size) {
                best_size = static_cast<int>(pixels.size());
                best_pixels.swap(pixels);
            }
        }
    }
    if (best_size < min_area) return 0;
    for (int p : best_pixels) mask_out[p] = 1;
    return best_size;
}

// ---- 簡易ダイレーション(3x3, 4近傍ベース) ----
void dilate_3x3(const uint8_t *src, uint8_t *dst, int w, int h)
{
    for (int y = 0; y < h; ++y) {
        for (int x = 0; x < w; ++x) {
            uint8_t v = src[y * w + x];
            if (!v) {
                if (x > 0     && src[y * w + x - 1]) v = 1;
                else if (x < w-1 && src[y * w + x + 1]) v = 1;
                else if (y > 0     && src[(y - 1) * w + x]) v = 1;
                else if (y < h-1 && src[(y + 1) * w + x]) v = 1;
            }
            dst[y * w + x] = v;
        }
    }
}

// ---- マスク輪郭(最も外側)から bbox 辺に張り付かない最長アーチを抽出 ----
// やり方: マスク=1 の境界画素(=1だが4近傍に0を持つ画素) を
// 開始点から8近傍時計回りで辿り, 閉路を得る.
// その後, 「bbox辺に近い点(margin内)」を falseとして, 最長のtrueランを取る.
struct Arc {
    std::vector<int> xs;
    std::vector<int> ys;
};

bool is_boundary_pixel(const uint8_t *mask, int w, int h, int x, int y)
{
    if (!mask[y * w + x]) return false;
    if (x == 0 || y == 0 || x == w - 1 || y == h - 1) return true;
    if (!mask[(y - 1) * w + x]) return true;
    if (!mask[(y + 1) * w + x]) return true;
    if (!mask[y * w + (x - 1)]) return true;
    if (!mask[y * w + (x + 1)]) return true;
    return false;
}

// Mooreの境界追跡(8近傍時計回り)
Arc trace_boundary(const uint8_t *mask, int w, int h)
{
    Arc out;
    int sx = -1, sy = -1;
    for (int y = 0; y < h && sy < 0; ++y) {
        for (int x = 0; x < w; ++x) {
            if (is_boundary_pixel(mask, w, h, x, y)) { sx = x; sy = y; break; }
        }
    }
    if (sx < 0) return out;

    static const int dx8[8] = { 1, 1, 0, -1, -1, -1, 0, 1 };
    static const int dy8[8] = { 0, 1, 1,  1,  0, -1, -1, -1 };
    int cx = sx, cy = sy;
    int dir = 6; // 上から来た想定
    out.xs.push_back(cx); out.ys.push_back(cy);

    const int max_steps = w * h * 4;
    for (int step = 0; step < max_steps; ++step) {
        bool moved = false;
        const int start_dir = (dir + 6) & 7; // 90度反時計から走査
        for (int k = 0; k < 8; ++k) {
            const int d = (start_dir + k) & 7;
            const int nx = cx + dx8[d];
            const int ny = cy + dy8[d];
            if (nx < 0 || ny < 0 || nx >= w || ny >= h) continue;
            if (mask[ny * w + nx]) {
                cx = nx; cy = ny; dir = d;
                out.xs.push_back(cx); out.ys.push_back(cy);
                moved = true;
                break;
            }
        }
        if (!moved) break;
        if (cx == sx && cy == sy && out.xs.size() > 2) break;
    }
    return out;
}

// 配列の最長 keep ラン抽出. closed=trueのときラップアラウンドを考慮.
void longest_keep_run(const std::vector<bool> &keep, std::vector<int> &out_indices, bool closed)
{
    out_indices.clear();
    const int n = static_cast<int>(keep.size());
    if (n == 0) return;

    int start = 0;
    if (closed) {
        bool any_false = false;
        for (int i = 0; i < n; ++i) if (!keep[i]) { any_false = true; break; }
        if (!any_false) {
            for (int i = 0; i < n; ++i) out_indices.push_back(i);
            return;
        }
        // 最初のfalseを探す
        for (int i = 0; i < n; ++i) if (!keep[i]) { start = i; break; }
    }

    int best_s = -1, best_e = -1;
    int i = 0;
    while (i < n) {
        const int idx = (start + i) % n;
        if (keep[idx]) {
            int j = i;
            while (j < n && keep[(start + j) % n]) ++j;
            if (j - i > best_e - best_s) {
                best_s = i; best_e = j;
            }
            i = j;
        } else {
            ++i;
        }
    }
    if (best_s < 0) return;
    for (int k = best_s; k < best_e; ++k) out_indices.push_back((start + k) % n);
}

// ---- 髪シルエット抽出 ----
// 顔肌パッチの平均輝度を基準に, 平均-2σ より暗い領域を髪候補とする.
// bbox内に限定し, 目の高さ+marginより上のみを対象, 連結→最大成分→外形→bbox辺除去アーチ.
bool extract_hair_arc(const uint8_t *gray, int w, int h, const FaceGeometry &g,
                      uint8_t *tmp_a, uint8_t *tmp_b, uint8_t *tmp_c,
                      Arc &out)
{
    if (!g.valid) return false;

    // 肌パッチ: 両目間下〜鼻まで, 中央 0.6*eye_dist 幅
    const int s_x0 = clamp_int(static_cast<int>((g.lx + g.rx) * 0.5f - g.eye_dist * 0.3f), g.x0, g.x1 - 1);
    const int s_x1 = clamp_int(static_cast<int>((g.lx + g.rx) * 0.5f + g.eye_dist * 0.3f), s_x0 + 1, g.x1);
    const int s_y0 = clamp_int(static_cast<int>(g.eye_y + g.eye_dist * 0.2f), g.y0, g.y1 - 1);
    const int s_y1 = clamp_int(static_cast<int>(g.nose_y), s_y0 + 1, g.y1);
    if (s_x1 - s_x0 < 4 || s_y1 - s_y0 < 4) return false;

    int sum = 0, sum2 = 0, cnt = 0;
    for (int y = s_y0; y < s_y1; ++y) {
        for (int x = s_x0; x < s_x1; ++x) {
            const int v = gray[y * w + x];
            sum += v; sum2 += v * v; ++cnt;
        }
    }
    const float skin_mean = static_cast<float>(sum) / static_cast<float>(cnt);
    const float skin_var = static_cast<float>(sum2) / static_cast<float>(cnt) - skin_mean * skin_mean;
    const float skin_std = std::sqrt(std::max(skin_var, 0.0f)) + 1.0f;
    const float thr = std::max(0.0f, skin_mean - 2.0f * skin_std);

    // bbox内 & 目より上の暗部 -> tmp_a
    std::memset(tmp_a, 0, static_cast<size_t>(w) * h);
    const int eye_y_i = static_cast<int>(std::floor(g.eye_y));
    const int top_y = clamp_int(g.y0, 0, h);
    const int bot_y = clamp_int(eye_y_i, top_y, h);
    const int lft_x = clamp_int(g.x0, 0, w);
    const int rgt_x = clamp_int(g.x1, lft_x, w);
    for (int y = top_y; y < bot_y; ++y) {
        for (int x = lft_x; x < rgt_x; ++x) {
            if (gray[y * w + x] < thr) tmp_a[y * w + x] = 1;
        }
    }

    // closing(膨張×2)→ 最大連結成分
    dilate_3x3(tmp_a, tmp_b, w, h);
    dilate_3x3(tmp_b, tmp_a, w, h);
    if (largest_component(tmp_a, tmp_b, tmp_c, w, h, 200) == 0) return false;

    // 境界追跡
    Arc boundary = trace_boundary(tmp_b, w, h);
    if (boundary.xs.size() < 8) return false;

    // bbox4辺 + eye_y 下端に張り付く点を除去, 最長keepランを取る
    const int margin = 3;
    std::vector<bool> keep(boundary.xs.size(), false);
    for (size_t i = 0; i < boundary.xs.size(); ++i) {
        const int x = boundary.xs[i];
        const int y = boundary.ys[i];
        keep[i] = (y < eye_y_i - margin) &&
                  (y > g.y0 + margin) &&
                  (x > g.x0 + margin) &&
                  (x < g.x1 - margin);
    }
    std::vector<int> idxs;
    longest_keep_run(keep, idxs, true);
    if (idxs.size() < 4) return false;

    out.xs.clear(); out.ys.clear();
    out.xs.reserve(idxs.size());
    out.ys.reserve(idxs.size());
    for (int i : idxs) { out.xs.push_back(boundary.xs[i]); out.ys.push_back(boundary.ys[i]); }
    return true;
}

// ---- 顎シルエット抽出 ----
// 肌色領域(肌平均±tol)の bbox内 & 目より下の最大成分の下端アーチ.
bool extract_jaw_arc(const uint8_t *gray, int w, int h, const FaceGeometry &g,
                     uint8_t *tmp_a, uint8_t *tmp_b, uint8_t *tmp_c,
                     Arc &out)
{
    if (!g.valid) return false;

    const int s_x0 = clamp_int(static_cast<int>((g.lx + g.rx) * 0.5f - g.eye_dist * 0.3f), g.x0, g.x1 - 1);
    const int s_x1 = clamp_int(static_cast<int>((g.lx + g.rx) * 0.5f + g.eye_dist * 0.3f), s_x0 + 1, g.x1);
    const int s_y0 = clamp_int(static_cast<int>(g.eye_y + g.eye_dist * 0.2f), g.y0, g.y1 - 1);
    const int s_y1 = clamp_int(static_cast<int>(g.nose_y), s_y0 + 1, g.y1);
    if (s_x1 - s_x0 < 4 || s_y1 - s_y0 < 4) return false;

    int sum = 0, sum2 = 0, cnt = 0;
    for (int y = s_y0; y < s_y1; ++y) {
        for (int x = s_x0; x < s_x1; ++x) {
            const int v = gray[y * w + x];
            sum += v; sum2 += v * v; ++cnt;
        }
    }
    const float skin_mean = static_cast<float>(sum) / static_cast<float>(cnt);
    const float skin_var = static_cast<float>(sum2) / static_cast<float>(cnt) - skin_mean * skin_mean;
    const float skin_std = std::sqrt(std::max(skin_var, 0.0f)) + 1.0f;
    const float tol = std::max(20.0f, 2.5f * skin_std);

    // bbox内 & 目より下の肌色 -> tmp_a
    std::memset(tmp_a, 0, static_cast<size_t>(w) * h);
    const int eye_y_i = static_cast<int>(std::floor(g.eye_y));
    const int top_y = clamp_int(eye_y_i, 0, h);
    const int bot_y = clamp_int(g.y1, top_y, h);
    const int lft_x = clamp_int(g.x0, 0, w);
    const int rgt_x = clamp_int(g.x1, lft_x, w);
    for (int y = top_y; y < bot_y; ++y) {
        for (int x = lft_x; x < rgt_x; ++x) {
            if (std::fabs(static_cast<float>(gray[y * w + x]) - skin_mean) < tol) {
                tmp_a[y * w + x] = 1;
            }
        }
    }

    dilate_3x3(tmp_a, tmp_b, w, h);
    dilate_3x3(tmp_b, tmp_a, w, h);
    if (largest_component(tmp_a, tmp_b, tmp_c, w, h, 300) == 0) return false;

    Arc boundary = trace_boundary(tmp_b, w, h);
    if (boundary.xs.size() < 8) return false;

    const int margin = 3;
    std::vector<bool> keep(boundary.xs.size(), false);
    for (size_t i = 0; i < boundary.xs.size(); ++i) {
        const int x = boundary.xs[i];
        const int y = boundary.ys[i];
        keep[i] = (y > eye_y_i + margin) &&
                  (y < g.y1 - margin) &&
                  (x > g.x0 + margin) &&
                  (x < g.x1 - margin);
    }
    std::vector<int> idxs;
    longest_keep_run(keep, idxs, true);
    if (idxs.size() < 4) return false;

    out.xs.clear(); out.ys.clear();
    out.xs.reserve(idxs.size());
    out.ys.reserve(idxs.size());
    for (int i : idxs) { out.xs.push_back(boundary.xs[i]); out.ys.push_back(boundary.ys[i]); }
    return true;
}

// ---- RDP (Ramer-Douglas-Peucker) 簡略化 ----
float perp_dist2(int ax, int ay, int bx, int by, int px, int py)
{
    const float dx = static_cast<float>(bx - ax);
    const float dy = static_cast<float>(by - ay);
    const float len2 = dx * dx + dy * dy;
    if (len2 < 1e-6f) {
        const float ex = static_cast<float>(px - ax);
        const float ey = static_cast<float>(py - ay);
        return ex * ex + ey * ey;
    }
    const float t = ((px - ax) * dx + (py - ay) * dy) / len2;
    const float qx = ax + t * dx;
    const float qy = ay + t * dy;
    const float ex = px - qx;
    const float ey = py - qy;
    return ex * ex + ey * ey;
}

void rdp_simplify(const std::vector<int> &xs, const std::vector<int> &ys,
                  std::vector<int> &out_x, std::vector<int> &out_y, float eps)
{
    const int n = static_cast<int>(xs.size());
    out_x.clear(); out_y.clear();
    if (n == 0) return;
    if (n <= 2) { out_x = xs; out_y = ys; return; }

    const float eps2 = eps * eps;
    std::vector<bool> keep(n, false);
    keep[0] = true; keep[n - 1] = true;

    std::vector<std::pair<int,int>> stack;
    stack.emplace_back(0, n - 1);
    while (!stack.empty()) {
        auto [a, b] = stack.back();
        stack.pop_back();
        float max_d2 = 0.0f;
        int max_i = -1;
        for (int i = a + 1; i < b; ++i) {
            const float d2 = perp_dist2(xs[a], ys[a], xs[b], ys[b], xs[i], ys[i]);
            if (d2 > max_d2) { max_d2 = d2; max_i = i; }
        }
        if (max_i >= 0 && max_d2 > eps2) {
            keep[max_i] = true;
            stack.emplace_back(a, max_i);
            stack.emplace_back(max_i, b);
        }
    }
    for (int i = 0; i < n; ++i) {
        if (keep[i]) { out_x.push_back(xs[i]); out_y.push_back(ys[i]); }
    }
}

// ---- 連結成分(エッジ画素)の貪欲トレース → RDPでストローク化 ----
int count_unvisited_neighbors(const uint8_t *edge, const uint8_t *visited, int w, int h, int x, int y)
{
    int count = 0;
    for (int dy = -1; dy <= 1; ++dy) {
        for (int dx = -1; dx <= 1; ++dx) {
            if (dx == 0 && dy == 0) continue;
            const int nx = x + dx, ny = y + dy;
            if (nx < 0 || nx >= w || ny < 0 || ny >= h) continue;
            const int idx = ny * w + nx;
            if (edge[idx] && !visited[idx]) ++count;
        }
    }
    return count;
}

bool find_next_pixel(const uint8_t *edge, const uint8_t *visited, int w, int h,
                     int x, int y, int *next_x, int *next_y)
{
    int best_x = -1, best_y = -1, best_score = -1;
    for (int dy = -1; dy <= 1; ++dy) {
        for (int dx = -1; dx <= 1; ++dx) {
            if (dx == 0 && dy == 0) continue;
            const int nx = x + dx, ny = y + dy;
            if (nx < 0 || nx >= w || ny < 0 || ny >= h) continue;
            const int idx = ny * w + nx;
            if (!edge[idx] || visited[idx]) continue;
            const int score = count_unvisited_neighbors(edge, visited, w, h, nx, ny);
            if (score > best_score) { best_score = score; best_x = nx; best_y = ny; }
        }
    }
    if (best_x < 0) return false;
    *next_x = best_x; *next_y = best_y;
    return true;
}

void trace_edges_to_strokes(uint8_t *edge, uint8_t *visited,
                            int w, int h, const int roi[4],
                            portrait_result_t &result)
{
    std::memset(visited, 0, static_cast<size_t>(w) * h);

    std::vector<int> raw_x, raw_y, simp_x, simp_y;
    raw_x.reserve(512); raw_y.reserve(512);

    for (int y = roi[1]; y < roi[3]; ++y) {
        for (int x = roi[0]; x < roi[2]; ++x) {
            const int s = y * w + x;
            if (!edge[s] || visited[s]) continue;
            if (result.stroke_count >= PORTRAIT_MAX_STROKES) return;
            if (result.point_count >= PORTRAIT_MAX_POINTS) return;

            raw_x.clear(); raw_y.clear();
            int trace_pixels = 0;
            int cx = x, cy = y;

            while (true) {
                const int idx = cy * w + cx;
                if (visited[idx] || !edge[idx]) break;
                visited[idx] = 1;
                if ((trace_pixels % kTraceDecimate) == 0) {
                    raw_x.push_back(cx); raw_y.push_back(cy);
                }
                ++trace_pixels;
                int nx = 0, ny = 0;
                if (!find_next_pixel(edge, visited, w, h, cx, cy, &nx, &ny)) break;
                cx = nx; cy = ny;
            }

            // 最後の点も拾う
            if (!raw_x.empty() && (raw_x.back() != cx || raw_y.back() != cy)) {
                raw_x.push_back(cx); raw_y.push_back(cy);
            }

            const int min_pixels = kMinTracePixels;
            if (trace_pixels < min_pixels || raw_x.size() < 2) continue;

            rdp_simplify(raw_x, raw_y, simp_x, simp_y, kRdpEpsilon);
            if (simp_x.size() < 2) continue;

            const uint16_t first = result.point_count;
            int written = 0;
            for (size_t i = 0; i < simp_x.size(); ++i) {
                if (!add_point(result, simp_x[i], simp_y[i])) break;
                ++written;
            }
            if (written >= 2) {
                add_stroke(result, PORTRAIT_STROKE_XDOG, first,
                           static_cast<uint16_t>(written));
            } else {
                result.point_count = first;
            }
        }
    }
}

// ---- 記号化パーツ(目の渦巻き・口) ----
void add_eye_spiral(portrait_result_t &result, portrait_stroke_type_t type,
                    float cx, float cy, float radius, float tilt, bool ccw)
{
    constexpr int points = 40;
    constexpr int turns = 3;
    const uint16_t first = result.point_count;
    const float sign = ccw ? 1.0f : -1.0f;
    int written = 0;

    for (int i = 0; i < points; ++i) {
        const float u = static_cast<float>(i) / static_cast<float>(points - 1);
        const float t = sign * turns * 2.0f * kPi * u + tilt;
        const float r = radius * u;
        if (add_point(result,
                      static_cast<int>(std::lround(cx + r * std::cos(t))),
                      static_cast<int>(std::lround(cy + r * std::sin(t))))) {
            ++written;
        }
    }
    add_stroke(result, type, first, static_cast<uint16_t>(written));
}

void add_symbol_parts(portrait_result_t &result, const FaceGeometry &g)
{
    if (!g.valid) return;
    const float tilt = std::atan2(g.ry - g.ly, g.rx - g.lx);
    const float eye_r = g.eye_dist * 0.12f;
    add_eye_spiral(result, PORTRAIT_STROKE_LEFT_EYE,  g.lx, g.ly, eye_r, tilt, true);
    add_eye_spiral(result, PORTRAIT_STROKE_RIGHT_EYE, g.rx, g.ry, eye_r, tilt, false);

    const uint16_t first = result.point_count;
    add_point(result, static_cast<int>(std::lround(g.lmx)), static_cast<int>(std::lround(g.lmy)));
    add_point(result, static_cast<int>(std::lround(g.rmx)), static_cast<int>(std::lround(g.rmy)));
    add_stroke(result, PORTRAIT_STROKE_MOUTH, first, 2);
}

// ---- 髪/顎アーチ Arc -> ストロークとして書き出し(RDP適用) ----
void add_arc_as_stroke(portrait_result_t &result, portrait_stroke_type_t type, const Arc &arc)
{
    if (arc.xs.size() < 2) return;

    // 境界追跡は1pxごとに点を吐くので, RDP前に粗く間引く. 始点/終点は必ず残す.
    constexpr int kArcDecimate = 3;
    std::vector<int> dx_buf, dy_buf;
    dx_buf.reserve(arc.xs.size() / kArcDecimate + 2);
    dy_buf.reserve(arc.xs.size() / kArcDecimate + 2);
    for (size_t i = 0; i < arc.xs.size(); ++i) {
        if (i == 0 || i + 1 == arc.xs.size() || (i % kArcDecimate) == 0) {
            dx_buf.push_back(arc.xs[i]);
            dy_buf.push_back(arc.ys[i]);
        }
    }

    std::vector<int> sx, sy;
    rdp_simplify(dx_buf, dy_buf, sx, sy, kRdpEpsilon);
    if (sx.size() < 2) return;

    const uint16_t first = result.point_count;
    int written = 0;
    for (size_t i = 0; i < sx.size(); ++i) {
        if (!add_point(result, sx[i], sy[i])) break;
        ++written;
    }
    if (written >= 2) {
        add_stroke(result, type, first, static_cast<uint16_t>(written));
    } else {
        result.point_count = first;
    }
}

void compute_roi(const FaceGeometry &g, int w, int h, int roi[4])
{
    if (!g.valid) {
        roi[0] = 0; roi[1] = 0; roi[2] = w; roi[3] = h;
        return;
    }
    roi[0] = clamp_int(g.x0 - g.bw / 5, 0, w - 1);
    roi[1] = clamp_int(g.y0 - g.bh / 4, 0, h - 1);
    roi[2] = clamp_int(g.x1 + g.bw / 5, roi[0] + 1, w);
    roi[3] = clamp_int(g.y1 + g.bh / 8, roi[1] + 1, h);
}

esp_err_t detect_face(const dl::image::img_t &img, FaceInfo &face)
{
    if (s_detector == nullptr) {
        s_detector = new (std::nothrow) HumanFaceDetect();
        if (s_detector == nullptr) return ESP_ERR_NO_MEM;
    }

    ESP_LOGI(TAG, "running human_face_detect on %dx%d frame", img.width, img.height);
    auto &results = s_detector->run(img);
    float best_score = -1.0f;
    for (const auto &res : results) {
        if (res.score <= best_score) continue;
        best_score = res.score;
        face.found = true;
        face.score = res.score;
        for (int i = 0; i < 4; ++i) face.box[i] = res.box[i];
        if (res.keypoint.size() >= 10) {
            for (int i = 0; i < 10; ++i) face.keypoint[i] = res.keypoint[i];
        } else {
            face.found = false;
        }
    }
    ESP_LOGI(TAG, "human_face_detect result: %s score=%.3f",
             face.found ? "face" : "none", face.score);
    return ESP_OK;
}

esp_err_t generate_portrait(const dl::image::img_t &img, const FaceInfo &face)
{
    const int w = img.width;
    const int h = img.height;
    const size_t pixels = static_cast<size_t>(w) * h;
    const uint8_t *rgb = static_cast<const uint8_t *>(img.data);

    uint8_t *gray       = static_cast<uint8_t *>(portrait_alloc(pixels));
    uint8_t *blur_small = static_cast<uint8_t *>(portrait_alloc(pixels));
    uint8_t *blur_large = static_cast<uint8_t *>(portrait_alloc(pixels));
    uint8_t *edge       = static_cast<uint8_t *>(portrait_alloc(pixels));
    uint8_t *visited    = static_cast<uint8_t *>(portrait_alloc(pixels));
    uint8_t *tmp_a      = static_cast<uint8_t *>(portrait_alloc(pixels));
    uint8_t *tmp_b      = static_cast<uint8_t *>(portrait_alloc(pixels));

    if (!gray || !blur_small || !blur_large || !edge || !visited || !tmp_a || !tmp_b) {
        portrait_free(gray); portrait_free(blur_small); portrait_free(blur_large);
        portrait_free(edge); portrait_free(visited);
        portrait_free(tmp_a); portrait_free(tmp_b);
        return ESP_ERR_NO_MEM;
    }

    // 約16.8KBの構造体なのでスタックに置かずヒープから確保する.
    portrait_result_t *next_ptr = static_cast<portrait_result_t *>(portrait_alloc(sizeof(portrait_result_t)));
    if (next_ptr == nullptr) {
        portrait_free(gray); portrait_free(blur_small); portrait_free(blur_large);
        portrait_free(edge); portrait_free(visited);
        portrait_free(tmp_a); portrait_free(tmp_b);
        return ESP_ERR_NO_MEM;
    }
    portrait_result_t &next = *next_ptr;
    std::memset(&next, 0, sizeof(next));
    next.timestamp_ms = static_cast<uint32_t>(esp_timer_get_time() / 1000);
    next.canvas_w = static_cast<int16_t>(w);
    next.canvas_h = static_cast<int16_t>(h);
    next.has_face = face.found;
    next.face_score = face.score;
    std::strncpy(next.message,
                 face.found ? "portrait generated" : "no face detected; XDoG only",
                 sizeof(next.message) - 1);
    for (int i = 0; i < 4; ++i)  next.bbox[i] = static_cast<int16_t>(face.box[i]);
    for (int i = 0; i < 10; ++i) next.keypoint[i] = static_cast<int16_t>(face.keypoint[i]);

    const FaceGeometry g = make_face_geometry(face);

    // --- 前処理 ---
    rgb888_to_gray(rgb, w, h, gray);
    light_local_contrast(gray, w, h, /*tile=*/16, /*gain=*/1.6f);

    // --- ガウシアン近似 ×2 ---
    // sigma比 1:1.6 を半径比 1:2 ではなく, box×3 通過の合成σで近づける.
    // box半径 r の box×3 のσ ≈ r * sqrt(3*(2r+1)*(2r+1)/12) ≈ r で良い近似.
    // ここでは r=1 (σ≈0.6相当) と r=2 (σ≈0.96相当) で sigma_ratio≈1.6 を狙う.
    approx_gaussian_blur(gray, blur_small, tmp_a, w, h, 1);
    approx_gaussian_blur(gray, blur_large, tmp_a, w, h, 2);

    // --- XDoG (tanhソフト閾値) ---
    int roi[4];
    compute_roi(g, w, h, roi);
    mark_xdog_edges_tanh(blur_small, blur_large, edge, w, h, roi);
    mask_symbol_regions(edge, w, h, g);

    // --- まず軽い記号パーツを先に積む(目の渦巻き, 口) ---
    add_symbol_parts(next, g);

    // --- 髪 / 顎シルエット(あれば先に積む) ---
    Arc hair_arc, jaw_arc;
    if (g.valid) {
        if (extract_hair_arc(gray, w, h, g, tmp_a, tmp_b, visited, hair_arc)) {
            add_arc_as_stroke(next, PORTRAIT_STROKE_XDOG, hair_arc);
        }
        if (extract_jaw_arc(gray, w, h, g, tmp_a, tmp_b, visited, jaw_arc)) {
            add_arc_as_stroke(next, PORTRAIT_STROKE_XDOG, jaw_arc);
        }
    }

    // --- 残り予算でXDoGストロークをトレース ---
    trace_edges_to_strokes(edge, visited, w, h, roi, next);

    portrait_engine_lock();
    s_result = next;
    portrait_engine_unlock();

    portrait_free(gray); portrait_free(blur_small); portrait_free(blur_large);
    portrait_free(edge); portrait_free(visited);
    portrait_free(tmp_a); portrait_free(tmp_b);
    portrait_free(next_ptr);
    return ESP_OK;
}

void portrait_worker(void *)
{
    while (true) {
        ulTaskNotifyTake(pdTRUE, portMAX_DELAY);
        ESP_LOGI(TAG, "worker started capture");
        esp_err_t err = portrait_engine_capture_and_generate();
        if (err != ESP_OK) {
            ESP_LOGE(TAG, "capture from worker failed: %s", esp_err_to_name(err));
        }
        finish_capture_request(err);
        ESP_LOGI(TAG, "worker finished capture: %s", esp_err_to_name(err));
    }
}

} // namespace

extern "C" esp_err_t portrait_engine_init(void)
{
    if (s_mutex == nullptr) {
        s_mutex = xSemaphoreCreateMutex();
        if (s_mutex == nullptr) return ESP_ERR_NO_MEM;
    }
    if (s_capture_done == nullptr) {
        s_capture_done = xSemaphoreCreateBinary();
        if (s_capture_done == nullptr) return ESP_ERR_NO_MEM;
    }

    portrait_engine_lock();
    reset_result_locked("waiting for capture");
    portrait_engine_unlock();

    if (s_worker_task == nullptr) {
        BaseType_t ok = xTaskCreate(
            portrait_worker, "portrait_worker", kWorkerStackBytes, nullptr, 5, &s_worker_task);
        if (ok != pdPASS) return ESP_ERR_NO_MEM;
    }
    return ESP_OK;
}

extern "C" esp_err_t portrait_engine_capture_and_generate(void)
{
    app_state_set_state(STATE_CAPTURING);
    ESP_LOGI(TAG, "capture frame");

    camera_fb_t *fb = esp_camera_fb_get();
    if (fb == nullptr) {
        portrait_engine_lock();
        reset_result_locked("camera capture failed");
        portrait_engine_unlock();
        app_state_set_state(STATE_IDLE);
        return ESP_FAIL;
    }

    if (fb->format != PIXFORMAT_JPEG) {
        esp_camera_fb_return(fb);
        portrait_engine_lock();
        reset_result_locked("camera frame was not JPEG");
        portrait_engine_unlock();
        app_state_set_state(STATE_IDLE);
        return ESP_ERR_NOT_SUPPORTED;
    }

    store_jpeg_copy(fb->buf, fb->len);
    ESP_LOGI(TAG, "captured JPEG: %u bytes", static_cast<unsigned>(fb->len));

    dl::image::jpeg_img_t jpeg = {
        .data = static_cast<void *>(fb->buf),
        .data_len = fb->len,
    };
    dl::image::img_t img = dl::image::sw_decode_jpeg(jpeg, dl::image::DL_IMAGE_PIX_TYPE_RGB888);
    esp_camera_fb_return(fb);

    if (img.data == nullptr || img.width <= 0 || img.height <= 0) {
        portrait_engine_lock();
        reset_result_locked("JPEG decode failed");
        portrait_engine_unlock();
        app_state_set_state(STATE_IDLE);
        return ESP_FAIL;
    }
    ESP_LOGI(TAG, "decoded JPEG: %dx%d", img.width, img.height);

    FaceInfo face;
    esp_err_t err = detect_face(img, face);
    if (err == ESP_OK) {
        err = generate_portrait(img, face);
    }

    heap_caps_free(img.data);
    app_state_set_state(STATE_IDLE);
    return err;
}

extern "C" esp_err_t portrait_engine_capture_and_wait(uint32_t timeout_ms)
{
    if (s_worker_task == nullptr || s_capture_done == nullptr) return ESP_ERR_INVALID_STATE;
    if (!begin_capture_request()) return ESP_ERR_INVALID_STATE;
    while (xSemaphoreTake(s_capture_done, 0) == pdTRUE) {}
    xTaskNotifyGive(s_worker_task);
    if (xSemaphoreTake(s_capture_done, pdMS_TO_TICKS(timeout_ms)) != pdTRUE) {
        return ESP_ERR_TIMEOUT;
    }
    portrait_engine_lock();
    esp_err_t err = s_last_capture_err;
    portrait_engine_unlock();
    return err;
}

extern "C" void portrait_engine_request_capture(void)
{
    if (s_worker_task != nullptr && begin_capture_request()) {
        xTaskNotifyGive(s_worker_task);
    } else {
        ESP_LOGW(TAG, "capture request ignored because capture is already busy");
    }
}

extern "C" bool portrait_engine_capture_is_busy(void)
{
    portrait_engine_lock();
    bool busy = s_capture_busy;
    portrait_engine_unlock();
    return busy;
}

extern "C" void portrait_engine_lock(void)
{
    if (s_mutex != nullptr) xSemaphoreTake(s_mutex, portMAX_DELAY);
}

extern "C" void portrait_engine_unlock(void)
{
    if (s_mutex != nullptr) xSemaphoreGive(s_mutex);
}

extern "C" const portrait_result_t *portrait_engine_get_result_locked(void)
{
    return &s_result;
}

extern "C" const uint8_t *portrait_engine_get_jpeg_locked(size_t *len)
{
    if (len != nullptr) *len = s_last_jpeg_len;
    return s_last_jpeg;
}

extern "C" const char *portrait_stroke_type_name(portrait_stroke_type_t type)
{
    switch (type) {
    case PORTRAIT_STROKE_XDOG:       return "xdog";
    case PORTRAIT_STROKE_LEFT_EYE:   return "spiral_left_eye";
    case PORTRAIT_STROKE_RIGHT_EYE:  return "spiral_right_eye";
    case PORTRAIT_STROKE_MOUTH:      return "mouth";
    default:                         return "unknown";
    }
}
