pulsar/windows/runner/screen_encoder.cpp

#include "screen_encoder.h"

#include <chrono>
#include <codecapi.h>
#include <mfapi.h>
#include <mferror.h>
#include <mfidl.h>

using Microsoft::WRL::ComPtr;

static constexpr LONGLONG kFrameDuration = 333333; // 100ns units ≈ 30fps
static constexpr UINT32   kBitrate       = 6'000'000;

// Scan an Annex-B bitstream for the span covering the SPS (type 7) through
// the end of the PPS (type 8).  Returns false if either is missing.
static bool ExtractSpsPps(const uint8_t* data, size_t size,
                          size_t& sps_start, size_t& pps_end) {
  struct Nal { size_t start; int type; };
  std::vector<Nal> nals;

  size_t i = 0;
  while (i + 4 <= size) {
    size_t hdr_len = 0;
    int    nal_type = 0;
    if (data[i] == 0 && data[i+1] == 0 && data[i+2] == 0 && data[i+3] == 1 &&
        i + 4 < size) {
      hdr_len  = 4;
      nal_type = data[i+4] & 0x1F;
    } else if (data[i] == 0 && data[i+1] == 0 && data[i+2] == 1 &&
               i + 3 < size) {
      hdr_len  = 3;
      nal_type = data[i+3] & 0x1F;
    }
    if (hdr_len) {
      nals.push_back({i, nal_type});
      i += hdr_len;
    } else {
      ++i;
    }
  }

  size_t sps_idx = SIZE_MAX, pps_idx = SIZE_MAX;
  for (size_t j = 0; j < nals.size(); ++j) {
    if (nals[j].type == 7) sps_idx = j;
    if (nals[j].type == 8) pps_idx = j;
  }
  if (sps_idx == SIZE_MAX || pps_idx == SIZE_MAX) return false;

  sps_start = nals[sps_idx].start;
  size_t after_pps = pps_idx + 1;
  pps_end = (after_pps < nals.size()) ? nals[after_pps].start : size;
  return sps_start < pps_end;
}

// ─── ScreenEncoder ───────────────────────────────────────────────────────────

ScreenEncoder::ScreenEncoder() {}

ScreenEncoder::~ScreenEncoder() { Stop(); }

void ScreenEncoder::Start(
    std::unique_ptr<flutter::EventSink<flutter::EncodableValue>> sink) {
  Stop();
  {
    std::lock_guard<std::mutex> lk(sink_mu_);
    sink_ = std::move(sink);
  }
  config_sent_ = false;
  sample_ts_   = 0;
  running_     = true;
  thread_      = std::thread(&ScreenEncoder::CaptureLoop, this);
}

void ScreenEncoder::Stop() {
  running_ = false;
  if (thread_.joinable()) thread_.join();
  {
    std::lock_guard<std::mutex> lk(sink_mu_);
    sink_.reset();
  }
  encoder_.Reset();
  dupl_.Reset();
  staging_.Reset();
  d3d_ctx_.Reset();
  d3d_dev_.Reset();
  enc_width_ = enc_height_ = 0;
}

void ScreenEncoder::ForceKeyframe() { force_kf_ = true; }

// ─── D3D / DXGI init ─────────────────────────────────────────────────────────

bool ScreenEncoder::InitD3D() {
  D3D_FEATURE_LEVEL level;
  HRESULT hr = D3D11CreateDevice(nullptr, D3D_DRIVER_TYPE_HARDWARE, nullptr, 0,
                                  nullptr, 0, D3D11_SDK_VERSION,
                                  &d3d_dev_, &level, &d3d_ctx_);
  if (FAILED(hr)) return false;

  ComPtr<IDXGIDevice>  dxgi_dev;
  ComPtr<IDXGIAdapter> adapter;
  ComPtr<IDXGIOutput>  output;
  ComPtr<IDXGIOutput1> output1;

  d3d_dev_.As(&dxgi_dev);
  dxgi_dev->GetAdapter(&adapter);
  if (FAILED(adapter->EnumOutputs(0, &output))) return false;
  if (FAILED(output.As(&output1)))              return false;

  hr = output1->DuplicateOutput(d3d_dev_.Get(), &dupl_);
  return SUCCEEDED(hr);
}

// ─── Encoder init ────────────────────────────────────────────────────────────

bool ScreenEncoder::InitEncoder(UINT width, UINT height) {
  MFT_REGISTER_TYPE_INFO out_info{MFMediaType_Video, MFVideoFormat_H264};

  UINT32      count     = 0;
  IMFActivate** activates = nullptr;
  HRESULT hr = MFTEnumEx(MFT_CATEGORY_VIDEO_ENCODER,
                          MFT_ENUM_FLAG_SYNCMFT | MFT_ENUM_FLAG_SORTANDFILTER,
                          nullptr, &out_info, &activates, &count);
  if (FAILED(hr) || count == 0) return false;

  hr = activates[0]->ActivateObject(IID_PPV_ARGS(&encoder_));
  for (UINT32 i = 0; i < count; ++i) activates[i]->Release();
  CoTaskMemFree(activates);
  if (FAILED(hr)) return false;

  // Output: H264
  ComPtr<IMFMediaType> out_type;
  MFCreateMediaType(&out_type);
  out_type->SetGUID(MF_MT_MAJOR_TYPE, MFMediaType_Video);
  out_type->SetGUID(MF_MT_SUBTYPE,    MFVideoFormat_H264);
  MFSetAttributeSize(out_type.Get(), MF_MT_FRAME_SIZE, width, height);
  MFSetAttributeRatio(out_type.Get(), MF_MT_FRAME_RATE, 30, 1);
  out_type->SetUINT32(MF_MT_AVG_BITRATE,     kBitrate);
  out_type->SetUINT32(MF_MT_INTERLACE_MODE,  MFVideoInterlace_Progressive);
  out_type->SetUINT32(MF_MT_MPEG2_PROFILE,   eAVEncH264VProfile_High);
  if (FAILED(encoder_->SetOutputType(0, out_type.Get(), 0))) return false;

  // Input: NV12
  ComPtr<IMFMediaType> in_type;
  MFCreateMediaType(&in_type);
  in_type->SetGUID(MF_MT_MAJOR_TYPE,      MFMediaType_Video);
  in_type->SetGUID(MF_MT_SUBTYPE,         MFVideoFormat_NV12);
  MFSetAttributeSize(in_type.Get(), MF_MT_FRAME_SIZE, width, height);
  MFSetAttributeRatio(in_type.Get(), MF_MT_FRAME_RATE, 30, 1);
  in_type->SetUINT32(MF_MT_INTERLACE_MODE,   MFVideoInterlace_Progressive);
  in_type->SetUINT32(MF_MT_DEFAULT_STRIDE,   (UINT32)width);
  if (FAILED(encoder_->SetInputType(0, in_type.Get(), 0))) return false;

  // Keyframe every 30 frames via media type attribute
  out_type->SetUINT32(MF_MT_MAX_KEYFRAME_SPACING, 30);

  encoder_->ProcessMessage(MFT_MESSAGE_COMMAND_FLUSH,             0);
  encoder_->ProcessMessage(MFT_MESSAGE_NOTIFY_BEGIN_STREAMING,    0);
  encoder_->ProcessMessage(MFT_MESSAGE_NOTIFY_START_OF_STREAM,    0);

  enc_width_  = width;
  enc_height_ = height;
  return true;
}

// ─── Frame capture ───────────────────────────────────────────────────────────

bool ScreenEncoder::CaptureFrame(std::vector<uint8_t>& bgra,
                                  UINT& width, UINT& height) {
  DXGI_OUTDUPL_FRAME_INFO info{};
  ComPtr<IDXGIResource>   res;

  HRESULT hr = dupl_->AcquireNextFrame(16, &info, &res);
  if (hr == DXGI_ERROR_WAIT_TIMEOUT) return false;
  if (FAILED(hr)) {
    dupl_.Reset();
    staging_.Reset();
    InitD3D();
    return false;
  }

  ComPtr<ID3D11Texture2D> tex;
  res.As(&tex);

  D3D11_TEXTURE2D_DESC desc{};
  tex->GetDesc(&desc);
  width  = desc.Width;
  height = desc.Height;

  // Recreate staging texture on size change
  if (!staging_) {
    D3D11_TEXTURE2D_DESC sd{};
    sd.Width            = width;
    sd.Height           = height;
    sd.MipLevels        = 1;
    sd.ArraySize        = 1;
    sd.Format           = DXGI_FORMAT_B8G8R8A8_UNORM;
    sd.SampleDesc.Count = 1;
    sd.Usage            = D3D11_USAGE_STAGING;
    sd.CPUAccessFlags   = D3D11_CPU_ACCESS_READ;
    d3d_dev_->CreateTexture2D(&sd, nullptr, &staging_);
  }

  d3d_ctx_->CopyResource(staging_.Get(), tex.Get());
  dupl_->ReleaseFrame();

  D3D11_MAPPED_SUBRESOURCE mapped{};
  hr = d3d_ctx_->Map(staging_.Get(), 0, D3D11_MAP_READ, 0, &mapped);
  if (FAILED(hr)) return false;

  bgra.resize(width * height * 4);
  const uint8_t* src = static_cast<const uint8_t*>(mapped.pData);
  for (UINT row = 0; row < height; ++row)
    memcpy(&bgra[row * width * 4], src + row * mapped.RowPitch, width * 4);

  d3d_ctx_->Unmap(staging_.Get(), 0);
  return true;
}

// ─── Color conversion: BGRA → NV12 ──────────────────────────────────────────

void ScreenEncoder::BgraToNv12(const uint8_t* bgra, std::vector<uint8_t>& nv12,
                                UINT w, UINT h) {
  nv12.resize(w * h * 3 / 2);
  uint8_t* Y  = nv12.data();
  uint8_t* UV = Y + w * h;

  for (UINT row = 0; row < h; ++row) {
    for (UINT col = 0; col < w; ++col) {
      const uint8_t* p = bgra + (row * w + col) * 4;
      const int b = p[0], g = p[1], r = p[2];
      Y[row * w + col] =
          (uint8_t)(((66*r + 129*g + 25*b + 128) >> 8) + 16);

      if ((row & 1) == 0 && (col & 1) == 0) {
        const UINT off = (row / 2) * w + col;
        UV[off]   = (uint8_t)(((-38*r -  74*g + 112*b + 128) >> 8) + 128);
        UV[off+1] = (uint8_t)(((112*r -  94*g -  18*b + 128) >> 8) + 128);
      }
    }
  }
}

// ─── Encode one frame ────────────────────────────────────────────────────────

void ScreenEncoder::EncodeFrame(const std::vector<uint8_t>& bgra,
                                 UINT w, UINT h, bool keyframe) {
  if (enc_width_ != w || enc_height_ != h) {
    encoder_.Reset();
    config_sent_ = false;
    if (!InitEncoder(w, h)) return;
  }

  // Build NV12 input sample
  std::vector<uint8_t> nv12;
  BgraToNv12(bgra.data(), nv12, w, h);

  ComPtr<IMFSample>      in_sample;
  ComPtr<IMFMediaBuffer> in_buf;
  MFCreateMemoryBuffer((DWORD)nv12.size(), &in_buf);
  {
    BYTE* ptr = nullptr;
    in_buf->Lock(&ptr, nullptr, nullptr);
    memcpy(ptr, nv12.data(), nv12.size());
    in_buf->Unlock();
  }
  in_buf->SetCurrentLength((DWORD)nv12.size());
  MFCreateSample(&in_sample);
  in_sample->SetSampleTime(sample_ts_);
  in_sample->SetSampleDuration(kFrameDuration);
  in_sample->AddBuffer(in_buf.Get());
  if (keyframe) in_sample->SetUINT32(MFSampleExtension_CleanPoint, 1);
  sample_ts_ += kFrameDuration;

  if (FAILED(encoder_->ProcessInput(0, in_sample.Get(), 0))) return;

  // Drain output samples
  MFT_OUTPUT_STREAM_INFO si{};
  encoder_->GetOutputStreamInfo(0, &si);

  while (true) {
    ComPtr<IMFSample>      out_sample;
    ComPtr<IMFMediaBuffer> out_buf;
    if (!(si.dwFlags & MFT_OUTPUT_STREAM_PROVIDES_SAMPLES)) {
      MFCreateMemoryBuffer(si.cbSize ? si.cbSize : w * h * 2, &out_buf);
      MFCreateSample(&out_sample);
      out_sample->AddBuffer(out_buf.Get());
    }

    MFT_OUTPUT_DATA_BUFFER out_data{};
    out_data.pSample = out_sample.Get();
    DWORD status = 0;
    HRESULT hr = encoder_->ProcessOutput(0, 1, &out_data, &status);
    if (out_data.pEvents) out_data.pEvents->Release();
    if (hr == MF_E_TRANSFORM_NEED_MORE_INPUT) break;
    if (FAILED(hr)) break;

    ComPtr<IMFSample> result(out_data.pSample);
    if (!result) break;

    ComPtr<IMFMediaBuffer> flat;
    result->ConvertToContiguousBuffer(&flat);
    BYTE* enc = nullptr; DWORD enc_len = 0;
    flat->Lock(&enc, nullptr, &enc_len);

    if (!config_sent_) {
      size_t sps_start = 0, pps_end = 0;
      if (ExtractSpsPps(enc, enc_len, sps_start, pps_end)) {
        std::vector<uint8_t> cfg(1 + (pps_end - sps_start));
        cfg[0] = 0x01;
        memcpy(&cfg[1], enc + sps_start, pps_end - sps_start);
        SendEvent(std::move(cfg));
        config_sent_ = true;
      }
    }

    const int64_t now_ms =
        std::chrono::duration_cast<std::chrono::milliseconds>(
            std::chrono::system_clock::now().time_since_epoch())
            .count();
    std::vector<uint8_t> frame(1 + 8 + enc_len);
    frame[0] = 0x02;
    for (int i = 0; i < 8; ++i)
      frame[1 + i] = static_cast<uint8_t>(now_ms >> (56 - i * 8));
    memcpy(&frame[9], enc, enc_len);

    flat->Unlock();
    SendEvent(std::move(frame));
  }
}

void ScreenEncoder::SendEvent(std::vector<uint8_t> data) {
  std::lock_guard<std::mutex> lk(sink_mu_);
  if (sink_) sink_->Success(flutter::EncodableValue(std::move(data)));
}

// ─── Capture loop ────────────────────────────────────────────────────────────

void ScreenEncoder::CaptureLoop() {
  CoInitializeEx(nullptr, COINIT_MULTITHREADED);

  if (!InitD3D()) {
    CoUninitialize();
    return;
  }

  while (running_) {
    std::vector<uint8_t> bgra;
    UINT w = 0, h = 0;
    if (CaptureFrame(bgra, w, h)) {
      bool kf = force_kf_.exchange(false);
      EncodeFrame(bgra, w, h, kf);
    }
  }

  if (encoder_) {
    encoder_->ProcessMessage(MFT_MESSAGE_NOTIFY_END_OF_STREAM, 0);
    encoder_->ProcessMessage(MFT_MESSAGE_COMMAND_DRAIN, 0);
  }
  CoUninitialize();
}