pulsar/windows/runner/screen_encoder.cpp

361 lines
12 KiB
C++

#include "screen_encoder.h"
#include <chrono>
#include <codecapi.h>
#include <mfapi.h>
#include <mferror.h>
#include <mfidl.h>
using Microsoft::WRL::ComPtr;
static constexpr LONGLONG kFrameDuration = 333333; // 100ns units ≈ 30fps
static constexpr UINT32 kBitrate = 6'000'000;
// Scan an Annex-B bitstream for the span covering the SPS (type 7) through
// the end of the PPS (type 8). Returns false if either is missing.
static bool ExtractSpsPps(const uint8_t* data, size_t size,
size_t& sps_start, size_t& pps_end) {
struct Nal { size_t start; int type; };
std::vector<Nal> nals;
size_t i = 0;
while (i + 4 <= size) {
size_t hdr_len = 0;
int nal_type = 0;
if (data[i] == 0 && data[i+1] == 0 && data[i+2] == 0 && data[i+3] == 1 &&
i + 4 < size) {
hdr_len = 4;
nal_type = data[i+4] & 0x1F;
} else if (data[i] == 0 && data[i+1] == 0 && data[i+2] == 1 &&
i + 3 < size) {
hdr_len = 3;
nal_type = data[i+3] & 0x1F;
}
if (hdr_len) {
nals.push_back({i, nal_type});
i += hdr_len;
} else {
++i;
}
}
size_t sps_idx = SIZE_MAX, pps_idx = SIZE_MAX;
for (size_t j = 0; j < nals.size(); ++j) {
if (nals[j].type == 7) sps_idx = j;
if (nals[j].type == 8) pps_idx = j;
}
if (sps_idx == SIZE_MAX || pps_idx == SIZE_MAX) return false;
sps_start = nals[sps_idx].start;
size_t after_pps = pps_idx + 1;
pps_end = (after_pps < nals.size()) ? nals[after_pps].start : size;
return sps_start < pps_end;
}
// ─── ScreenEncoder ───────────────────────────────────────────────────────────
ScreenEncoder::ScreenEncoder() {}
ScreenEncoder::~ScreenEncoder() { Stop(); }
void ScreenEncoder::Start(
std::unique_ptr<flutter::EventSink<flutter::EncodableValue>> sink) {
Stop();
{
std::lock_guard<std::mutex> lk(sink_mu_);
sink_ = std::move(sink);
}
config_sent_ = false;
sample_ts_ = 0;
running_ = true;
thread_ = std::thread(&ScreenEncoder::CaptureLoop, this);
}
void ScreenEncoder::Stop() {
running_ = false;
if (thread_.joinable()) thread_.join();
{
std::lock_guard<std::mutex> lk(sink_mu_);
sink_.reset();
}
encoder_.Reset();
dupl_.Reset();
staging_.Reset();
d3d_ctx_.Reset();
d3d_dev_.Reset();
enc_width_ = enc_height_ = 0;
}
void ScreenEncoder::ForceKeyframe() { force_kf_ = true; }
// ─── D3D / DXGI init ─────────────────────────────────────────────────────────
bool ScreenEncoder::InitD3D() {
D3D_FEATURE_LEVEL level;
HRESULT hr = D3D11CreateDevice(nullptr, D3D_DRIVER_TYPE_HARDWARE, nullptr, 0,
nullptr, 0, D3D11_SDK_VERSION,
&d3d_dev_, &level, &d3d_ctx_);
if (FAILED(hr)) return false;
ComPtr<IDXGIDevice> dxgi_dev;
ComPtr<IDXGIAdapter> adapter;
ComPtr<IDXGIOutput> output;
ComPtr<IDXGIOutput1> output1;
d3d_dev_.As(&dxgi_dev);
dxgi_dev->GetAdapter(&adapter);
if (FAILED(adapter->EnumOutputs(0, &output))) return false;
if (FAILED(output.As(&output1))) return false;
hr = output1->DuplicateOutput(d3d_dev_.Get(), &dupl_);
return SUCCEEDED(hr);
}
// ─── Encoder init ────────────────────────────────────────────────────────────
bool ScreenEncoder::InitEncoder(UINT width, UINT height) {
MFT_REGISTER_TYPE_INFO out_info{MFMediaType_Video, MFVideoFormat_H264};
UINT32 count = 0;
IMFActivate** activates = nullptr;
HRESULT hr = MFTEnumEx(MFT_CATEGORY_VIDEO_ENCODER,
MFT_ENUM_FLAG_SYNCMFT | MFT_ENUM_FLAG_SORTANDFILTER,
nullptr, &out_info, &activates, &count);
if (FAILED(hr) || count == 0) return false;
hr = activates[0]->ActivateObject(IID_PPV_ARGS(&encoder_));
for (UINT32 i = 0; i < count; ++i) activates[i]->Release();
CoTaskMemFree(activates);
if (FAILED(hr)) return false;
// Output: H264
ComPtr<IMFMediaType> out_type;
MFCreateMediaType(&out_type);
out_type->SetGUID(MF_MT_MAJOR_TYPE, MFMediaType_Video);
out_type->SetGUID(MF_MT_SUBTYPE, MFVideoFormat_H264);
MFSetAttributeSize(out_type.Get(), MF_MT_FRAME_SIZE, width, height);
MFSetAttributeRatio(out_type.Get(), MF_MT_FRAME_RATE, 30, 1);
out_type->SetUINT32(MF_MT_AVG_BITRATE, kBitrate);
out_type->SetUINT32(MF_MT_INTERLACE_MODE, MFVideoInterlace_Progressive);
out_type->SetUINT32(MF_MT_MPEG2_PROFILE, eAVEncH264VProfile_High);
if (FAILED(encoder_->SetOutputType(0, out_type.Get(), 0))) return false;
// Input: NV12
ComPtr<IMFMediaType> in_type;
MFCreateMediaType(&in_type);
in_type->SetGUID(MF_MT_MAJOR_TYPE, MFMediaType_Video);
in_type->SetGUID(MF_MT_SUBTYPE, MFVideoFormat_NV12);
MFSetAttributeSize(in_type.Get(), MF_MT_FRAME_SIZE, width, height);
MFSetAttributeRatio(in_type.Get(), MF_MT_FRAME_RATE, 30, 1);
in_type->SetUINT32(MF_MT_INTERLACE_MODE, MFVideoInterlace_Progressive);
in_type->SetUINT32(MF_MT_DEFAULT_STRIDE, (UINT32)width);
if (FAILED(encoder_->SetInputType(0, in_type.Get(), 0))) return false;
// Keyframe every 30 frames via media type attribute
out_type->SetUINT32(MF_MT_MAX_KEYFRAME_SPACING, 30);
encoder_->ProcessMessage(MFT_MESSAGE_COMMAND_FLUSH, 0);
encoder_->ProcessMessage(MFT_MESSAGE_NOTIFY_BEGIN_STREAMING, 0);
encoder_->ProcessMessage(MFT_MESSAGE_NOTIFY_START_OF_STREAM, 0);
enc_width_ = width;
enc_height_ = height;
return true;
}
// ─── Frame capture ───────────────────────────────────────────────────────────
bool ScreenEncoder::CaptureFrame(std::vector<uint8_t>& bgra,
UINT& width, UINT& height) {
DXGI_OUTDUPL_FRAME_INFO info{};
ComPtr<IDXGIResource> res;
HRESULT hr = dupl_->AcquireNextFrame(16, &info, &res);
if (hr == DXGI_ERROR_WAIT_TIMEOUT) return false;
if (FAILED(hr)) {
dupl_.Reset();
staging_.Reset();
InitD3D();
return false;
}
ComPtr<ID3D11Texture2D> tex;
res.As(&tex);
D3D11_TEXTURE2D_DESC desc{};
tex->GetDesc(&desc);
width = desc.Width;
height = desc.Height;
// Recreate staging texture on size change
if (!staging_) {
D3D11_TEXTURE2D_DESC sd{};
sd.Width = width;
sd.Height = height;
sd.MipLevels = 1;
sd.ArraySize = 1;
sd.Format = DXGI_FORMAT_B8G8R8A8_UNORM;
sd.SampleDesc.Count = 1;
sd.Usage = D3D11_USAGE_STAGING;
sd.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
d3d_dev_->CreateTexture2D(&sd, nullptr, &staging_);
}
d3d_ctx_->CopyResource(staging_.Get(), tex.Get());
dupl_->ReleaseFrame();
D3D11_MAPPED_SUBRESOURCE mapped{};
hr = d3d_ctx_->Map(staging_.Get(), 0, D3D11_MAP_READ, 0, &mapped);
if (FAILED(hr)) return false;
bgra.resize(width * height * 4);
const uint8_t* src = static_cast<const uint8_t*>(mapped.pData);
for (UINT row = 0; row < height; ++row)
memcpy(&bgra[row * width * 4], src + row * mapped.RowPitch, width * 4);
d3d_ctx_->Unmap(staging_.Get(), 0);
return true;
}
// ─── Color conversion: BGRA → NV12 ──────────────────────────────────────────
void ScreenEncoder::BgraToNv12(const uint8_t* bgra, std::vector<uint8_t>& nv12,
UINT w, UINT h) {
nv12.resize(w * h * 3 / 2);
uint8_t* Y = nv12.data();
uint8_t* UV = Y + w * h;
for (UINT row = 0; row < h; ++row) {
for (UINT col = 0; col < w; ++col) {
const uint8_t* p = bgra + (row * w + col) * 4;
const int b = p[0], g = p[1], r = p[2];
Y[row * w + col] =
(uint8_t)(((66*r + 129*g + 25*b + 128) >> 8) + 16);
if ((row & 1) == 0 && (col & 1) == 0) {
const UINT off = (row / 2) * w + col;
UV[off] = (uint8_t)(((-38*r - 74*g + 112*b + 128) >> 8) + 128);
UV[off+1] = (uint8_t)(((112*r - 94*g - 18*b + 128) >> 8) + 128);
}
}
}
}
// ─── Encode one frame ────────────────────────────────────────────────────────
void ScreenEncoder::EncodeFrame(const std::vector<uint8_t>& bgra,
UINT w, UINT h, bool keyframe) {
if (enc_width_ != w || enc_height_ != h) {
encoder_.Reset();
config_sent_ = false;
if (!InitEncoder(w, h)) return;
}
// Build NV12 input sample
std::vector<uint8_t> nv12;
BgraToNv12(bgra.data(), nv12, w, h);
ComPtr<IMFSample> in_sample;
ComPtr<IMFMediaBuffer> in_buf;
MFCreateMemoryBuffer((DWORD)nv12.size(), &in_buf);
{
BYTE* ptr = nullptr;
in_buf->Lock(&ptr, nullptr, nullptr);
memcpy(ptr, nv12.data(), nv12.size());
in_buf->Unlock();
}
in_buf->SetCurrentLength((DWORD)nv12.size());
MFCreateSample(&in_sample);
in_sample->SetSampleTime(sample_ts_);
in_sample->SetSampleDuration(kFrameDuration);
in_sample->AddBuffer(in_buf.Get());
if (keyframe) in_sample->SetUINT32(MFSampleExtension_CleanPoint, 1);
sample_ts_ += kFrameDuration;
if (FAILED(encoder_->ProcessInput(0, in_sample.Get(), 0))) return;
// Drain output samples
MFT_OUTPUT_STREAM_INFO si{};
encoder_->GetOutputStreamInfo(0, &si);
while (true) {
ComPtr<IMFSample> out_sample;
ComPtr<IMFMediaBuffer> out_buf;
if (!(si.dwFlags & MFT_OUTPUT_STREAM_PROVIDES_SAMPLES)) {
MFCreateMemoryBuffer(si.cbSize ? si.cbSize : w * h * 2, &out_buf);
MFCreateSample(&out_sample);
out_sample->AddBuffer(out_buf.Get());
}
MFT_OUTPUT_DATA_BUFFER out_data{};
out_data.pSample = out_sample.Get();
DWORD status = 0;
HRESULT hr = encoder_->ProcessOutput(0, 1, &out_data, &status);
if (out_data.pEvents) out_data.pEvents->Release();
if (hr == MF_E_TRANSFORM_NEED_MORE_INPUT) break;
if (FAILED(hr)) break;
ComPtr<IMFSample> result(out_data.pSample);
if (!result) break;
ComPtr<IMFMediaBuffer> flat;
result->ConvertToContiguousBuffer(&flat);
BYTE* enc = nullptr; DWORD enc_len = 0;
flat->Lock(&enc, nullptr, &enc_len);
if (!config_sent_) {
size_t sps_start = 0, pps_end = 0;
if (ExtractSpsPps(enc, enc_len, sps_start, pps_end)) {
std::vector<uint8_t> cfg(1 + (pps_end - sps_start));
cfg[0] = 0x01;
memcpy(&cfg[1], enc + sps_start, pps_end - sps_start);
SendEvent(std::move(cfg));
config_sent_ = true;
}
}
const int64_t now_ms =
std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::system_clock::now().time_since_epoch())
.count();
std::vector<uint8_t> frame(1 + 8 + enc_len);
frame[0] = 0x02;
for (int i = 0; i < 8; ++i)
frame[1 + i] = static_cast<uint8_t>(now_ms >> (56 - i * 8));
memcpy(&frame[9], enc, enc_len);
flat->Unlock();
SendEvent(std::move(frame));
}
}
void ScreenEncoder::SendEvent(std::vector<uint8_t> data) {
std::lock_guard<std::mutex> lk(sink_mu_);
if (sink_) sink_->Success(flutter::EncodableValue(std::move(data)));
}
// ─── Capture loop ────────────────────────────────────────────────────────────
void ScreenEncoder::CaptureLoop() {
CoInitializeEx(nullptr, COINIT_MULTITHREADED);
if (!InitD3D()) {
CoUninitialize();
return;
}
while (running_) {
std::vector<uint8_t> bgra;
UINT w = 0, h = 0;
if (CaptureFrame(bgra, w, h)) {
bool kf = force_kf_.exchange(false);
EncodeFrame(bgra, w, h, kf);
}
}
if (encoder_) {
encoder_->ProcessMessage(MFT_MESSAGE_NOTIFY_END_OF_STREAM, 0);
encoder_->ProcessMessage(MFT_MESSAGE_COMMAND_DRAIN, 0);
}
CoUninitialize();
}