Handling MF_E_TRANSFORM_STREAM_CHANGE from video decoder MFT

I am trying to decode even just a single H264 frame with the H264 Decoder MFT, but I've been having problems with ProcessOutput(). I've reduced the bad HRESULT's as much as I can, but I'm currently stuck on dealing with MF_E_TRANSFORM_STREAM_CHANGE. This occurs after I set the pSample equal to my allocated output_sample and call ProcessOutput(), since this decoder requires you allocate your own sample. I tried resetting the output type using SetOutputType() to what I had in my configure_decoder() function, but alas I get a bad HRESULT. Not sure what to do next.

enter image description here

// libs
#pragma comment(lib, "D3D11.lib")
#pragma comment(lib, "mfplat.lib")
#pragma comment(lib, "mf.lib")
#pragma comment(lib, "evr.lib")
#pragma comment(lib, "mfuuid.lib")
#pragma comment(lib, "Winmm.lib")
// std
#include <iostream>
#include <string>
#include <fstream>
// Windows
#include <windows.h>
#include <atlbase.h>
// DirectX
#include <d3d11.h>
// Media Foundation
#include <mfapi.h>
#include <mfplay.h>
#include <mfreadwrite.h>
#include <mferror.h>
// Others
#include <chrono>
#include <thread>
#include <direct.h> // for mkdir()
#include <Codecapi.h> // for CODECAPI_AVDecVideoAcceleration_H264
#include <comdef.h>
// Custom

// Constants
constexpr UINT decode_width = 1920;
constexpr UINT decode_height = 1080;

HRESULT init_mf()
{
    HRESULT hr = S_OK;
    if (FAILED(hr = CoInitializeEx(nullptr, COINIT_APARTMENTTHREADED)))
        return hr;
    if (FAILED(hr = MFStartup(MF_VERSION)))
        return hr;

    std::cout << "- Initialized Media Foundation" << std::endl;

    return hr;
}

HRESULT init_dxgi(CComPtr<ID3D11Device>& out_device, CComPtr<ID3D11DeviceContext>& in_context)
{
    HRESULT hr = S_OK;

    if (FAILED(hr = D3D11CreateDevice(nullptr, D3D_DRIVER_TYPE_HARDWARE, nullptr, D3D11_CREATE_DEVICE_VIDEO_SUPPORT /*|
        D3D11_CREATE_DEVICE_DEBUG*/, nullptr, 0, D3D11_SDK_VERSION, &out_device, nullptr, &in_context)))
        return hr;

    std::cout << "- Initialized DXGI" << std::endl;

    return hr;
}

HRESULT get_decoder(CComPtr<IMFTransform>& out_transform, CComPtr<IMFActivate>& out_activate,
    CComPtr<IMFAttributes>& out_attributes)
{
    HRESULT hr = S_OK;

    // Find the decoder
    CComHeapPtr<IMFActivate*> activate_raw;
    uint32_t activateCount = 0;

    // Input & output types
    const MFT_REGISTER_TYPE_INFO in_info = { MFMediaType_Video, MFVideoFormat_H264 };
    const MFT_REGISTER_TYPE_INFO out_info = { MFMediaType_Video, MFVideoFormat_NV12 };

    // Get decoders matching the specified attributes
    if (FAILED(hr = MFTEnum2(MFT_CATEGORY_VIDEO_DECODER, MFT_ENUM_FLAG_SYNCMFT | MFT_ENUM_FLAG_SORTANDFILTER, &in_info, &out_info,
        nullptr, &activate_raw, &activateCount)))
        return hr;

    // Choose the first returned decoder
    out_activate = activate_raw[0];

    // Memory management
    for (int i = 1; i < activateCount; i++)
        activate_raw[i]->Release();

    // Activate
    if (FAILED(hr = out_activate->ActivateObject(IID_PPV_ARGS(&out_transform))))
        return hr;

    // Get attributes
    if (FAILED(hr = out_transform->GetAttributes(&out_attributes)))
        return hr;

    std::cout << "- get_decoder() Found " << activateCount << " decoders" << std::endl;

    return hr;
}

HRESULT configure_decoder(const CComPtr<IMFTransform>& in_transform, CComPtr<IMFDXGIDeviceManager>& in_device_manager,
    const DWORD in_input_stream_id, const DWORD output_stream_id
)
{
    HRESULT hr = S_OK;

    // Sets or clears the Direct3D Device Manager for DirectX Video Acceleration (DXVA).
    /*if (FAILED(hr = in_transform->ProcessMessage(MFT_MESSAGE_SET_D3D_MANAGER, reinterpret_cast<ULONG_PTR>(in_device_manager.p))))
        return hr;*/

        // Input type, I have no idea how to do this
    CComPtr<IMFMediaType> input_type;
    /*if (FAILED(hr = MFCreateMediaType(&inputType)))
        return hr;*/

    if (FAILED(hr = in_transform->GetInputAvailableType(in_input_stream_id, 0, &input_type)))
        return hr;

    // Input type settings
    if (FAILED(hr = input_type->SetGUID(MF_MT_MAJOR_TYPE, MFMediaType_Video)))
        return hr;
    if (FAILED(hr = input_type->SetGUID(MF_MT_SUBTYPE, MFVideoFormat_H264)))
        return hr;
    if (FAILED(hr = input_type->SetUINT32(MF_MT_AVG_BITRATE, 30000000)))
        return hr;
    if (FAILED(hr = MFSetAttributeSize(input_type, MF_MT_FRAME_SIZE, decode_width, decode_height)))
        return hr;
    if (FAILED(hr = MFSetAttributeRatio(input_type, MF_MT_FRAME_RATE, 60, 1)))
        return hr;
    if (FAILED(hr = input_type->SetUINT32(MF_MT_INTERLACE_MODE, MFVideoInterlaceMode::MFVideoInterlace_Progressive)))
        return hr;

    // Set input type
    if (FAILED(hr = in_transform->SetInputType(in_input_stream_id, input_type, 0)))
        return hr;

    // Create output type
    CComPtr<IMFMediaType> output_type;
    if (FAILED(hr = MFCreateMediaType(&output_type)))
        return hr;

    // Set output type
    if (FAILED(hr = output_type->SetGUID(MF_MT_MAJOR_TYPE, MFMediaType_Video)))
        return hr;
    if (FAILED(hr = output_type->SetGUID(MF_MT_SUBTYPE, MFVideoFormat_NV12)))
        return hr;
    if (FAILED(hr = MFSetAttributeSize(output_type, MF_MT_FRAME_SIZE, decode_width, decode_height)))
        return hr;
    if (FAILED(hr = MFSetAttributeRatio(output_type, MF_MT_FRAME_RATE, 60, 1)))
        return hr;
    if (FAILED(hr = MFSetAttributeRatio(output_type, MF_MT_PIXEL_ASPECT_RATIO, 1, 1)))
        return hr;
    /*if (FAILED(hr = output_type->SetUINT32(MF_MT_INTERLACE_MODE, MFVideoInterlaceMode::MFVideoInterlace_Progressive)))
        return hr;*/

        // Set output type
    if (FAILED(hr = in_transform->SetOutputType(output_stream_id, output_type, 0)))
        return hr;



    std::cout << "- Set decoder configuration" << std::endl;

    // AMD decoder crashes on this line
    /*DWORD flags;
    if (FAILED(hr = inTransform->GetInputStatus(0, &flags)))
        return hr;*/

    return hr;
}

HRESULT configure_color_conversion(IMFTransform* in_color_transform)
{
    HRESULT hr = S_OK;

    CComPtr<IMFMediaType> input_type;
    if (FAILED(hr = MFCreateMediaType(&input_type)))
        return hr;
    if (FAILED(hr = input_type->SetGUID(MF_MT_MAJOR_TYPE, MFMediaType_Video)))
        return hr;
    if (FAILED(hr = input_type->SetGUID(MF_MT_SUBTYPE, MFVideoFormat_NV12)))
        return hr;
    if (FAILED(hr = MFSetAttributeSize(input_type, MF_MT_FRAME_SIZE, decode_width, decode_height)))
        return hr;

    if (FAILED(hr = in_color_transform->SetInputType(0, input_type, 0)))
        return hr;

    CComPtr<IMFMediaType> output_type;
    if (FAILED(hr = MFCreateMediaType(&output_type)))
        return hr;
    if (FAILED(hr = output_type->SetGUID(MF_MT_MAJOR_TYPE, MFMediaType_Video)))
        return hr;
    if (FAILED(hr = output_type->SetGUID(MF_MT_SUBTYPE, MFVideoFormat_ARGB32)))
        return hr;
    if (FAILED(hr = MFSetAttributeSize(output_type, MF_MT_FRAME_SIZE, decode_width, decode_height)))
        return hr;

    if (FAILED(hr = in_color_transform->SetOutputType(0, output_type, 0)))
        return hr;

    return hr;
}

HRESULT color_convert(IMFTransform* in_transform, ID3D11Texture2D* in_texture, IMFSample** p_sample_out)
{
    HRESULT hr = S_OK;

    // Copy texture, since the one that desktop duplication generates can die at random
    CD3D11_TEXTURE2D_DESC desc;
    in_texture->GetDesc(&desc);
    CComPtr<ID3D11Device> device;
    in_texture->GetDevice(&device);
    const CD3D11_TEXTURE2D_DESC copy_desc(desc.Format, desc.Width, desc.Height, 1, 1, D3D11_BIND_SHADER_RESOURCE |
        D3D11_BIND_RENDER_TARGET);
    CComPtr<ID3D11Texture2D> copy_texture;
    device->CreateTexture2D(&copy_desc, nullptr, &copy_texture);
    CComPtr<ID3D11DeviceContext> device_context;
    device->GetImmediateContext(&device_context);
    device_context->CopyResource(copy_texture, in_texture);
    in_texture = copy_texture;

    // Create buffer
    CComPtr<IMFMediaBuffer> input_buffer;
    if (FAILED(hr = MFCreateDXGISurfaceBuffer(__uuidof(ID3D11Texture2D), in_texture, 0, false, &input_buffer)))
        return hr;

    // Create sample
    CComPtr<IMFSample> input_sample;
    if (FAILED(hr = MFCreateSample(&input_sample)))
        return hr;
    if (FAILED(hr = input_sample->AddBuffer(input_buffer)))
        return hr;

    // Set input sample times
    if (FAILED(hr = input_sample->SetSampleTime(100)))
        return hr;
    if (FAILED(hr = input_sample->SetSampleDuration(1000)))
        return hr;

    // Process input
    if (FAILED(hr = in_transform->ProcessInput(0, input_sample, 0)))
        return hr;

    // Process output
    DWORD status;
    MFT_OUTPUT_DATA_BUFFER output_buffer;
    output_buffer.pSample = nullptr;
    output_buffer.pEvents = nullptr;
    output_buffer.dwStreamID = 0;
    output_buffer.dwStatus = 0;

    MFT_OUTPUT_STREAM_INFO mft_stream_info;
    ZeroMemory(&mft_stream_info, sizeof(MFT_OUTPUT_STREAM_INFO));

    if (FAILED(hr = in_transform->GetOutputStreamInfo(0, &mft_stream_info)))
        return hr;

    ATLASSERT(mft_stream_info.dwFlags & MFT_OUTPUT_STREAM_PROVIDES_SAMPLES);

    if (FAILED(hr = in_transform->ProcessOutput(0, 1, &output_buffer, &status)))
        return hr;

    *p_sample_out = output_buffer.pSample;

    // Test output to file
    /*IMFMediaBuffer* buffer;
    if (FAILED(hr = outputBuffer.pSample->ConvertToContiguousBuffer(&buffer)))
        return hr;

    unsigned char* data;
    DWORD length;
    if (FAILED(hr = buffer->GetCurrentLength(&length)))
        return hr;

    if (FAILED(hr = buffer->Lock(&data, nullptr, &length)))
        return hr;

    std::ofstream fout;
    fout.open("raw.nv12", std::ios::binary | std::ios::out);
    fout.write((char*)data, length);
    fout.close();

    if (FAILED(hr = buffer->Unlock()))
        return hr;*/
        // End test output

    return hr;
}

int main()
{
    HRESULT hr;

    if (FAILED(hr = init_mf()))
        return hr;

    // Initialize DXGI
    CComPtr<ID3D11Device> device;
    CComPtr<ID3D11DeviceContext> context;
    if (FAILED(hr = init_dxgi(device, context)))
        return hr;

    // Create device manager
    CComPtr<IMFDXGIDeviceManager> device_manager;
    UINT resetToken;
    if (FAILED(hr = MFCreateDXGIDeviceManager(&resetToken, &device_manager)))
        return hr;

    // https://docs.microsoft.com/en-us/windows/win32/api/dxva2api/nf-dxva2api-idirect3ddevicemanager9-resetdevice
    // When you first create the Direct3D device manager, call this method with a pointer to the Direct3D device.
    if (FAILED(hr = device_manager->ResetDevice(device, resetToken)))
        return hr;

    // Get decoder
    CComPtr<IMFTransform> decoder_transform;
    CComPtr<IMFActivate> decoder_activate;
    CComPtr<IMFAttributes> decoder_attributes;
    CComQIPtr<IMFMediaEventGenerator> decoder_event_generator;
    if (FAILED(hr = get_decoder(decoder_transform, decoder_activate, decoder_attributes)))
        return hr;

    // Get the name of the decoder
    CComHeapPtr<wchar_t> friendly_name;
    uint32_t friendly_name_length;
    if (FAILED(hr = decoder_activate->GetAllocatedString(MFT_FRIENDLY_NAME_Attribute, &friendly_name, &friendly_name_length)))
        return hr;
    std::wcout << "- Selected decoder: " << static_cast<wchar_t const*>(friendly_name) << std::endl;

    // Enable hardware acceleration
    if (FAILED(hr = decoder_attributes->SetUINT32(CODECAPI_AVDecVideoAcceleration_H264, true)))
        return hr;
    // Enable low-latency mode - otherwise the decoder will require many input frames before it's able produce any output
    if (FAILED(hr = decoder_attributes->SetUINT32(CODECAPI_AVLowLatencyMode, true)))
        return hr;

    // Get decoder stream IDs
    DWORD input_stream_id, output_stream_id;
    hr = decoder_transform->GetStreamIDs(1, &input_stream_id, 1, &output_stream_id);
    if (hr == E_NOTIMPL) // Doesn't mean failed, see remarks
    {                    // https://docs.microsoft.com/en-us/windows/win32/api/mftransform/nf-mftransform-imftransform-getstreamids
        input_stream_id = 0;
        output_stream_id = 0;
        hr = S_OK;
    }
    if (FAILED(hr))
        return hr;

    // Init decoder-related objects/variables
    if (FAILED(hr = configure_decoder(decoder_transform, device_manager, input_stream_id, output_stream_id)))
        return hr;

    // Apparently you can do this, idek man
    decoder_event_generator = decoder_transform;

    // Init color conversion-related objects/variables
    IMFTransform* color_transform;
    if (FAILED(hr = CoCreateInstance(CLSID_VideoProcessorMFT, nullptr, CLSCTX_INPROC_SERVER,
        IID_IMFTransform, (void**)&color_transform)))
        return hr;

    if (FAILED(hr = color_transform->ProcessMessage(MFT_MESSAGE_SET_D3D_MANAGER, reinterpret_cast<ULONG_PTR>(device_manager.p))))
        return hr;

    if (FAILED(hr = configure_color_conversion(color_transform)))
        return hr;

    if (FAILED(hr = decoder_transform->ProcessMessage(MFT_MESSAGE_COMMAND_FLUSH, NULL)))
        return hr;
    if (FAILED(hr = decoder_transform->ProcessMessage(MFT_MESSAGE_NOTIFY_BEGIN_STREAMING, NULL)))
        return hr;
    if (FAILED(hr = decoder_transform->ProcessMessage(MFT_MESSAGE_NOTIFY_START_OF_STREAM, NULL)))
        return hr;

    int frameIndex = 1;
    do
    {
        // Read frame data from .h264 file
        std::ifstream fin("..\\CaptureAndEncode\\Encoded Frames\\frame" + std::to_string(frameIndex) + ".h264", std::ios::binary |
            std::ios::in);

        if (!fin)
            throw std::runtime_error("Invalid file path specified");

        // Get file length
        fin.seekg(0, std::ios::end);
        size_t length = fin.tellg();
        fin.seekg(0, std::ios::beg);

        // Variables
        CComPtr<IMFMediaBuffer> input_buffer;
        CComPtr<IMFSample> input_sample;
        DWORD cbMaxLength, cbCurrentLength;
        BYTE* pBuffer;

        // Create memory buffer and make the underlying array readable/writeable by using Lock()
        if (FAILED(hr = MFCreateMemoryBuffer(length, &input_buffer)))
            return hr;
        if (FAILED(hr = input_buffer->Lock(&pBuffer, &cbMaxLength, &cbCurrentLength)))
            return hr;

        // Copy frame data from file to array;
        fin.read((char*)pBuffer, length);
        fin.close();

        // Unlock it again (no longer readable/writeable)
        if (FAILED(hr = input_buffer->Unlock()))
            return hr;
        if (FAILED(hr = input_buffer->SetCurrentLength(length)))
            return hr;

        // Create sample and add the buffer to it
        if (FAILED(hr = MFCreateSample(&input_sample)))
            return hr;
        if (FAILED(hr = input_sample->AddBuffer(input_buffer)))
            return hr;

        // Process input
        hr = decoder_transform->ProcessInput(0, input_sample, 0);
        if (SUCCEEDED(hr))
            std::cout << "decoder_transform::ProcessInput() - SUCCESS" << std::endl;
        else if (hr == MF_E_NOTACCEPTING)
            std::cout << "decoder_transform::ProcessInput() - MF_E_NOTACCEPTING" << std::endl;
        else
            std::cout << "decoder_transform::ProcessInput() - ERROR" << std::endl;

        // Output H264 -> NV12
        DWORD status;
        MFT_OUTPUT_DATA_BUFFER output_buffer;
        output_buffer.pSample = nullptr;
        output_buffer.pEvents = nullptr;
        output_buffer.dwStreamID = 0;
        output_buffer.dwStatus = 0;

        MFT_OUTPUT_STREAM_INFO mft_stream_info;
        ZeroMemory(&mft_stream_info, sizeof(MFT_OUTPUT_STREAM_INFO));

        if (FAILED(hr = decoder_transform->GetOutputStreamInfo(0, &mft_stream_info)))
            return hr;

        if ((mft_stream_info.dwFlags & MFT_OUTPUT_STREAM_PROVIDES_SAMPLES) == 0 &&
            (mft_stream_info.dwFlags & MFT_OUTPUT_STREAM_CAN_PROVIDE_SAMPLES) == 0)
        {
            std::cout << "This decoder requires that we allocate the output sample ourselves" << std::endl;

            CComPtr<IMFSample> output_sample;
            CComPtr<IMFMediaBuffer> output_media_buffer;

            if (FAILED(hr = MFCreateSample(&output_sample)))
                return hr;

            if (FAILED(hr = MFCreateMemoryBuffer(mft_stream_info.cbSize, &output_media_buffer)))
                return hr;

            if (FAILED(hr = output_sample->AddBuffer(output_media_buffer)))
                return hr;

            output_buffer.pSample = output_sample.Detach();
        }

        hr = decoder_transform->ProcessOutput(0, 1, &output_buffer, &status);
        if (SUCCEEDED(hr))
        {
            std::cout << "decoder_transform::ProcessOutput() - SUCCESS" << std::endl;
            // Test output to file
            IMFMediaBuffer* buffer;
            if (FAILED(hr = output_buffer.pSample->ConvertToContiguousBuffer(&buffer)))
                return hr;

            unsigned char* data;
            DWORD length;
            if (FAILED(hr = buffer->GetCurrentLength(&length)))
                return hr;

            if (FAILED(hr = buffer->Lock(&data, nullptr, &length)))
                return hr;

            std::ofstream fout;
            fout.open("raw.nv12", std::ios::binary | std::ios::out);
            fout.write((char*)data, length);
            fout.close();

            if (FAILED(hr = buffer->Unlock()))
                return hr;
            // End test output
        }   
        else if (hr == MF_E_TRANSFORM_NEED_MORE_INPUT)
            std::cout << "decoder_transform::ProcessOutput() - MF_E_TRANSFORM_NEED_MORE_INPUT" << std::endl;
        else if (hr == E_INVALIDARG)
            std::cout << "decoder_transform::ProcessOutput() - E_INVALIDARG" << std::endl;
        else if (hr == MF_E_TRANSFORM_STREAM_CHANGE)
        {
            std::cout << "decoder_transform::ProcessOutput() - MF_E_TRANSFORM_STREAM_CHANGE" << std::endl;

            CComPtr<IMFMediaType> output_type;
            if (FAILED(hr = decoder_transform->GetOutputAvailableType(0, 0, &output_type)))
                return hr;

            // Create output type
            /*if (FAILED(hr = MFCreateMediaType(&output_type)))
                return hr;*/

            // Reconfigure output settings
            if (FAILED(hr = output_type->SetGUID(MF_MT_MAJOR_TYPE, MFMediaType_Video)))
                return hr;
            if (FAILED(hr = output_type->SetGUID(MF_MT_SUBTYPE, MFVideoFormat_NV12)))
                return hr;
            if (FAILED(hr = MFSetAttributeRatio(output_type, MF_MT_PIXEL_ASPECT_RATIO, 1, 1)))
                return hr;

            // Set output type
            if (FAILED(hr = decoder_transform->SetOutputType(output_stream_id, output_type, 0)))
                return hr;
        }

        frameIndex++;

        input_sample.Release();
    } while (frameIndex <= 60);


    return hr;
}

You just need to follow this at Handling Stream Changes:

  1. The client calls IMFTransform::GetOutputAvailableType. This method returns an updated set of output types.
  2. The client calls SetOutputType to set a new output type.
  3. The client resumes calling ProcessInput/ProcessOutput.

In the question body above you are trying to do 3 without doing 2. Most likely your media type is somewhat different from MFT's so it is likely to reject it and it blocks the processing until this is resolved.