Julia CUDA: LoadError: GPU broadcast resulted in non-concrete element type Any

I have to analyze some data using maximum likelihood methods, but CUDA doesn't like how I handle type instability. Any idea on how I could fix this? I tried my best in forcing concrete return types by declaring the type of every function argument, but it doesn't seem to work.

EDIT: I moved back some function declarations inside where they belonged. Here is an extract of the problematic part of the program:

function ln_likelihood( a_c::Float64,
                        a_p::Float64,
                        θ_1::Float64,
                        θ_2p::CuArray{Float64},
                        θ_2c::CuArray{Float64},
                        ϵ_p::CuArray{Float64},
                        σ_p::CuArray{Float64},
                        ϵ_c::CuArray{Float64},
                        σ_c::CuArray{Float64})
    ...
    #return Float64
end

function trova_max_likelihood(  θ_1::Float64,
                                θ_2p::CuArray{Float64},
                                θ_2c::CuArray{Float64},
                                ϵ_p::CuArray{Float64},
                                σ_p::CuArray{Float64},
                                ϵ_c::CuArray{Float64},
                                σ_c::CuArray{Float64})

    ...

    function funzione_likelirobin(a_c::Float64, a_p::Float64)
        global θ_1,θ_2p,θ_2c, ϵ_p, σ_p, ϵ_c, σ_c 
        ln_likelihood(a_c,a_p,θ_1,θ_2p,θ_2c, ϵ_p, σ_p, ϵ_c, σ_c)
    end

    funzione_likelihood(x::Tuple{Float64, Float64}) = funzione_likelirobin(x[1],x[2])

    @code_warntype funzione_likelihood.(range)
    #Where range::CuArray{Tuple{Float64,Float64}}
    ...
end


trova_max_likelihood(gθ_1, gθ_2p, gθ_2c, gϵ_p, gσ_p, gϵ_c, gσ_c)

And the output I get:

Variables
  #self#::Core.Const(var"##dotfunction#274#175"{var"#funzione_likelihood#174"{var"#funzione_likelirobin#173"}}(var"#funzione_likelihood#174"{var"#funzione_likelirobin#173"}(var"#funzione_likelirobin#173"())))
  x1::CuArray{Tuple{Float64, Float64}, 1, CUDA.Mem.DeviceBuffer}

Body::Union{}
1 ─ %1 = Core.getfield(#self#, :funzione_likelihood)::Core.Const(var"#funzione_likelihood#174"{var"#funzione_likelirobin#173"}(var"#funzione_likelirobin#173"()))
│   %2 = Base.broadcasted(%1, x1)::Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Nothing, var"#funzione_likelihood#174"{var"#funzione_likelirobin#173"}, Tuple{CuArray{Tuple{Float64, Float64}, 1, CUDA.Mem.DeviceBuffer}}}
│        Base.materialize(%2)
└──      Core.Const(:(return %3))
ERROR: LoadError: GPU broadcast resulted in non-concrete element type Any.
This probably means that the function you are broadcasting contains an error or type instability.
Stacktrace:
 [1] error(s::String)
   @ Base .\error.jl:33
 [2] copy
   @ ~\.julia\packages\GPUArrays\gkF6S\src\host\broadcast.jl:44 [inlined]
 [3] materialize
   @ .\broadcast.jl:883 [inlined]
 [4] trova_max_likelihood(θ_1::Float64, θ_2p::CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, θ_2c::CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, ϵ_p::CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, σ_p::CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, ϵ_c::CuArray{Float64, 1, C, σ_c::CuArray{Float64, 1, CUDA.Mem.DeviceBuffer})
   @ Main ~\Documents\GitHub\lab2\Lab2\Esercizio 5\esercizio5.jl:82
 [5] top-level scope
   @ ~\Documents\GitHub\lab2\Lab2\Esercizio 5\esercizio5.jl:99
 [6] eval
   @ .\boot.jl:360 [inlined]
 [7] include_string(mapexpr::typeof(identity), mod::Module, code::String, filename::String)
   @ Base .\loading.jl:1094
in expression starting at C:\Users\marce\Documents\GitHub\lab2\Lab2\Esercizio 5\esercizio5.jl:99

EDIT 2: I tried switching to regular arrays and the code above wouldn't work. I had to delete a line and define:

function funzione_likelirobin(a_c::Float64, a_p::Float64)
        ln_likelihood(a_c,a_p,θ_1,θ_2p,θ_2c, ϵ_p, σ_p, ϵ_c, σ_c)
end

So I made the same change in the code with CuArrays. The output I get is now:

Variables
  #self#::var"##dotfunction#260#56"{var"#funzione_likelihood#55"{var"#funzione_likelirobin#54"{Float64, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}}}}
  x1::CuArray{Tuple{Float64, Float64}, 1, CUDA.Mem.DeviceBuffer}

Body::CuArray{_A, 1, CUDA.Mem.DeviceBuffer} where _A
1 ─ %1 = Core.getfield(#self#, :funzione_likelihood)::var"#funzione_likelihood#55"{var"#funzione_likelirobin#54"{Float64, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}}}
│   %2 = Base.broadcasted(%1, x1)::Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Nothing, var"#funzione_likelihood#55"{var"#funzione_likelirobin#54"{Float64, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}}}, Tuple{CuArray{Tuple{Float64, Float64}, 1, CUDA.Mem.DeviceBuffer}}}
│   %3 = Base.materialize(%2)::CuArray{_A, 1, CUDA.Mem.DeviceBuffer} where _A
└──      return %3
ERROR: LoadError: InvalidIRError: compiling kernel broadcast_kernel(CUDA.CuKernelContext, CuDeviceVector{Float64, 1}, Base.Broadcast.Broadcasted{Nothing, Tuple{Base.OneTo{Int64}}, var"#funzione_likelihood#55"{var"#funzione_likelirobin#54"{Float64, CuDeviceVector{Float64, 1}, CuDeviceVector{Float64, 1}, CuDeviceVector{Float64, 1}, CuDeviceVector{Float64, 1}, CuDeviceVector{Float64, 1}, CuDeviceVector{Float64, 1}}}, Tuple{Base.Broadcast.Extruded{CuDeviceVector{Tuple{Float64, Float64}, 1}, Tuple{Bool}, Tuple{Int64}}}}, Int64) resulted in invalid LLVM IR
Reason: unsupported dynamic function invocation (call to ln_likelihood)
Stacktrace:
 [1] funzione_likelirobin
   @ ~\Documents\GitHub\lab2\Lab2\Esercizio 5\esercizio5.jl:76
 [2] funzione_likelihood
   @ ~\Documents\GitHub\lab2\Lab2\Esercizio 5\esercizio5.jl:79
 [3] _broadcast_getindex_evalf
   @ .\broadcast.jl:648
 [4] _broadcast_getindex
   @ .\broadcast.jl:621
 [5] getindex
   @ .\broadcast.jl:575
 [6] broadcast_kernel
   @ ~\.julia\packages\GPUArrays\gkF6S\src\host\broadcast.jl:59
Stacktrace:
  [1] check_ir(job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{GPUArrays.var"#broadcast_kernel#17", Tuple{CUDA.CuKernelContext, CuDeviceVector{Float64, 1}, Base.Broadcast.Broadcasted{Nothing, Tuple{Base.OneTo{Int64}}, var"#funzione_likelihood#55"{var"#funzione_likelirobin#54"{Float64, CuDeviceVector{Float64, 1}, CuDeviceVector{Float64, 1}, CuDeviceVector{Float64, 1}, CuDeviceVector{Float64, 1}, CuDeviceVector{Float64, 1}, CuDeviceVector{Float64, 1}}}, Tuple{Base.Broadcast.Extruded{CuDeviceVector{Tuple{Float64, Float64}, 1}, Tuple{Bool}, Tuple{Int64}}}}, Int64}}}, args::LLVM.Module)
    @ GPUCompiler ~\.julia\packages\GPUCompiler\HeCT6\src\validation.jl:111
  [2] macro expansion
    @ ~\.julia\packages\GPUCompiler\HeCT6\src\driver.jl:326 [inlined]
  [3] macro expansion
    @ ~\.julia\packages\TimerOutputs\YJq3h\src\TimerOutput.jl:252 [inlined]
  [4] macro expansion
    @ ~\.julia\packages\GPUCompiler\HeCT6\src\driver.jl:324 [inlined]
  [5] emit_asm(job::GPUCompiler.CompilerJob, ir::LLVM.Module; strip::Bool, validate::Bool, format::LLVM.API.LLVMCodeGenFileType)
    @ GPUCompiler ~\.julia\packages\GPUCompiler\HeCT6\src\utils.jl:64
  [6] cufunction_compile(job::GPUCompiler.CompilerJob)
    @ CUDA ~\.julia\packages\CUDA\sCev8\src\compiler\execution.jl:326
  [7] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link))
    @ GPUCompiler ~\.julia\packages\GPUCompiler\HeCT6\src\cache.jl:90
  [8] cufunction(f::GPUArrays.var"#broadcast_kernel#17", tt::Type{Tuple{CUDA.CuKernelContext, CuDeviceVector{Float64, 1}, Base.Broadcast.Broadcasted{Nothing, Tuple{Base.OneTo{Int64}}, var"#funzione_likelihood#55"{var"#funzione_likelirobin#54"{Float64, CuDeviceVector{Float64, 1}, CuDeviceVector{Float64, 1}, CuDeviceVector{Float64, 1}, CuDeviceVector{Float64, 1}, CuDeviceVector{Float64, 1}, CuDeviceVector{Float64, 1}}}, Tuple{Base.Broadcast.Extruded{CuDeviceVector{Tuple{Float64, Float64}, 1}, Tuple{Bool}, Tuple{Int64}}}}, Int64}}; name::Nothing, kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ CUDA ~\.julia\packages\CUDA\sCev8\src\compiler\execution.jl:297
  [9] cufunction(f::GPUArrays.var"#broadcast_kernel#17", tt::Type{Tuple{CUDA.CuKernelContext, CuDeviceVector{Float64, 1}, Base.Broadcast.Broadcasted{Nothing, Tuple{Base.OneTo{Int64}}, var"#funzione_likelihood#55"{var"#funzione_likelirobin#54"{Float64, CuDeviceVector{Float64, 1}, CuDeviceVector{Float64, 1}, CuDeviceVector{Float64, 1}, CuDeviceVector{Float64, 1}, CuDeviceVector{Float64, 1}, CuDeviceVector{Float64, 1}}}, Tuple{Base.Broadcast.Extruded{CuDeviceVector{Tuple{Float64, Float64}, 1}, Tuple{Bool}, Tuple{Int64}}}}, Int64}})
    @ CUDA ~\.julia\packages\CUDA\sCev8\src\compiler\execution.jl:291
 [10] macro expansion
    @ ~\.julia\packages\CUDA\sCev8\src\compiler\execution.jl:102 [inlined]
 [11] launch_heuristic(::CUDA.CuArrayBackend, ::GPUArrays.var"#broadcast_kernel#17", ::CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, ::Base.Broadcast.Broadcasted{Nothing, Tuple{Base.OneTo{Int64}}, var"#funzione_likelihood#55"{var"#funzione_likelirobin#54"{Float64, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}}}, Tuple{Base.Broadcast.Extruded{CuArray{Tuple{Float64, Float64}, 1, CUDA.Mem.DeviceBuffer}, Tuple{Bool}, Tuple{Int64}}}}, ::Int64; elements::Int64, elements_per_thread::Int64)
    @ CUDA ~\.julia\packages\CUDA\sCev8\src\gpuarrays.jl:17
 [12] copyto!(dest::CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, bc::Base.Broadcast.Broadcasted{Nothing, Tuple{Base.OneTo{Int64}}, var"#funzione_likelihood#55"{var"#funzione_likelirobin#54"{Float64, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}}}, Tuple{CuArray{Tuple{Float64, Float64}, 1, CUDA.Mem.DeviceBuffer}}})
    @ GPUArrays ~\.julia\packages\GPUArrays\gkF6S\src\host\broadcast.jl:65
in expression starting at C:\Users\marce\Documents\GitHub\lab2\Lab2\Esercizio 5\esercizio5.jl:100

Solution 1:

The code you have provided seems to be a bit short of an MWE. However, filling in some random data of the specified types, we have:

using CUDA
a_c, a_p, θ_1 = rand(3)
N = 1000
θ_2p, θ_2c, ϵ_p, σ_p, ϵ_c, σ_c = ntuple(x->CUDA.randn(Float64, N), 6)


function ln_likelihood( a_c::Float64,
                        a_p::Float64,
                        θ_1::Float64,
                        θ_2p::CuArray{Float64},
                        θ_2c::CuArray{Float64},
                        ϵ_p::CuArray{Float64},
                        σ_p::CuArray{Float64},
                        ϵ_c::CuArray{Float64},
                        σ_c::CuArray{Float64})
    # ...
    return a_c + a_p + θ_1 + sum(θ_2p) + sum(θ_2c) + sum(ϵ_p) + sum(σ_p) + sum(ϵ_c) + sum(σ_c)
end


function trova_max_likelihood(  θ_1::Float64,
                                θ_2p::CuArray{Float64},
                                θ_2c::CuArray{Float64},
                                ϵ_p::CuArray{Float64},
                                σ_p::CuArray{Float64},
                                ϵ_c::CuArray{Float64},
                                σ_c::CuArray{Float64})

    # ...

    function funzione_likelirobin(a_c::Float64, a_p::Float64)
        global θ_1, θ_2p, θ_2c, ϵ_p, σ_p, ϵ_c, σ_c
        ln_likelihood(a_c, a_p, θ_1, θ_2p, θ_2c, ϵ_p, σ_p, ϵ_c, σ_c)
    end

    funzione_likelihood(x::Tuple{Float64, Float64}) = funzione_likelirobin(x[1],x[2])

    # Better make a range if we want to broadcast over it
    range = CUDA.fill((1., 2.), 10)

    @code_warntype funzione_likelihood.(range)
    #Where range::CuArray{Tuple{Float64,Float64}}
end

which runs for me without any error, and gives blue, stably-inferred types in the @code_warntype output:

julia> trova_max_likelihood(θ_1, θ_2p, θ_2c, ϵ_p, σ_p, ϵ_c, σ_c)
MethodInstance for (::var"##dotfunction#413#25"{var"#funzione_likelihood#24"{var"#funzione_likelirobin#23"}})(::CuArray{Tuple{Float64, Float64}, 1, CUDA.Mem.DeviceBuffer})
  from (::var"##dotfunction#413#25")(x1) in Main
Arguments
  #self#::Core.Const(var"##dotfunction#413#25"{var"#funzione_likelihood#24"{var"#funzione_likelirobin#23"}}(var"#funzione_likelihood#24"{var"#funzione_likelirobin#23"}(var"#funzione_likelirobin#23"())))
  x1::CuArray{Tuple{Float64, Float64}, 1, CUDA.Mem.DeviceBuffer}
Body::Union{}
1 ─ %1 = Core.getfield(#self#, :funzione_likelihood)::Core.Const(var"#funzione_likelihood#24"{var"#funzione_likelirobin#23"}(var"#funzione_likelirobin#23"()))
│   %2 = Base.broadcasted(%1, x1)::Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Nothing, var"#funzione_likelihood#24"{var"#funzione_likelirobin#23"}, Tuple{CuArray{Tuple{Float64, Float64}, 1, CUDA.Mem.DeviceBuffer}}}
│        Base.materialize(%2)
└──      Core.Const(:(return %3))

So it would seem that the instability is likely coming from somewhere in the code you have elided with ....

That said, I would heavily recommend avoiding global variables; either just specifying the variables explicitly in the function signature, or if you have to, capturing local variables in a closure, would be preferable to using globals -- which can be a major source of type-instability.

Using a closure instead of globals as follows

using CUDA
a_c, a_p, θ_1 = rand(3)
N = 1000
θ_2p, θ_2c, ϵ_p, σ_p, ϵ_c, σ_c = ntuple(x->CUDA.randn(Float64, N), 6)


function ln_likelihood( a_c::Float64,
                        a_p::Float64,
                        θ_1::Float64,
                        θ_2p::CuArray{Float64},
                        θ_2c::CuArray{Float64},
                        ϵ_p::CuArray{Float64},
                        σ_p::CuArray{Float64},
                        ϵ_c::CuArray{Float64},
                        σ_c::CuArray{Float64})
    # ...
    return a_c + a_p + θ_1 + sum(θ_2p) + sum(θ_2c) + sum(ϵ_p) + sum(σ_p) + sum(ϵ_c) + sum(σ_c)
end


function trova_max_likelihood(  θ_1::Float64,
                                θ_2p::CuArray{Float64},
                                θ_2c::CuArray{Float64},
                                ϵ_p::CuArray{Float64},
                                σ_p::CuArray{Float64},
                                ϵ_c::CuArray{Float64},
                                σ_c::CuArray{Float64})

    # ...

    function funzione_likelirobin(a_c::Float64, a_p::Float64)
        ln_likelihood(a_c, a_p, θ_1, θ_2p, θ_2c, ϵ_p, σ_p, ϵ_c, σ_c)
    end

    funzione_likelihood(x::Tuple{Float64, Float64}) = funzione_likelirobin(x[1],x[2])

    # Better make a range if we want to broadcast over it
    range = CUDA.fill((1., 2.), 10)

    @code_warntype funzione_likelihood.(range)
    #Where range::CuArray{Tuple{Float64,Float64}}
end

yields a slightly different @code_warntype output, but still with blue, stably-inferred types and no error

julia> trova_max_likelihood(θ_1, θ_2p, θ_2c, ϵ_p, σ_p, ϵ_c, σ_c)
MethodInstance for (::var"##dotfunction#414#30"{var"#funzione_likelihood#29"{var"#funzione_likelirobin#28"{Float64, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}}}})(::CuArray{Tuple{Float64, Float64}, 1, CUDA.Mem.DeviceBuffer})
  from (::var"##dotfunction#414#30")(x1) in Main
Arguments
  #self#::var"##dotfunction#414#30"{var"#funzione_likelihood#29"{var"#funzione_likelirobin#28"{Float64, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}}}}
  x1::CuArray{Tuple{Float64, Float64}, 1, CUDA.Mem.DeviceBuffer}
Body::CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}
1 ─ %1 = Core.getfield(#self#, :funzione_likelihood)::var"#funzione_likelihood#29"{var"#funzione_likelirobin#28"{Float64, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}}}
│   %2 = Base.broadcasted(%1, x1)::Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1}, Nothing, var"#funzione_likelihood#29"{var"#funzione_likelirobin#28"{Float64, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}, CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}}}, Tuple{CuArray{Tuple{Float64, Float64}, 1, CUDA.Mem.DeviceBuffer}}}
│   %3 = Base.materialize(%2)::CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}
└──      return %3