Flux.jl : Customizing optimizer
I'm trying to implement a gradient-free optimizer function to train convolutional neural networks with Julia using Flux.jl. The reference paper is this: https://arxiv.org/abs/2005.05955. This paper proposes RSO, a gradient-free optimization algorithm updates single weight at a time on a sampling bases. The pseudocode of this algorithm is depicted in the picture below.
optimizer_pseudocode
I'm using MNIST dataset.
function train(; kws...)
args = Args(; kws...) # collect options in a stuct for convinience
if CUDA.functional() && args.use_cuda
@info "Training on CUDA GPU"
CUDA.allwoscalar(false)
device = gpu
else
@info "Training on CPU"
device = cpu
end
# Prepare datasets
x_train, x_test, y_train, y_test = getdata(args, device)
# Create DataLoaders (mini-batch iterators)
train_loader = DataLoader((x_train, y_train), batchsize=args.batchsize, shuffle=true)
test_loader = DataLoader((x_test, y_test), batchsize=args.batchsize)
# Construct model
model = build_model() |> device
ps = Flux.params(model) # model's trainable parameters
best_param = ps
if args.optimiser == "SGD"
# Regular training step with SGD
elseif args.optimiser == "RSO"
# Run RSO function and update ps
best_param .= RSO(x_train, y_train, args.RSOupdate, model, args.batchsize, device)
end
And the corresponding RSO function:
function RSO(X,L,C,model, batch_size, device)
"""
model = convolutional model structure
X = Input data
L = labels
C = Number of rounds to update parameters
W = Weight set of layers
Wd = Weight tensors of layer d that generates an activation
wid = weight tensor that generates an activation aᵢ
wj = a weight in wid
"""
# Normalize input data to have zero mean and unit standard deviation
X .= (X .- sum(X))./std(X)
train_loader = DataLoader((X, L), batchsize=batch_size, shuffle=true)
#println("model = $(typeof(model))")
std_prep = []
σ_d = Float64[]
D = 1
for layer in model
D += 1
Wd = Flux.params(layer)
# Initialize the weights of the network with Gaussian distribution
for id in Wd
wj = convert(Array{Float32, 4}, rand(Normal(0, sqrt(2/length(id))), (3,3,4,4)))
id = wj
append!(std_prep, vec(wj))
end
# Compute std of all elements in the weight tensor Wd
push!(σ_d, std(std_prep))
end
W = Flux.params(model)
# Weight update
for _ in 1:C
d = D
while d > 0
for id in 1:length(W[d])
# Randomly sample change in weights from Gaussian distribution
for j in 1:length(w[d][id])
# Randomly sample mini-batch
(x, l) = train_loader[rand(1:length(train_loader))]
# Sample a weight from normal distribution
ΔWj[d][id][j] = rand(Normal(0, σ_d[d]), 1)
loss, acc = loss_and_accuracy(data_loader, model, device)
W = argmin(F(x,l, W+ΔWj), F(x,l,W), F(x,l, W-ΔWj))
end
end
d -= 1
end
end
return W
end
The problem here is the second block of the RSO function. I'm trying to evaluate the loss with the change of single weight in three scenarios, which are F(w, l, W+gW), F(w, l, W), F(w, l, W-gW)
, and choose the weight-set with minimum loss. But how do I do that using Flux.jl? The loss function I'm trying to use is logitcrossentropy(ŷ, y, agg=sum)
. In order to generate y_hat, we should use model(W), but changing single weight parameter in Zygote.Params() form was already challenging....
Solution 1:
Based on the paper you shared, it looks like you need to change the weight arrays per each output neuron per each layer. Unfortunately, this means that the implementation of your optimization routine is going to depend on the layer type, since an "output neuron" for a convolution layer is quite different than a fully-connected layer. In other words, just looping over Flux.params(model)
is not going to be sufficient, since this is just a set of all the weight arrays in the model and each weight array is treated differently depending on which layer it comes from.
Fortunately, Julia's multiple dispatch does make this easier to write if you use separate functions instead of a giant loop. I'll summarize the algorithm using the pseudo-code below:
for layer in model
for output_neuron in layer
for weight_element in parameters(output_neuron)
weight_element = sample(N(0, sqrt(2 / num_outputs(layer))))
end
end
sigmas[layer] = stddev(parameters(layer))
end
for c in 1 to C
for layer in reverse(model)
for output_neuron in layer
for weight_element in parameters(output_neuron)
x, y = sample(batches)
dw = N(0, sigmas[layer])
# optimize weights
end
end
end
end
It's the for output_neuron ...
portions that we need to isolate into separate functions.
In the first block, we don't actually do anything different to every weight_element
, they are all sampled from the same normal distribution. So, we don't actually need to iterate the output neurons, but we do need to know how many there are.
using Statistics: std
# this function will set the weights according to the
# normal distribution and the number of output neurons
# it also returns the standard deviation of the weights
function sample_weight!(layer::Dense)
sample = randn(eltype(layer.weight), size(layer.weight))
num_outputs = size(layer.weight, 1)
# notice the "." notation which is used to mutate the array
layer.weight .= sample .* num_outputs
return std(layer.weight)
end
function sample_weight!(layer::Conv)
sample = randn(eltype(layer.weight), size(layer.weight))
num_outputs = size(layer.weight, 4)
# notice the "." notation which is used to mutate the array
layer.weight .= sample .* num_outputs
return std(layer.weight)
end
sigmas = map(sample_weights!, model)
Now, for the second block, we will do a similar trick by defining different functions for each layer.
function optimize_layer!(loss, layer::Dense, data, sigma)
for i in 1:size(layer.weight, 1)
for j in 1:size(layer.weight, 2)
wj = layer.weight[i, j]
x, y = data[rand(1:length(data))]
dw = randn() * sigma
ws = [wj + dw, wj, wj - dw]
losses = Float32[]
for (k, w) in enumerate(ws)
layer.weight[i, j] = w
losses[k] = loss(x, y)
end
layer.weight[i, j] = ws[argmin(losses)]
end
end
end
function optimize_layer!(loss, layer::Conv, data, sigma)
for i in 1:size(layer.weight, 4)
# we use a view to reference the full kernel
# for this output channel
wid = view(layer.weight, :, :, :, i)
# each index let's us treat wid like a vector
for j in eachindex(wid)
wj = wid[j]
x, y = data[rand(1:length(data))]
dw = randn() * sigma
ws = [wj + dw, wj, wj - dw]
losses = Float32[]
for (k, w) in enumerate(ws)
wid[j] = w
losses[k] = loss(x, y)
end
wid[j] = ws[argmin(losses)]
end
end
end
for c in 1:C
for (layer, sigma) in reverse(zip(model, sigmas))
optimize_layer!(layer, data, sigma) do x, y
logitcrossentropy(model(x), y; agg = sum)
end
end
end
Notice that nowhere did I use Flux.params
which does not help us here. Also, Flux.params
would include both the weight and bias, and the paper doesn't look like it bothers with the bias at all. If you had an optimization method that generically optimized any parameter regardless of layer type the same (i.e. like gradient descent), then you could use for p in Flux.params(model) ...
.
Solution 2:
Thanks @darsnack :)
I found your answer a bit late, so in the meantime I could figure out my own script that works. Mine is indeed a bit hardcoded but could you also give feedback on this?
function RSO(train_loader, test_loader, C,model, batch_size, device, args)
"""
model = convolutional model structure
C = Number of rounds to update parameters (epochs)
batch_size = size of the mini batch that will be used to calculate loss
device = CPU or GPU
"""
# Evaluate initial weight
test_loss, test_acc = loss_and_accuracy(test_loader, model, device)
println("Initial Weight:")
println(" test_loss = $test_loss, test_accuracy = $test_acc")
random_batch = []
for (x, l) in train_loader
push!(random_batch, (x,l))
end
# Initialize weights
std_prep = []
σ_d = Float64[]
D = 0
for layer in model
D += 1
Wd = Flux.params(layer)
# Initialize the weights of the network with Gaussian distribution
for id in Wd
if typeof(id) == Array{Float32, 4}
wj = convert(Array{Float32, 4}, rand(Normal(0, sqrt(2/length(id))), size(id)))
elseif typeof(id) == Vector{Float32}
wj = convert(Vector{Float32}, rand(Normal(0, sqrt(2/length(id))), length(id)))
elseif typeof(id) == Matrix{Float32}
wj = convert(Matrix{Float32}, rand(Normal(0, sqrt(2/length(id))), size(id)))
end
id = wj
append!(std_prep, vec(wj))
end
# Compute std of all elements in the weight tensor Wd
push!(σ_d, std(std_prep))
end
# Weight update
for c in 1:C
d = D
# First update the weights of the layer closest to the labels
# and then sequentially move closer to the input
while d > 0
Wd = Flux.params(model[d])
for id in Wd
# Randomly sample change in weights from Gaussian distribution
for j in 1:length(id)
# Randomly sample mini-batch
(x, y) = rand(random_batch, 1)[1]
x, y = device(x), device(y)
# Sample a weight from normal distribution
ΔWj = rand(Normal(0, σ_d[d]), 1)[1]
# Weight update with three scenario
## F(x,l, W+ΔWj)
id[j] = id[j]+ΔWj
ŷ = model(x)
ls_pos = logitcrossentropy(ŷ, y, agg=sum) / size(x)[end]
## F(x,l,W)
id[j] = id[j]-ΔWj
ŷ = model(x)
ls_org = logitcrossentropy(ŷ, y, agg=sum) / size(x)[end]
## F(x,l, W-ΔWj)
id[j] = id[j]-ΔWj
ŷ = model(x)
ls_neg = logitcrossentropy(ŷ, y, agg=sum) / size(x)[end]
# Check weight update that gives minimum loss
min_loss = argmin([ls_org, ls_pos, ls_neg])
# Save weight update with minimum loss
if min_loss == 1
id[j] = id[j] + ΔWj
elseif min_loss == 2
id[j] = id[j] + 2*ΔWj
elseif min_loss == 3
id[j] = id[j]
end
end
end
d -= 1
end
train_loss, train_acc = loss_and_accuracy(train_loader, model, device)
test_loss, test_acc = loss_and_accuracy(test_loader, model, device)
track!(args.tracker, test_acc)
println("RSO Round=$c")
println(" train_loss = $train_loss, train_accuracy = $train_acc")
println(" test_loss = $test_loss, test_accuracy = $test_acc")
end
return Flux.params(model)
end