Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu

I get the following error message which I tried to deal with it by throwing .to(self.device) everywhere but it doesn't work.

  ab = torch.lgamma(torch.tensor(a+b, dtype=torch.float, requires_grad=True).to(device=local_device))
Traceback (most recent call last):
  File "Script.py", line 923, in <module>
    average_epoch_loss, out , elbo2 =train(epoch)
  File "Script.py", line 848, in train
    loss_dict = net.get_ELBO(X)
  File "Script.py", line 546, in get_ELBO
    elbo -= compute_kumar2beta_kld(self.kumar_a[:, k].to(self.device), self.kumar_b[:, k].to(self.device), self.prior, (self.K-1-k)* self.prior).mean().to(self.device)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!

Here is a snippet of my script which is related to the error:

def compute_kumar2beta_kld(a, b, alpha, beta):
    SMALL = 1e-16
    EULER_GAMMA = 0.5772156649015329
    ab    = torch.mul(a,b)+ SMALL
    a_inv = torch.pow(a + SMALL, -1)
    b_inv = torch.pow(b + SMALL, -1)
    kl = torch.mul(torch.pow(1+ab,-1), beta_fn(a_inv, b))
    for idx in range(10):
        kl += torch.mul(torch.pow(idx+2+ab,-1), beta_fn(torch.mul(idx+2., a_inv), b))
    kl = torch.mul(torch.mul(beta-1,b), kl)
    psi_b = torch.digamma(b+SMALL)
    kl += torch.mul(torch.div(a-alpha,a+SMALL), -EULER_GAMMA - psi_b - b_inv)
    
    kl += torch.log(ab) + torch.log(beta_fn(alpha, beta) + SMALL)
    
    kl += torch.div(-(b-1),b +SMALL)
    return kl

class VAE(GMMVAE):
    def __init__(self, hyperParams, K, nchannel, base_channels, z_dim, w_dim, hidden_dim, device, img_width, batch_size, include_elbo2):
        global local_device
        local_device = device
        super(VAE, self).__init__(K, nchannel, base_channels, z_dim, w_dim, hidden_dim,  device, img_width, batch_size)

        self.prior      = hyperParams['prior']
        self.K          = hyperParams['K']
        self.z_dim      = hyperParams['latent_d']
        self.hidden_dim = hyperParams['hidden_d']
    def get_ELBO(self, X):

        elbo = torch.tensor(0, dtype=torch.float)
        if self.include_elbo2:
            for k in range(self.K-1):
                elbo -= compute_kumar2beta_kld(self.kumar_a[:, k], self.kumar_b[:, k], self.prior, (self.K-1-k)* self.prior).mean().to(self.device)
  

I appreciate it if someone can suggest how to fix the error.


Solution 1:

I am not sure if this is the "only" problem, but one of the device-related problems is this:

elbo = torch.tensor(0, dtype=torch.float) <- this will create the elbo tensor on CPU

and when you do, elbo -= <some result>,

The result is on cuda (or self.device). This will clearly cause a problem. To fix, this just do

elbo = torch.tensor(0, dtype=torch.float, device=self.device)