Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu
I get the following error message which I tried to deal with it by throwing .to(self.device)
everywhere but it doesn't work.
ab = torch.lgamma(torch.tensor(a+b, dtype=torch.float, requires_grad=True).to(device=local_device))
Traceback (most recent call last):
File "Script.py", line 923, in <module>
average_epoch_loss, out , elbo2 =train(epoch)
File "Script.py", line 848, in train
loss_dict = net.get_ELBO(X)
File "Script.py", line 546, in get_ELBO
elbo -= compute_kumar2beta_kld(self.kumar_a[:, k].to(self.device), self.kumar_b[:, k].to(self.device), self.prior, (self.K-1-k)* self.prior).mean().to(self.device)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!
Here is a snippet of my script which is related to the error:
def compute_kumar2beta_kld(a, b, alpha, beta):
SMALL = 1e-16
EULER_GAMMA = 0.5772156649015329
ab = torch.mul(a,b)+ SMALL
a_inv = torch.pow(a + SMALL, -1)
b_inv = torch.pow(b + SMALL, -1)
kl = torch.mul(torch.pow(1+ab,-1), beta_fn(a_inv, b))
for idx in range(10):
kl += torch.mul(torch.pow(idx+2+ab,-1), beta_fn(torch.mul(idx+2., a_inv), b))
kl = torch.mul(torch.mul(beta-1,b), kl)
psi_b = torch.digamma(b+SMALL)
kl += torch.mul(torch.div(a-alpha,a+SMALL), -EULER_GAMMA - psi_b - b_inv)
kl += torch.log(ab) + torch.log(beta_fn(alpha, beta) + SMALL)
kl += torch.div(-(b-1),b +SMALL)
return kl
class VAE(GMMVAE):
def __init__(self, hyperParams, K, nchannel, base_channels, z_dim, w_dim, hidden_dim, device, img_width, batch_size, include_elbo2):
global local_device
local_device = device
super(VAE, self).__init__(K, nchannel, base_channels, z_dim, w_dim, hidden_dim, device, img_width, batch_size)
self.prior = hyperParams['prior']
self.K = hyperParams['K']
self.z_dim = hyperParams['latent_d']
self.hidden_dim = hyperParams['hidden_d']
def get_ELBO(self, X):
elbo = torch.tensor(0, dtype=torch.float)
if self.include_elbo2:
for k in range(self.K-1):
elbo -= compute_kumar2beta_kld(self.kumar_a[:, k], self.kumar_b[:, k], self.prior, (self.K-1-k)* self.prior).mean().to(self.device)
I appreciate it if someone can suggest how to fix the error.
Solution 1:
I am not sure if this is the "only" problem, but one of the device-related problems is this:
elbo = torch.tensor(0, dtype=torch.float)
<- this will create the elbo tensor on CPU
and when you do, elbo -= <some result>
,
The result is on cuda (or self.device
). This will clearly cause a problem. To fix, this just do
elbo = torch.tensor(0, dtype=torch.float, device=self.device)