two pytorch DistributedSampler same seeds different shuffling multiple GPU-s
Solution 1:
DistributedSampler
is for distributed data training where we want different data to be sent to different processes so it is not what you need. Regular dataloader will do just fine.
Example:
import torch
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader, RandomSampler
class ToyDataset(Dataset):
def __init__(self, type):
self.type = type
def __getitem__(self, idx):
return f'{self.type}, {idx}'
def __len__(self):
return 10
def get_sampler(dataset, seed=42):
generator = torch.Generator()
generator.manual_seed(seed)
sampler = RandomSampler(dataset, generator=generator)
return sampler
original_dataset = ToyDataset('original')
pcp_dataset = ToyDataset('pcp')
original_loader = DataLoader(original_dataset, batch_size=2, sampler=get_sampler(original_dataset))
pcp_loader = DataLoader(pcp_dataset, batch_size=2, sampler=get_sampler(pcp_dataset))
for data in original_loader:
print(data)
for data in pcp_loader:
print(data)
Output:
['original, 2', 'original, 6']
['original, 1', 'original, 8']
['original, 4', 'original, 5']
['original, 0', 'original, 9']
['original, 3', 'original, 7']
['pcp, 2', 'pcp, 6']
['pcp, 1', 'pcp, 8']
['pcp, 4', 'pcp, 5']
['pcp, 0', 'pcp, 9']
['pcp, 3', 'pcp, 7']