WeichenXu123
diff --git a/‎docs/api.rst
Lines changed: 4 additions & 0 deletions b/‎docs/api.rst
Lines changed: 4 additions & 0 deletions
diff --git a/‎docs/mocks.py
Lines changed: 6 additions & 0 deletions b/‎docs/mocks.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎examples/elastic/pytorch_imagenet_resnet50_elastic.py
Lines changed: 340 additions & 0 deletions b/‎examples/elastic/pytorch_imagenet_resnet50_elastic.py
Lines changed: 340 additions & 0 deletions
@@ -4,22 +4,26 @@ API
 horovod.tensorflow
 ------------------
 .. automodule:: horovod.tensorflow
+.. automodule:: horovod.tensorflow.elastic
 
 horovod.tensorflow.keras
 ------------------------
 .. automodule:: horovod.tensorflow.keras
 .. automodule:: horovod.tensorflow.keras.callbacks
     :special-members: __init__
+.. automodule:: horovod.tensorflow.keras.elastic
 
 horovod.keras
 -------------
 .. automodule:: horovod.keras
 .. automodule:: horovod.keras.callbacks
     :special-members: __init__
+.. automodule:: horovod.keras.elastic
 
 horovod.torch
 -------------
 .. automodule:: horovod.torch
+.. automodule:: horovod.torch.elastic
 
 horovod.mxnet
 -------------
 
@@ -72,6 +72,7 @@ def _dummy():
     'torch.nn.modules.batchnorm',
     'torch.utils',
     'torch.utils.data',
+    'torch.utils.data.distributed',
     'torch.utils.tensorboard',
 
     'mxnet',
@@ -112,6 +113,11 @@ def _dummy():
                 }
             },
         },
+        'utils': {
+            'data': {
+                'Sampler': MagicMock,
+            },
+        },
     },
     'pyspark': {
         'ml': {
 
@@ -0,0 +1,340 @@
+import torch
+import argparse
+import torch.backends.cudnn as cudnn
+import torch.multiprocessing as mp
+import torch.nn.functional as F
+import torch.optim as optim
+import torch.utils.data.distributed
+from torch.utils.tensorboard import SummaryWriter
+from torchvision import datasets, transforms, models
+import horovod.torch as hvd
+import os
+import math
+from tqdm import tqdm
+
+# Training settings
+parser = argparse.ArgumentParser(description='Elastic PyTorch ImageNet Example',
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--train-dir', default=os.path.expanduser('~/imagenet/train'),
+                    help='path to training data')
+parser.add_argument('--val-dir', default=os.path.expanduser('~/imagenet/validation'),
+                    help='path to validation data')
+parser.add_argument('--log-dir', default='./logs',
+                    help='tensorboard log directory')
+parser.add_argument('--checkpoint-format', default='./checkpoint-{epoch}.pth.tar',
+                    help='checkpoint file format')
+
+# Horoovd settings
+parser.add_argument('--fp16-allreduce', action='store_true', default=False,
+                    help='use fp16 compression during allreduce')
+parser.add_argument('--batches-per-allreduce', type=int, default=1,
+                    help='number of batches processed locally before '
+                         'executing allreduce across workers; it multiplies '
+                         'total batch size.')
+parser.add_argument('--use-adasum', action='store_true', default=False,
+                    help='use adasum algorithm to do reduction')
+parser.add_argument('--gradient-predivide-factor', type=float, default=1.0,
+                    help='apply gradient predivide factor in optimizer (default: 1.0)')
+
+# Elastic Horovod settings
+parser.add_argument('--batches-per-commit', type=int, default=100,
+                    help='number of batches processed before calling `state.commit()`; '
+                         'commits prevent losing progress if an error occurs, but slow '
+                         'down training.')
+parser.add_argument('--batches-per-host-check', type=int, default=10,
+                    help='number of batches processed before calling `state.check_host_updates()`; '
+                         'this check is very fast compared to state.commit() (which calls this '
+                         'as part of the commit process), but because still incurs some cost due '
+                         'to broadcast, so we may not want to perform it every batch.')
+
+# Default settings from https://arxiv.org/abs/1706.02677.
+parser.add_argument('--batch-size', type=int, default=32,
+                    help='input batch size for training')
+parser.add_argument('--val-batch-size', type=int, default=32,
+                    help='input batch size for validation')
+parser.add_argument('--epochs', type=int, default=90,
+                    help='number of epochs to train')
+parser.add_argument('--base-lr', type=float, default=0.0125,
+                    help='learning rate for a single GPU')
+parser.add_argument('--warmup-epochs', type=float, default=5,
+                    help='number of warmup epochs')
+parser.add_argument('--momentum', type=float, default=0.9,
+                    help='SGD momentum')
+parser.add_argument('--wd', type=float, default=0.00005,
+                    help='weight decay')
+
+parser.add_argument('--no-cuda', action='store_true', default=False,
+                    help='disables CUDA training')
+parser.add_argument('--seed', type=int, default=42,
+                    help='random seed')
+
+
+def train(state):
+    model.train()
+    epoch = state.epoch
+    train_loss = Metric('train_loss')
+    train_accuracy = Metric('train_accuracy')
+
+    batch_offset = state.batch
+    with tqdm(total=len(train_loader),
+              desc='Train Epoch     #{}'.format(epoch + 1),
+              disable=not verbose) as t:
+        for idx, (data, target) in enumerate(train_loader):
+            # Elastic Horovod: update the current batch index this epoch
+            # and commit / check for host updates. Do not check hosts when
+            # we commit as it would be redundant.
+            state.batch = batch_idx = batch_offset + idx
+            if args.batches_per_commit > 0 and \
+                    state.batch % args.batches_per_commit == 0:
+                state.commit()
+            elif args.batches_per_host_check > 0 and \
+                    state.batch % args.batches_per_host_check == 0:
+                state.check_host_updates()
+
+            adjust_learning_rate(epoch, batch_idx)
+
+            if args.cuda:
+                data, target = data.cuda(), target.cuda()
+            optimizer.zero_grad()
+            # Split data into sub-batches of size batch_size
+            for i in range(0, len(data), args.batch_size):
+                data_batch = data[i:i + args.batch_size]
+                target_batch = target[i:i + args.batch_size]
+                output = model(data_batch)
+                train_accuracy.update(accuracy(output, target_batch))
+                loss = F.cross_entropy(output, target_batch)
+                train_loss.update(loss)
+                # Average gradients among sub-batches
+                loss.div_(math.ceil(float(len(data)) / args.batch_size))
+                loss.backward()
+
+            # Elastic Horovod: record which samples were processed this batch
+            # so we do not reprocess them if a reset event occurs
+            state.train_sampler.record_batch(idx, allreduce_batch_size)
+
+            # Gradient is applied across all ranks
+            optimizer.step()
+            t.set_postfix({'loss': train_loss.avg.item(),
+                           'accuracy': 100. * train_accuracy.avg.item()})
+
+            t.update(1)
+
+    if log_writer:
+        log_writer.add_scalar('train/loss', train_loss.avg, epoch)
+        log_writer.add_scalar('train/accuracy', train_accuracy.avg, epoch)
+
+    state.commit()
+
+
+def validate(epoch):
+    model.eval()
+    val_loss = Metric('val_loss')
+    val_accuracy = Metric('val_accuracy')
+
+    with tqdm(total=len(val_loader),
+              desc='Validate Epoch  #{}'.format(epoch + 1),
+              disable=not verbose) as t:
+        with torch.no_grad():
+            for data, target in val_loader:
+                if args.cuda:
+                    data, target = data.cuda(), target.cuda()
+                output = model(data)
+
+                val_loss.update(F.cross_entropy(output, target))
+                val_accuracy.update(accuracy(output, target))
+                t.set_postfix({'loss': val_loss.avg.item(),
+                               'accuracy': 100. * val_accuracy.avg.item()})
+                t.update(1)
+
+    if log_writer:
+        log_writer.add_scalar('val/loss', val_loss.avg, epoch)
+        log_writer.add_scalar('val/accuracy', val_accuracy.avg, epoch)
+
+
+# Horovod: using `lr = base_lr * hvd.size()` from the very beginning leads to worse final
+# accuracy. Scale the learning rate `lr = base_lr` ---> `lr = base_lr * hvd.size()` during
+# the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
+# After the warmup reduce learning rate by 10 on the 30th, 60th and 80th epochs.
+def adjust_learning_rate(epoch, batch_idx):
+    if epoch < args.warmup_epochs:
+        epoch += float(batch_idx + 1) / len(train_loader)
+        lr_adj = 1. / hvd.size() * (epoch * (hvd.size() - 1) / args.warmup_epochs + 1)
+    elif epoch < 30:
+        lr_adj = 1.
+    elif epoch < 60:
+        lr_adj = 1e-1
+    elif epoch < 80:
+        lr_adj = 1e-2
+    else:
+        lr_adj = 1e-3
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = args.base_lr * hvd.size() * args.batches_per_allreduce * lr_adj
+
+
+def accuracy(output, target):
+    # get the index of the max log-probability
+    pred = output.max(1, keepdim=True)[1]
+    return pred.eq(target.view_as(pred)).cpu().float().mean()
+
+
+def save_checkpoint(epoch):
+    if hvd.rank() == 0:
+        filepath = args.checkpoint_format.format(epoch=epoch + 1)
+        state = {
+            'model': model.state_dict(),
+            'optimizer': optimizer.state_dict(),
+        }
+        torch.save(state, filepath)
+
+
+def end_epoch(state):
+    state.epoch += 1
+    state.batch = 0
+    state.train_sampler.set_epoch(state.epoch)
+    state.commit()
+
+
+# Horovod: average metrics from distributed training.
+class Metric(object):
+    def __init__(self, name):
+        self.name = name
+        self.sum = torch.tensor(0.)
+        self.n = torch.tensor(0.)
+
+    def update(self, val):
+        self.sum += hvd.allreduce(val.detach().cpu(), name=self.name)
+        self.n += 1
+
+    @property
+    def avg(self):
+        return self.sum / self.n
+
+
+@hvd.elastic.run
+def full_train(state):
+    while state.epoch < args.epochs:
+        train(state)
+        validate(state.epoch)
+        save_checkpoint(state.epoch)
+        end_epoch(state)
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    args.cuda = not args.no_cuda and torch.cuda.is_available()
+
+    allreduce_batch_size = args.batch_size * args.batches_per_allreduce
+
+    hvd.init()
+    torch.manual_seed(args.seed)
+
+    if args.cuda:
+        # Horovod: pin GPU to local rank.
+        torch.cuda.set_device(hvd.local_rank())
+        torch.cuda.manual_seed(args.seed)
+
+    cudnn.benchmark = True
+
+    # Horovod: print logs on the first worker.
+    verbose = 1 if hvd.rank() == 0 else 0
+
+    # Horovod: write TensorBoard logs on first worker.
+    log_writer = SummaryWriter(args.log_dir) if hvd.rank() == 0 else None
+
+    # Horovod: limit # of CPU threads to be used per worker.
+    torch.set_num_threads(4)
+
+    kwargs = {'num_workers': 4, 'pin_memory': True} if args.cuda else {}
+    # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent
+    # issues with Infiniband implementations that are not fork-safe
+    if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and
+            mp._supports_context and 'forkserver' in mp.get_all_start_methods()):
+        kwargs['multiprocessing_context'] = 'forkserver'
+
+    train_dataset = \
+        datasets.ImageFolder(args.train_dir,
+                             transform=transforms.Compose([
+                                 transforms.RandomResizedCrop(224),
+                                 transforms.RandomHorizontalFlip(),
+                                 transforms.ToTensor(),
+                                 transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                                      std=[0.229, 0.224, 0.225])
+                             ]))
+    # Elastic Horovod: use ElasticSampler to partition data among workers.
+    train_sampler = hvd.elastic.ElasticSampler(train_dataset)
+    train_loader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=allreduce_batch_size,
+        sampler=train_sampler,
+        **kwargs)
+
+    val_dataset = \
+        datasets.ImageFolder(args.val_dir,
+                             transform=transforms.Compose([
+                                 transforms.Resize(256),
+                                 transforms.CenterCrop(224),
+                                 transforms.ToTensor(),
+                                 transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                                      std=[0.229, 0.224, 0.225])
+                             ]))
+    val_sampler = hvd.elastic.ElasticSampler(val_dataset)
+    val_loader = torch.utils.data.DataLoader(
+        val_dataset,
+        batch_size=args.val_batch_size,
+        sampler=val_sampler,
+        **kwargs)
+
+    # Set up standard ResNet-50 model.
+    model = models.resnet50()
+
+    # By default, Adasum doesn't need scaling up learning rate.
+    # For sum/average with gradient Accumulation: scale learning rate by batches_per_allreduce
+    lr_scaler = args.batches_per_allreduce * hvd.size() if not args.use_adasum else 1
+
+    if args.cuda:
+        # Move model to GPU.
+        model.cuda()
+        # If using GPU Adasum allreduce, scale learning rate by local_size.
+        if args.use_adasum and hvd.nccl_built():
+            lr_scaler = args.batches_per_allreduce * hvd.local_size()
+
+    # Horovod: scale learning rate by the number of GPUs.
+    optimizer = optim.SGD(model.parameters(),
+                          lr=(args.base_lr *
+                              lr_scaler),
+                          momentum=args.momentum, weight_decay=args.wd)
+
+    # Horovod: (optional) compression algorithm.
+    compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none
+
+    # Horovod: wrap optimizer with DistributedOptimizer.
+    optimizer = hvd.DistributedOptimizer(
+        optimizer, named_parameters=model.named_parameters(),
+        compression=compression,
+        backward_passes_per_step=args.batches_per_allreduce,
+        op=hvd.Adasum if args.use_adasum else hvd.Average,
+        gradient_predivide_factor=args.gradient_predivide_factor)
+
+    # Restore from a previous checkpoint, if initial_epoch is specified.
+    # Horovod: restore on the first worker which will broadcast weights to other workers.
+    resume_from_epoch = 0
+    if hvd.rank() == 0:
+        for try_epoch in range(args.epochs, 0, -1):
+            if os.path.exists(args.checkpoint_format.format(epoch=try_epoch)):
+                resume_from_epoch = try_epoch
+                break
+
+        if resume_from_epoch > 0:
+            filepath = args.checkpoint_format.format(epoch=resume_from_epoch)
+            checkpoint = torch.load(filepath)
+            model.load_state_dict(checkpoint['model'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+
+    state = hvd.elastic.TorchState(model=model,
+                                   optimizer=optimizer,
+                                   train_sampler=train_sampler,
+                                   val_sampler=val_sampler,
+                                   epoch=resume_from_epoch,
+                                   batch=0)
+
+    full_train(state)