Implementation of cosine learning rate training policy (#29440)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/29440

as titled. same as diff: D18195868.
We fix the windows compiling issue by changing the marco, inspired from: D15511736

Test Plan:
buck test -v 2 caffe2/caffe2/fb/dper/layer_models/tests/split_1:sparse_nn_test  -- test_composite_cosine_lr_policy
canary: https://fburl.com/fblearner/ky7wh3vg

Differential Revision: D18392276

fbshipit-source-id: 83c84c985cd23b1cc43efedfef176ff3c67acb6e
This commit is contained in:
Jiang Liu 2019-11-08 12:26:07 -08:00 committed by Facebook Github Bot
parent edcf659e42
commit 3bc014ecf2
3 changed files with 151 additions and 1 deletions

View File

@ -1,9 +1,16 @@
#ifndef CAFFE2_SGD_LEARNING_RATE_FUNCTORS_H_
#define CAFFE2_SGD_LEARNING_RATE_FUNCTORS_H_
#define _USE_MATH_DEFINES
#include <cmath>
#include <list>
#include <map>
#ifdef _MSC_VER
#define _USE_MATH_DEFINES
#include <math.h>
#endif // _MSC_VER
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
@ -287,6 +294,52 @@ class CyclicalLearningRate : public LearningRateFunctor<T> {
T decay_;
};
// Cosine: return a learning rate with a cosine schedule
// lower bound min_lr, upper bound max_lr.
// See https://arxiv.org/pdf/1608.03983.pdf
template <typename T>
class CosineLearningRate : public LearningRateFunctor<T> {
public:
CosineLearningRate(
const T min_lr,
const T max_lr,
const int64_t period,
const T t_mult,
const T lr_shrink)
: min_lr_(min_lr),
max_lr_(max_lr),
period_(period),
t_mult_(t_mult),
lr_shrink_(lr_shrink) {}
T operator()(const int64_t iter) const override {
T i, t_i, t_curr;
if (t_mult_ != 1.0) {
// the period is changed every time
i = floor(
log(1 - double(iter) / double(period_) * (1.0 - t_mult_)) /
log(t_mult_));
t_i = pow(t_mult_, i) * period_;
t_curr = iter - (1.0 - pow(t_mult_, i)) / (1.0 - t_mult_) * period_;
} else {
// fixed period
i = floor(double(iter) / double(period_));
t_i = period_;
t_curr = iter - t_i * i;
}
T lr_shrink = pow(lr_shrink_, i);
T min_lr = min_lr_ * lr_shrink;
T max_lr = max_lr_ * lr_shrink;
T final_lr =
min_lr + 0.5 * (max_lr - min_lr) * (1 + cos(M_PI * t_curr / t_i));
return final_lr;
}
T min_lr_;
T max_lr_;
int64_t period_;
T t_mult_;
T lr_shrink_;
};
// constantThenLinearWarmup: first use a constant multiplier
// and then ramp up to the global lr
template <typename T>
@ -316,6 +369,47 @@ class ConstantThenLinearWarmupLearningRate : public LearningRateFunctor<T> {
LinearWarmupLearningRate<T> linear_warmup_lr_;
};
// CompositeCosineLearningRate: first use a constant multiplier
// and then ramp up to the global lr, and then use a cosine learning rate
template <typename T>
class CompositeCosineLearningRate : public LearningRateFunctor<T> {
public:
CompositeCosineLearningRate(
const T start_warmup_multiplier,
const int64_t constant_warmup_num_iter,
const int64_t linear_warmup_num_iter,
const T cosine_min_lr,
const T cosine_max_lr,
const int64_t cosine_period,
const T consine_t_mult,
const T cosine_lr_shrink)
: constant_warmup_num_iter_(constant_warmup_num_iter),
linear_warmup_num_iter_(linear_warmup_num_iter),
constant_then_linear_warmup_lr_(
start_warmup_multiplier,
constant_warmup_num_iter,
linear_warmup_num_iter),
cosine_lr_(
cosine_min_lr,
cosine_max_lr,
cosine_period,
consine_t_mult,
cosine_lr_shrink) {}
T operator()(const int64_t iter) const override {
if (iter < constant_warmup_num_iter_ + linear_warmup_num_iter_) {
return constant_then_linear_warmup_lr_(iter);
}
return cosine_lr_(
iter - constant_warmup_num_iter_ - linear_warmup_num_iter_);
}
int64_t constant_warmup_num_iter_;
int64_t linear_warmup_num_iter_;
ConstantThenLinearWarmupLearningRate<T> constant_then_linear_warmup_lr_;
CosineLearningRate<T> cosine_lr_;
};
// CompositeCyclicalLearningRate: first use a constant multiplier
// and then ramp up to the global lr, and then use a cyclical learning rate
template <typename T>

View File

@ -33,8 +33,10 @@ Required:
`hill`: uses those in both `linearWarmup` and `inv`, plus `end_multiplier`
`composite`: uses `sub_policy_num_iters` and additional args with format
`cyclic`: uses `max_lr`, `stepsize`
`cosine`: uses `min_lr`, `max_lr`, `period`, `t_mult`, `lr_shrink`
`constantThenLinearWarmup`: uses `start_warmup_multiplier`, `constant_warmup_num_iter`, `linear_warmup_num_iter`
`compositeCyclical`: uses `start_warmup_multiplier`, `constant_warmup_num_iter`, `linear_warmup_num_iter`, `cyclical_max_lr`, `cyclical_step_size`, `cyclical_decay`
`compositeCosine`: uses `start_warmup_multiplier`, `constant_warmup_num_iter`, `linear_warmup_num_iter`, `cosine_max_lr`, `cosine_period`, `cosine_t_mult`, `cosine_lr_shrink`
sub_policy_{sub_policy_index}_{sub_policy_arg}, for example:
sub_policy_0_policy: "exp", sub_policy_0_gamma: 0.99,
sub_policy_0_lr_scale: 1.2
@ -58,10 +60,15 @@ Optional:
`m3`: defaults to 0.5, the third piece lr of piece warmup
`start_warmup_multiplier`: defaults to 0.1, part of constantThenLinearWarmup
`constant_warmup_num_iter`: defaults to 10000000, part of constantThenLinearWarmup and constantThenLinearWarmup
`linear_warmup_num_iter`: defaults to 10000000, part of constantThenLinearWarmup and CompositeCyclicalLRPolicy
`linear_warmup_num_iter`: defaults to 10000000, part of constantThenLinearWarmup, CompositeCyclicalLRPolicy, CompositeCosineLRPolicy
`cyclical_max_lr`: defaults to 0.05, part of CompositeCyclicalLRPolicy
`cyclical_step_size`: defaults to 1000000, part of CompositeCyclicalLRPolicy
`cyclical_decay`: defaults to 1.0, part of CompositeCyclicalLRPolicy
`cosine_min_lr`:defaults to 0.01, part of CompositeCosineLRPolicy
`cosine_max_lr`:defaults to 0.05, part of CompositeCosineLRPolicy
`cosine_period`:defaults to 50, part of CompositeCosineLRPolicy
`cosine_t_mult`:defaults to 1.0, part of CompositeCosineLRPolicy
`cosine_lr_shrink`:defaults to 0.99, part of CompositeCosineLRPolicy
Usage:
train_net.LearningRate(*iterations*, "*label*", base_lr=*float*,
@ -120,6 +127,13 @@ Example usage:
.Arg(
"cyclical_decay",
"defaults to 0.999, part of CompositeCyclicalLRPolicy")
.Arg("cosine_min_lr", "defaults to 0.01, part of CompositeCosineLRPolicy")
.Arg("cosine_max_lr", "defaults to 0.05, part of CompositeCosineLRPolicy")
.Arg("cosine_period", "defaults to 50, part of CompositeCosineLRPolicy")
.Arg("cosine_t_mult", "defaults to 1,0, part of CompositeCosineLRPolicy")
.Arg(
"cosine_lr_shrink",
"defaults to 0.99, part of CompositeCosineLRPolicy")
.Input(0, "input", "description needed")
.Output(0, "output", "description needed")
.DeviceInferenceFunction([](const OperatorDef& def) {

View File

@ -210,6 +210,48 @@ class LearningRateOp final : public Operator<Context> {
cyclical_max_lr,
cyclical_step_size,
cyclical_decay);
} else if (policy == "cosine") {
T max_lr =
this->template GetSingleArgument<float>(arg_prefix + "max_lr", 0.5);
T min_lr =
this->template GetSingleArgument<float>(arg_prefix + "min_lr", 0.1);
int64_t period =
this->template GetSingleArgument<int>(arg_prefix + "period", 50);
T t_mult =
this->template GetSingleArgument<float>(arg_prefix + "t_mult", 1.0);
T lr_shrink = this->template GetSingleArgument<float>(
arg_prefix + "lr_shrink", 0.99);
DCHECK_GE(max_lr, min_lr);
return new CosineLearningRate<T>(
min_lr, max_lr, period, t_mult, lr_shrink);
} else if (policy == "compositeCosine") {
T start_warmup_multiplier = this->template GetSingleArgument<float>(
arg_prefix + "start_warmup_multiplier", 0.1);
int64_t constant_warmup_num_iter = this->template GetSingleArgument<int>(
arg_prefix + "constant_warmup_num_iter", 10000000);
int64_t linear_warmup_num_iter = this->template GetSingleArgument<int>(
arg_prefix + "linear_warmup_num_iter", 10000000);
T cosine_max_lr = this->template GetSingleArgument<float>(
arg_prefix + "cosine_max_lr", 0.5);
T cosine_min_lr = this->template GetSingleArgument<float>(
arg_prefix + "cosine_min_lr", 0.1);
int64_t cosine_period = this->template GetSingleArgument<int>(
arg_prefix + "cosine_period", 50);
T cosine_t_mult = this->template GetSingleArgument<float>(
arg_prefix + "cosine_t_mult", 1.0);
T cosine_lr_shrink = this->template GetSingleArgument<float>(
arg_prefix + "cosine_lr_shrink", 0.99);
DCHECK_GE(cosine_max_lr, cosine_min_lr);
return new CompositeCosineLearningRate<T>(
start_warmup_multiplier,
constant_warmup_num_iter,
linear_warmup_num_iter,
cosine_min_lr,
cosine_max_lr,
cosine_period,
cosine_t_mult,
cosine_lr_shrink);
} else {
CAFFE_THROW("Unknown learning rate policy: ", policy);
return NULL;