mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Implementation of cosine learning rate training policy (#29440)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/29440 as titled. same as diff: D18195868. We fix the windows compiling issue by changing the marco, inspired from: D15511736 Test Plan: buck test -v 2 caffe2/caffe2/fb/dper/layer_models/tests/split_1:sparse_nn_test -- test_composite_cosine_lr_policy canary: https://fburl.com/fblearner/ky7wh3vg Differential Revision: D18392276 fbshipit-source-id: 83c84c985cd23b1cc43efedfef176ff3c67acb6e
This commit is contained in:
parent
edcf659e42
commit
3bc014ecf2
|
|
@ -1,9 +1,16 @@
|
|||
#ifndef CAFFE2_SGD_LEARNING_RATE_FUNCTORS_H_
|
||||
#define CAFFE2_SGD_LEARNING_RATE_FUNCTORS_H_
|
||||
#define _USE_MATH_DEFINES
|
||||
|
||||
#include <cmath>
|
||||
#include <list>
|
||||
#include <map>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define _USE_MATH_DEFINES
|
||||
#include <math.h>
|
||||
#endif // _MSC_VER
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
|
||||
|
|
@ -287,6 +294,52 @@ class CyclicalLearningRate : public LearningRateFunctor<T> {
|
|||
T decay_;
|
||||
};
|
||||
|
||||
// Cosine: return a learning rate with a cosine schedule
|
||||
// lower bound min_lr, upper bound max_lr.
|
||||
// See https://arxiv.org/pdf/1608.03983.pdf
|
||||
template <typename T>
|
||||
class CosineLearningRate : public LearningRateFunctor<T> {
|
||||
public:
|
||||
CosineLearningRate(
|
||||
const T min_lr,
|
||||
const T max_lr,
|
||||
const int64_t period,
|
||||
const T t_mult,
|
||||
const T lr_shrink)
|
||||
: min_lr_(min_lr),
|
||||
max_lr_(max_lr),
|
||||
period_(period),
|
||||
t_mult_(t_mult),
|
||||
lr_shrink_(lr_shrink) {}
|
||||
T operator()(const int64_t iter) const override {
|
||||
T i, t_i, t_curr;
|
||||
if (t_mult_ != 1.0) {
|
||||
// the period is changed every time
|
||||
i = floor(
|
||||
log(1 - double(iter) / double(period_) * (1.0 - t_mult_)) /
|
||||
log(t_mult_));
|
||||
t_i = pow(t_mult_, i) * period_;
|
||||
t_curr = iter - (1.0 - pow(t_mult_, i)) / (1.0 - t_mult_) * period_;
|
||||
} else {
|
||||
// fixed period
|
||||
i = floor(double(iter) / double(period_));
|
||||
t_i = period_;
|
||||
t_curr = iter - t_i * i;
|
||||
}
|
||||
T lr_shrink = pow(lr_shrink_, i);
|
||||
T min_lr = min_lr_ * lr_shrink;
|
||||
T max_lr = max_lr_ * lr_shrink;
|
||||
T final_lr =
|
||||
min_lr + 0.5 * (max_lr - min_lr) * (1 + cos(M_PI * t_curr / t_i));
|
||||
return final_lr;
|
||||
}
|
||||
T min_lr_;
|
||||
T max_lr_;
|
||||
int64_t period_;
|
||||
T t_mult_;
|
||||
T lr_shrink_;
|
||||
};
|
||||
|
||||
// constantThenLinearWarmup: first use a constant multiplier
|
||||
// and then ramp up to the global lr
|
||||
template <typename T>
|
||||
|
|
@ -316,6 +369,47 @@ class ConstantThenLinearWarmupLearningRate : public LearningRateFunctor<T> {
|
|||
LinearWarmupLearningRate<T> linear_warmup_lr_;
|
||||
};
|
||||
|
||||
// CompositeCosineLearningRate: first use a constant multiplier
|
||||
// and then ramp up to the global lr, and then use a cosine learning rate
|
||||
template <typename T>
|
||||
class CompositeCosineLearningRate : public LearningRateFunctor<T> {
|
||||
public:
|
||||
CompositeCosineLearningRate(
|
||||
const T start_warmup_multiplier,
|
||||
const int64_t constant_warmup_num_iter,
|
||||
const int64_t linear_warmup_num_iter,
|
||||
const T cosine_min_lr,
|
||||
const T cosine_max_lr,
|
||||
const int64_t cosine_period,
|
||||
const T consine_t_mult,
|
||||
const T cosine_lr_shrink)
|
||||
: constant_warmup_num_iter_(constant_warmup_num_iter),
|
||||
linear_warmup_num_iter_(linear_warmup_num_iter),
|
||||
constant_then_linear_warmup_lr_(
|
||||
start_warmup_multiplier,
|
||||
constant_warmup_num_iter,
|
||||
linear_warmup_num_iter),
|
||||
cosine_lr_(
|
||||
cosine_min_lr,
|
||||
cosine_max_lr,
|
||||
cosine_period,
|
||||
consine_t_mult,
|
||||
cosine_lr_shrink) {}
|
||||
|
||||
T operator()(const int64_t iter) const override {
|
||||
if (iter < constant_warmup_num_iter_ + linear_warmup_num_iter_) {
|
||||
return constant_then_linear_warmup_lr_(iter);
|
||||
}
|
||||
return cosine_lr_(
|
||||
iter - constant_warmup_num_iter_ - linear_warmup_num_iter_);
|
||||
}
|
||||
|
||||
int64_t constant_warmup_num_iter_;
|
||||
int64_t linear_warmup_num_iter_;
|
||||
ConstantThenLinearWarmupLearningRate<T> constant_then_linear_warmup_lr_;
|
||||
CosineLearningRate<T> cosine_lr_;
|
||||
};
|
||||
|
||||
// CompositeCyclicalLearningRate: first use a constant multiplier
|
||||
// and then ramp up to the global lr, and then use a cyclical learning rate
|
||||
template <typename T>
|
||||
|
|
|
|||
|
|
@ -33,8 +33,10 @@ Required:
|
|||
`hill`: uses those in both `linearWarmup` and `inv`, plus `end_multiplier`
|
||||
`composite`: uses `sub_policy_num_iters` and additional args with format
|
||||
`cyclic`: uses `max_lr`, `stepsize`
|
||||
`cosine`: uses `min_lr`, `max_lr`, `period`, `t_mult`, `lr_shrink`
|
||||
`constantThenLinearWarmup`: uses `start_warmup_multiplier`, `constant_warmup_num_iter`, `linear_warmup_num_iter`
|
||||
`compositeCyclical`: uses `start_warmup_multiplier`, `constant_warmup_num_iter`, `linear_warmup_num_iter`, `cyclical_max_lr`, `cyclical_step_size`, `cyclical_decay`
|
||||
`compositeCosine`: uses `start_warmup_multiplier`, `constant_warmup_num_iter`, `linear_warmup_num_iter`, `cosine_max_lr`, `cosine_period`, `cosine_t_mult`, `cosine_lr_shrink`
|
||||
sub_policy_{sub_policy_index}_{sub_policy_arg}, for example:
|
||||
sub_policy_0_policy: "exp", sub_policy_0_gamma: 0.99,
|
||||
sub_policy_0_lr_scale: 1.2
|
||||
|
|
@ -58,10 +60,15 @@ Optional:
|
|||
`m3`: defaults to 0.5, the third piece lr of piece warmup
|
||||
`start_warmup_multiplier`: defaults to 0.1, part of constantThenLinearWarmup
|
||||
`constant_warmup_num_iter`: defaults to 10000000, part of constantThenLinearWarmup and constantThenLinearWarmup
|
||||
`linear_warmup_num_iter`: defaults to 10000000, part of constantThenLinearWarmup and CompositeCyclicalLRPolicy
|
||||
`linear_warmup_num_iter`: defaults to 10000000, part of constantThenLinearWarmup, CompositeCyclicalLRPolicy, CompositeCosineLRPolicy
|
||||
`cyclical_max_lr`: defaults to 0.05, part of CompositeCyclicalLRPolicy
|
||||
`cyclical_step_size`: defaults to 1000000, part of CompositeCyclicalLRPolicy
|
||||
`cyclical_decay`: defaults to 1.0, part of CompositeCyclicalLRPolicy
|
||||
`cosine_min_lr`:defaults to 0.01, part of CompositeCosineLRPolicy
|
||||
`cosine_max_lr`:defaults to 0.05, part of CompositeCosineLRPolicy
|
||||
`cosine_period`:defaults to 50, part of CompositeCosineLRPolicy
|
||||
`cosine_t_mult`:defaults to 1.0, part of CompositeCosineLRPolicy
|
||||
`cosine_lr_shrink`:defaults to 0.99, part of CompositeCosineLRPolicy
|
||||
|
||||
Usage:
|
||||
train_net.LearningRate(*iterations*, "*label*", base_lr=*float*,
|
||||
|
|
@ -120,6 +127,13 @@ Example usage:
|
|||
.Arg(
|
||||
"cyclical_decay",
|
||||
"defaults to 0.999, part of CompositeCyclicalLRPolicy")
|
||||
.Arg("cosine_min_lr", "defaults to 0.01, part of CompositeCosineLRPolicy")
|
||||
.Arg("cosine_max_lr", "defaults to 0.05, part of CompositeCosineLRPolicy")
|
||||
.Arg("cosine_period", "defaults to 50, part of CompositeCosineLRPolicy")
|
||||
.Arg("cosine_t_mult", "defaults to 1,0, part of CompositeCosineLRPolicy")
|
||||
.Arg(
|
||||
"cosine_lr_shrink",
|
||||
"defaults to 0.99, part of CompositeCosineLRPolicy")
|
||||
.Input(0, "input", "description needed")
|
||||
.Output(0, "output", "description needed")
|
||||
.DeviceInferenceFunction([](const OperatorDef& def) {
|
||||
|
|
|
|||
|
|
@ -210,6 +210,48 @@ class LearningRateOp final : public Operator<Context> {
|
|||
cyclical_max_lr,
|
||||
cyclical_step_size,
|
||||
cyclical_decay);
|
||||
} else if (policy == "cosine") {
|
||||
T max_lr =
|
||||
this->template GetSingleArgument<float>(arg_prefix + "max_lr", 0.5);
|
||||
T min_lr =
|
||||
this->template GetSingleArgument<float>(arg_prefix + "min_lr", 0.1);
|
||||
int64_t period =
|
||||
this->template GetSingleArgument<int>(arg_prefix + "period", 50);
|
||||
T t_mult =
|
||||
this->template GetSingleArgument<float>(arg_prefix + "t_mult", 1.0);
|
||||
T lr_shrink = this->template GetSingleArgument<float>(
|
||||
arg_prefix + "lr_shrink", 0.99);
|
||||
DCHECK_GE(max_lr, min_lr);
|
||||
return new CosineLearningRate<T>(
|
||||
min_lr, max_lr, period, t_mult, lr_shrink);
|
||||
} else if (policy == "compositeCosine") {
|
||||
T start_warmup_multiplier = this->template GetSingleArgument<float>(
|
||||
arg_prefix + "start_warmup_multiplier", 0.1);
|
||||
int64_t constant_warmup_num_iter = this->template GetSingleArgument<int>(
|
||||
arg_prefix + "constant_warmup_num_iter", 10000000);
|
||||
int64_t linear_warmup_num_iter = this->template GetSingleArgument<int>(
|
||||
arg_prefix + "linear_warmup_num_iter", 10000000);
|
||||
T cosine_max_lr = this->template GetSingleArgument<float>(
|
||||
arg_prefix + "cosine_max_lr", 0.5);
|
||||
T cosine_min_lr = this->template GetSingleArgument<float>(
|
||||
arg_prefix + "cosine_min_lr", 0.1);
|
||||
int64_t cosine_period = this->template GetSingleArgument<int>(
|
||||
arg_prefix + "cosine_period", 50);
|
||||
T cosine_t_mult = this->template GetSingleArgument<float>(
|
||||
arg_prefix + "cosine_t_mult", 1.0);
|
||||
T cosine_lr_shrink = this->template GetSingleArgument<float>(
|
||||
arg_prefix + "cosine_lr_shrink", 0.99);
|
||||
|
||||
DCHECK_GE(cosine_max_lr, cosine_min_lr);
|
||||
return new CompositeCosineLearningRate<T>(
|
||||
start_warmup_multiplier,
|
||||
constant_warmup_num_iter,
|
||||
linear_warmup_num_iter,
|
||||
cosine_min_lr,
|
||||
cosine_max_lr,
|
||||
cosine_period,
|
||||
cosine_t_mult,
|
||||
cosine_lr_shrink);
|
||||
} else {
|
||||
CAFFE_THROW("Unknown learning rate policy: ", policy);
|
||||
return NULL;
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user