Line data Source code
1 : /* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 : Copyright (c) 2016-2021 The VES code team 3 : (see the PEOPLE-VES file at the root of this folder for a list of names) 4 : 5 : See http://www.ves-code.org for more information. 6 : 7 : This file is part of VES code module. 8 : 9 : The VES code module is free software: you can redistribute it and/or modify 10 : it under the terms of the GNU Lesser General Public License as published by 11 : the Free Software Foundation, either version 3 of the License, or 12 : (at your option) any later version. 13 : 14 : The VES code module is distributed in the hope that it will be useful, 15 : but WITHOUT ANY WARRANTY; without even the implied warranty of 16 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 : GNU Lesser General Public License for more details. 18 : 19 : You should have received a copy of the GNU Lesser General Public License 20 : along with the VES code module. If not, see <http://www.gnu.org/licenses/>. 21 : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */ 22 : 23 : #include "Optimizer.h" 24 : #include "CoeffsVector.h" 25 : 26 : #include "core/ActionRegister.h" 27 : #include "core/ActionSet.h" 28 : 29 : 30 : namespace PLMD { 31 : namespace ves { 32 : 33 : //+PLUMEDOC VES_OPTIMIZER OPT_ADAM 34 : /* 35 : Adaptive moment estimation (ADAM) optimizer. 36 : 37 : \attention 38 : __This optimizer is still experimental and not fully documented. The syntax might change. Restarting does not work. We recommend to use the averaged stochastic gradient decent optimizer (\ref OPT_AVERAGED_SGD) for now__. 39 : 40 : 41 : \par Examples 42 : 43 : */ 44 : //+ENDPLUMEDOC 45 : 46 : class Opt_Adam: public Optimizer { 47 : private: 48 : unsigned int time_; 49 : double beta_1_; 50 : double beta_2_; 51 : double epsilon_; 52 : double one_minus_weight_decay_; 53 : bool amsgrad_; 54 : bool adamw_; 55 : // 1st gradient moment uses the "AuxCoeffs", so only 2nd moment needs new CoeffVectors 56 : std::vector<std::unique_ptr<CoeffsVector>> var_coeffs_pntrs_; 57 : // used only for AMSGrad variant 58 : std::vector<std::unique_ptr<CoeffsVector>> varmax_coeffs_pntrs_; 59 : protected: 60 : CoeffsVector& VarCoeffs(const unsigned int coeffs_id = 0) const; 61 : CoeffsVector& VarmaxCoeffs(const unsigned int coeffs_id = 0) const; 62 : public: 63 : static void registerKeywords(Keywords&); 64 : explicit Opt_Adam(const ActionOptions&); 65 : void coeffsUpdate(const unsigned int c_id = 0) override; 66 : }; 67 : 68 : inline 69 : CoeffsVector& Opt_Adam::VarCoeffs(const unsigned int coeffs_id) const {return *var_coeffs_pntrs_[coeffs_id];} 70 : 71 : inline 72 : CoeffsVector& Opt_Adam::VarmaxCoeffs(const unsigned int coeffs_id) const {return *varmax_coeffs_pntrs_[coeffs_id];} 73 : 74 : 75 : PLUMED_REGISTER_ACTION(Opt_Adam,"OPT_ADAM") 76 : 77 : 78 4 : void Opt_Adam::registerKeywords(Keywords& keys) { 79 4 : Optimizer::registerKeywords(keys); 80 4 : Optimizer::useFixedStepSizeKeywords(keys); 81 4 : Optimizer::useMultipleWalkersKeywords(keys); 82 4 : Optimizer::useMaskKeywords(keys); 83 4 : Optimizer::useDynamicTargetDistributionKeywords(keys); 84 8 : keys.add("optional","BETA_1","Parameter for the first moment estimate. Defaults to 0.9"); 85 8 : keys.add("optional","BETA_2","Parameter for the second moment estimate. Defaults to 0.999"); 86 8 : keys.add("optional","EPSILON","Small parameter to avoid division by zero. Defaults to 1e-8"); 87 8 : keys.add("optional","ADAMW_WEIGHT_DECAY","Weight decay parameter for the AdamW variant. Defaults to 0"); 88 8 : keys.addFlag("AMSGRAD", false, "Use the AMSGrad variant"); 89 4 : } 90 : 91 : 92 2 : Opt_Adam::Opt_Adam(const ActionOptions&ao): 93 : PLUMED_VES_OPTIMIZER_INIT(ao), 94 2 : time_(0), 95 2 : beta_1_(0.9), 96 2 : beta_2_(0.999), 97 2 : epsilon_(0.00000001), 98 2 : one_minus_weight_decay_(1.0), 99 2 : amsgrad_(false), 100 2 : adamw_(false), 101 2 : var_coeffs_pntrs_(0) 102 : { 103 : // add citation and print it to log 104 2 : log << " Adam type stochastic gradient decent\n"; 105 2 : parseFlag("AMSGRAD",amsgrad_); 106 2 : if (amsgrad_) { 107 1 : log << " Using the AMSGrad variant of the Adam algorithm, see and cite\n"; 108 : } 109 : 110 2 : double tmp_weight_decay = 0.0; 111 2 : parse("ADAMW_WEIGHT_DECAY",tmp_weight_decay); 112 2 : if (tmp_weight_decay != 0.0) { 113 0 : adamw_ = true; 114 0 : log << " Using the AdamW variant (Adam with weight decay), see and cite\n"; 115 0 : one_minus_weight_decay_ = 1 - tmp_weight_decay; 116 0 : log << " weight decay parameter: " << tmp_weight_decay << "\n"; 117 : } 118 : 119 2 : log << " Adam parameters:\n"; 120 2 : parse("BETA_1",beta_1_); 121 2 : plumed_massert(beta_1_ > 0 && beta_1_ <= 1, "BETA_1 must be between 0 and 1"); 122 2 : log << " beta_1: " << beta_1_ << "\n"; 123 : 124 2 : parse("BETA_2",beta_2_); 125 2 : plumed_massert(beta_2_ > 0 && beta_2_ <= 1, "BETA_2 must be between 0 and 1"); 126 2 : log << " beta_2: " << beta_2_ << "\n"; 127 : 128 2 : parse("EPSILON",epsilon_); 129 2 : plumed_massert(epsilon_ > 0 && epsilon_ <= 1, "EPSILON must be between 0 and 1"); 130 2 : log << " epsilon: " << epsilon_ << "\n"; 131 : 132 : 133 : // set up the coeff vector for the 2nd moment of the gradient (variance) 134 4 : for (unsigned i = 0; i < numberOfCoeffsSets(); ++i) { 135 2 : var_coeffs_pntrs_.emplace_back(std::unique_ptr<CoeffsVector>(new CoeffsVector(Coeffs(i)))); 136 4 : VarCoeffs(i).replaceLabelString("coeffs","grad_var"); 137 2 : VarCoeffs(i).setAllValuesToZero(); // can Coeffs(i) even be non-zero at this point? 138 : 139 : // add second set of coefficients to store the maximum values of the 2nd moment 140 2 : if (amsgrad_) { 141 1 : varmax_coeffs_pntrs_.emplace_back(std::unique_ptr<CoeffsVector>(new CoeffsVector(VarCoeffs(i)))); 142 2 : VarmaxCoeffs(i).replaceLabelString("coeffs","grad_varmax"); 143 : } 144 : 145 : // also rename the Coeffs used for the mean of the gradient 146 4 : AuxCoeffs(i).replaceLabelString("coeffs","grad_mean"); 147 : } 148 : 149 2 : checkRead(); 150 2 : } 151 : 152 : 153 20 : void Opt_Adam::coeffsUpdate(const unsigned int c_id) { 154 20 : time_++; 155 : // AuxCoeffs is used for first moment (mean) 156 20 : AuxCoeffs(c_id) *= beta_1_; 157 20 : AuxCoeffs(c_id) += (1 - beta_1_ ) * Gradient(c_id) * CoeffsMask(c_id); 158 20 : VarCoeffs(c_id) *= beta_2_; 159 20 : VarCoeffs(c_id) += (1 - beta_2_ ) * Gradient(c_id) * Gradient(c_id) * CoeffsMask(c_id); 160 : 161 20 : if (amsgrad_) { 162 120 : for (size_t i = 0; i< VarCoeffs(c_id).getSize(); ++i) { 163 110 : if (VarCoeffs(c_id).getValue(i) > VarmaxCoeffs(c_id).getValue(i)) { 164 95 : VarmaxCoeffs(c_id)[i] = VarCoeffs(c_id).getValue(i); 165 : } 166 : } 167 : } 168 : 169 : // store sqrt of VarCoeffs in vector, easier than writing a CoeffsVector::sqrt() function 170 : // also directly add epsilon and invert to multiply with the Coeffs in last step 171 : std::vector<double> var_coeffs_sqrt; 172 20 : if (!amsgrad_) { 173 120 : for (size_t i = 0; i< VarCoeffs(c_id).getSize(); ++i) { 174 110 : var_coeffs_sqrt.push_back(1 / (sqrt(VarCoeffs(c_id).getValue(i)) + epsilon)); 175 : } 176 : } 177 : else { // use VarmaxCoffs instead of VarCoeffs 178 120 : for (size_t i = 0; i< VarmaxCoeffs(c_id).getSize(); ++i) { 179 110 : var_coeffs_sqrt.push_back(1 / (sqrt(VarmaxCoeffs(c_id).getValue(i)) + epsilon)); 180 : } 181 : } 182 : 183 : // bias correction 184 20 : double scalefactor = StepSize(c_id) * sqrt(1 - pow(beta_2_, time_)) / (1 - pow(beta_1_, time_)); 185 : 186 20 : if (adamw_) { // check is not necessary but probably faster than always multiplying by 1 187 0 : Coeffs(c_id) *= one_minus_weight_decay_ * CoeffsMask(c_id); 188 : } 189 : 190 : // coeff update 191 20 : Coeffs(c_id) -= scalefactor * AuxCoeffs(c_id) * var_coeffs_sqrt * CoeffsMask(c_id); 192 20 : } 193 : 194 : 195 : } 196 : }