Line data Source code
1 : /* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 : Copyright (c) 2016-2021 The VES code team 3 : (see the PEOPLE-VES file at the root of this folder for a list of names) 4 : 5 : See http://www.ves-code.org for more information. 6 : 7 : This file is part of VES code module. 8 : 9 : The VES code module is free software: you can redistribute it and/or modify 10 : it under the terms of the GNU Lesser General Public License as published by 11 : the Free Software Foundation, either version 3 of the License, or 12 : (at your option) any later version. 13 : 14 : The VES code module is distributed in the hope that it will be useful, 15 : but WITHOUT ANY WARRANTY; without even the implied warranty of 16 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 : GNU Lesser General Public License for more details. 18 : 19 : You should have received a copy of the GNU Lesser General Public License 20 : along with the VES code module. If not, see <http://www.gnu.org/licenses/>. 21 : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */ 22 : 23 : #include "Optimizer.h" 24 : #include "CoeffsVector.h" 25 : 26 : #include "core/ActionRegister.h" 27 : #include "core/ActionSet.h" 28 : 29 : 30 : namespace PLMD { 31 : namespace ves { 32 : 33 : //+PLUMEDOC VES_OPTIMIZER OPT_ADAM 34 : /* 35 : Adaptive moment estimation (ADAM) optimizer. 36 : 37 : \attention 38 : __This optimizer is still experimental and not fully documented. The syntax might change. Restarting does not work. We recommend to use the averaged stochastic gradient decent optimizer (\ref OPT_AVERAGED_SGD) for now__. 39 : 40 : 41 : \par Examples 42 : 43 : */ 44 : //+ENDPLUMEDOC 45 : 46 : class Opt_Adam: public Optimizer { 47 : private: 48 : unsigned int time_; 49 : double beta_1_; 50 : double beta_2_; 51 : double epsilon_; 52 : double one_minus_weight_decay_; 53 : bool amsgrad_; 54 : bool adamw_; 55 : // 1st gradient moment uses the "AuxCoeffs", so only 2nd moment needs new CoeffVectors 56 : std::vector<std::unique_ptr<CoeffsVector>> var_coeffs_pntrs_; 57 : // used only for AMSGrad variant 58 : std::vector<std::unique_ptr<CoeffsVector>> varmax_coeffs_pntrs_; 59 : protected: 60 : CoeffsVector& VarCoeffs(const unsigned int coeffs_id = 0) const; 61 : CoeffsVector& VarmaxCoeffs(const unsigned int coeffs_id = 0) const; 62 : public: 63 : static void registerKeywords(Keywords&); 64 : explicit Opt_Adam(const ActionOptions&); 65 : void coeffsUpdate(const unsigned int c_id = 0) override; 66 : }; 67 : 68 : inline 69 : CoeffsVector& Opt_Adam::VarCoeffs(const unsigned int coeffs_id) const { 70 : return *var_coeffs_pntrs_[coeffs_id]; 71 : } 72 : 73 : inline 74 : CoeffsVector& Opt_Adam::VarmaxCoeffs(const unsigned int coeffs_id) const { 75 : return *varmax_coeffs_pntrs_[coeffs_id]; 76 : } 77 : 78 : 79 : PLUMED_REGISTER_ACTION(Opt_Adam,"OPT_ADAM") 80 : 81 : 82 4 : void Opt_Adam::registerKeywords(Keywords& keys) { 83 4 : Optimizer::registerKeywords(keys); 84 4 : Optimizer::useFixedStepSizeKeywords(keys); 85 4 : Optimizer::useMultipleWalkersKeywords(keys); 86 4 : Optimizer::useMaskKeywords(keys); 87 4 : Optimizer::useDynamicTargetDistributionKeywords(keys); 88 4 : keys.add("optional","BETA_1","Parameter for the first moment estimate. Defaults to 0.9"); 89 4 : keys.add("optional","BETA_2","Parameter for the second moment estimate. Defaults to 0.999"); 90 4 : keys.add("optional","EPSILON","Small parameter to avoid division by zero. Defaults to 1e-8"); 91 4 : keys.add("optional","ADAMW_WEIGHT_DECAY","Weight decay parameter for the AdamW variant. Defaults to 0"); 92 4 : keys.addFlag("AMSGRAD", false, "Use the AMSGrad variant"); 93 4 : } 94 : 95 : 96 2 : Opt_Adam::Opt_Adam(const ActionOptions&ao): 97 : PLUMED_VES_OPTIMIZER_INIT(ao), 98 2 : time_(0), 99 2 : beta_1_(0.9), 100 2 : beta_2_(0.999), 101 2 : epsilon_(0.00000001), 102 2 : one_minus_weight_decay_(1.0), 103 2 : amsgrad_(false), 104 2 : adamw_(false), 105 2 : var_coeffs_pntrs_(0) { 106 : // add citation and print it to log 107 2 : log << " Adam type stochastic gradient decent\n"; 108 2 : parseFlag("AMSGRAD",amsgrad_); 109 2 : if (amsgrad_) { 110 1 : log << " Using the AMSGrad variant of the Adam algorithm, see and cite\n"; 111 : } 112 : 113 2 : double tmp_weight_decay = 0.0; 114 2 : parse("ADAMW_WEIGHT_DECAY",tmp_weight_decay); 115 2 : if (tmp_weight_decay != 0.0) { 116 0 : adamw_ = true; 117 0 : log << " Using the AdamW variant (Adam with weight decay), see and cite\n"; 118 0 : one_minus_weight_decay_ = 1 - tmp_weight_decay; 119 0 : log << " weight decay parameter: " << tmp_weight_decay << "\n"; 120 : } 121 : 122 2 : log << " Adam parameters:\n"; 123 2 : parse("BETA_1",beta_1_); 124 2 : plumed_massert(beta_1_ > 0 && beta_1_ <= 1, "BETA_1 must be between 0 and 1"); 125 2 : log << " beta_1: " << beta_1_ << "\n"; 126 : 127 2 : parse("BETA_2",beta_2_); 128 2 : plumed_massert(beta_2_ > 0 && beta_2_ <= 1, "BETA_2 must be between 0 and 1"); 129 2 : log << " beta_2: " << beta_2_ << "\n"; 130 : 131 2 : parse("EPSILON",epsilon_); 132 2 : plumed_massert(epsilon_ > 0 && epsilon_ <= 1, "EPSILON must be between 0 and 1"); 133 2 : log << " epsilon: " << epsilon_ << "\n"; 134 : 135 : 136 : // set up the coeff vector for the 2nd moment of the gradient (variance) 137 4 : for (unsigned i = 0; i < numberOfCoeffsSets(); ++i) { 138 2 : var_coeffs_pntrs_.emplace_back(std::unique_ptr<CoeffsVector>(new CoeffsVector(Coeffs(i)))); 139 4 : VarCoeffs(i).replaceLabelString("coeffs","grad_var"); 140 2 : VarCoeffs(i).setAllValuesToZero(); // can Coeffs(i) even be non-zero at this point? 141 : 142 : // add second set of coefficients to store the maximum values of the 2nd moment 143 2 : if (amsgrad_) { 144 1 : varmax_coeffs_pntrs_.emplace_back(std::unique_ptr<CoeffsVector>(new CoeffsVector(VarCoeffs(i)))); 145 2 : VarmaxCoeffs(i).replaceLabelString("coeffs","grad_varmax"); 146 : } 147 : 148 : // also rename the Coeffs used for the mean of the gradient 149 4 : AuxCoeffs(i).replaceLabelString("coeffs","grad_mean"); 150 : } 151 : 152 2 : checkRead(); 153 2 : } 154 : 155 : 156 20 : void Opt_Adam::coeffsUpdate(const unsigned int c_id) { 157 20 : time_++; 158 : // AuxCoeffs is used for first moment (mean) 159 20 : AuxCoeffs(c_id) *= beta_1_; 160 20 : AuxCoeffs(c_id) += (1 - beta_1_ ) * Gradient(c_id) * CoeffsMask(c_id); 161 20 : VarCoeffs(c_id) *= beta_2_; 162 20 : VarCoeffs(c_id) += (1 - beta_2_ ) * Gradient(c_id) * Gradient(c_id) * CoeffsMask(c_id); 163 : 164 20 : if (amsgrad_) { 165 120 : for (size_t i = 0; i< VarCoeffs(c_id).getSize(); ++i) { 166 110 : if (VarCoeffs(c_id).getValue(i) > VarmaxCoeffs(c_id).getValue(i)) { 167 95 : VarmaxCoeffs(c_id)[i] = VarCoeffs(c_id).getValue(i); 168 : } 169 : } 170 : } 171 : 172 : // store sqrt of VarCoeffs in vector, easier than writing a CoeffsVector::sqrt() function 173 : // also directly add epsilon and invert to multiply with the Coeffs in last step 174 : std::vector<double> var_coeffs_sqrt; 175 20 : if (!amsgrad_) { 176 120 : for (size_t i = 0; i< VarCoeffs(c_id).getSize(); ++i) { 177 110 : var_coeffs_sqrt.push_back(1 / (sqrt(VarCoeffs(c_id).getValue(i)) + epsilon)); 178 : } 179 : } else { // use VarmaxCoffs instead of VarCoeffs 180 120 : for (size_t i = 0; i< VarmaxCoeffs(c_id).getSize(); ++i) { 181 110 : var_coeffs_sqrt.push_back(1 / (sqrt(VarmaxCoeffs(c_id).getValue(i)) + epsilon)); 182 : } 183 : } 184 : 185 : // bias correction 186 20 : double scalefactor = StepSize(c_id) * sqrt(1 - pow(beta_2_, time_)) / (1 - pow(beta_1_, time_)); 187 : 188 20 : if (adamw_) { // check is not necessary but probably faster than always multiplying by 1 189 0 : Coeffs(c_id) *= one_minus_weight_decay_ * CoeffsMask(c_id); 190 : } 191 : 192 : // coeff update 193 20 : Coeffs(c_id) -= scalefactor * AuxCoeffs(c_id) * var_coeffs_sqrt * CoeffsMask(c_id); 194 20 : } 195 : 196 : 197 : } 198 : }