LCOV - code coverage report
Current view: top level - ves - Opt_Adam.cpp (source / functions) Hit Total Coverage
Test: plumed test coverage Lines: 68 73 93.2 %
Date: 2024-10-11 08:09:47 Functions: 6 7 85.7 %

          Line data    Source code
       1             : /* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
       2             :    Copyright (c) 2016-2021 The VES code team
       3             :    (see the PEOPLE-VES file at the root of this folder for a list of names)
       4             : 
       5             :    See http://www.ves-code.org for more information.
       6             : 
       7             :    This file is part of VES code module.
       8             : 
       9             :    The VES code module is free software: you can redistribute it and/or modify
      10             :    it under the terms of the GNU Lesser General Public License as published by
      11             :    the Free Software Foundation, either version 3 of the License, or
      12             :    (at your option) any later version.
      13             : 
      14             :    The VES code module is distributed in the hope that it will be useful,
      15             :    but WITHOUT ANY WARRANTY; without even the implied warranty of
      16             :    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      17             :    GNU Lesser General Public License for more details.
      18             : 
      19             :    You should have received a copy of the GNU Lesser General Public License
      20             :    along with the VES code module.  If not, see <http://www.gnu.org/licenses/>.
      21             : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
      22             : 
      23             : #include "Optimizer.h"
      24             : #include "CoeffsVector.h"
      25             : 
      26             : #include "core/ActionRegister.h"
      27             : #include "core/ActionSet.h"
      28             : 
      29             : 
      30             : namespace PLMD {
      31             : namespace ves {
      32             : 
      33             : //+PLUMEDOC VES_OPTIMIZER OPT_ADAM
      34             : /*
      35             : Adaptive moment estimation (ADAM) optimizer.
      36             : 
      37             : \attention
      38             : __This optimizer is still experimental and not fully documented. The syntax might change. Restarting does not work. We recommend to use the averaged stochastic gradient decent optimizer (\ref OPT_AVERAGED_SGD) for now__.
      39             : 
      40             : 
      41             : \par Examples
      42             : 
      43             : */
      44             : //+ENDPLUMEDOC
      45             : 
      46             : class Opt_Adam: public Optimizer {
      47             : private:
      48             :   unsigned int time_;
      49             :   double beta_1_;
      50             :   double beta_2_;
      51             :   double epsilon_;
      52             :   double one_minus_weight_decay_;
      53             :   bool amsgrad_;
      54             :   bool adamw_;
      55             :   // 1st gradient moment uses the "AuxCoeffs", so only 2nd moment needs new CoeffVectors
      56             :   std::vector<std::unique_ptr<CoeffsVector>> var_coeffs_pntrs_;
      57             :   // used only for AMSGrad variant
      58             :   std::vector<std::unique_ptr<CoeffsVector>> varmax_coeffs_pntrs_;
      59             : protected:
      60             :   CoeffsVector& VarCoeffs(const unsigned int coeffs_id = 0) const;
      61             :   CoeffsVector& VarmaxCoeffs(const unsigned int coeffs_id = 0) const;
      62             : public:
      63             :   static void registerKeywords(Keywords&);
      64             :   explicit Opt_Adam(const ActionOptions&);
      65             :   void coeffsUpdate(const unsigned int c_id = 0) override;
      66             : };
      67             : 
      68             : inline
      69             : CoeffsVector& Opt_Adam::VarCoeffs(const unsigned int coeffs_id) const {return *var_coeffs_pntrs_[coeffs_id];}
      70             : 
      71             : inline
      72             : CoeffsVector& Opt_Adam::VarmaxCoeffs(const unsigned int coeffs_id) const {return *varmax_coeffs_pntrs_[coeffs_id];}
      73             : 
      74             : 
      75       10423 : PLUMED_REGISTER_ACTION(Opt_Adam,"OPT_ADAM")
      76             : 
      77             : 
      78           3 : void Opt_Adam::registerKeywords(Keywords& keys) {
      79           3 :   Optimizer::registerKeywords(keys);
      80           3 :   Optimizer::useFixedStepSizeKeywords(keys);
      81           3 :   Optimizer::useMultipleWalkersKeywords(keys);
      82           3 :   Optimizer::useMaskKeywords(keys);
      83           3 :   Optimizer::useDynamicTargetDistributionKeywords(keys);
      84           6 :   keys.add("optional","BETA_1","Parameter for the first moment estimate. Defaults to 0.9");
      85           6 :   keys.add("optional","BETA_2","Parameter for the second moment estimate. Defaults to 0.999");
      86           6 :   keys.add("optional","EPSILON","Small parameter to avoid division by zero. Defaults to 1e-8");
      87           9 :   keys.add("optional","ADAMW_WEIGHT_DECAY","Weight decay parameter for the AdamW variant. Defaults to 0");
      88           6 :   keys.addFlag("AMSGRAD", false, "Use the AMSGrad variant");
      89           3 : }
      90             : 
      91             : 
      92           2 : Opt_Adam::Opt_Adam(const ActionOptions&ao):
      93             :   PLUMED_VES_OPTIMIZER_INIT(ao),
      94           2 :   time_(0),
      95           2 :   beta_1_(0.9),
      96           2 :   beta_2_(0.999),
      97           2 :   epsilon_(0.00000001),
      98           2 :   one_minus_weight_decay_(1.0),
      99           2 :   amsgrad_(false),
     100           2 :   adamw_(false),
     101           2 :   var_coeffs_pntrs_(0)
     102             : {
     103             :   // add citation and print it to log
     104           2 :   log << "  Adam type stochastic gradient decent\n";
     105           2 :   parseFlag("AMSGRAD",amsgrad_);
     106           2 :   if (amsgrad_) {
     107           1 :     log << "  Using the AMSGrad variant of the Adam algorithm, see and cite\n";
     108             :   }
     109             : 
     110           2 :   double tmp_weight_decay = 0.0;
     111           2 :   parse("ADAMW_WEIGHT_DECAY",tmp_weight_decay);
     112           2 :   if (tmp_weight_decay != 0.0) {
     113           0 :     adamw_ = true;
     114           0 :     log << "  Using the AdamW variant (Adam with weight decay), see and cite\n";
     115           0 :     one_minus_weight_decay_ = 1 - tmp_weight_decay;
     116           0 :     log << "    weight decay parameter: " << tmp_weight_decay << "\n";
     117             :   }
     118             : 
     119           2 :   log << "  Adam parameters:\n";
     120           2 :   parse("BETA_1",beta_1_);
     121           2 :   plumed_massert(beta_1_ > 0 && beta_1_ <= 1, "BETA_1 must be between 0 and 1");
     122           2 :   log << "    beta_1: " << beta_1_ << "\n";
     123             : 
     124           2 :   parse("BETA_2",beta_2_);
     125           2 :   plumed_massert(beta_2_ > 0 && beta_2_ <= 1, "BETA_2 must be between 0 and 1");
     126           2 :   log << "    beta_2: " << beta_2_ << "\n";
     127             : 
     128           2 :   parse("EPSILON",epsilon_);
     129           2 :   plumed_massert(epsilon_ > 0 && epsilon_ <= 1, "EPSILON must be between 0 and 1");
     130           2 :   log << "    epsilon: " << epsilon_ << "\n";
     131             : 
     132             : 
     133             :   // set up the coeff vector for the 2nd moment of the gradient (variance)
     134           4 :   for (unsigned i = 0; i < numberOfCoeffsSets(); ++i) {
     135           2 :     var_coeffs_pntrs_.emplace_back(std::unique_ptr<CoeffsVector>(new CoeffsVector(Coeffs(i))));
     136           4 :     VarCoeffs(i).replaceLabelString("coeffs","grad_var");
     137           2 :     VarCoeffs(i).setAllValuesToZero(); // can Coeffs(i) even be non-zero at this point?
     138             : 
     139             :     // add second set of coefficients to store the maximum values of the 2nd moment
     140           2 :     if (amsgrad_) {
     141           1 :       varmax_coeffs_pntrs_.emplace_back(std::unique_ptr<CoeffsVector>(new CoeffsVector(VarCoeffs(i))));
     142           2 :       VarmaxCoeffs(i).replaceLabelString("coeffs","grad_varmax");
     143             :     }
     144             : 
     145             :     // also rename the Coeffs used for the mean of the gradient
     146           4 :     AuxCoeffs(i).replaceLabelString("coeffs","grad_mean");
     147             :   }
     148             : 
     149           2 :   checkRead();
     150           2 : }
     151             : 
     152             : 
     153          20 : void Opt_Adam::coeffsUpdate(const unsigned int c_id) {
     154          20 :   time_++;
     155             :   // AuxCoeffs is used for first moment (mean)
     156          20 :   AuxCoeffs(c_id) *= beta_1_;
     157          20 :   AuxCoeffs(c_id) += (1 - beta_1_ ) * Gradient(c_id) * CoeffsMask(c_id);
     158          20 :   VarCoeffs(c_id) *= beta_2_;
     159          20 :   VarCoeffs(c_id) += (1 - beta_2_ ) * Gradient(c_id) * Gradient(c_id) * CoeffsMask(c_id);
     160             : 
     161          20 :   if (amsgrad_) {
     162         120 :     for (size_t i = 0; i< VarCoeffs(c_id).getSize(); ++i) {
     163         110 :       if (VarCoeffs(c_id).getValue(i) > VarmaxCoeffs(c_id).getValue(i)) {
     164          95 :         VarmaxCoeffs(c_id)[i] = VarCoeffs(c_id).getValue(i);
     165             :       }
     166             :     }
     167             :   }
     168             : 
     169             :   // store sqrt of VarCoeffs in vector, easier than writing a CoeffsVector::sqrt() function
     170             :   // also directly add epsilon and invert to multiply with the Coeffs in last step
     171             :   std::vector<double> var_coeffs_sqrt;
     172          20 :   if (!amsgrad_) {
     173         120 :     for (size_t i = 0; i< VarCoeffs(c_id).getSize(); ++i) {
     174         110 :       var_coeffs_sqrt.push_back(1 / (sqrt(VarCoeffs(c_id).getValue(i)) + epsilon));
     175             :     }
     176             :   }
     177             :   else { // use VarmaxCoffs instead of VarCoeffs
     178         120 :     for (size_t i = 0; i< VarmaxCoeffs(c_id).getSize(); ++i) {
     179         110 :       var_coeffs_sqrt.push_back(1 / (sqrt(VarmaxCoeffs(c_id).getValue(i)) + epsilon));
     180             :     }
     181             :   }
     182             : 
     183             :   // bias correction
     184          20 :   double scalefactor = StepSize(c_id) * sqrt(1 - pow(beta_2_, time_)) / (1 - pow(beta_1_, time_));
     185             : 
     186          20 :   if (adamw_) { // check is not necessary but probably faster than always multiplying by 1
     187           0 :     Coeffs(c_id) *= one_minus_weight_decay_ * CoeffsMask(c_id);
     188             :   }
     189             : 
     190             :   // coeff update
     191          20 :   Coeffs(c_id) -= scalefactor * AuxCoeffs(c_id) * var_coeffs_sqrt * CoeffsMask(c_id);
     192          20 : }
     193             : 
     194             : 
     195             : }
     196             : }

Generated by: LCOV version 1.15