LCOV - code coverage report
Current view: top level - cltools - Benchmark.cpp (source / functions) Hit Total Coverage
Test: plumed test coverage Lines: 20 341 5.9 %
Date: 2024-10-18 13:59:31 Functions: 6 27 22.2 %

          Line data    Source code
       1             : /* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
       2             :    Copyright (c) 2019-2023 The plumed team
       3             :    (see the PEOPLE file at the root of the distribution for a list of names)
       4             : 
       5             :    See http://www.plumed.org for more information.
       6             : 
       7             :    This file is part of plumed, version 2.
       8             : 
       9             :    plumed is free software: you can redistribute it and/or modify
      10             :    it under the terms of the GNU Lesser General Public License as published by
      11             :    the Free Software Foundation, either version 3 of the License, or
      12             :    (at your option) any later version.
      13             : 
      14             :    plumed is distributed in the hope that it will be useful,
      15             :    but WITHOUT ANY WARRANTY; without even the implied warranty of
      16             :    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      17             :    GNU Lesser General Public License for more details.
      18             : 
      19             :    You should have received a copy of the GNU Lesser General Public License
      20             :    along with plumed.  If not, see <http://www.gnu.org/licenses/>.
      21             : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
      22             : #include "CLTool.h"
      23             : #include "core/CLToolRegister.h"
      24             : #include "tools/Communicator.h"
      25             : #include "tools/Tools.h"
      26             : #include "config/Config.h"
      27             : #include "tools/PlumedHandle.h"
      28             : #include "tools/Stopwatch.h"
      29             : #include "tools/Log.h"
      30             : #include "tools/DLLoader.h"
      31             : #include "tools/Random.h"
      32             : 
      33             : #include <cstdio>
      34             : #include <string>
      35             : #include <vector>
      36             : #include <fstream>
      37             : #include <atomic>
      38             : #include <csignal>
      39             : #include <cstdio>
      40             : #include <random>
      41             : #include <algorithm>
      42             : #include <chrono>
      43             : #include <string_view>
      44             : 
      45             : namespace PLMD {
      46             : namespace cltools {
      47             : 
      48             : //+PLUMEDOC TOOLS benchmark
      49             : /*
      50             : benchmark is a lightweight reimplementation of driver focused on running benchmarks
      51             : 
      52             : The main difference wrt driver is that it generates a trajectory in memory rather than reading it
      53             : from a file. This allows to better time the overhead of the plumed library, without including
      54             : the time needed to read the trajectory.
      55             : 
      56             : It is also possible to load a separate version of the plumed kernel. This enables running
      57             : benchmarks agaist previous plumed versions in a controlled setting, where systematic errors
      58             : in the comparison are minimized.
      59             : 
      60             : \par Examples
      61             : 
      62             : First, you should create a sample `plumed.dat` file for testing. For instance:
      63             : 
      64             : \plumedfile
      65             : WHOLEMOLECULES ENTITY0=1-10000
      66             : p: POSITION ATOM=1
      67             : RESTRAINT ARG=p.x KAPPA=1 AT=0
      68             : \endplumedfile
      69             : 
      70             : Then you can test the performance of this input with the following command:
      71             : \verbatim
      72             : plumed benchmark
      73             : \endverbatim
      74             : 
      75             : You can also test a different (older) version of PLUMED with the same input. To do so,
      76             : you should run
      77             : \verbatim
      78             : plumed-runtime benchmark --kernel /path/to/lib/libplumedKernel.so
      79             : \endverbatim
      80             : 
      81             : \warning It is necessary to use the `plumed-runtime` executable here to avoid conflicts between different
      82             : plumed versions. You will find it in your path if you are using the non installed version of plumed,
      83             : and in `$prefix/lib/plumed` if you installed plumed in $prefix,.
      84             : 
      85             : \par Comparing multiple versions
      86             : 
      87             : The best way to compare two versions of plumed on the same input is to pass multiple colon-separated kernels:
      88             : 
      89             : \verbatim
      90             : plumed-runtime benchmark --kernel /path/to/lib/libplumedKernel.so:/path2/to/lib/libplumedKernel.so:this
      91             : \endverbatim
      92             : 
      93             : Here `this` means the kernel of the version with which you are running the benchmark. This comparison runs the three
      94             : instances simultaneously (alternating them) so that systematic differences in the load of your machine will affect them
      95             : to the same extent.
      96             : 
      97             : In case the different versions require modified plumed.dat files, or if you simply want to compare
      98             : two different plumed input files that compute the same thing, you can also use multiple plumed input files:
      99             : 
     100             : \verbatim
     101             : plumed-runtime benchmark --kernel /path/to/lib/libplumedKernel.so:this --plumed plumed1.dat:plumed2.dat
     102             : \endverbatim
     103             : 
     104             : Similarly, you might want to run two different inputs using the same kernel, which can be obtained with:
     105             : 
     106             : \verbatim
     107             : plumed-runtime benchmark --plumed plumed1.dat:plumed2.dat
     108             : \endverbatim
     109             : 
     110             : \par Profiling
     111             : 
     112             : If you want to attach a profiler on the fly to the process, you might find it convenient to use `--nsteps -1`.
     113             : The simulation will run forever and can be interrupted with CTRL-C. When interrupted, the result of the timers
     114             : should be displayed anyway.
     115             : You can also run setting a maximum time with `--maxtime`.
     116             : 
     117             : If you run a profiler when testing multiple PLUMED versions you might be confused by which function is from
     118             : each version. It is recommended to recompile separate instances with a separate C++ namespace (`-DPLMD=PLUMED_version_1`)
     119             : so that you will be able to distinguish them. In addition, compiling with `CXXFLAGS="-g -O3"` will make the profiling
     120             : report more complete, likely including code lines.
     121             : 
     122             : \par MPI runs
     123             : 
     124             : You can run emulating a domain decomposition. This is done automatically if plumed has been compiled with MPI
     125             : and you run with `mpirun`
     126             : 
     127             : \verbatim
     128             : mpirun -np 4 plumed-runtime benchmark
     129             : \endverbatim
     130             : 
     131             : If you load separate PLUMED instances as discussed above, they should all be compiled against the same MPI version.
     132             : Notice that when using MPI signals (CTRL-C) might not work.
     133             : 
     134             : Since some of the data transfer could happen asynchronously, you might want to use the `--sleep` option
     135             : to simulate a lag between the `prepareCalc` and `performCalc` actions. This part of the calculation will not contribute
     136             : to timer, but will obviously slow down your test.
     137             : 
     138             : \par Output
     139             : 
     140             : In the output you will see the usual reports about timing produced by the internal
     141             : timers of the tested plumed instances.
     142             : In addition, this tool will monitor the timing externally, with some slightly different criterion:
     143             : - First, the initialization (construction of the input) will be shown with a separate timer,
     144             :   as well as the timing for the first step.
     145             : - Second, the timer corresponding to the calculation will be split in three parts, reporting
     146             :   execution of the first 20% (warm-up) and the next two blocks of 40% each.
     147             : - Finally, you might notice some discrepancy because some of the actions that are usually
     148             :   not expensive are not included in the internal timers. The external timer will
     149             :   thus provide a better estimate of the total elapsed time, including everything.
     150             : 
     151             : The internal timers are still useful to monitor what happens at the different stages
     152             : and, with \ref DEBUG `DETAILED_TIMERS`, what happens in each action.
     153             : 
     154             : When you run multiple version, a comparative analisys of the time spent within PLUMED in the various
     155             : instances will be done, showing the ratio between the total time and the time measured on the first
     156             : instance, which will act as a reference. Errors will be estimated with bootstrapping. The warm-up phase will be discarded for
     157             : this analysis.
     158             : 
     159             : */
     160             : //+ENDPLUMEDOC
     161             : 
     162             : // We use an anonymous namespace here to avoid clashes with variables
     163             : // declared in other parts of the code
     164             : namespace {
     165             : 
     166             : //this is a sugar for changing idea faster about the rng
     167             : using generator = std::mt19937;
     168             : 
     169             : std::atomic<bool> signalReceived(false);
     170             : 
     171             : class SignalHandlerGuard {
     172             : public:
     173           0 :   SignalHandlerGuard(int signal, void (*newHandler)(int)) : signal_(signal) {
     174             :     // Store the current handler before setting the new one
     175           0 :     prevHandler_ = std::signal(signal, newHandler);
     176           0 :     if (prevHandler_ == SIG_ERR) {
     177           0 :       throw std::runtime_error("Failed to set signal handler");
     178             :     }
     179           0 :   }
     180             : 
     181             :   ~SignalHandlerGuard() {
     182             :     // Restore the previous handler upon destruction
     183           0 :     std::signal(signal_, prevHandler_);
     184           0 :   }
     185             : 
     186             :   // Delete copy constructor and assignment operator to prevent copying
     187             :   SignalHandlerGuard(const SignalHandlerGuard&) = delete;
     188             :   SignalHandlerGuard& operator=(const SignalHandlerGuard&) = delete;
     189             : 
     190             : private:
     191             :   int signal_;
     192             :   void (*prevHandler_)(int);
     193             : };
     194             : 
     195           0 : extern "C" void signalHandler(int signal) {
     196           0 :   if (signal == SIGINT) {
     197             :     signalReceived.store(true);
     198           0 :     fprintf(stderr, "Signal interrupt received\n");
     199             :   }
     200           0 :   if (signal == SIGTERM) {
     201             :     signalReceived.store(true);
     202           0 :     fprintf(stderr, "Signal termination received\n");
     203             :   }
     204           0 : }
     205             : 
     206             : /// This base class contains members that are movable with default operations
     207             : struct KernelBase {
     208             :   std::string path;
     209             :   std::string plumed_dat;
     210             :   PlumedHandle handle;
     211             :   Stopwatch stopwatch;
     212             :   std::vector<long long int> timings;
     213             :   double comparative_timing=-1.0;
     214             :   double comparative_timing_error=-1.0;
     215           0 :   KernelBase(const std::string & path_,const std::string & plumed_dat_, Log* log_):
     216           0 :     path(path_),
     217           0 :     plumed_dat(plumed_dat_),
     218           0 :     handle([&]() {
     219           0 :     if(path_=="this") return PlumedHandle();
     220           0 :     else return PlumedHandle::dlopen(path.c_str());
     221             :   }()),
     222           0 :   stopwatch(*log_)
     223             :   {
     224           0 :   }
     225             : };
     226             : 
     227             : /// Local structure handling a kernel and the related timers.
     228             : /// This structure specifically contain the Log, which needs special treatment
     229             : /// in move semantics
     230             : struct Kernel :
     231             :   public KernelBase {
     232             :   Log* log=nullptr;
     233           0 :   Kernel(const std::string & path_,const std::string & the_plumed_dat, Log* log_):
     234             :     KernelBase(path_,the_plumed_dat,log_),
     235           0 :     log(log_)
     236             :   {
     237             :   }
     238             : 
     239           0 :   ~Kernel() {
     240           0 :     if(log) {
     241           0 :       (*log)<<"\n";
     242           0 :       (*log)        <<"Kernel:      "<<path<<"\n";
     243           0 :       (*log)        <<"Input:       "<<plumed_dat<<"\n";
     244           0 :       if(comparative_timing>0.0) {
     245           0 :         (*log).printf("Comparative: %.3f +- %.3f\n",comparative_timing,comparative_timing_error);
     246             :       }
     247             :     }
     248           0 :   }
     249             : 
     250           0 :   Kernel(Kernel && other) noexcept:
     251             :     KernelBase(std::move(other)),
     252           0 :     log(other.log)
     253             :   {
     254           0 :     other.log=nullptr; // ensure no log is done in the moved away object
     255             :   }
     256             : 
     257           0 :   Kernel & operator=(Kernel && other) noexcept
     258             :   {
     259           0 :     if(this != &other) {
     260           0 :       KernelBase::operator=(std::move(other));
     261           0 :       log=other.log;
     262           0 :       other.log=nullptr; // ensure no log is done in the moved away object
     263             :     }
     264           0 :     return *this;
     265             :   }
     266             : };
     267             : 
     268             : namespace  {
     269             : 
     270             : class UniformSphericalVector {
     271             :   //double rminCub;
     272             :   double rCub;
     273             : 
     274             : public:
     275             :   //assuming rmin=0
     276           0 :   UniformSphericalVector(const double rmax):
     277           0 :     rCub (rmax*rmax*rmax/*-rminCub*/) {}
     278           0 :   PLMD::Vector operator()(Random& rng) {
     279           0 :     double rho = std::cbrt (/*rminCub + */rng.RandU01()*rCub);
     280           0 :     double theta =std::acos (2.0*rng.RandU01() -1.0);
     281           0 :     double phi = 2.0 * PLMD::pi * rng.RandU01();
     282             :     return Vector (
     283           0 :              rho * sin (theta) * cos (phi),
     284           0 :              rho * sin (theta) * sin (phi),
     285           0 :              rho * cos (theta));
     286             :   }
     287             : };
     288             : 
     289             : ///Acts as a template for any distribution
     290             : struct AtomDistribution {
     291             :   virtual void positions(std::vector<Vector>& posToUpdate, unsigned /*step*/, Random&)=0;
     292           0 :   virtual void box(std::vector<double>& box, unsigned /*natoms*/, unsigned /*step*/, Random&) {
     293             :     std::fill(box.begin(), box.end(),0);
     294           0 :   };
     295             :   virtual ~AtomDistribution() noexcept {}
     296             : };
     297             : 
     298           0 : struct theLine:public AtomDistribution {
     299           0 :   void positions(std::vector<Vector>& posToUpdate, unsigned step, Random&rng) override {
     300             :     auto nat = posToUpdate.size();
     301             :     UniformSphericalVector usv(0.5);
     302             : 
     303           0 :     for (unsigned i=0; i<nat; ++i) {
     304           0 :       posToUpdate[i] = Vector(i, 0, 0) + usv(rng);
     305             :     }
     306           0 :   }
     307             : };
     308             : 
     309           0 : struct uniformSphere:public AtomDistribution {
     310           0 :   void positions(std::vector<Vector>& posToUpdate, unsigned /*step*/, Random& rng) override {
     311             : 
     312             :     //giving more or less a cubic udm of volume for each atom: V=nat
     313           0 :     const double rmax= std::cbrt ((3.0/(4.0*PLMD::pi)) * posToUpdate.size());
     314             : 
     315             :     UniformSphericalVector usv(rmax);
     316             :     auto s=posToUpdate.begin();
     317             :     auto e=posToUpdate.end();
     318             :     //I am using the iterators:this is slightly faster,
     319             :     // enough to overcome the cost of the vtable that I added
     320           0 :     for (unsigned i=0; s!=e; ++s,++i) {
     321           0 :       *s = usv (rng);
     322             :     }
     323             : 
     324           0 :   }
     325           0 :   void box(std::vector<double>& box, unsigned natoms, unsigned /*step*/, Random&) override {
     326           0 :     const double rmax= 2.0*std::cbrt((3.0/(4.0*PLMD::pi)) * natoms);
     327           0 :     box[0]=rmax; box[1]=0.0;  box[2]=0.0;
     328           0 :     box[3]=0.0;  box[4]=rmax; box[5]=0.0;
     329           0 :     box[6]=0.0;  box[7]=0.0;  box[8]=rmax;
     330             : 
     331           0 :   }
     332             : };
     333             : 
     334           0 : struct twoGlobs: public AtomDistribution {
     335           0 :   virtual void positions(std::vector<Vector>& posToUpdate, unsigned /*step*/, Random&rng) {
     336             :     //I am using two unigform spheres and 2V=n
     337           0 :     const double rmax= std::cbrt ((3.0/(8.0*PLMD::pi)) * posToUpdate.size());
     338             : 
     339             :     UniformSphericalVector usv(rmax);
     340             :     std::array<Vector,2> centers{
     341             :       PLMD::Vector{0.0,0.0,0.0},
     342             : //so they do not overlap
     343             :       PLMD::Vector{2.0*rmax,2.0*rmax,2.0*rmax}
     344           0 :     };
     345           0 :     std::generate(posToUpdate.begin(),posToUpdate.end(),[&]() {
     346             :       //RandInt is only declared
     347             :       // return usv (rng) + centers[rng.RandInt(1)];
     348           0 :       return usv (rng) + centers[rng.RandU01()>0.5];
     349             :     });
     350           0 :   }
     351             : 
     352           0 :   virtual void box(std::vector<double>& box, unsigned natoms, unsigned /*step*/, Random&) {
     353             : 
     354           0 :     const double rmax= 4.0 * std::cbrt ((3.0/(8.0*PLMD::pi)) * natoms);
     355           0 :     box[0]=rmax; box[1]=0.0;  box[2]=0.0;
     356           0 :     box[3]=0.0;  box[4]=rmax; box[5]=0.0;
     357           0 :     box[6]=0.0;  box[7]=0.0;  box[8]=rmax;
     358           0 :   };
     359             : };
     360             : 
     361           0 : struct uniformCube:public AtomDistribution {
     362           0 :   void positions(std::vector<Vector>& posToUpdate, unsigned /*step*/, Random& rng) override {
     363             :     //giving more or less a cubic udm of volume for each atom: V = nat
     364           0 :     const double rmax = std::cbrt(static_cast<double>(posToUpdate.size()));
     365             : 
     366             : 
     367             : 
     368             :     // std::generate(posToUpdate.begin(),posToUpdate.end(),[&]() {
     369             :     //   return Vector (rndR(rng),rndR(rng),rndR(rng));
     370             :     // });
     371             :     auto s=posToUpdate.begin();
     372             :     auto e=posToUpdate.end();
     373             :     //I am using the iterators:this is slightly faster,
     374             :     // enough to overcome the cost of the vtable that I added
     375           0 :     for (unsigned i=0; s!=e; ++s,++i) {
     376           0 :       *s = Vector (rng.RandU01()*rmax,rng.RandU01()*rmax,rng.RandU01()*rmax);
     377             :     }
     378           0 :   }
     379           0 :   void box(std::vector<double>& box, unsigned natoms, unsigned /*step*/, Random&) override {
     380             :     //+0.05 to avoid overlap
     381           0 :     const double rmax= std::cbrt(natoms)+0.05;
     382           0 :     box[0]=rmax; box[1]=0.0;  box[2]=0.0;
     383           0 :     box[3]=0.0;  box[4]=rmax; box[5]=0.0;
     384           0 :     box[6]=0.0;  box[7]=0.0;  box[8]=rmax;
     385             : 
     386           0 :   }
     387             : };
     388             : 
     389           0 : struct tiledSimpleCubic:public AtomDistribution {
     390           0 :   void positions(std::vector<Vector>& posToUpdate, unsigned /*step*/, Random& rng) override {
     391             :     //Tiling the space in this way will not tests 100% the pbc, but
     392             :     //I do not think that write a spacefilling curve, like Hilbert, Peano or Morton
     393             :     //could be a good idea, in this case
     394           0 :     const unsigned rmax = std::ceil(std::cbrt(static_cast<double>(posToUpdate.size())));
     395             : 
     396             :     auto s=posToUpdate.begin();
     397             :     auto e=posToUpdate.end();
     398             :     //I am using the iterators:this is slightly faster,
     399             :     // enough to overcome the cost of the vtable that I added
     400           0 :     for (unsigned k=0; k<rmax&&s!=e; ++k) {
     401           0 :       for (unsigned j=0; j<rmax&&s!=e; ++j) {
     402           0 :         for (unsigned i=0; i<rmax&&s!=e; ++i) {
     403           0 :           *s = Vector (i,j,k);
     404             :           ++s;
     405             :         }
     406             :       }
     407             :     }
     408           0 :   }
     409           0 :   void box(std::vector<double>& box, unsigned natoms, unsigned /*step*/, Random&) override {
     410           0 :     const double rmax= std::ceil(std::cbrt(static_cast<double>(natoms)));;
     411           0 :     box[0]=rmax; box[1]=0.0;  box[2]=0.0;
     412           0 :     box[3]=0.0;  box[4]=rmax; box[5]=0.0;
     413           0 :     box[6]=0.0;  box[7]=0.0;  box[8]=rmax;
     414             : 
     415           0 :   }
     416             : };
     417           0 : std::unique_ptr<AtomDistribution> getAtomDistribution(std::string_view atomicDistr) {
     418           0 :   std::unique_ptr<AtomDistribution> distribution;
     419           0 :   if(atomicDistr == "line") {
     420           0 :     distribution = std::make_unique<theLine>();
     421           0 :   } else if (atomicDistr == "cube") {
     422           0 :     distribution = std::make_unique<uniformCube>();
     423           0 :   } else if (atomicDistr == "sphere") {
     424           0 :     distribution = std::make_unique<uniformSphere>();
     425           0 :   } else if (atomicDistr == "globs") {
     426           0 :     distribution = std::make_unique<twoGlobs>();
     427           0 :   } else if (atomicDistr == "sc") {
     428           0 :     distribution = std::make_unique<tiledSimpleCubic>();
     429             :   } else {
     430           0 :     plumed_error() << R"(The atomic distribution can be only "line", "cube", "sphere", "globs" and "sc", the input was ")"
     431           0 :                    << atomicDistr <<'"';
     432             :   }
     433           0 :   return distribution;
     434           0 : }
     435             : } //anonymus namespace for benchmark distributions
     436             : class Benchmark:
     437             :   public CLTool
     438             : {
     439             : public:
     440             :   static void registerKeywords( Keywords& keys );
     441             :   explicit Benchmark(const CLToolOptions& co );
     442             :   int main(FILE* in, FILE*out,Communicator& pc) override;
     443           4 :   std::string description()const override {
     444           4 :     return "run a calculation with a fixed trajectory to find bottlenecks in PLUMED";
     445             :   }
     446             : };
     447             : 
     448       15952 : PLUMED_REGISTER_CLTOOL(Benchmark,"benchmark")
     449             : 
     450        5316 : void Benchmark::registerKeywords( Keywords& keys ) {
     451        5316 :   CLTool::registerKeywords( keys );
     452       10632 :   keys.add("compulsory","--plumed","plumed.dat","colon separated path(s) to the input file(s)");
     453       10632 :   keys.add("compulsory","--kernel","this","colon separated path(s) to kernel(s)");
     454       10632 :   keys.add("compulsory","--natoms","100000","the number of atoms to use for the simulation");
     455       10632 :   keys.add("compulsory","--nsteps","2000","number of steps of MD to perform (-1 means forever)");
     456       10632 :   keys.add("compulsory","--maxtime","-1","maximum number of seconds (-1 means forever)");
     457       10632 :   keys.add("compulsory","--sleep","0","number of seconds of sleep, mimicking MD calculation");
     458       10632 :   keys.add("compulsory","--atom-distribution","line","the kind of possible atomic displacement at each step");
     459       10632 :   keys.add("optional","--dump-trajectory","dump the trajectory to this file");
     460       10632 :   keys.addFlag("--domain-decomposition",false,"simulate domain decomposition, implies --shuffle");
     461       10632 :   keys.addFlag("--shuffled",false,"reshuffle atoms");
     462        5316 : }
     463             : 
     464           4 : Benchmark::Benchmark(const CLToolOptions& co ):
     465           4 :   CLTool(co)
     466             : {
     467           4 :   inputdata=commandline;
     468           4 : }
     469             : 
     470             : 
     471           0 : int Benchmark::main(FILE* in, FILE*out,Communicator& pc) {
     472             :   // deterministic initializations to avoid issues with MPI
     473             :   generator rng;
     474           0 :   PLMD::Random atomicGenerator;
     475           0 :   std::unique_ptr<AtomDistribution> distribution;
     476             : 
     477             :   struct FileDeleter {
     478             :     void operator()(FILE*f) const noexcept {
     479           0 :       if(f) std::fclose(f);
     480           0 :     }
     481             :   };
     482             : 
     483           0 :   std::unique_ptr<FILE,FileDeleter> log_dev_null{std::fopen("/dev/null","w")};
     484             : 
     485           0 :   Log log;
     486           0 :   if(pc.Get_rank()==0) {
     487           0 :     log.link(out);
     488             :   } else {
     489           0 :     log.link(log_dev_null.get());
     490             :   }
     491           0 :   log.setLinePrefix("BENCH:  ");
     492           0 :   log <<"Welcome to PLUMED benchmark\n";
     493             :   std::vector<Kernel> kernels;
     494             : 
     495             :   // perform comparative analysis
     496             :   // ensure that kernels vector is destroyed from last to first element upon exit
     497           0 :   auto kernels_deleter=[&log](auto f) {
     498           0 :     if(!f) {
     499           0 :       return;
     500             :     }
     501           0 :     if(f->empty()) {
     502             :       return;
     503             :     }
     504             :     generator bootstrapRng;
     505             : 
     506             :     const auto size=f->back().timings.size();
     507             :     //B are the bootstrap iterations
     508             :     constexpr int B=200;
     509           0 :     const size_t numblocks=size;
     510             :     // for some reasons, blocked bootstrap underestimates error
     511             :     // For now I keep it off. If I remove it, the code below could be simplified
     512             :     // if(numblocks>20) numblocks=20;
     513           0 :     const auto blocksize=size/numblocks;
     514             : 
     515           0 :     if(f->size()<2) {
     516           0 :       log<<"Single run, skipping comparative analysis\n";
     517           0 :     } else if(size<10) {
     518           0 :       log<<"Too small sample, skipping comparative analysis\n";
     519             :     } else try {
     520             : 
     521           0 :         log<<"Running comparative analysis, "<<numblocks<<" blocks with size "<<blocksize<<"\n";
     522             : 
     523           0 :         std::vector<std::size_t> choice(size);
     524           0 :         std::uniform_int_distribution<> distrib(0, numblocks-1);
     525           0 :         std::vector<std::vector<long long int>> blocks(f->size());
     526             : 
     527             :         { int i=0;
     528           0 :           for(auto it = f->rbegin(); it != f->rend(); ++it,++i) {
     529             :             size_t l=0;
     530           0 :             blocks[i].assign(numblocks,0);
     531           0 :             for(auto j=0ULL; j<numblocks; j++) {
     532           0 :               for(auto k=0ULL; k<blocksize; k++) {
     533           0 :                 plumed_assert(l<it->timings.size());
     534           0 :                 blocks[i][j]+=it->timings[l];
     535           0 :                 l++;
     536             :               }
     537             :             }
     538             :           }
     539             :         }
     540             : 
     541           0 :         std::vector<std::vector<double>> ratios(f->size());
     542           0 :         for(auto & r : ratios) {
     543             :           //B are the bootstrap iterations
     544           0 :           r.resize(B);
     545             :         }
     546             : 
     547             :         //B are the bootstrap iterations
     548           0 :         for(unsigned b=0; b<B; b++) {
     549           0 :           for(auto & c : choice) c=distrib(bootstrapRng);
     550             :           long long int reference=0;
     551           0 :           for(auto & c : choice) {
     552           0 :             reference+=blocks[0][c];
     553             :           }
     554           0 :           for(auto i=0ULL; i<blocks.size(); i++) {
     555             :             long long int estimate=0;
     556             :             // this would lead to separate bootstrap samples for each estimate:
     557             :             // for(auto & c : choice){c=distrib(bootstrapRng);}
     558           0 :             for(auto & c : choice) {
     559           0 :               estimate+=blocks[i][c];
     560             :             }
     561           0 :             ratios[i][b]=double(estimate)/double(reference);
     562             :           }
     563             :         }
     564             : 
     565             :         {
     566             :           int i=0;
     567           0 :           for(auto it = f->rbegin(); it != f->rend(); ++it,++i) {
     568             :             double sum=0.0;
     569             :             double sum2=0.0;
     570           0 :             for(auto r : ratios[i]) {
     571           0 :               sum+=r;
     572           0 :               sum2+=r*r;
     573             :             }
     574             :             //B are the bootstrap iterations
     575           0 :             it->comparative_timing=sum/B;
     576           0 :             it->comparative_timing_error=std::sqrt(sum2/B-sum*sum/(B*B));
     577             :           }
     578             :         }
     579             : 
     580           0 :       } catch(std::exception & e) {
     581           0 :         log<<"Unexpected error during comparative analysis\n";
     582           0 :         log<<e.what()<<"\n";
     583             :       }
     584           0 :     while(!f->empty()) f->pop_back();
     585             : 
     586             :   };
     587             :   std::unique_ptr<decltype(kernels),decltype(kernels_deleter)> kernels_deleter_obj(&kernels,kernels_deleter);
     588             : 
     589             : 
     590             :   // construct the kernels vector:
     591             :   {
     592             :     std::vector<std::string> allpaths;
     593             : 
     594             :     {
     595             :       std::string paths;
     596           0 :       parse("--kernel",paths);
     597           0 :       log <<"Using --kernel=" << paths << "\n";
     598           0 :       allpaths=Tools::getWords(paths,":");
     599             :     }
     600             : 
     601             :     std::vector<std::string> allplumed;
     602             :     {
     603             :       std::string paths;
     604           0 :       parse("--plumed",paths);
     605           0 :       log <<"Using --plumed=" << paths << "\n";
     606           0 :       allplumed=Tools::getWords(paths,":");
     607             :     }
     608             : 
     609           0 :     plumed_assert(allplumed.size()>0 && allpaths.size()>0);
     610             : 
     611             :     // this check only works on MacOS
     612             : #if defined(__APPLE__)
     613             :     // if any of the paths if different from "this", we check if libplumed was loaded locally to avoid conflicts.
     614             :     if(std::any_of(allpaths.begin(),allpaths.end(),[](auto value) {return value != "this";})) {
     615             :       if(DLLoader::isPlumedGlobal()) {
     616             :         plumed_error()<<"It looks like libplumed is loaded in the global namespace, you cannot load a different version of the kernel\n"
     617             :                       <<"Please make sure you use the plumed-runtime executable and that the env var PLUMED_LOAD_NAMESPACE is not set to GLOBAL";
     618             :       }
     619             :     }
     620             : #endif
     621             : 
     622           0 :     if(allplumed.size()>1 && allpaths.size()>1 && allplumed.size() != allpaths.size()) {
     623           0 :       plumed_error() << "--kernel and --plumed should have either one element or the same number of elements";
     624             :     }
     625             : 
     626           0 :     if(allplumed.size()>1 && allpaths.size()==1) for(unsigned i=1; i<allplumed.size(); i++) allpaths.push_back(allpaths[0]);
     627           0 :     if(allplumed.size()==1 && allpaths.size()>1) for(unsigned i=1; i<allpaths.size(); i++) allplumed.push_back(allplumed[0]);
     628             : 
     629           0 :     for(unsigned i=0; i<allpaths.size(); i++) kernels.emplace_back(allpaths[i],allplumed[i],&log);
     630           0 :   }
     631             : 
     632             :   // reverse order so that log happens in the forward order:
     633           0 :   std::reverse(kernels.begin(),kernels.end());
     634             : 
     635             :   // read other flags:
     636           0 :   bool shuffled=false;
     637           0 :   parseFlag("--shuffled",shuffled);
     638             : 
     639           0 :   int nf; parse("--nsteps",nf);
     640           0 :   log << "Using --nsteps=" << nf << "\n";
     641           0 :   unsigned natoms; parse("--natoms",natoms);
     642           0 :   log << "Using --natoms=" << natoms << "\n";
     643           0 :   double maxtime; parse("--maxtime",maxtime);
     644           0 :   log << "Using --maxtime=" << maxtime << "\n";
     645             : 
     646           0 :   bool domain_decomposition=false;
     647           0 :   parseFlag("--domain-decomposition",domain_decomposition);
     648             : 
     649           0 :   if(pc.Get_size()>1) domain_decomposition=true;
     650           0 :   if(domain_decomposition) shuffled=true;
     651             : 
     652           0 :   if (shuffled)
     653           0 :     log << "Using --shuffled\n";
     654           0 :   if (domain_decomposition)
     655           0 :     log << "Using --domain-decomposition\n";
     656             : 
     657             :   double timeToSleep;
     658           0 :   parse("--sleep",timeToSleep);
     659           0 :   log << "Using --sleep=" << timeToSleep << "\n";
     660             : 
     661             :   std::vector<int> shuffled_indexes;
     662             : 
     663             :   {
     664             :     std::string atomicDistr;
     665           0 :     parse("--atom-distribution",atomicDistr);
     666           0 :     distribution = getAtomDistribution(atomicDistr);
     667           0 :     log << "Using --atom-distribution=" << atomicDistr << "\n";
     668             :   }
     669             : 
     670             :   {
     671             :     std::string fileToDump;
     672           0 :     if(parse("--dump-trajectory",fileToDump)) {
     673           0 :       log << "Saving the trajectory to \"" << fileToDump << "\" and exiting\n";
     674           0 :       std::vector<double> cell(9);
     675           0 :       std::vector<Vector> pos(natoms);
     676           0 :       std::ofstream ofile(fileToDump);
     677           0 :       if (nf<0) {
     678             :         //if the user accidentally sets infinite steps, we set it to print only one
     679           0 :         nf=1;
     680             :       }
     681           0 :       for(int step=0; step<nf; ++step) {
     682           0 :         auto sw=kernels[0].stopwatch.startStop("TrajectoryGeneration");
     683           0 :         distribution->positions(pos,step,atomicGenerator);
     684           0 :         distribution->box(cell,natoms,step,atomicGenerator);
     685           0 :         ofile << natoms << "\n"
     686           0 :               << cell[0] << " " << cell[1] << " " << cell[2] << " "
     687           0 :               << cell[3] << " " << cell[4] << " " << cell[5] << " "
     688           0 :               << cell[6] << " " << cell[7] << " " << cell[8] << "\n";
     689           0 :         for(int i=0; i<natoms; ++i) {
     690           0 :           ofile << "X\t" << pos[i]<< "\n";
     691             :         }
     692           0 :       }
     693           0 :       ofile.close();
     694             :       return 0;
     695           0 :     }
     696             :   }
     697             : 
     698           0 :   log <<"Initializing the setup of the kernel(s)\n";
     699           0 :   const auto initial_time=std::chrono::high_resolution_clock::now();
     700             : 
     701           0 :   for(auto & k : kernels) {
     702             :     auto & p(k.handle);
     703           0 :     auto sw=k.stopwatch.startStop("A Initialization");
     704           0 :     if(Communicator::plumedHasMPI() && domain_decomposition) p.cmd("setMPIComm",&pc.Get_comm());
     705           0 :     p.cmd("setRealPrecision",(int)sizeof(double));
     706           0 :     p.cmd("setMDLengthUnits",1.0);
     707           0 :     p.cmd("setMDChargeUnits",1.0);
     708           0 :     p.cmd("setMDMassUnits",1.0);
     709           0 :     p.cmd("setMDEngine","benchmarks");
     710           0 :     p.cmd("setTimestep",1.0);
     711           0 :     p.cmd("setPlumedDat",k.plumed_dat.c_str());
     712           0 :     p.cmd("setLog",out);
     713           0 :     p.cmd("setNatoms",natoms);
     714           0 :     p.cmd("init");
     715           0 :   }
     716             : 
     717           0 :   std::vector<double> cell( 9 ), virial( 9 );
     718           0 :   std::vector<Vector> pos( natoms ), forces( natoms );
     719           0 :   std::vector<double> masses( natoms, 1 ), charges( natoms, 0 );
     720             : 
     721           0 :   if(shuffled) {
     722           0 :     shuffled_indexes.resize(natoms);
     723           0 :     for(unsigned i=0; i<natoms; i++) shuffled_indexes[i]=i;
     724           0 :     std::shuffle(shuffled_indexes.begin(),shuffled_indexes.end(),rng);
     725             :   }
     726             : 
     727             :   // non owning pointers, used for shuffling the execution order
     728             :   std::vector<Kernel*> kernels_ptr;
     729           0 :   for(unsigned i=0; i<kernels.size(); i++) kernels_ptr.push_back(&kernels[i]);
     730             : 
     731           0 :   int plumedStopCondition=0;
     732             :   bool fast_finish=false;
     733             :   int part=0;
     734             : 
     735           0 :   log<<"Starting MD loop\n";
     736           0 :   log<<"Use CTRL+C to stop at any time and collect timers (not working in MPI runs)\n";
     737             :   // trap signals:
     738           0 :   SignalHandlerGuard sigIntGuard(SIGINT, signalHandler);
     739           0 :   SignalHandlerGuard sigTermGuard(SIGTERM, signalHandler);
     740             : 
     741           0 :   for(int step=0; nf<0 || step<nf; ++step) {
     742           0 :     std::shuffle(kernels_ptr.begin(),kernels_ptr.end(),rng);
     743           0 :     distribution->positions(pos,step,atomicGenerator);
     744           0 :     distribution->box(cell,natoms,step,atomicGenerator);
     745             :     double* pos_ptr;
     746             :     double* for_ptr;
     747             :     double* charges_ptr;
     748             :     double* masses_ptr;
     749             :     int* indexes_ptr=nullptr;
     750             :     int n_local_atoms;
     751             : 
     752           0 :     if(domain_decomposition) {
     753           0 :       const auto nproc=pc.Get_size();
     754           0 :       const auto nn=natoms/nproc;
     755             :       //using int to remove warning, MPI don't work with unsigned
     756           0 :       int excess=natoms%nproc;
     757           0 :       const auto myrank=pc.Get_rank();
     758             :       auto shift=0;
     759           0 :       n_local_atoms=nn;
     760           0 :       if(myrank<excess) n_local_atoms+=1;
     761           0 :       for(int i=0; i<myrank; i++) {
     762           0 :         shift+=nn;
     763           0 :         if(i<excess) shift+=1;
     764             :       }
     765           0 :       pos_ptr=&pos[shift][0];
     766           0 :       for_ptr=&forces[shift][0];
     767             :       charges_ptr=&charges[shift];
     768             :       masses_ptr=&masses[shift];
     769           0 :       indexes_ptr=shuffled_indexes.data()+shift;
     770             :     } else {
     771           0 :       pos_ptr=&pos[0][0];
     772           0 :       for_ptr=&forces[0][0];
     773             :       charges_ptr=&charges[0];
     774             :       masses_ptr=&masses[0];
     775           0 :       n_local_atoms=natoms;
     776             :       indexes_ptr=shuffled_indexes.data();
     777             :     }
     778             : 
     779             :     const char* sw_name;
     780           0 :     if(part==0)      sw_name="B0 First step";
     781           0 :     else if(part==1) sw_name="B1 Warm-up";
     782           0 :     else if(part==2) sw_name="B2 Calculation part 1";
     783             :     else             sw_name="B3 Calculation part 2";
     784             : 
     785             : 
     786           0 :     for(unsigned i=0; i<kernels_ptr.size(); i++) {
     787           0 :       auto & p(kernels_ptr[i]->handle);
     788             : 
     789             :       {
     790           0 :         auto sw=kernels_ptr[i]->stopwatch.startPause(sw_name);
     791           0 :         p.cmd("setStep",step);
     792           0 :         p.cmd("setStopFlag",&plumedStopCondition);
     793           0 :         p.cmd("setForces",for_ptr, {n_local_atoms,3});
     794           0 :         p.cmd("setBox",&cell[0], {3,3});
     795           0 :         p.cmd("setVirial",&virial[0], {3,3});
     796           0 :         p.cmd("setPositions",pos_ptr, {n_local_atoms,3});
     797           0 :         p.cmd("setMasses",masses_ptr, {n_local_atoms});
     798           0 :         p.cmd("setCharges",charges_ptr, {n_local_atoms});
     799           0 :         if(shuffled) {
     800           0 :           p.cmd("setAtomsNlocal",n_local_atoms);
     801           0 :           p.cmd("setAtomsGatindex",indexes_ptr, {n_local_atoms});
     802             :         }
     803           0 :         p.cmd("prepareCalc");
     804           0 :       }
     805             : 
     806             :       // mimick MD calculation here
     807             :       {
     808             :         unsigned k=0;
     809           0 :         auto start=std::chrono::high_resolution_clock::now();
     810           0 :         while(std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::high_resolution_clock::now()-start).count()<(long long int)1e9*timeToSleep) k+=i*i;
     811             :         std::fprintf(log_dev_null.get(),"%u",k);
     812             :       }
     813             : 
     814             :       {
     815           0 :         auto sw=kernels_ptr[i]->stopwatch.startStop(sw_name);
     816           0 :         p.cmd("performCalc");
     817           0 :       }
     818             : 
     819           0 :       if(kernels_ptr.size()>1 && part>1) kernels_ptr[i]->timings.push_back(kernels_ptr[i]->stopwatch.getLastCycle(sw_name));
     820           0 :       if(plumedStopCondition || signalReceived.load()) fast_finish=true;
     821             :     }
     822           0 :     auto elapsed=std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::high_resolution_clock::now()-initial_time).count();
     823           0 :     if(part==0) part=1;
     824           0 :     if(part<2) {
     825           0 :       if((maxtime>0 && elapsed>(long long int)(0.2*1e9*maxtime)) || (nf>0 && step+1>=nf/5) || (maxtime<0 && nf<0 && step+1>=100)) {
     826             :         part=2;
     827           0 :         log<<"Warm-up completed\n";
     828             :       }
     829             :     }
     830           0 :     if(part<3) {
     831           0 :       if((maxtime>0 && elapsed>(long long int)(0.6*1e9*maxtime)) || (nf>0 && step+1>=3*nf/5)) {
     832             :         part=3;
     833           0 :         log<<"60% completed\n";
     834             :       }
     835             :     }
     836             : 
     837           0 :     if(maxtime>0 && elapsed>(long long int)(1e9*maxtime)) fast_finish=true;
     838             : 
     839             :     {
     840           0 :       unsigned tmp=fast_finish;
     841           0 :       pc.Bcast(tmp,0);
     842           0 :       fast_finish=tmp;
     843             :     }
     844           0 :     if(fast_finish) break;
     845             :   }
     846             : 
     847             :   return 0;
     848           0 : }
     849             : 
     850             : } // namespace unnamed
     851             : } // namespace cltools
     852             : } // namespace PLMD

Generated by: LCOV version 1.16