LCOV - plumed test coverage - cltools/Benchmark.cpp

LCOV - code coverage report

Current view:	top level - cltools - Benchmark.cpp (source / functions)		Hit	Total	Coverage
Test:	plumed test coverage	Lines:	20	381	5.2 %
Date:	2025-04-08 21:11:17	Functions:	6	27	22.2 %

          Line data    Source code

       1             : /* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
       2             :    Copyright (c) 2019-2023 The plumed team
       3             :    (see the PEOPLE file at the root of the distribution for a list of names)
       4             : 
       5             :    See http://www.plumed.org for more information.
       6             : 
       7             :    This file is part of plumed, version 2.
       8             : 
       9             :    plumed is free software: you can redistribute it and/or modify
      10             :    it under the terms of the GNU Lesser General Public License as published by
      11             :    the Free Software Foundation, either version 3 of the License, or
      12             :    (at your option) any later version.
      13             : 
      14             :    plumed is distributed in the hope that it will be useful,
      15             :    but WITHOUT ANY WARRANTY; without even the implied warranty of
      16             :    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      17             :    GNU Lesser General Public License for more details.
      18             : 
      19             :    You should have received a copy of the GNU Lesser General Public License
      20             :    along with plumed.  If not, see <http://www.gnu.org/licenses/>.
      21             : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
      22             : #include "CLTool.h"
      23             : #include "core/CLToolRegister.h"
      24             : #include "tools/Communicator.h"
      25             : #include "tools/Tools.h"
      26             : #include "config/Config.h"
      27             : #include "tools/PlumedHandle.h"
      28             : #include "tools/Stopwatch.h"
      29             : #include "tools/Log.h"
      30             : #include "tools/DLLoader.h"
      31             : #include "tools/Random.h"
      32             : 
      33             : #include <cstdio>
      34             : #include <string>
      35             : #include <vector>
      36             : #include <fstream>
      37             : #include <atomic>
      38             : #include <csignal>
      39             : #include <cstdio>
      40             : #include <random>
      41             : #include <algorithm>
      42             : #include <chrono>
      43             : #include <string_view>
      44             : 
      45             : namespace PLMD {
      46             : namespace cltools {
      47             : 
      48             : //+PLUMEDOC TOOLS benchmark
      49             : /*
      50             : benchmark is a lightweight reimplementation of [driver](driver.md) that can be used to run benchmark calculations
      51             : 
      52             : The main difference between [driver](driver.md) and benchmark is that benchmark generates a trajectory in memory rather than reading a
      53             : trajectory from a file. This approach is better for timing the overhead of the plumed library.  If you do similar benchmarking with driver
      54             : the timings you get are dominated by the time spent doing the I/O operations that are required to read the trajectory.
      55             : 
      56             : ## Basic usage
      57             : 
      58             : If you want to use benchmark you first create a sample `plumed.dat` file for testing. For example:
      59             : 
      60             : ```plumed
      61             : WHOLEMOLECULES ENTITY0=1-10000
      62             : p: POSITION ATOM=1
      63             : RESTRAINT ARG=p.x KAPPA=1 AT=0
      64             : ```
      65             : 
      66             : You can then run this benchmark using the following command:
      67             : 
      68             : ```plumed
      69             : plumed benchmark
      70             : ```
      71             : 
      72             : Notice, that benchmark will read an input file called `plumed.dat` by default.  You can specify a different name for you PLUMED input file
      73             : by using the `--plumed` flag.
      74             : 
      75             : ## Running with a different PLUMED version
      76             : 
      77             : If you want to run a benchmark against a previous plumed version in a controlled setting you can do so by using the command:
      78             : 
      79             : ```plumed
      80             : plumed-runtime benchmark --kernel /path/to/lib/libplumedKernel.so
      81             : ```
      82             : 
      83             : If you use this command the version of PLUMED that is in your environment calls the version of the library that is specified using the
      84             : `--kernel` flag.  Running the benchmark in this way ensures that you are running in a controlled setting, where systematic errors
      85             : in the comparison are minimized.
      86             : 
      87             : > [!WARNING]
      88             : > You use the `plumed-runtime` executable here to avoid conflicts between different
      89             : > plumed versions. You will find the `plumed-runtime` executable in your path if you are using the non installed version of plumed,
      90             : > and in `$prefix/lib/plumed` if you installed plumed in $prefix,.
      91             : 
      92             : ## Comparing multiple versions
      93             : 
      94             : The best way to compare two versions of plumed on the same input is to pass multiple colon-separated kernels as shown below:
      95             : 
      96             : ```plumed
      97             : plumed-runtime benchmark --kernel /path/to/lib/libplumedKernel.so:/path2/to/lib/libplumedKernel.so:this
      98             : ```
      99             : 
     100             : Here `this` means the kernel of the version with which you are running the benchmark. This comparison runs the three
     101             : instances simultaneously (alternating them) so that systematic differences in the load of your machine will affect them
     102             : to the same extent.
     103             : 
     104             : In case the different versions require modified plumed.dat files, or if you simply want to compare
     105             : two different plumed input files that compute the same thing, you can also use multiple plumed input files:
     106             : 
     107             : ```plumed
     108             : plumed-runtime benchmark --kernel /path/to/lib/libplumedKernel.so:this --plumed plumed1.dat:plumed2.dat
     109             : ```
     110             : 
     111             : Similarly, you might want to run two different inputs using the same kernel by using an input like this:
     112             : 
     113             : ```plumed
     114             : plumed-runtime benchmark --plumed plumed1.dat:plumed2.dat
     115             : ```
     116             : 
     117             : ## Profiling
     118             : 
     119             : If you want to attach a profiler to the process on the fly, you might find it convenient to use `--nsteps -1`.
     120             : This options ensures that the simulation runs forever unless interrupted with CTRL-C. When interrupted, the result of the timers
     121             : should be displayed anyway.
     122             : You can also set a maximum time for the calculating by using the `--maxtime` flag.
     123             : 
     124             : If you run a profiler when testing multiple PLUMED versions it can be difficult to determine which function is from
     125             : each version. We therefore recommended you recompile separate PLUMED instances with a separate C++ namespace (`-DPLMD=PLUMED_version_1`)
     126             : so that you will be able to distinguish them. In addition, compiling with `CXXFLAGS="-g -O3"` will make the profiling
     127             : report more complete and will likely highlight lines of code that are particularly computationally demanding.
     128             : 
     129             : ## MPI runs
     130             : 
     131             : You can also run a benchmark that emulates a domain decomposition if plumed has been compiled with MPI
     132             : and you run with `mpirun` and a command like the one shown below:
     133             : 
     134             : ```plumed
     135             : mpirun -np 4 plumed-runtime benchmark
     136             : ```
     137             : 
     138             : If you load separate PLUMED instances as discussed above, they should all be compiled against the same MPI version.
     139             : Notice that when using MPI signals (CTRL-C) might not work.
     140             : 
     141             : Since some of the data transfer could happen asynchronously, you might want to use the `--sleep` option
     142             : to simulate a lag between the `prepareCalc` and `performCalc` actions. This part of the calculation will not contribute
     143             : to the output timings, but will obviously slow down your test.
     144             : 
     145             : ## Output
     146             : 
     147             : In the output you will see the usual reports about timings produced by the internal
     148             : timers of the tested plumed instances.
     149             : 
     150             : In addition, this tool monitors the timing externally, with some slightly different criterion:
     151             : 
     152             : - First, the initialization (construction of the input) will be shown with a separate timer,
     153             :   as well as the timing for the first step.
     154             : - Second, the timer corresponding to the calculation will be split in three parts, reporting
     155             :   execution of the first 20% (warm-up) and the next two blocks of 40% each.
     156             : - Finally, you might notice some discrepancies because some of the actions that are usually
     157             :   not expensive are not included in the internal timers. The external timer will
     158             :   thus provide a better estimate of the total elapsed time that includes everything.
     159             : 
     160             : The internal timers are still useful to monitor what happens at the different stages
     161             : of the calculattion.  If you want more detailed information you can also use a
     162             : [DEBUG](DEBUG.md) action with the `DETAILED_TIMERS`, to determine how much time is spnt in each action.
     163             : 
     164             : When you run multiple version, a comparative analisys of the time spent within PLUMED in the various
     165             : instances will be done.  For each PLUMED instance you run, this analysis shows the ratio between the total time each PLUMED instance ran for and the total time the first
     166             : PLUMED instance ran for. In other words, the first time that the first PLUMED instance ran for is used as the basis for comparisons. Errors on these estimates of the timings
     167             : are calculated using bootstrapping and the warm-up phase is discarded in the analysis.
     168             : 
     169             : */
     170             : //+ENDPLUMEDOC
     171             : 
     172             : // We use an anonymous namespace here to avoid clashes with variables
     173             : // declared in other parts of the code
     174             : namespace {
     175             : 
     176             : //this is a sugar for changing idea faster about the rng
     177             : using generator = std::mt19937;
     178             : 
     179             : std::atomic<bool> signalReceived(false);
     180             : 
     181             : class SignalHandlerGuard {
     182             : public:
     183           0 :   SignalHandlerGuard(int signal, void (*newHandler)(int)) : signal_(signal) {
     184             :     // Store the current handler before setting the new one
     185           0 :     prevHandler_ = std::signal(signal, newHandler);
     186           0 :     if (prevHandler_ == SIG_ERR) {
     187           0 :       throw std::runtime_error("Failed to set signal handler");
     188             :     }
     189           0 :   }
     190             : 
     191             :   ~SignalHandlerGuard() {
     192             :     // Restore the previous handler upon destruction
     193           0 :     std::signal(signal_, prevHandler_);
     194           0 :   }
     195             : 
     196             :   // Delete copy constructor and assignment operator to prevent copying
     197             :   SignalHandlerGuard(const SignalHandlerGuard&) = delete;
     198             :   SignalHandlerGuard& operator=(const SignalHandlerGuard&) = delete;
     199             : 
     200             : private:
     201             :   int signal_;
     202             :   void (*prevHandler_)(int);
     203             : };
     204             : 
     205           0 : extern "C" void signalHandler(int signal) {
     206           0 :   if (signal == SIGINT) {
     207             :     signalReceived.store(true);
     208           0 :     fprintf(stderr, "Signal interrupt received\n");
     209             :   }
     210           0 :   if (signal == SIGTERM) {
     211             :     signalReceived.store(true);
     212           0 :     fprintf(stderr, "Signal termination received\n");
     213             :   }
     214           0 : }
     215             : 
     216             : /// This base class contains members that are movable with default operations
     217             : struct KernelBase {
     218             :   std::string path;
     219             :   std::string plumed_dat;
     220             :   PlumedHandle handle;
     221             :   Stopwatch stopwatch;
     222             :   std::vector<long long int> timings;
     223             :   double comparative_timing=-1.0;
     224             :   double comparative_timing_error=-1.0;
     225           0 :   KernelBase(const std::string & path_,const std::string & plumed_dat_, Log* log_):
     226           0 :     path(path_),
     227           0 :     plumed_dat(plumed_dat_),
     228           0 :     handle([&]() {
     229           0 :     if(path_=="this") {
     230           0 :       return PlumedHandle();
     231             :     } else {
     232           0 :       return PlumedHandle::dlopen(path.c_str());
     233             :     }
     234             :   }()),
     235           0 :   stopwatch(*log_) {
     236           0 :   }
     237             : };
     238             : 
     239             : /// Local structure handling a kernel and the related timers.
     240             : /// This structure specifically contain the Log, which needs special treatment
     241             : /// in move semantics
     242             : struct Kernel :
     243             :   public KernelBase {
     244             :   Log* log=nullptr;
     245           0 :   Kernel(const std::string & path_,const std::string & the_plumed_dat, Log* log_):
     246             :     KernelBase(path_,the_plumed_dat,log_),
     247           0 :     log(log_) {
     248             :   }
     249             : 
     250           0 :   ~Kernel() {
     251           0 :     if(log) {
     252           0 :       (*log)<<"\n";
     253           0 :       (*log)        <<"Kernel:      "<<path<<"\n";
     254           0 :       (*log)        <<"Input:       "<<plumed_dat<<"\n";
     255           0 :       if(comparative_timing>0.0) {
     256           0 :         (*log).printf("Comparative: %.3f +- %.3f\n",comparative_timing,comparative_timing_error);
     257             :       }
     258             :     }
     259           0 :   }
     260             : 
     261           0 :   Kernel(Kernel && other) noexcept:
     262             :     KernelBase(std::move(other)),
     263           0 :     log(other.log) {
     264           0 :     other.log=nullptr; // ensure no log is done in the moved away object
     265             :   }
     266             : 
     267           0 :   Kernel & operator=(Kernel && other) noexcept {
     268           0 :     if(this != &other) {
     269           0 :       KernelBase::operator=(std::move(other));
     270           0 :       log=other.log;
     271           0 :       other.log=nullptr; // ensure no log is done in the moved away object
     272             :     }
     273           0 :     return *this;
     274             :   }
     275             : };
     276             : 
     277             : namespace  {
     278             : 
     279             : class UniformSphericalVector {
     280             :   //double rminCub;
     281             :   double rCub;
     282             : 
     283             : public:
     284             :   //assuming rmin=0
     285           0 :   UniformSphericalVector(const double rmax):
     286           0 :     rCub (rmax*rmax*rmax/*-rminCub*/) {}
     287           0 :   PLMD::Vector operator()(Random& rng) {
     288           0 :     double rho = std::cbrt (/*rminCub + */rng.RandU01()*rCub);
     289           0 :     double theta =std::acos (2.0*rng.RandU01() -1.0);
     290           0 :     double phi = 2.0 * PLMD::pi * rng.RandU01();
     291             :     return Vector (
     292           0 :              rho * sin (theta) * cos (phi),
     293           0 :              rho * sin (theta) * sin (phi),
     294           0 :              rho * cos (theta));
     295             :   }
     296             : };
     297             : 
     298             : ///Acts as a template for any distribution
     299             : struct AtomDistribution {
     300             :   virtual void positions(std::vector<Vector>& posToUpdate, unsigned /*step*/, Random&)=0;
     301           0 :   virtual void box(std::vector<double>& box, unsigned /*natoms*/, unsigned /*step*/, Random&) {
     302             :     std::fill(box.begin(), box.end(),0);
     303           0 :   };
     304             :   virtual ~AtomDistribution() noexcept {}
     305             : };
     306             : 
     307           0 : struct theLine:public AtomDistribution {
     308           0 :   void positions(std::vector<Vector>& posToUpdate, unsigned step, Random&rng) override {
     309             :     auto nat = posToUpdate.size();
     310             :     UniformSphericalVector usv(0.5);
     311             : 
     312           0 :     for (unsigned i=0; i<nat; ++i) {
     313           0 :       posToUpdate[i] = Vector(i, 0, 0) + usv(rng);
     314             :     }
     315           0 :   }
     316             : };
     317             : 
     318           0 : struct uniformSphere:public AtomDistribution {
     319           0 :   void positions(std::vector<Vector>& posToUpdate, unsigned /*step*/, Random& rng) override {
     320             : 
     321             :     //giving more or less a cubic udm of volume for each atom: V=nat
     322           0 :     const double rmax= std::cbrt ((3.0/(4.0*PLMD::pi)) * posToUpdate.size());
     323             : 
     324             :     UniformSphericalVector usv(rmax);
     325             :     auto s=posToUpdate.begin();
     326             :     auto e=posToUpdate.end();
     327             :     //I am using the iterators:this is slightly faster,
     328             :     // enough to overcome the cost of the vtable that I added
     329           0 :     for (unsigned i=0; s!=e; ++s,++i) {
     330           0 :       *s = usv (rng);
     331             :     }
     332             : 
     333           0 :   }
     334           0 :   void box(std::vector<double>& box, unsigned natoms, unsigned /*step*/, Random&) override {
     335           0 :     const double rmax= 2.0*std::cbrt((3.0/(4.0*PLMD::pi)) * natoms);
     336           0 :     box[0]=rmax;
     337           0 :     box[1]=0.0;
     338           0 :     box[2]=0.0;
     339           0 :     box[3]=0.0;
     340           0 :     box[4]=rmax;
     341           0 :     box[5]=0.0;
     342           0 :     box[6]=0.0;
     343           0 :     box[7]=0.0;
     344           0 :     box[8]=rmax;
     345             : 
     346           0 :   }
     347             : };
     348             : 
     349           0 : struct twoGlobs: public AtomDistribution {
     350           0 :   virtual void positions(std::vector<Vector>& posToUpdate, unsigned /*step*/, Random&rng) {
     351             :     //I am using two unigform spheres and 2V=n
     352           0 :     const double rmax= std::cbrt ((3.0/(8.0*PLMD::pi)) * posToUpdate.size());
     353             : 
     354             :     UniformSphericalVector usv(rmax);
     355             :     std::array<Vector,2> centers{
     356             :       PLMD::Vector{0.0,0.0,0.0},
     357             : //so they do not overlap
     358             :       PLMD::Vector{2.0*rmax,2.0*rmax,2.0*rmax}
     359           0 :     };
     360           0 :     std::generate(posToUpdate.begin(),posToUpdate.end(),[&]() {
     361             :       //RandInt is only declared
     362             :       // return usv (rng) + centers[rng.RandInt(1)];
     363           0 :       return usv (rng) + centers[rng.RandU01()>0.5];
     364             :     });
     365           0 :   }
     366             : 
     367           0 :   virtual void box(std::vector<double>& box, unsigned natoms, unsigned /*step*/, Random&) {
     368             : 
     369           0 :     const double rmax= 4.0 * std::cbrt ((3.0/(8.0*PLMD::pi)) * natoms);
     370           0 :     box[0]=rmax;
     371           0 :     box[1]=0.0;
     372           0 :     box[2]=0.0;
     373           0 :     box[3]=0.0;
     374           0 :     box[4]=rmax;
     375           0 :     box[5]=0.0;
     376           0 :     box[6]=0.0;
     377           0 :     box[7]=0.0;
     378           0 :     box[8]=rmax;
     379           0 :   };
     380             : };
     381             : 
     382           0 : struct uniformCube:public AtomDistribution {
     383           0 :   void positions(std::vector<Vector>& posToUpdate, unsigned /*step*/, Random& rng) override {
     384             :     //giving more or less a cubic udm of volume for each atom: V = nat
     385           0 :     const double rmax = std::cbrt(static_cast<double>(posToUpdate.size()));
     386             : 
     387             : 
     388             : 
     389             :     // std::generate(posToUpdate.begin(),posToUpdate.end(),[&]() {
     390             :     //   return Vector (rndR(rng),rndR(rng),rndR(rng));
     391             :     // });
     392             :     auto s=posToUpdate.begin();
     393             :     auto e=posToUpdate.end();
     394             :     //I am using the iterators:this is slightly faster,
     395             :     // enough to overcome the cost of the vtable that I added
     396           0 :     for (unsigned i=0; s!=e; ++s,++i) {
     397           0 :       *s = Vector (rng.RandU01()*rmax,rng.RandU01()*rmax,rng.RandU01()*rmax);
     398             :     }
     399           0 :   }
     400           0 :   void box(std::vector<double>& box, unsigned natoms, unsigned /*step*/, Random&) override {
     401             :     //+0.05 to avoid overlap
     402           0 :     const double rmax= std::cbrt(natoms)+0.05;
     403           0 :     box[0]=rmax;
     404           0 :     box[1]=0.0;
     405           0 :     box[2]=0.0;
     406           0 :     box[3]=0.0;
     407           0 :     box[4]=rmax;
     408           0 :     box[5]=0.0;
     409           0 :     box[6]=0.0;
     410           0 :     box[7]=0.0;
     411           0 :     box[8]=rmax;
     412             : 
     413           0 :   }
     414             : };
     415             : 
     416           0 : struct tiledSimpleCubic:public AtomDistribution {
     417           0 :   void positions(std::vector<Vector>& posToUpdate, unsigned /*step*/, Random& rng) override {
     418             :     //Tiling the space in this way will not tests 100% the pbc, but
     419             :     //I do not think that write a spacefilling curve, like Hilbert, Peano or Morton
     420             :     //could be a good idea, in this case
     421           0 :     const unsigned rmax = std::ceil(std::cbrt(static_cast<double>(posToUpdate.size())));
     422             : 
     423             :     auto s=posToUpdate.begin();
     424             :     auto e=posToUpdate.end();
     425             :     //I am using the iterators:this is slightly faster,
     426             :     // enough to overcome the cost of the vtable that I added
     427           0 :     for (unsigned k=0; k<rmax&&s!=e; ++k) {
     428           0 :       for (unsigned j=0; j<rmax&&s!=e; ++j) {
     429           0 :         for (unsigned i=0; i<rmax&&s!=e; ++i) {
     430           0 :           *s = Vector (i,j,k);
     431             :           ++s;
     432             :         }
     433             :       }
     434             :     }
     435           0 :   }
     436           0 :   void box(std::vector<double>& box, unsigned natoms, unsigned /*step*/, Random&) override {
     437           0 :     const double rmax= std::ceil(std::cbrt(static_cast<double>(natoms)));;
     438           0 :     box[0]=rmax;
     439           0 :     box[1]=0.0;
     440           0 :     box[2]=0.0;
     441           0 :     box[3]=0.0;
     442           0 :     box[4]=rmax;
     443           0 :     box[5]=0.0;
     444           0 :     box[6]=0.0;
     445           0 :     box[7]=0.0;
     446           0 :     box[8]=rmax;
     447             : 
     448           0 :   }
     449             : };
     450           0 : std::unique_ptr<AtomDistribution> getAtomDistribution(std::string_view atomicDistr) {
     451           0 :   std::unique_ptr<AtomDistribution> distribution;
     452           0 :   if(atomicDistr == "line") {
     453           0 :     distribution = std::make_unique<theLine>();
     454           0 :   } else if (atomicDistr == "cube") {
     455           0 :     distribution = std::make_unique<uniformCube>();
     456           0 :   } else if (atomicDistr == "sphere") {
     457           0 :     distribution = std::make_unique<uniformSphere>();
     458           0 :   } else if (atomicDistr == "globs") {
     459           0 :     distribution = std::make_unique<twoGlobs>();
     460           0 :   } else if (atomicDistr == "sc") {
     461           0 :     distribution = std::make_unique<tiledSimpleCubic>();
     462             :   } else {
     463           0 :     plumed_error() << R"(The atomic distribution can be only "line", "cube", "sphere", "globs" and "sc", the input was ")"
     464           0 :                    << atomicDistr <<'"';
     465             :   }
     466           0 :   return distribution;
     467           0 : }
     468             : } //anonymus namespace for benchmark distributions
     469             : class Benchmark:
     470             :   public CLTool {
     471             : public:
     472             :   static void registerKeywords( Keywords& keys );
     473             :   explicit Benchmark(const CLToolOptions& co );
     474             :   int main(FILE* in, FILE*out,Communicator& pc) override;
     475           5 :   std::string description()const override {
     476           5 :     return "run a calculation with a fixed trajectory to find bottlenecks in PLUMED";
     477             :   }
     478             : };
     479             : 
     480       16259 : PLUMED_REGISTER_CLTOOL(Benchmark,"benchmark")
     481             : 
     482        5418 : void Benchmark::registerKeywords( Keywords& keys ) {
     483        5418 :   CLTool::registerKeywords( keys );
     484        5418 :   keys.add("compulsory","--plumed","plumed.dat","colon separated path(s) to the input file(s)");
     485        5418 :   keys.add("compulsory","--kernel","this","colon separated path(s) to kernel(s)");
     486        5418 :   keys.add("compulsory","--natoms","100000","the number of atoms to use for the simulation");
     487        5418 :   keys.add("compulsory","--nsteps","2000","number of steps of MD to perform (-1 means forever)");
     488        5418 :   keys.add("compulsory","--maxtime","-1","maximum number of seconds (-1 means forever)");
     489        5418 :   keys.add("compulsory","--sleep","0","number of seconds of sleep, mimicking MD calculation");
     490        5418 :   keys.add("compulsory","--atom-distribution","line","the kind of possible atomic displacement at each step");
     491        5418 :   keys.add("optional","--dump-trajectory","dump the trajectory to this file");
     492        5418 :   keys.addFlag("--domain-decomposition",false,"simulate domain decomposition, implies --shuffle");
     493        5418 :   keys.addFlag("--shuffled",false,"reshuffle atoms");
     494        5418 : }
     495             : 
     496           5 : Benchmark::Benchmark(const CLToolOptions& co ):
     497           5 :   CLTool(co) {
     498           5 :   inputdata=commandline;
     499           5 : }
     500             : 
     501             : 
     502           0 : int Benchmark::main(FILE* in, FILE*out,Communicator& pc) {
     503             :   // deterministic initializations to avoid issues with MPI
     504             :   generator rng;
     505           0 :   PLMD::Random atomicGenerator;
     506           0 :   std::unique_ptr<AtomDistribution> distribution;
     507             : 
     508             :   struct FileDeleter {
     509             :     void operator()(FILE*f) const noexcept {
     510             :       if(f) {
     511           0 :         std::fclose(f);
     512             :       }
     513           0 :     }
     514             :   };
     515             : 
     516           0 :   std::unique_ptr<FILE,FileDeleter> log_dev_null{std::fopen("/dev/null","w")};
     517             : 
     518           0 :   Log log;
     519           0 :   if(pc.Get_rank()==0) {
     520           0 :     log.link(out);
     521             :   } else {
     522           0 :     log.link(log_dev_null.get());
     523             :   }
     524           0 :   log.setLinePrefix("BENCH:  ");
     525           0 :   log <<"Welcome to PLUMED benchmark\n";
     526             :   std::vector<Kernel> kernels;
     527             : 
     528             :   // perform comparative analysis
     529             :   // ensure that kernels vector is destroyed from last to first element upon exit
     530           0 :   auto kernels_deleter=[&log](auto f) {
     531           0 :     if(!f) {
     532           0 :       return;
     533             :     }
     534           0 :     if(f->empty()) {
     535             :       return;
     536             :     }
     537             :     generator bootstrapRng;
     538             : 
     539             :     const auto size=f->back().timings.size();
     540             :     //B are the bootstrap iterations
     541             :     constexpr int B=200;
     542           0 :     const size_t numblocks=size;
     543             :     // for some reasons, blocked bootstrap underestimates error
     544             :     // For now I keep it off. If I remove it, the code below could be simplified
     545             :     // if(numblocks>20) numblocks=20;
     546           0 :     const auto blocksize=size/numblocks;
     547             : 
     548           0 :     if(f->size()<2) {
     549           0 :       log<<"Single run, skipping comparative analysis\n";
     550           0 :     } else if(size<10) {
     551           0 :       log<<"Too small sample, skipping comparative analysis\n";
     552             :     } else
     553             :       try {
     554             : 
     555           0 :         log<<"Running comparative analysis, "<<numblocks<<" blocks with size "<<blocksize<<"\n";
     556             : 
     557           0 :         std::vector<std::size_t> choice(size);
     558           0 :         std::uniform_int_distribution<> distrib(0, numblocks-1);
     559           0 :         std::vector<std::vector<long long int>> blocks(f->size());
     560             : 
     561             :         {
     562             :           int i=0;
     563           0 :           for(auto it = f->rbegin(); it != f->rend(); ++it,++i) {
     564             :             size_t l=0;
     565           0 :             blocks[i].assign(numblocks,0);
     566           0 :             for(auto j=0ULL; j<numblocks; j++) {
     567           0 :               for(auto k=0ULL; k<blocksize; k++) {
     568           0 :                 plumed_assert(l<it->timings.size());
     569           0 :                 blocks[i][j]+=it->timings[l];
     570           0 :                 l++;
     571             :               }
     572             :             }
     573             :           }
     574             :         }
     575             : 
     576           0 :         std::vector<std::vector<double>> ratios(f->size());
     577           0 :         for(auto & r : ratios) {
     578             :           //B are the bootstrap iterations
     579           0 :           r.resize(B);
     580             :         }
     581             : 
     582             :         //B are the bootstrap iterations
     583           0 :         for(unsigned b=0; b<B; b++) {
     584           0 :           for(auto & c : choice) {
     585           0 :             c=distrib(bootstrapRng);
     586             :           }
     587             :           long long int reference=0;
     588           0 :           for(auto & c : choice) {
     589           0 :             reference+=blocks[0][c];
     590             :           }
     591           0 :           for(auto i=0ULL; i<blocks.size(); i++) {
     592             :             long long int estimate=0;
     593             :             // this would lead to separate bootstrap samples for each estimate:
     594             :             // for(auto & c : choice){c=distrib(bootstrapRng);}
     595           0 :             for(auto & c : choice) {
     596           0 :               estimate+=blocks[i][c];
     597             :             }
     598           0 :             ratios[i][b]=double(estimate)/double(reference);
     599             :           }
     600             :         }
     601             : 
     602             :         {
     603             :           int i=0;
     604           0 :           for(auto it = f->rbegin(); it != f->rend(); ++it,++i) {
     605             :             double sum=0.0;
     606             :             double sum2=0.0;
     607           0 :             for(auto r : ratios[i]) {
     608           0 :               sum+=r;
     609           0 :               sum2+=r*r;
     610             :             }
     611             :             //B are the bootstrap iterations
     612           0 :             it->comparative_timing=sum/B;
     613           0 :             it->comparative_timing_error=std::sqrt(sum2/B-sum*sum/(B*B));
     614             :           }
     615             :         }
     616             : 
     617           0 :       } catch(std::exception & e) {
     618           0 :         log<<"Unexpected error during comparative analysis\n";
     619           0 :         log<<e.what()<<"\n";
     620             :       }
     621           0 :     while(!f->empty()) {
     622             :       f->pop_back();
     623             :     }
     624             : 
     625             :   };
     626             :   std::unique_ptr<decltype(kernels),decltype(kernels_deleter)> kernels_deleter_obj(&kernels,kernels_deleter);
     627             : 
     628             : 
     629             :   // construct the kernels vector:
     630             :   {
     631             :     std::vector<std::string> allpaths;
     632             : 
     633             :     {
     634             :       std::string paths;
     635           0 :       parse("--kernel",paths);
     636           0 :       log <<"Using --kernel=" << paths << "\n";
     637           0 :       allpaths=Tools::getWords(paths,":");
     638             :     }
     639             : 
     640             :     std::vector<std::string> allplumed;
     641             :     {
     642             :       std::string paths;
     643           0 :       parse("--plumed",paths);
     644           0 :       log <<"Using --plumed=" << paths << "\n";
     645           0 :       allplumed=Tools::getWords(paths,":");
     646             :     }
     647             : 
     648           0 :     plumed_assert(allplumed.size()>0 && allpaths.size()>0);
     649             : 
     650             :     // this check only works on MacOS
     651             : #if defined(__APPLE__)
     652             :     // if any of the paths if different from "this", we check if libplumed was loaded locally to avoid conflicts.
     653             :     if(std::any_of(allpaths.begin(),allpaths.end(),[](auto value) {
     654             :     return value != "this";
     655             :   })) {
     656             :       if(DLLoader::isPlumedGlobal()) {
     657             :         plumed_error()<<"It looks like libplumed is loaded in the global namespace, you cannot load a different version of the kernel\n"
     658             :                       <<"Please make sure you use the plumed-runtime executable and that the env var PLUMED_LOAD_NAMESPACE is not set to GLOBAL";
     659             :       }
     660             :     }
     661             : #endif
     662             : 
     663           0 :     if(allplumed.size()>1 && allpaths.size()>1 && allplumed.size() != allpaths.size()) {
     664           0 :       plumed_error() << "--kernel and --plumed should have either one element or the same number of elements";
     665             :     }
     666             : 
     667           0 :     if(allplumed.size()>1 && allpaths.size()==1)
     668           0 :       for(unsigned i=1; i<allplumed.size(); i++) {
     669           0 :         allpaths.push_back(allpaths[0]);
     670             :       }
     671           0 :     if(allplumed.size()==1 && allpaths.size()>1)
     672           0 :       for(unsigned i=1; i<allpaths.size(); i++) {
     673           0 :         allplumed.push_back(allplumed[0]);
     674             :       }
     675             : 
     676           0 :     for(unsigned i=0; i<allpaths.size(); i++) {
     677           0 :       kernels.emplace_back(allpaths[i],allplumed[i],&log);
     678             :     }
     679           0 :   }
     680             : 
     681             :   // reverse order so that log happens in the forward order:
     682           0 :   std::reverse(kernels.begin(),kernels.end());
     683             : 
     684             :   // read other flags:
     685           0 :   bool shuffled=false;
     686           0 :   parseFlag("--shuffled",shuffled);
     687             : 
     688             :   int nf;
     689           0 :   parse("--nsteps",nf);
     690           0 :   log << "Using --nsteps=" << nf << "\n";
     691             :   unsigned natoms;
     692           0 :   parse("--natoms",natoms);
     693           0 :   log << "Using --natoms=" << natoms << "\n";
     694             :   double maxtime;
     695           0 :   parse("--maxtime",maxtime);
     696           0 :   log << "Using --maxtime=" << maxtime << "\n";
     697             : 
     698           0 :   bool domain_decomposition=false;
     699           0 :   parseFlag("--domain-decomposition",domain_decomposition);
     700             : 
     701           0 :   if(pc.Get_size()>1) {
     702           0 :     domain_decomposition=true;
     703             :   }
     704           0 :   if(domain_decomposition) {
     705           0 :     shuffled=true;
     706             :   }
     707             : 
     708           0 :   if (shuffled) {
     709           0 :     log << "Using --shuffled\n";
     710             :   }
     711           0 :   if (domain_decomposition) {
     712           0 :     log << "Using --domain-decomposition\n";
     713             :   }
     714             : 
     715             :   double timeToSleep;
     716           0 :   parse("--sleep",timeToSleep);
     717           0 :   log << "Using --sleep=" << timeToSleep << "\n";
     718             : 
     719             :   std::vector<int> shuffled_indexes;
     720             : 
     721             :   {
     722             :     std::string atomicDistr;
     723           0 :     parse("--atom-distribution",atomicDistr);
     724           0 :     distribution = getAtomDistribution(atomicDistr);
     725           0 :     log << "Using --atom-distribution=" << atomicDistr << "\n";
     726             :   }
     727             : 
     728             :   {
     729             :     std::string fileToDump;
     730           0 :     if(parse("--dump-trajectory",fileToDump)) {
     731           0 :       log << "Saving the trajectory to \"" << fileToDump << "\" and exiting\n";
     732           0 :       std::vector<double> cell(9);
     733           0 :       std::vector<Vector> pos(natoms);
     734           0 :       std::ofstream ofile(fileToDump);
     735           0 :       if (nf<0) {
     736             :         //if the user accidentally sets infinite steps, we set it to print only one
     737           0 :         nf=1;
     738             :       }
     739           0 :       for(int step=0; step<nf; ++step) {
     740           0 :         auto sw=kernels[0].stopwatch.startStop("TrajectoryGeneration");
     741           0 :         distribution->positions(pos,step,atomicGenerator);
     742           0 :         distribution->box(cell,natoms,step,atomicGenerator);
     743           0 :         ofile << natoms << "\n"
     744           0 :               << cell[0] << " " << cell[1] << " " << cell[2] << " "
     745           0 :               << cell[3] << " " << cell[4] << " " << cell[5] << " "
     746           0 :               << cell[6] << " " << cell[7] << " " << cell[8] << "\n";
     747           0 :         for(int i=0; i<natoms; ++i) {
     748           0 :           ofile << "X\t" << pos[i]<< "\n";
     749             :         }
     750           0 :       }
     751           0 :       ofile.close();
     752             :       return 0;
     753           0 :     }
     754             :   }
     755             : 
     756           0 :   log <<"Initializing the setup of the kernel(s)\n";
     757           0 :   const auto initial_time=std::chrono::high_resolution_clock::now();
     758             : 
     759           0 :   for(auto & k : kernels) {
     760             :     auto & p(k.handle);
     761           0 :     auto sw=k.stopwatch.startStop("A Initialization");
     762           0 :     if(Communicator::plumedHasMPI() && domain_decomposition) {
     763           0 :       p.cmd("setMPIComm",&pc.Get_comm());
     764             :     }
     765           0 :     p.cmd("setRealPrecision",(int)sizeof(double));
     766           0 :     p.cmd("setMDLengthUnits",1.0);
     767           0 :     p.cmd("setMDChargeUnits",1.0);
     768           0 :     p.cmd("setMDMassUnits",1.0);
     769           0 :     p.cmd("setMDEngine","benchmarks");
     770           0 :     p.cmd("setTimestep",1.0);
     771           0 :     p.cmd("setPlumedDat",k.plumed_dat.c_str());
     772           0 :     p.cmd("setLog",out);
     773           0 :     p.cmd("setNatoms",natoms);
     774           0 :     p.cmd("init");
     775           0 :   }
     776             : 
     777           0 :   std::vector<double> cell( 9 ), virial( 9 );
     778           0 :   std::vector<Vector> pos( natoms ), forces( natoms );
     779           0 :   std::vector<double> masses( natoms, 1 ), charges( natoms, 0 );
     780             : 
     781           0 :   if(shuffled) {
     782           0 :     shuffled_indexes.resize(natoms);
     783           0 :     for(unsigned i=0; i<natoms; i++) {
     784           0 :       shuffled_indexes[i]=i;
     785             :     }
     786           0 :     std::shuffle(shuffled_indexes.begin(),shuffled_indexes.end(),rng);
     787             :   }
     788             : 
     789             :   // non owning pointers, used for shuffling the execution order
     790             :   std::vector<Kernel*> kernels_ptr;
     791           0 :   for(unsigned i=0; i<kernels.size(); i++) {
     792           0 :     kernels_ptr.push_back(&kernels[i]);
     793             :   }
     794             : 
     795           0 :   int plumedStopCondition=0;
     796             :   bool fast_finish=false;
     797             :   int part=0;
     798             : 
     799           0 :   log<<"Starting MD loop\n";
     800           0 :   log<<"Use CTRL+C to stop at any time and collect timers (not working in MPI runs)\n";
     801             :   // trap signals:
     802           0 :   SignalHandlerGuard sigIntGuard(SIGINT, signalHandler);
     803           0 :   SignalHandlerGuard sigTermGuard(SIGTERM, signalHandler);
     804             : 
     805           0 :   for(int step=0; nf<0 || step<nf; ++step) {
     806           0 :     std::shuffle(kernels_ptr.begin(),kernels_ptr.end(),rng);
     807           0 :     distribution->positions(pos,step,atomicGenerator);
     808           0 :     distribution->box(cell,natoms,step,atomicGenerator);
     809             :     double* pos_ptr;
     810             :     double* for_ptr;
     811             :     double* charges_ptr;
     812             :     double* masses_ptr;
     813             :     int* indexes_ptr=nullptr;
     814             :     int n_local_atoms;
     815             : 
     816           0 :     if(domain_decomposition) {
     817           0 :       const auto nproc=pc.Get_size();
     818           0 :       const auto nn=natoms/nproc;
     819             :       //using int to remove warning, MPI don't work with unsigned
     820           0 :       int excess=natoms%nproc;
     821           0 :       const auto myrank=pc.Get_rank();
     822             :       auto shift=0;
     823           0 :       n_local_atoms=nn;
     824           0 :       if(myrank<excess) {
     825           0 :         n_local_atoms+=1;
     826             :       }
     827           0 :       for(int i=0; i<myrank; i++) {
     828           0 :         shift+=nn;
     829           0 :         if(i<excess) {
     830           0 :           shift+=1;
     831             :         }
     832             :       }
     833           0 :       pos_ptr=&pos[shift][0];
     834           0 :       for_ptr=&forces[shift][0];
     835             :       charges_ptr=&charges[shift];
     836             :       masses_ptr=&masses[shift];
     837           0 :       indexes_ptr=shuffled_indexes.data()+shift;
     838             :     } else {
     839           0 :       pos_ptr=&pos[0][0];
     840           0 :       for_ptr=&forces[0][0];
     841             :       charges_ptr=&charges[0];
     842             :       masses_ptr=&masses[0];
     843           0 :       n_local_atoms=natoms;
     844             :       indexes_ptr=shuffled_indexes.data();
     845             :     }
     846             : 
     847             :     const char* sw_name;
     848           0 :     if(part==0) {
     849             :       sw_name="B0 First step";
     850           0 :     } else if(part==1) {
     851             :       sw_name="B1 Warm-up";
     852           0 :     } else if(part==2) {
     853             :       sw_name="B2 Calculation part 1";
     854             :     } else {
     855             :       sw_name="B3 Calculation part 2";
     856             :     }
     857             : 
     858             : 
     859           0 :     for(unsigned i=0; i<kernels_ptr.size(); i++) {
     860           0 :       auto & p(kernels_ptr[i]->handle);
     861             : 
     862             :       {
     863           0 :         auto sw=kernels_ptr[i]->stopwatch.startPause(sw_name);
     864           0 :         p.cmd("setStep",step);
     865           0 :         p.cmd("setStopFlag",&plumedStopCondition);
     866           0 :         p.cmd("setForces",for_ptr, {n_local_atoms,3});
     867           0 :         p.cmd("setBox",&cell[0], {3,3});
     868           0 :         p.cmd("setVirial",&virial[0], {3,3});
     869           0 :         p.cmd("setPositions",pos_ptr, {n_local_atoms,3});
     870           0 :         p.cmd("setMasses",masses_ptr, {n_local_atoms});
     871           0 :         p.cmd("setCharges",charges_ptr, {n_local_atoms});
     872           0 :         if(shuffled) {
     873           0 :           p.cmd("setAtomsNlocal",n_local_atoms);
     874           0 :           p.cmd("setAtomsGatindex",indexes_ptr, {n_local_atoms});
     875             :         }
     876           0 :         p.cmd("prepareCalc");
     877           0 :       }
     878             : 
     879             :       // mimick MD calculation here
     880             :       {
     881             :         unsigned k=0;
     882           0 :         auto start=std::chrono::high_resolution_clock::now();
     883           0 :         while(std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::high_resolution_clock::now()-start).count()<(long long int)1e9*timeToSleep) {
     884           0 :           k+=i*i;
     885             :         }
     886             :         std::fprintf(log_dev_null.get(),"%u",k);
     887             :       }
     888             : 
     889             :       {
     890           0 :         auto sw=kernels_ptr[i]->stopwatch.startStop(sw_name);
     891           0 :         p.cmd("performCalc");
     892           0 :       }
     893             : 
     894           0 :       if(kernels_ptr.size()>1 && part>1) {
     895           0 :         kernels_ptr[i]->timings.push_back(kernels_ptr[i]->stopwatch.getLastCycle(sw_name));
     896             :       }
     897           0 :       if(plumedStopCondition || signalReceived.load()) {
     898             :         fast_finish=true;
     899             :       }
     900             :     }
     901           0 :     auto elapsed=std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::high_resolution_clock::now()-initial_time).count();
     902           0 :     if(part==0) {
     903             :       part=1;
     904             :     }
     905           0 :     if(part<2) {
     906           0 :       if((maxtime>0 && elapsed>(long long int)(0.2*1e9*maxtime)) || (nf>0 && step+1>=nf/5) || (maxtime<0 && nf<0 && step+1>=100)) {
     907             :         part=2;
     908           0 :         log<<"Warm-up completed\n";
     909             :       }
     910             :     }
     911           0 :     if(part<3) {
     912           0 :       if((maxtime>0 && elapsed>(long long int)(0.6*1e9*maxtime)) || (nf>0 && step+1>=3*nf/5)) {
     913             :         part=3;
     914           0 :         log<<"60% completed\n";
     915             :       }
     916             :     }
     917             : 
     918           0 :     if(maxtime>0 && elapsed>(long long int)(1e9*maxtime)) {
     919             :       fast_finish=true;
     920             :     }
     921             : 
     922             :     {
     923           0 :       unsigned tmp=fast_finish;
     924           0 :       pc.Bcast(tmp,0);
     925           0 :       fast_finish=tmp;
     926             :     }
     927           0 :     if(fast_finish) {
     928             :       break;
     929             :     }
     930             :   }
     931             : 
     932             :   return 0;
     933           0 : }
     934             : 
     935             : } // namespace unnamed
     936             : } // namespace cltools
     937             : } // namespace PLMD

Generated by: LCOV version 1.16