Line data Source code
1 : /* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 : Copyright (c) 2019-2023 The plumed team
3 : (see the PEOPLE file at the root of the distribution for a list of names)
4 :
5 : See http://www.plumed.org for more information.
6 :
7 : This file is part of plumed, version 2.
8 :
9 : plumed is free software: you can redistribute it and/or modify
10 : it under the terms of the GNU Lesser General Public License as published by
11 : the Free Software Foundation, either version 3 of the License, or
12 : (at your option) any later version.
13 :
14 : plumed is distributed in the hope that it will be useful,
15 : but WITHOUT ANY WARRANTY; without even the implied warranty of
16 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 : GNU Lesser General Public License for more details.
18 :
19 : You should have received a copy of the GNU Lesser General Public License
20 : along with plumed. If not, see <http://www.gnu.org/licenses/>.
21 : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
22 : #include "CLTool.h"
23 : #include "core/CLToolRegister.h"
24 : #include "tools/Communicator.h"
25 : #include "tools/Tools.h"
26 : #include "config/Config.h"
27 : #include "tools/PlumedHandle.h"
28 : #include "tools/Stopwatch.h"
29 : #include "tools/Log.h"
30 : #include "tools/DLLoader.h"
31 : #include "tools/Random.h"
32 :
33 : #include <cstdio>
34 : #include <string>
35 : #include <vector>
36 : #include <fstream>
37 : #include <atomic>
38 : #include <csignal>
39 : #include <cstdio>
40 : #include <random>
41 : #include <algorithm>
42 : #include <chrono>
43 : #include <string_view>
44 :
45 : namespace PLMD {
46 : namespace cltools {
47 :
48 : //+PLUMEDOC TOOLS benchmark
49 : /*
50 : benchmark is a lightweight reimplementation of driver focused on running benchmarks
51 :
52 : The main difference wrt driver is that it generates a trajectory in memory rather than reading it
53 : from a file. This allows to better time the overhead of the plumed library, without including
54 : the time needed to read the trajectory.
55 :
56 : It is also possible to load a separate version of the plumed kernel. This enables running
57 : benchmarks agaist previous plumed versions in a controlled setting, where systematic errors
58 : in the comparison are minimized.
59 :
60 : \par Examples
61 :
62 : First, you should create a sample `plumed.dat` file for testing. For instance:
63 :
64 : \plumedfile
65 : WHOLEMOLECULES ENTITY0=1-10000
66 : p: POSITION ATOM=1
67 : RESTRAINT ARG=p.x KAPPA=1 AT=0
68 : \endplumedfile
69 :
70 : Then you can test the performance of this input with the following command:
71 : \verbatim
72 : plumed benchmark
73 : \endverbatim
74 :
75 : You can also test a different (older) version of PLUMED with the same input. To do so,
76 : you should run
77 : \verbatim
78 : plumed-runtime benchmark --kernel /path/to/lib/libplumedKernel.so
79 : \endverbatim
80 :
81 : \warning It is necessary to use the `plumed-runtime` executable here to avoid conflicts between different
82 : plumed versions. You will find it in your path if you are using the non installed version of plumed,
83 : and in `$prefix/lib/plumed` if you installed plumed in $prefix,.
84 :
85 : \par Comparing multiple versions
86 :
87 : The best way to compare two versions of plumed on the same input is to pass multiple colon-separated kernels:
88 :
89 : \verbatim
90 : plumed-runtime benchmark --kernel /path/to/lib/libplumedKernel.so:/path2/to/lib/libplumedKernel.so:this
91 : \endverbatim
92 :
93 : Here `this` means the kernel of the version with which you are running the benchmark. This comparison runs the three
94 : instances simultaneously (alternating them) so that systematic differences in the load of your machine will affect them
95 : to the same extent.
96 :
97 : In case the different versions require modified plumed.dat files, or if you simply want to compare
98 : two different plumed input files that compute the same thing, you can also use multiple plumed input files:
99 :
100 : \verbatim
101 : plumed-runtime benchmark --kernel /path/to/lib/libplumedKernel.so:this --plumed plumed1.dat:plumed2.dat
102 : \endverbatim
103 :
104 : Similarly, you might want to run two different inputs using the same kernel, which can be obtained with:
105 :
106 : \verbatim
107 : plumed-runtime benchmark --plumed plumed1.dat:plumed2.dat
108 : \endverbatim
109 :
110 : \par Profiling
111 :
112 : If you want to attach a profiler on the fly to the process, you might find it convenient to use `--nsteps -1`.
113 : The simulation will run forever and can be interrupted with CTRL-C. When interrupted, the result of the timers
114 : should be displayed anyway.
115 : You can also run setting a maximum time with `--maxtime`.
116 :
117 : If you run a profiler when testing multiple PLUMED versions you might be confused by which function is from
118 : each version. It is recommended to recompile separate instances with a separate C++ namespace (`-DPLMD=PLUMED_version_1`)
119 : so that you will be able to distinguish them. In addition, compiling with `CXXFLAGS="-g -O3"` will make the profiling
120 : report more complete, likely including code lines.
121 :
122 : \par MPI runs
123 :
124 : You can run emulating a domain decomposition. This is done automatically if plumed has been compiled with MPI
125 : and you run with `mpirun`
126 :
127 : \verbatim
128 : mpirun -np 4 plumed-runtime benchmark
129 : \endverbatim
130 :
131 : If you load separate PLUMED instances as discussed above, they should all be compiled against the same MPI version.
132 : Notice that when using MPI signals (CTRL-C) might not work.
133 :
134 : Since some of the data transfer could happen asynchronously, you might want to use the `--sleep` option
135 : to simulate a lag between the `prepareCalc` and `performCalc` actions. This part of the calculation will not contribute
136 : to timer, but will obviously slow down your test.
137 :
138 : \par Output
139 :
140 : In the output you will see the usual reports about timing produced by the internal
141 : timers of the tested plumed instances.
142 : In addition, this tool will monitor the timing externally, with some slightly different criterion:
143 : - First, the initialization (construction of the input) will be shown with a separate timer,
144 : as well as the timing for the first step.
145 : - Second, the timer corresponding to the calculation will be split in three parts, reporting
146 : execution of the first 20% (warm-up) and the next two blocks of 40% each.
147 : - Finally, you might notice some discrepancy because some of the actions that are usually
148 : not expensive are not included in the internal timers. The external timer will
149 : thus provide a better estimate of the total elapsed time, including everything.
150 :
151 : The internal timers are still useful to monitor what happens at the different stages
152 : and, with \ref DEBUG `DETAILED_TIMERS`, what happens in each action.
153 :
154 : When you run multiple version, a comparative analisys of the time spent within PLUMED in the various
155 : instances will be done, showing the ratio between the total time and the time measured on the first
156 : instance, which will act as a reference. Errors will be estimated with bootstrapping. The warm-up phase will be discarded for
157 : this analysis.
158 :
159 : */
160 : //+ENDPLUMEDOC
161 :
162 : // We use an anonymous namespace here to avoid clashes with variables
163 : // declared in other parts of the code
164 : namespace {
165 :
166 : //this is a sugar for changing idea faster about the rng
167 : using generator = std::mt19937;
168 :
169 : std::atomic<bool> signalReceived(false);
170 :
171 : class SignalHandlerGuard {
172 : public:
173 0 : SignalHandlerGuard(int signal, void (*newHandler)(int)) : signal_(signal) {
174 : // Store the current handler before setting the new one
175 0 : prevHandler_ = std::signal(signal, newHandler);
176 0 : if (prevHandler_ == SIG_ERR) {
177 0 : throw std::runtime_error("Failed to set signal handler");
178 : }
179 0 : }
180 :
181 : ~SignalHandlerGuard() {
182 : // Restore the previous handler upon destruction
183 0 : std::signal(signal_, prevHandler_);
184 0 : }
185 :
186 : // Delete copy constructor and assignment operator to prevent copying
187 : SignalHandlerGuard(const SignalHandlerGuard&) = delete;
188 : SignalHandlerGuard& operator=(const SignalHandlerGuard&) = delete;
189 :
190 : private:
191 : int signal_;
192 : void (*prevHandler_)(int);
193 : };
194 :
195 0 : extern "C" void signalHandler(int signal) {
196 0 : if (signal == SIGINT) {
197 : signalReceived.store(true);
198 0 : fprintf(stderr, "Signal interrupt received\n");
199 : }
200 0 : if (signal == SIGTERM) {
201 : signalReceived.store(true);
202 0 : fprintf(stderr, "Signal termination received\n");
203 : }
204 0 : }
205 :
206 : /// This base class contains members that are movable with default operations
207 : struct KernelBase {
208 : std::string path;
209 : std::string plumed_dat;
210 : PlumedHandle handle;
211 : Stopwatch stopwatch;
212 : std::vector<long long int> timings;
213 : double comparative_timing=-1.0;
214 : double comparative_timing_error=-1.0;
215 0 : KernelBase(const std::string & path_,const std::string & plumed_dat_, Log* log_):
216 0 : path(path_),
217 0 : plumed_dat(plumed_dat_),
218 0 : handle([&]() {
219 0 : if(path_=="this") return PlumedHandle();
220 0 : else return PlumedHandle::dlopen(path.c_str());
221 : }()),
222 0 : stopwatch(*log_)
223 : {
224 0 : }
225 : };
226 :
227 : /// Local structure handling a kernel and the related timers.
228 : /// This structure specifically contain the Log, which needs special treatment
229 : /// in move semantics
230 : struct Kernel :
231 : public KernelBase {
232 : Log* log=nullptr;
233 0 : Kernel(const std::string & path_,const std::string & the_plumed_dat, Log* log_):
234 : KernelBase(path_,the_plumed_dat,log_),
235 0 : log(log_)
236 : {
237 : }
238 :
239 0 : ~Kernel() {
240 0 : if(log) {
241 0 : (*log)<<"\n";
242 0 : (*log) <<"Kernel: "<<path<<"\n";
243 0 : (*log) <<"Input: "<<plumed_dat<<"\n";
244 0 : if(comparative_timing>0.0) {
245 0 : (*log).printf("Comparative: %.3f +- %.3f\n",comparative_timing,comparative_timing_error);
246 : }
247 : }
248 0 : }
249 :
250 0 : Kernel(Kernel && other) noexcept:
251 : KernelBase(std::move(other)),
252 0 : log(other.log)
253 : {
254 0 : other.log=nullptr; // ensure no log is done in the moved away object
255 : }
256 :
257 0 : Kernel & operator=(Kernel && other) noexcept
258 : {
259 0 : if(this != &other) {
260 0 : KernelBase::operator=(std::move(other));
261 0 : log=other.log;
262 0 : other.log=nullptr; // ensure no log is done in the moved away object
263 : }
264 0 : return *this;
265 : }
266 : };
267 :
268 : namespace {
269 :
270 : class UniformSphericalVector {
271 : //double rminCub;
272 : double rCub;
273 :
274 : public:
275 : //assuming rmin=0
276 0 : UniformSphericalVector(const double rmax):
277 0 : rCub (rmax*rmax*rmax/*-rminCub*/) {}
278 0 : PLMD::Vector operator()(Random& rng) {
279 0 : double rho = std::cbrt (/*rminCub + */rng.RandU01()*rCub);
280 0 : double theta =std::acos (2.0*rng.RandU01() -1.0);
281 0 : double phi = 2.0 * PLMD::pi * rng.RandU01();
282 : return Vector (
283 0 : rho * sin (theta) * cos (phi),
284 0 : rho * sin (theta) * sin (phi),
285 0 : rho * cos (theta));
286 : }
287 : };
288 :
289 : ///Acts as a template for any distribution
290 : struct AtomDistribution {
291 : virtual void positions(std::vector<Vector>& posToUpdate, unsigned /*step*/, Random&)=0;
292 0 : virtual void box(std::vector<double>& box, unsigned /*natoms*/, unsigned /*step*/, Random&) {
293 : std::fill(box.begin(), box.end(),0);
294 0 : };
295 : virtual ~AtomDistribution() noexcept {}
296 : };
297 :
298 0 : struct theLine:public AtomDistribution {
299 0 : void positions(std::vector<Vector>& posToUpdate, unsigned step, Random&rng) override {
300 : auto nat = posToUpdate.size();
301 : UniformSphericalVector usv(0.5);
302 :
303 0 : for (unsigned i=0; i<nat; ++i) {
304 0 : posToUpdate[i] = Vector(i, 0, 0) + usv(rng);
305 : }
306 0 : }
307 : };
308 :
309 0 : struct uniformSphere:public AtomDistribution {
310 0 : void positions(std::vector<Vector>& posToUpdate, unsigned /*step*/, Random& rng) override {
311 :
312 : //giving more or less a cubic udm of volume for each atom: V=nat
313 0 : const double rmax= std::cbrt ((3.0/(4.0*PLMD::pi)) * posToUpdate.size());
314 :
315 : UniformSphericalVector usv(rmax);
316 : auto s=posToUpdate.begin();
317 : auto e=posToUpdate.end();
318 : //I am using the iterators:this is slightly faster,
319 : // enough to overcome the cost of the vtable that I added
320 0 : for (unsigned i=0; s!=e; ++s,++i) {
321 0 : *s = usv (rng);
322 : }
323 :
324 0 : }
325 0 : void box(std::vector<double>& box, unsigned natoms, unsigned /*step*/, Random&) override {
326 0 : const double rmax= 2.0*std::cbrt((3.0/(4.0*PLMD::pi)) * natoms);
327 0 : box[0]=rmax; box[1]=0.0; box[2]=0.0;
328 0 : box[3]=0.0; box[4]=rmax; box[5]=0.0;
329 0 : box[6]=0.0; box[7]=0.0; box[8]=rmax;
330 :
331 0 : }
332 : };
333 :
334 0 : struct twoGlobs: public AtomDistribution {
335 0 : virtual void positions(std::vector<Vector>& posToUpdate, unsigned /*step*/, Random&rng) {
336 : //I am using two unigform spheres and 2V=n
337 0 : const double rmax= std::cbrt ((3.0/(8.0*PLMD::pi)) * posToUpdate.size());
338 :
339 : UniformSphericalVector usv(rmax);
340 : std::array<Vector,2> centers{
341 : PLMD::Vector{0.0,0.0,0.0},
342 : //so they do not overlap
343 : PLMD::Vector{2.0*rmax,2.0*rmax,2.0*rmax}
344 0 : };
345 0 : std::generate(posToUpdate.begin(),posToUpdate.end(),[&]() {
346 : //RandInt is only declared
347 : // return usv (rng) + centers[rng.RandInt(1)];
348 0 : return usv (rng) + centers[rng.RandU01()>0.5];
349 : });
350 0 : }
351 :
352 0 : virtual void box(std::vector<double>& box, unsigned natoms, unsigned /*step*/, Random&) {
353 :
354 0 : const double rmax= 4.0 * std::cbrt ((3.0/(8.0*PLMD::pi)) * natoms);
355 0 : box[0]=rmax; box[1]=0.0; box[2]=0.0;
356 0 : box[3]=0.0; box[4]=rmax; box[5]=0.0;
357 0 : box[6]=0.0; box[7]=0.0; box[8]=rmax;
358 0 : };
359 : };
360 :
361 0 : struct uniformCube:public AtomDistribution {
362 0 : void positions(std::vector<Vector>& posToUpdate, unsigned /*step*/, Random& rng) override {
363 : //giving more or less a cubic udm of volume for each atom: V = nat
364 0 : const double rmax = std::cbrt(static_cast<double>(posToUpdate.size()));
365 :
366 :
367 :
368 : // std::generate(posToUpdate.begin(),posToUpdate.end(),[&]() {
369 : // return Vector (rndR(rng),rndR(rng),rndR(rng));
370 : // });
371 : auto s=posToUpdate.begin();
372 : auto e=posToUpdate.end();
373 : //I am using the iterators:this is slightly faster,
374 : // enough to overcome the cost of the vtable that I added
375 0 : for (unsigned i=0; s!=e; ++s,++i) {
376 0 : *s = Vector (rng.RandU01()*rmax,rng.RandU01()*rmax,rng.RandU01()*rmax);
377 : }
378 0 : }
379 0 : void box(std::vector<double>& box, unsigned natoms, unsigned /*step*/, Random&) override {
380 : //+0.05 to avoid overlap
381 0 : const double rmax= std::cbrt(natoms)+0.05;
382 0 : box[0]=rmax; box[1]=0.0; box[2]=0.0;
383 0 : box[3]=0.0; box[4]=rmax; box[5]=0.0;
384 0 : box[6]=0.0; box[7]=0.0; box[8]=rmax;
385 :
386 0 : }
387 : };
388 :
389 0 : struct tiledSimpleCubic:public AtomDistribution {
390 0 : void positions(std::vector<Vector>& posToUpdate, unsigned /*step*/, Random& rng) override {
391 : //Tiling the space in this way will not tests 100% the pbc, but
392 : //I do not think that write a spacefilling curve, like Hilbert, Peano or Morton
393 : //could be a good idea, in this case
394 0 : const unsigned rmax = std::ceil(std::cbrt(static_cast<double>(posToUpdate.size())));
395 :
396 : auto s=posToUpdate.begin();
397 : auto e=posToUpdate.end();
398 : //I am using the iterators:this is slightly faster,
399 : // enough to overcome the cost of the vtable that I added
400 0 : for (unsigned k=0; k<rmax&&s!=e; ++k) {
401 0 : for (unsigned j=0; j<rmax&&s!=e; ++j) {
402 0 : for (unsigned i=0; i<rmax&&s!=e; ++i) {
403 0 : *s = Vector (i,j,k);
404 : ++s;
405 : }
406 : }
407 : }
408 0 : }
409 0 : void box(std::vector<double>& box, unsigned natoms, unsigned /*step*/, Random&) override {
410 0 : const double rmax= std::ceil(std::cbrt(static_cast<double>(natoms)));;
411 0 : box[0]=rmax; box[1]=0.0; box[2]=0.0;
412 0 : box[3]=0.0; box[4]=rmax; box[5]=0.0;
413 0 : box[6]=0.0; box[7]=0.0; box[8]=rmax;
414 :
415 0 : }
416 : };
417 0 : std::unique_ptr<AtomDistribution> getAtomDistribution(std::string_view atomicDistr) {
418 0 : std::unique_ptr<AtomDistribution> distribution;
419 0 : if(atomicDistr == "line") {
420 0 : distribution = std::make_unique<theLine>();
421 0 : } else if (atomicDistr == "cube") {
422 0 : distribution = std::make_unique<uniformCube>();
423 0 : } else if (atomicDistr == "sphere") {
424 0 : distribution = std::make_unique<uniformSphere>();
425 0 : } else if (atomicDistr == "globs") {
426 0 : distribution = std::make_unique<twoGlobs>();
427 0 : } else if (atomicDistr == "sc") {
428 0 : distribution = std::make_unique<tiledSimpleCubic>();
429 : } else {
430 0 : plumed_error() << R"(The atomic distribution can be only "line", "cube", "sphere", "globs" and "sc", the input was ")"
431 0 : << atomicDistr <<'"';
432 : }
433 0 : return distribution;
434 0 : }
435 : } //anonymus namespace for benchmark distributions
436 : class Benchmark:
437 : public CLTool
438 : {
439 : public:
440 : static void registerKeywords( Keywords& keys );
441 : explicit Benchmark(const CLToolOptions& co );
442 : int main(FILE* in, FILE*out,Communicator& pc) override;
443 4 : std::string description()const override {
444 4 : return "run a calculation with a fixed trajectory to find bottlenecks in PLUMED";
445 : }
446 : };
447 :
448 15952 : PLUMED_REGISTER_CLTOOL(Benchmark,"benchmark")
449 :
450 5316 : void Benchmark::registerKeywords( Keywords& keys ) {
451 5316 : CLTool::registerKeywords( keys );
452 10632 : keys.add("compulsory","--plumed","plumed.dat","colon separated path(s) to the input file(s)");
453 10632 : keys.add("compulsory","--kernel","this","colon separated path(s) to kernel(s)");
454 10632 : keys.add("compulsory","--natoms","100000","the number of atoms to use for the simulation");
455 10632 : keys.add("compulsory","--nsteps","2000","number of steps of MD to perform (-1 means forever)");
456 10632 : keys.add("compulsory","--maxtime","-1","maximum number of seconds (-1 means forever)");
457 10632 : keys.add("compulsory","--sleep","0","number of seconds of sleep, mimicking MD calculation");
458 10632 : keys.add("compulsory","--atom-distribution","line","the kind of possible atomic displacement at each step");
459 10632 : keys.add("optional","--dump-trajectory","dump the trajectory to this file");
460 10632 : keys.addFlag("--domain-decomposition",false,"simulate domain decomposition, implies --shuffle");
461 10632 : keys.addFlag("--shuffled",false,"reshuffle atoms");
462 5316 : }
463 :
464 4 : Benchmark::Benchmark(const CLToolOptions& co ):
465 4 : CLTool(co)
466 : {
467 4 : inputdata=commandline;
468 4 : }
469 :
470 :
471 0 : int Benchmark::main(FILE* in, FILE*out,Communicator& pc) {
472 : // deterministic initializations to avoid issues with MPI
473 : generator rng;
474 0 : PLMD::Random atomicGenerator;
475 0 : std::unique_ptr<AtomDistribution> distribution;
476 :
477 : struct FileDeleter {
478 : void operator()(FILE*f) const noexcept {
479 0 : if(f) std::fclose(f);
480 0 : }
481 : };
482 :
483 0 : std::unique_ptr<FILE,FileDeleter> log_dev_null{std::fopen("/dev/null","w")};
484 :
485 0 : Log log;
486 0 : if(pc.Get_rank()==0) {
487 0 : log.link(out);
488 : } else {
489 0 : log.link(log_dev_null.get());
490 : }
491 0 : log.setLinePrefix("BENCH: ");
492 0 : log <<"Welcome to PLUMED benchmark\n";
493 : std::vector<Kernel> kernels;
494 :
495 : // perform comparative analysis
496 : // ensure that kernels vector is destroyed from last to first element upon exit
497 0 : auto kernels_deleter=[&log](auto f) {
498 0 : if(!f) {
499 0 : return;
500 : }
501 0 : if(f->empty()) {
502 : return;
503 : }
504 : generator bootstrapRng;
505 :
506 : const auto size=f->back().timings.size();
507 : //B are the bootstrap iterations
508 : constexpr int B=200;
509 0 : const size_t numblocks=size;
510 : // for some reasons, blocked bootstrap underestimates error
511 : // For now I keep it off. If I remove it, the code below could be simplified
512 : // if(numblocks>20) numblocks=20;
513 0 : const auto blocksize=size/numblocks;
514 :
515 0 : if(f->size()<2) {
516 0 : log<<"Single run, skipping comparative analysis\n";
517 0 : } else if(size<10) {
518 0 : log<<"Too small sample, skipping comparative analysis\n";
519 : } else try {
520 :
521 0 : log<<"Running comparative analysis, "<<numblocks<<" blocks with size "<<blocksize<<"\n";
522 :
523 0 : std::vector<std::size_t> choice(size);
524 0 : std::uniform_int_distribution<> distrib(0, numblocks-1);
525 0 : std::vector<std::vector<long long int>> blocks(f->size());
526 :
527 : { int i=0;
528 0 : for(auto it = f->rbegin(); it != f->rend(); ++it,++i) {
529 : size_t l=0;
530 0 : blocks[i].assign(numblocks,0);
531 0 : for(auto j=0ULL; j<numblocks; j++) {
532 0 : for(auto k=0ULL; k<blocksize; k++) {
533 0 : plumed_assert(l<it->timings.size());
534 0 : blocks[i][j]+=it->timings[l];
535 0 : l++;
536 : }
537 : }
538 : }
539 : }
540 :
541 0 : std::vector<std::vector<double>> ratios(f->size());
542 0 : for(auto & r : ratios) {
543 : //B are the bootstrap iterations
544 0 : r.resize(B);
545 : }
546 :
547 : //B are the bootstrap iterations
548 0 : for(unsigned b=0; b<B; b++) {
549 0 : for(auto & c : choice) c=distrib(bootstrapRng);
550 : long long int reference=0;
551 0 : for(auto & c : choice) {
552 0 : reference+=blocks[0][c];
553 : }
554 0 : for(auto i=0ULL; i<blocks.size(); i++) {
555 : long long int estimate=0;
556 : // this would lead to separate bootstrap samples for each estimate:
557 : // for(auto & c : choice){c=distrib(bootstrapRng);}
558 0 : for(auto & c : choice) {
559 0 : estimate+=blocks[i][c];
560 : }
561 0 : ratios[i][b]=double(estimate)/double(reference);
562 : }
563 : }
564 :
565 : {
566 : int i=0;
567 0 : for(auto it = f->rbegin(); it != f->rend(); ++it,++i) {
568 : double sum=0.0;
569 : double sum2=0.0;
570 0 : for(auto r : ratios[i]) {
571 0 : sum+=r;
572 0 : sum2+=r*r;
573 : }
574 : //B are the bootstrap iterations
575 0 : it->comparative_timing=sum/B;
576 0 : it->comparative_timing_error=std::sqrt(sum2/B-sum*sum/(B*B));
577 : }
578 : }
579 :
580 0 : } catch(std::exception & e) {
581 0 : log<<"Unexpected error during comparative analysis\n";
582 0 : log<<e.what()<<"\n";
583 : }
584 0 : while(!f->empty()) f->pop_back();
585 :
586 : };
587 : std::unique_ptr<decltype(kernels),decltype(kernels_deleter)> kernels_deleter_obj(&kernels,kernels_deleter);
588 :
589 :
590 : // construct the kernels vector:
591 : {
592 : std::vector<std::string> allpaths;
593 :
594 : {
595 : std::string paths;
596 0 : parse("--kernel",paths);
597 0 : log <<"Using --kernel=" << paths << "\n";
598 0 : allpaths=Tools::getWords(paths,":");
599 : }
600 :
601 : std::vector<std::string> allplumed;
602 : {
603 : std::string paths;
604 0 : parse("--plumed",paths);
605 0 : log <<"Using --plumed=" << paths << "\n";
606 0 : allplumed=Tools::getWords(paths,":");
607 : }
608 :
609 0 : plumed_assert(allplumed.size()>0 && allpaths.size()>0);
610 :
611 : // this check only works on MacOS
612 : #if defined(__APPLE__)
613 : // if any of the paths if different from "this", we check if libplumed was loaded locally to avoid conflicts.
614 : if(std::any_of(allpaths.begin(),allpaths.end(),[](auto value) {return value != "this";})) {
615 : if(DLLoader::isPlumedGlobal()) {
616 : plumed_error()<<"It looks like libplumed is loaded in the global namespace, you cannot load a different version of the kernel\n"
617 : <<"Please make sure you use the plumed-runtime executable and that the env var PLUMED_LOAD_NAMESPACE is not set to GLOBAL";
618 : }
619 : }
620 : #endif
621 :
622 0 : if(allplumed.size()>1 && allpaths.size()>1 && allplumed.size() != allpaths.size()) {
623 0 : plumed_error() << "--kernel and --plumed should have either one element or the same number of elements";
624 : }
625 :
626 0 : if(allplumed.size()>1 && allpaths.size()==1) for(unsigned i=1; i<allplumed.size(); i++) allpaths.push_back(allpaths[0]);
627 0 : if(allplumed.size()==1 && allpaths.size()>1) for(unsigned i=1; i<allpaths.size(); i++) allplumed.push_back(allplumed[0]);
628 :
629 0 : for(unsigned i=0; i<allpaths.size(); i++) kernels.emplace_back(allpaths[i],allplumed[i],&log);
630 0 : }
631 :
632 : // reverse order so that log happens in the forward order:
633 0 : std::reverse(kernels.begin(),kernels.end());
634 :
635 : // read other flags:
636 0 : bool shuffled=false;
637 0 : parseFlag("--shuffled",shuffled);
638 :
639 0 : int nf; parse("--nsteps",nf);
640 0 : log << "Using --nsteps=" << nf << "\n";
641 0 : unsigned natoms; parse("--natoms",natoms);
642 0 : log << "Using --natoms=" << natoms << "\n";
643 0 : double maxtime; parse("--maxtime",maxtime);
644 0 : log << "Using --maxtime=" << maxtime << "\n";
645 :
646 0 : bool domain_decomposition=false;
647 0 : parseFlag("--domain-decomposition",domain_decomposition);
648 :
649 0 : if(pc.Get_size()>1) domain_decomposition=true;
650 0 : if(domain_decomposition) shuffled=true;
651 :
652 0 : if (shuffled)
653 0 : log << "Using --shuffled\n";
654 0 : if (domain_decomposition)
655 0 : log << "Using --domain-decomposition\n";
656 :
657 : double timeToSleep;
658 0 : parse("--sleep",timeToSleep);
659 0 : log << "Using --sleep=" << timeToSleep << "\n";
660 :
661 : std::vector<int> shuffled_indexes;
662 :
663 : {
664 : std::string atomicDistr;
665 0 : parse("--atom-distribution",atomicDistr);
666 0 : distribution = getAtomDistribution(atomicDistr);
667 0 : log << "Using --atom-distribution=" << atomicDistr << "\n";
668 : }
669 :
670 : {
671 : std::string fileToDump;
672 0 : if(parse("--dump-trajectory",fileToDump)) {
673 0 : log << "Saving the trajectory to \"" << fileToDump << "\" and exiting\n";
674 0 : std::vector<double> cell(9);
675 0 : std::vector<Vector> pos(natoms);
676 0 : std::ofstream ofile(fileToDump);
677 0 : if (nf<0) {
678 : //if the user accidentally sets infinite steps, we set it to print only one
679 0 : nf=1;
680 : }
681 0 : for(int step=0; step<nf; ++step) {
682 0 : auto sw=kernels[0].stopwatch.startStop("TrajectoryGeneration");
683 0 : distribution->positions(pos,step,atomicGenerator);
684 0 : distribution->box(cell,natoms,step,atomicGenerator);
685 0 : ofile << natoms << "\n"
686 0 : << cell[0] << " " << cell[1] << " " << cell[2] << " "
687 0 : << cell[3] << " " << cell[4] << " " << cell[5] << " "
688 0 : << cell[6] << " " << cell[7] << " " << cell[8] << "\n";
689 0 : for(int i=0; i<natoms; ++i) {
690 0 : ofile << "X\t" << pos[i]<< "\n";
691 : }
692 0 : }
693 0 : ofile.close();
694 : return 0;
695 0 : }
696 : }
697 :
698 0 : log <<"Initializing the setup of the kernel(s)\n";
699 0 : const auto initial_time=std::chrono::high_resolution_clock::now();
700 :
701 0 : for(auto & k : kernels) {
702 : auto & p(k.handle);
703 0 : auto sw=k.stopwatch.startStop("A Initialization");
704 0 : if(Communicator::plumedHasMPI() && domain_decomposition) p.cmd("setMPIComm",&pc.Get_comm());
705 0 : p.cmd("setRealPrecision",(int)sizeof(double));
706 0 : p.cmd("setMDLengthUnits",1.0);
707 0 : p.cmd("setMDChargeUnits",1.0);
708 0 : p.cmd("setMDMassUnits",1.0);
709 0 : p.cmd("setMDEngine","benchmarks");
710 0 : p.cmd("setTimestep",1.0);
711 0 : p.cmd("setPlumedDat",k.plumed_dat.c_str());
712 0 : p.cmd("setLog",out);
713 0 : p.cmd("setNatoms",natoms);
714 0 : p.cmd("init");
715 0 : }
716 :
717 0 : std::vector<double> cell( 9 ), virial( 9 );
718 0 : std::vector<Vector> pos( natoms ), forces( natoms );
719 0 : std::vector<double> masses( natoms, 1 ), charges( natoms, 0 );
720 :
721 0 : if(shuffled) {
722 0 : shuffled_indexes.resize(natoms);
723 0 : for(unsigned i=0; i<natoms; i++) shuffled_indexes[i]=i;
724 0 : std::shuffle(shuffled_indexes.begin(),shuffled_indexes.end(),rng);
725 : }
726 :
727 : // non owning pointers, used for shuffling the execution order
728 : std::vector<Kernel*> kernels_ptr;
729 0 : for(unsigned i=0; i<kernels.size(); i++) kernels_ptr.push_back(&kernels[i]);
730 :
731 0 : int plumedStopCondition=0;
732 : bool fast_finish=false;
733 : int part=0;
734 :
735 0 : log<<"Starting MD loop\n";
736 0 : log<<"Use CTRL+C to stop at any time and collect timers (not working in MPI runs)\n";
737 : // trap signals:
738 0 : SignalHandlerGuard sigIntGuard(SIGINT, signalHandler);
739 0 : SignalHandlerGuard sigTermGuard(SIGTERM, signalHandler);
740 :
741 0 : for(int step=0; nf<0 || step<nf; ++step) {
742 0 : std::shuffle(kernels_ptr.begin(),kernels_ptr.end(),rng);
743 0 : distribution->positions(pos,step,atomicGenerator);
744 0 : distribution->box(cell,natoms,step,atomicGenerator);
745 : double* pos_ptr;
746 : double* for_ptr;
747 : double* charges_ptr;
748 : double* masses_ptr;
749 : int* indexes_ptr=nullptr;
750 : int n_local_atoms;
751 :
752 0 : if(domain_decomposition) {
753 0 : const auto nproc=pc.Get_size();
754 0 : const auto nn=natoms/nproc;
755 : //using int to remove warning, MPI don't work with unsigned
756 0 : int excess=natoms%nproc;
757 0 : const auto myrank=pc.Get_rank();
758 : auto shift=0;
759 0 : n_local_atoms=nn;
760 0 : if(myrank<excess) n_local_atoms+=1;
761 0 : for(int i=0; i<myrank; i++) {
762 0 : shift+=nn;
763 0 : if(i<excess) shift+=1;
764 : }
765 0 : pos_ptr=&pos[shift][0];
766 0 : for_ptr=&forces[shift][0];
767 : charges_ptr=&charges[shift];
768 : masses_ptr=&masses[shift];
769 0 : indexes_ptr=shuffled_indexes.data()+shift;
770 : } else {
771 0 : pos_ptr=&pos[0][0];
772 0 : for_ptr=&forces[0][0];
773 : charges_ptr=&charges[0];
774 : masses_ptr=&masses[0];
775 0 : n_local_atoms=natoms;
776 : indexes_ptr=shuffled_indexes.data();
777 : }
778 :
779 : const char* sw_name;
780 0 : if(part==0) sw_name="B0 First step";
781 0 : else if(part==1) sw_name="B1 Warm-up";
782 0 : else if(part==2) sw_name="B2 Calculation part 1";
783 : else sw_name="B3 Calculation part 2";
784 :
785 :
786 0 : for(unsigned i=0; i<kernels_ptr.size(); i++) {
787 0 : auto & p(kernels_ptr[i]->handle);
788 :
789 : {
790 0 : auto sw=kernels_ptr[i]->stopwatch.startPause(sw_name);
791 0 : p.cmd("setStep",step);
792 0 : p.cmd("setStopFlag",&plumedStopCondition);
793 0 : p.cmd("setForces",for_ptr, {n_local_atoms,3});
794 0 : p.cmd("setBox",&cell[0], {3,3});
795 0 : p.cmd("setVirial",&virial[0], {3,3});
796 0 : p.cmd("setPositions",pos_ptr, {n_local_atoms,3});
797 0 : p.cmd("setMasses",masses_ptr, {n_local_atoms});
798 0 : p.cmd("setCharges",charges_ptr, {n_local_atoms});
799 0 : if(shuffled) {
800 0 : p.cmd("setAtomsNlocal",n_local_atoms);
801 0 : p.cmd("setAtomsGatindex",indexes_ptr, {n_local_atoms});
802 : }
803 0 : p.cmd("prepareCalc");
804 0 : }
805 :
806 : // mimick MD calculation here
807 : {
808 : unsigned k=0;
809 0 : auto start=std::chrono::high_resolution_clock::now();
810 0 : while(std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::high_resolution_clock::now()-start).count()<(long long int)1e9*timeToSleep) k+=i*i;
811 : std::fprintf(log_dev_null.get(),"%u",k);
812 : }
813 :
814 : {
815 0 : auto sw=kernels_ptr[i]->stopwatch.startStop(sw_name);
816 0 : p.cmd("performCalc");
817 0 : }
818 :
819 0 : if(kernels_ptr.size()>1 && part>1) kernels_ptr[i]->timings.push_back(kernels_ptr[i]->stopwatch.getLastCycle(sw_name));
820 0 : if(plumedStopCondition || signalReceived.load()) fast_finish=true;
821 : }
822 0 : auto elapsed=std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::high_resolution_clock::now()-initial_time).count();
823 0 : if(part==0) part=1;
824 0 : if(part<2) {
825 0 : if((maxtime>0 && elapsed>(long long int)(0.2*1e9*maxtime)) || (nf>0 && step+1>=nf/5) || (maxtime<0 && nf<0 && step+1>=100)) {
826 : part=2;
827 0 : log<<"Warm-up completed\n";
828 : }
829 : }
830 0 : if(part<3) {
831 0 : if((maxtime>0 && elapsed>(long long int)(0.6*1e9*maxtime)) || (nf>0 && step+1>=3*nf/5)) {
832 : part=3;
833 0 : log<<"60% completed\n";
834 : }
835 : }
836 :
837 0 : if(maxtime>0 && elapsed>(long long int)(1e9*maxtime)) fast_finish=true;
838 :
839 : {
840 0 : unsigned tmp=fast_finish;
841 0 : pc.Bcast(tmp,0);
842 0 : fast_finish=tmp;
843 : }
844 0 : if(fast_finish) break;
845 : }
846 :
847 : return 0;
848 0 : }
849 :
850 : } // namespace unnamed
851 : } // namespace cltools
852 : } // namespace PLMD
|