#include #include #include // Two functions for timing how fast your code runs. Use them as // auto t1 = start_time (); // ... your code... // long int usec = delta_usec(t1); std::chrono::time_point start_time (); long int delta_usec (std::chrono::time_point start); // Two simple macros for printing, if you want them. #define LOG(args) cout << args << endl #define DIE(args) { cout << args << endl; exit(0); } // This macro is for printing in a multithreaded environment. It uses a // lock to ensure that output from different threads doesn't interleave with // each other. The idea is that // 1. You declare a mutex yourself. // 2. You pass it as the first argument to every time you call LOGM. #define LOGM(mut, args) { mut.lock(); cout << args << endl; mut.unlock(); } // Check the return status after a CUDA call. #define ERR_CHK(status, args) if (status != cudaSuccess) DIE (args << " (error code "< cores; #define DEBUG_CORES2(print_counter, n_loops_to_print, cores) \ if (++print_counter==n_loops_to_print) { \ print_counter=0; \ cores.push_back (sched_getcpu()); \ if (cores.size()==10) { \ cout <<"Thread #"< void assign_to_core (std::thread::native_handle_type th_handle, int i); // Given a vector of execution times (typically from running the same code // multiple times & timing it each time), print statistics: average time and // standard deviation. Ignore times[0]; it probably represents a cold cache. void analyze_times (std::string message, std::vector ×, std::string units);