Commit 316208ec authored by Misbah Mubarak's avatar Misbah Mubarak
Browse files

Adding workloads provided by intel

parents
This diff is collapsed.
#ifndef _LAMMPS_
#define _LAMMPS_
#include "app_base_swm_user_code.h"
#include <list>
// Internal LAMMPS paramenters
// Skin cutoff for ghost neighbor exchange (on comm)
#define GHOST_SKIN_CUTOFF 12.0
// Skin cutoff for fft neighbor exchange (on commgrid)
#define FFT_SKIN_CUTOFF 2.0
// Number of atoms in a basic block
#define N_ATOMS_BASE 32000
// Neighbor check after NEIGH_DELAY, then every NEIGH_EVERY
#define NEIGH_DELAY 5
#define NEIGH_EVERY 1
// Dimensions of the basic block
#define XLO_BASE (-27.5)
#define XHI_BASE (27.5)
#define YLO_BASE (-38.5)
#define YHI_BASE (38.5)
#define ZLO_BASE (-36.3646)
#define ZHI_BASE (36.3615)
// lammps factors for determining required decomposition
#define GEWALD 0.243177
#define FFT_ACCURACY 0.033206
// number of transposes in fft
#define NUM_TRANSPOSE 13
// number of allreduces at the end of neighbor exchange
#define NUM_NEIGH_ALLREDUCE 5
#define PI 3.14159265358979323846
class LAMMPS_SWM : public AppBaseSWMUserCode
{
public:
LAMMPS_SWM(
SWMUserIF* user_if,
boost::property_tree::ptree cfg,
void**& generic_ptrs
);
~LAMMPS_SWM();
void call();
protected:
uint32_t x_rep; // number of replicas in X dimension
uint32_t y_rep; // number of replicas in Y dimension
uint32_t z_rep; // number of replicas in Z dimension
uint32_t num_timesteps; // number of time steps to simulate
uint32_t req_vc; // request vc
uint32_t resp_vc; // response vc
double router_freq; // router frequency in Hz
double cpu_freq; // CPU frequency in Hz
double cpu_sim_speedup; // simulation speedup factor (makes CPU faster)
uint32_t rsp_bytes;
private:
double prd[3];
double pppmGrid[3];
int procNums[3];
int *k_r_targets[NUM_TRANSPOSE];
int *k_s_targets[NUM_TRANSPOSE];
int *k_s_sizes[NUM_TRANSPOSE];
long k_cyc[NUM_TRANSPOSE];
int k_len[NUM_TRANSPOSE];
int *gh_fw_r_targets;
int *gh_fw_s_targets;
int *gh_fw_s_sizes;
long *gh_fw_cyc;
int gh_fw_len;
int *gh_rw_r_targets;
int *gh_rw_s_targets;
int *gh_rw_s_sizes;
long *gh_rw_cyc;
int gh_rw_len;
int *k_pre_r_targets;
int *k_pre_s_targets;
int *k_pre_s_sizes;
long *k_pre_cyc;
int k_pre_len;
int *k_post_r_targets;
int *k_post_s_targets;
int *k_post_s_sizes;
long *k_post_cyc;
int k_post_len;
int *fix_r_targets;
int *fix_s_targets;
int *fix_s_sizes;
long *fix_cyc;
int fix_len;
int *neigh_e_r_targets;
int *neigh_e_s_targets;
int *neigh_e_s_sizes;
long *neigh_e_cyc;
int neigh_e_len;
int *neigh_b_r_targets;
int *neigh_b_s_targets;
int *neigh_b_s_sizes;
long *neigh_b_cyc;
int neigh_b_len;
long neigh_check_cyc;
double neigh_check_average;
double neigh_check_cumulative;
int neigh_check_count;
long neigh_end_cyc[NUM_NEIGH_ALLREDUCE];
long start_cyc;
long k_energy_cyc;
long final_cyc;
void modelInit();
void doP2P(int len, int *r_targets, int *s_targets, int *s_sizes, long *cyc_cnt);
void doNeighExch();
void doFFT();
bool neigh_check();
// process decomposition
void proc_decomposition(int n, double prd[], int procNums[]);
// PPPM decomposition
void pppm_decomposition(int n, double prd[], double pppmGrid[]);
double pppm_estimate_ik_error(double h, double prd, int n, double all_prd[]);
int pppm_factorable(int n);
// neighbor comm setup
void ghost_setup(double cutoff, int rank, double t_vol);
void k_pre_setup(double cutoff, int rank, double f_vol);
void k_post_setup(double cutoff, int rank, double f_vol);
void neigh_e_setup(double cutoff, int rank, double t_vol);
// k space paramenters
void get_k_params(int rank, double f_vol);
void get_nx_in(int rank, int nx[10]);
void get_nx_fft(int rank, int nx[10]);
void get_nx_mid1(int rank, int nx[10]);
void get_nx_mid2(int rank, int nx[10]);
int find_one_overlap(int a[6], int b[6], int s[3]);
void find_overlap(int all_in[], int in_shift, int all_out[], int out_shift, int rank, int r_r[], int *r_len, int s_r[], int s_rs[], int *s_len);
void best_2d_mapping(int *px, int *py, int nx, int ny);
void bifactor(int n, int *f1, int *f2);
void rank_to_xyz(int rank, int coord[3]);
int xyz_to_rank(int coord[3]);
void rank_to_neigh(int rank, int neighs[6]);
};
#endif
double msg_ghost_fw = 2.48839990371;
double msg_ghost_rw = 2.48841071356;
double msg_k_pre = 8.0;
double msg_k_post = 24.0;
double msg_fix = 2.48841071356;
double msg_neigh_exch = 3.08673789851;
double msg_neigh_border = 6.63563071593;
double ins_start_a[1] = {8.51937488057};
double ins_start_b[1] = {1544.46231029};
double ins_start_cpi = 0.843141163755;
double ins_neigh_check_a[1] = {89.6202085326};
double ins_neigh_check_b[1] = {195042.694781};
double ins_neigh_check_cpi = 0.951841661097;
double ins_neigh_exch_sr_a[3] = {11.5746361748, 1.3778877165, 1.34223584427};
double ins_neigh_exch_sr_b[3] = {438096.47233, 4800.95420873, 8838.30958016};
double ins_neigh_exch_sr_cpi = 1.58963777201;
double ins_neigh_border_sr_a[6] = {1.8243979135, 2.03810250649, 3.06679631198, 3.0870981696, 3.58608401984, 2.7521157202};
double ins_neigh_border_sr_b[6] = {32382.7816726, 51218.9714454, 83557.2150064, 99920.5231836, 248049.508775, 357653.369027};
double ins_neigh_border_sr_cpi = 1.75604132297;
double ins_neigh_end_a[5] = {1.21665755465, 6595.30712353, 29.6655250587, 58.3229990241, 1.69059035676};
double ins_neigh_end_b[5] = {139153.690154, 11183101.9944, 44150.0262654, 91071.0968296, 2057.50606924};
double ins_neigh_end_cpi = 0.784053776222;
double ins_k_pre_a[6] = {43360.7612799, 0.780443563075, 0.999500801383, 1.23253340415, 1.11044737418, 0.813347233046};
double ins_k_pre_b[6] = {-8151826.36712, 12652.2538632, 10478.6380748, 5362.54935036, 4946.43943567, 2809.82745824};
double ins_k_pre_cpi = 0.897392796161;
double ins_k_fft_a[13] = {12.7660165971, 49.5132610315, 36.7957959, 48.6517835605, 102.611869648, 36.7771213175, 48.6352484315, 71.1465535394, 36.7760281598, 48.624983362, 71.1461101858, 36.7661712493, 48.6196591605};
double ins_k_fft_b[13] = {2755.32405875, -14031.7206559, -268.944769389, -34509.6094468, -15768.811004, 468.975498509, -34540.223738, -23736.2087919, -119.26574367, -34526.5937504, -23716.4687588, 130.351010748, -34514.7315393};
double ins_k_fft_cpi = 0.700575655531;
double ins_k_post_a[6] = {15.7888010275, 1.98489719387, 2.63496119567, 3.00664450319, 2.41731560611, 1.9085386988};
double ins_k_post_b[6] = {390.234582372, 6358.92071557, 11170.1400931, 12661.0465342, 24102.3020575, 30456.6591775};
double ins_k_post_cpi = 1.72457235374;
double ins_k_energy_a[1] = {5476.95439615};
double ins_k_energy_b[1] = {-1073884.00556};
double ins_k_energy_cpi = 0.475585305054;
double ins_ghost_fw_a[6] = {43.2730897193, 0.0, 9.23745386168e-09, 2.35234627328e-08, 2.00592476871e-08, 1.33019109126e-07};
double ins_ghost_fw_b[6] = {96380.315439, 313.368687371, 313.346132312, 313.327974814, 313.346666244, 313.3420572};
double ins_ghost_fw_cpi = 0.924608655408;
double ins_ghost_rw_a[6] = {0.0, 0.31849027582, 0.316224042474, 0.513934022608, 0.501449013036, 0.298131697301};
double ins_ghost_rw_b[6] = {566.574947244, 81855.4482201, 82072.2144336, 18124.8399648, 22463.1938402, 12639.7445334};
double ins_ghost_rw_cpi = 1.91339519762;
double ins_fix_a[6] = {6.31029520441, 0.519042043438, 0.699422916624, 0.862615710189, 0.708782029108, 0.552896821411};
double ins_fix_b[6] = {25475.6674291, 22357.989938, 30827.99377, 38715.8802264, 89865.0435751, 140845.450888};
double ins_fix_cpi = 1.96053897728;
double ins_final_a[1] = {43.3348975221};
double ins_final_b[1] = {141841.44285};
double ins_final_cpi = 1.56863134534;
double neigh_check_avg = 0.196428571429;
{
"jobs": [
{
"name": "StandaloneSWM",
"app": "dll",
"dll_path": "apps/dll/lammps.so",
"size": 128,
"time": 0,
"cfg":
{
"num_x_replicas": 1, # number of replicas in X dimension
"num_y_replicas": 1, # number of replicas in Y dimension
"num_z_replicas": 1, # number of replicas in Z dimension
"num_time_steps": 30, # number of time steps to simulate
"req_vc" : 0, # request vc
"resp_vc" : 1, # response vc
"router_freq" : 800e6, # router frequency in Hz
"cpu_freq" : 1.2e9, # CPU frequency in Hz
"cpu_sim_speedup" : 1e6 # simulation speedup factor (makes CPU faster) (use this to shorten computation periods)
}
}
]
}
\ No newline at end of file
#include "nearest_neighbor_swm_user_code.h"
#include "boost_ptree_array_to_std_vector.h"
extern uint64_t global_cycle;
NearestNeighborSWMUserCode::NearestNeighborSWMUserCode(
SWMUserIF* user_if,
boost::property_tree::ptree cfg,
void**& generic_ptrs
) :
AppBaseSWMUserCode(user_if,cfg,"nearest_neighbor"),
dimension_cnt(cfg.get<uint32_t>("dimension_cnt",0)),
dimension_sizes(boost_ptree_array_to_std_vector<uint32_t>(cfg,"dimension_sizes", {0})),
max_dimension_distance(cfg.get<uint32_t>("max_dimension_distance",0)),
synchronous(cfg.get<bool>("synchronous",false)),
iterations_per_sync(cfg.get<uint32_t>("iterations_per_sync",1)),
randomize_communication_order(cfg.get<bool>("randomize_communication_order",false))
{
assert(dimension_sizes.size() == dimension_cnt);
size_t dim_product = 1;
for(size_t dim_i = 0; dim_i < dimension_sizes.size(); dim_i++)
{
dim_product *= dimension_sizes[dim_i];
}
std::cout << "dim_product is " << dim_product << " and process_cnt is " << process_cnt << std::endl;
assert(dim_product == process_cnt);
}
void
NearestNeighborSWMUserCode::xlat_pid_to_coords(
uint32_t pid,
std::vector<uint32_t>& coords
)
{
coords.clear();
uint32_t dim_div = 1;
for(uint32_t dim_idx=0; dim_idx<dimension_cnt; dim_idx++)
{
uint32_t pid_coord_in_dim = (pid / dim_div) % dimension_sizes[dim_idx];
dim_div *= dimension_sizes[dim_idx];
coords.push_back(pid_coord_in_dim);
}
}
void
NearestNeighborSWMUserCode::xlat_coords_to_pid(
std::vector<uint32_t> coords,
uint32_t& pid
)
{
pid=0;
/*
std::cout << "xlat_coords_to_pid on coords ";
for(size_t coords_idx=0; coords_idx<coords.size(); coords_idx++) {
std::cout << " " << coords[coords_idx];
}
std::cout << endl;
*/
uint32_t dim_mult = 1;
for(uint32_t dim_idx=0; dim_idx<dimension_cnt; dim_idx++)
{
pid += coords[dim_idx] * dim_mult;
dim_mult *= dimension_sizes[dim_idx];
}
}
std::string
NearestNeighborSWMUserCode::get_neighbor_string(
uint32_t my_pid,
uint32_t neighbor_pid
)
{
std::vector<uint32_t> my_coords;
std::vector<uint32_t> neighbor_coords;
xlat_pid_to_coords(my_pid, my_coords);
xlat_pid_to_coords(neighbor_pid, neighbor_coords);
assert(my_coords.size() == neighbor_coords.size());
std::ostringstream oss;
for(size_t c=0; c<my_coords.size(); c++)
{
if(my_coords[c] != neighbor_coords[c])
{
if(my_coords[c] == 0)
{
if(neighbor_coords[c] == (my_coords[c]+1))
{
oss << "p" << c;
}
else if(neighbor_coords[c] == (dimension_sizes[c]-1))
{
oss << "m" << c;
}
else
{
assert(0);
}
}
else
{
if(neighbor_coords[c] == ((my_coords[c] + 1) % dimension_sizes[c]))
{
oss << "p" << c;
}
else if(neighbor_coords[c] == (my_coords[c] - 1))
{
oss << "m" << c;
}
else
{
assert(0);
}
}
}
}
return oss.str();
}
void
NearestNeighborSWMUserCode::derive_neighbors_recurse(
std::vector<uint32_t> coords,
std::vector<neighbor_tuple>& neighbors,
uint32_t dimension_to_vary,
uint32_t accumulated_dimension_distance
)
{
std::vector<uint32_t> coords_copy;
//uint32_t accumulated_dimension_distance_copy;
coords_copy.resize(coords.size());
if(accumulated_dimension_distance == max_dimension_distance)
{
uint32_t neighbor_pid;
xlat_coords_to_pid(coords, neighbor_pid);
std::string neighbor_string = get_neighbor_string(process_id, neighbor_pid);
std::string regexed_string = GetFirstMatch(neighbor_string);
//std::cout << "neighbor_string is " << neighbor_string << ", regexd_string is " << regexed_string << std::endl;
neighbors.push_back( std::make_tuple(neighbor_pid,regexed_string) );
return;
}
else if(dimension_to_vary == dimension_cnt)
{
if(accumulated_dimension_distance > 0)
{
uint32_t neighbor_pid;
xlat_coords_to_pid(coords, neighbor_pid);
std::string neighbor_string = get_neighbor_string(process_id, neighbor_pid);
std::string regexed_string = GetFirstMatch(neighbor_string);
//std::cout << "neighbor_string is " << neighbor_string << ", regexd_string is " << regexed_string << std::endl;
neighbors.push_back( std::make_tuple(neighbor_pid,regexed_string) );
}
return;
}
//negative
coords_copy = coords;
if(coords_copy[dimension_to_vary] == 0)
{
coords_copy[dimension_to_vary] = (dimension_sizes[dimension_to_vary] -1);
}
else
{
coords_copy[dimension_to_vary] = (coords_copy[dimension_to_vary] -1);
}
derive_neighbors_recurse(
coords_copy,
neighbors,
dimension_to_vary+1,
accumulated_dimension_distance+1
);
//none
coords_copy = coords;
derive_neighbors_recurse(
coords_copy,
neighbors,
dimension_to_vary+1,
accumulated_dimension_distance
);
//positive
coords_copy = coords;
if(coords_copy[dimension_to_vary] == (dimension_sizes[dimension_to_vary] -1))
{
coords_copy[dimension_to_vary] = 0;
}
else
{
coords_copy[dimension_to_vary] = (coords_copy[dimension_to_vary] +1);
}
derive_neighbors_recurse(
coords_copy,
neighbors,
dimension_to_vary+1,
accumulated_dimension_distance+1
);
}
void
NearestNeighborSWMUserCode::call()
{
/*
if(process_id == 0) { //lets print every pid in coords and back again
std::vector<uint32_t> coords;
uint32_t pid_again;
for(uint32_t pid=0; pid<process_cnt; pid++) {
coords.clear();
pid_again=0;
xlat_pid_to_coords(pid, coords);
std::cout << "pid " << pid << " has coords.size " << coords.size() << " ";
for(size_t i=0; i<coords.size(); i++) {
std::cout << " " << coords[i];
}
std::cout << "; which have pid ";
xlat_coords_to_pid(coords, pid_again);
std::cout << pid_again << endl;
}
}
*/
std::vector<uint32_t> my_coords;
std::vector<uint32_t> neighbor_pids;
xlat_pid_to_coords(process_id, my_coords);
derive_neighbors_recurse(my_coords, neighbors);
/*
if(process_id == 0)
{
std::cout << "neighbors of pid " << process_id << " are: ";
for(size_t neighbors_idx=0; neighbors_idx<neighbors.size(); neighbors_idx++) {
std::cout << " " << std::get<0>(neighbors[neighbors_idx]) << "," << std::get<1>(neighbors[neighbors_idx]);
}
std::cout << "\n";
}
*/
uint32_t* send_handles = NULL;
uint32_t* recv_handles = NULL;
if(synchronous)
{
send_handles = new uint32_t[neighbors.size()*iterations_per_sync];
recv_handles = new uint32_t[neighbors.size()*iterations_per_sync];
}
uint32_t iter_before_sync = 0;
uint32_t neighbors_size=neighbors.size();
for(uint32_t iter=0; iter<iteration_cnt; iter++)
{
//shuffle the neighbors
if(randomize_communication_order)
{
std::default_random_engine e {rng_unique_seed->Get(INT_MAX)};
std::shuffle(neighbors.begin(), neighbors.end(), e);
}
//send to each neighbor
for(size_t neighbor_idx=0; neighbor_idx<neighbors.size(); neighbor_idx++)
{
msg_traffic_desc msg_desc;
GetMsgDetails(&msg_desc, std::get<1>(neighbors[neighbor_idx]));
if(synchronous)
{
//send/recv pair that we'll later wait on
SWM_Isend(
std::get<0>(neighbors[neighbor_idx]),
SWM_COMM_WORLD,
process_id,
msg_desc.msg_req_vc,
msg_desc.msg_rsp_vc,
NO_BUFFER,
msg_desc.msg_req_bytes,
msg_desc.pkt_rsp_bytes,
&(send_handles[neighbor_idx+iter_before_sync*neighbors_size]),
msg_desc.msg_req_routing_type,
msg_desc.msg_rsp_routing_type
);
SWM_Irecv(
std::get<0>(neighbors[neighbor_idx]),
SWM_COMM_WORLD,
std::get<0>(neighbors[neighbor_idx]),
NO_BUFFER,
&(recv_handles[neighbor_idx+iter_before_sync*neighbors_size])
);
for(uint32_t noop=0; noop<noop_cnt; noop++)
{
SWM_Noop();
}
}
else
{
//fire and forget
SWM_Synthetic(
std::get<0>(neighbors[neighbor_idx]), //dst
msg_desc.msg_req_vc,
msg_desc.msg_rsp_vc,
msg_desc.pkt_rsp_vc,
msg_desc.msg_req_bytes,
msg_desc.msg_rsp_bytes,
msg_desc.pkt_rsp_bytes,
msg_desc.msg_req_routing_type,
msg_desc.msg_rsp_routing_type,
msg_desc.pkt_rsp_routing_type,
NULL,
msg_desc.attribute
#ifdef FABSIM_EMULATION
, msg_desc.l2_encoding
#endif
);
for(uint32_t noop=0; noop<noop_cnt; noop++)
{
SWM_Noop();
}
}