diff --git a/codes/hotspot/hotspot_openmp.cpp b/codes/hotspot/hotspot_openmp.cpp new file mode 100755 index 0000000000000000000000000000000000000000..467bae8214d3f4d3175f1cc1ddfaab2bc8a88fd0 --- /dev/null +++ b/codes/hotspot/hotspot_openmp.cpp @@ -0,0 +1,334 @@ +#include <stdio.h> +#include <stdlib.h> +#include <omp.h> +#include <sys/time.h> + +// Returns the current system time in microseconds +long long get_time() +{ + struct timeval tv; + gettimeofday(&tv, NULL); + return (tv.tv_sec * 1000000) + tv.tv_usec; + +} + +using namespace std; + +#define BLOCK_SIZE 16 +#define BLOCK_SIZE_C BLOCK_SIZE +#define BLOCK_SIZE_R BLOCK_SIZE + +#define STR_SIZE 256 + +/* maximum power density possible (say 300W for a 10mm x 10mm chip) */ +#define MAX_PD (3.0e6) +/* required precision in degrees */ +#define PRECISION 0.001 +#define SPEC_HEAT_SI 1.75e6 +#define K_SI 100 +/* capacitance fitting factor */ +#define FACTOR_CHIP 0.5 +#define OPEN +//#define NUM_THREAD 4 + +typedef float FLOAT; + +/* chip parameters */ +const FLOAT t_chip = 0.0005; +const FLOAT chip_height = 0.016; +const FLOAT chip_width = 0.016; + +#ifdef OMP_OFFLOAD +#pragma offload_attribute(push, target(mic)) +#endif + +/* ambient temperature, assuming no package at all */ +const FLOAT amb_temp = 80.0; + +int num_omp_threads; + +/* Single iteration of the transient solver in the grid model. + * advances the solution of the discretized difference equations + * by one time step + */ +void single_iteration(FLOAT *result, FLOAT *temp, FLOAT *power, int row, int col, + FLOAT Cap_1, FLOAT Rx_1, FLOAT Ry_1, FLOAT Rz_1, + FLOAT step) +{ + FLOAT delta; + int r, c; + int chunk; + int num_chunk = row*col / (BLOCK_SIZE_R * BLOCK_SIZE_C); + int chunks_in_row = col/BLOCK_SIZE_C; + int chunks_in_col = row/BLOCK_SIZE_R; + +#ifdef OPEN + #ifndef __MIC__ + omp_set_num_threads(num_omp_threads); + #endif + #pragma omp parallel for shared(power, temp, result) private(chunk, r, c, delta) firstprivate(row, col, num_chunk, chunks_in_row) schedule(static) +#endif + for ( chunk = 0; chunk < num_chunk; ++chunk ) + { + int r_start = BLOCK_SIZE_R*(chunk/chunks_in_col); + int c_start = BLOCK_SIZE_C*(chunk%chunks_in_row); + int r_end = r_start + BLOCK_SIZE_R > row ? row : r_start + BLOCK_SIZE_R; + int c_end = c_start + BLOCK_SIZE_C > col ? col : c_start + BLOCK_SIZE_C; + + if ( r_start == 0 || c_start == 0 || r_end == row || c_end == col ) + { + for ( r = r_start; r < r_start + BLOCK_SIZE_R; ++r ) { + for ( c = c_start; c < c_start + BLOCK_SIZE_C; ++c ) { + /* Corner 1 */ + if ( (r == 0) && (c == 0) ) { + delta = (Cap_1) * (power[0] + + (temp[1] - temp[0]) * Rx_1 + + (temp[col] - temp[0]) * Ry_1 + + (amb_temp - temp[0]) * Rz_1); + } /* Corner 2 */ + else if ((r == 0) && (c == col-1)) { + delta = (Cap_1) * (power[c] + + (temp[c-1] - temp[c]) * Rx_1 + + (temp[c+col] - temp[c]) * Ry_1 + + ( amb_temp - temp[c]) * Rz_1); + } /* Corner 3 */ + else if ((r == row-1) && (c == col-1)) { + delta = (Cap_1) * (power[r*col+c] + + (temp[r*col+c-1] - temp[r*col+c]) * Rx_1 + + (temp[(r-1)*col+c] - temp[r*col+c]) * Ry_1 + + ( amb_temp - temp[r*col+c]) * Rz_1); + } /* Corner 4 */ + else if ((r == row-1) && (c == 0)) { + delta = (Cap_1) * (power[r*col] + + (temp[r*col+1] - temp[r*col]) * Rx_1 + + (temp[(r-1)*col] - temp[r*col]) * Ry_1 + + (amb_temp - temp[r*col]) * Rz_1); + } /* Edge 1 */ + else if (r == 0) { + delta = (Cap_1) * (power[c] + + (temp[c+1] + temp[c-1] - 2.0*temp[c]) * Rx_1 + + (temp[col+c] - temp[c]) * Ry_1 + + (amb_temp - temp[c]) * Rz_1); + } /* Edge 2 */ + else if (c == col-1) { + delta = (Cap_1) * (power[r*col+c] + + (temp[(r+1)*col+c] + temp[(r-1)*col+c] - 2.0*temp[r*col+c]) * Ry_1 + + (temp[r*col+c-1] - temp[r*col+c]) * Rx_1 + + (amb_temp - temp[r*col+c]) * Rz_1); + } /* Edge 3 */ + else if (r == row-1) { + delta = (Cap_1) * (power[r*col+c] + + (temp[r*col+c+1] + temp[r*col+c-1] - 2.0*temp[r*col+c]) * Rx_1 + + (temp[(r-1)*col+c] - temp[r*col+c]) * Ry_1 + + (amb_temp - temp[r*col+c]) * Rz_1); + } /* Edge 4 */ + else if (c == 0) { + delta = (Cap_1) * (power[r*col] + + (temp[(r+1)*col] + temp[(r-1)*col] - 2.0*temp[r*col]) * Ry_1 + + (temp[r*col+1] - temp[r*col]) * Rx_1 + + (amb_temp - temp[r*col]) * Rz_1); + } + result[r*col+c] =temp[r*col+c]+ delta; + } + } + continue; + } + + for ( r = r_start; r < r_start + BLOCK_SIZE_R; ++r ) { +#pragma omp simd + for ( c = c_start; c < c_start + BLOCK_SIZE_C; ++c ) { + /* Update Temperatures */ + result[r*col+c] =temp[r*col+c]+ + ( Cap_1 * (power[r*col+c] + + (temp[(r+1)*col+c] + temp[(r-1)*col+c] - 2.f*temp[r*col+c]) * Ry_1 + + (temp[r*col+c+1] + temp[r*col+c-1] - 2.f*temp[r*col+c]) * Rx_1 + + (amb_temp - temp[r*col+c]) * Rz_1)); + } + } + } +} + +#ifdef OMP_OFFLOAD +#pragma offload_attribute(pop) +#endif + +/* Transient solver driver routine: simply converts the heat + * transfer differential equations to difference equations + * and solves the difference equations by iterating + */ +void compute_tran_temp(FLOAT *result, int num_iterations, FLOAT *temp, FLOAT *power, int row, int col) +{ + #ifdef VERBOSE + int i = 0; + #endif + + FLOAT grid_height = chip_height / row; + FLOAT grid_width = chip_width / col; + + FLOAT Cap = FACTOR_CHIP * SPEC_HEAT_SI * t_chip * grid_width * grid_height; + FLOAT Rx = grid_width / (2.0 * K_SI * t_chip * grid_height); + FLOAT Ry = grid_height / (2.0 * K_SI * t_chip * grid_width); + FLOAT Rz = t_chip / (K_SI * grid_height * grid_width); + + FLOAT max_slope = MAX_PD / (FACTOR_CHIP * t_chip * SPEC_HEAT_SI); + FLOAT step = PRECISION / max_slope / 1000.0; + + FLOAT Rx_1=1.f/Rx; + FLOAT Ry_1=1.f/Ry; + FLOAT Rz_1=1.f/Rz; + FLOAT Cap_1 = step/Cap; + #ifdef VERBOSE + fprintf(stdout, "total iterations: %d s\tstep size: %g s\n", num_iterations, step); + fprintf(stdout, "Rx: %g\tRy: %g\tRz: %g\tCap: %g\n", Rx, Ry, Rz, Cap); + #endif + +#ifdef OMP_OFFLOAD + int array_size = row*col; +#pragma omp target \ + map(temp[0:array_size]) \ + map(to: power[0:array_size], row, col, Cap_1, Rx_1, Ry_1, Rz_1, step, num_iterations) \ + map( result[0:array_size]) +#endif + { + FLOAT* r = result; + FLOAT* t = temp; + for (int i = 0; i < num_iterations ; i++) + { + #ifdef VERBOSE + fprintf(stdout, "iteration %d\n", i++); + #endif + single_iteration(r, t, power, row, col, Cap_1, Rx_1, Ry_1, Rz_1, step); + FLOAT* tmp = t; + t = r; + r = tmp; + } + } + #ifdef VERBOSE + fprintf(stdout, "iteration %d\n", i++); + #endif +} + +void fatal(char *s) +{ + fprintf(stderr, "error: %s\n", s); + exit(1); +} + +void writeoutput(FLOAT *vect, int grid_rows, int grid_cols, char *file) { + + int i,j, index=0; + FILE *fp; + char str[STR_SIZE]; + + if( (fp = fopen(file, "w" )) == 0 ) + printf( "The file was not opened\n" ); + + + for (i=0; i < grid_rows; i++) + for (j=0; j < grid_cols; j++) + { + + sprintf(str, "%d\t%g\n", index, vect[i*grid_cols+j]); + fputs(str,fp); + index++; + } + + fclose(fp); +} + +void read_input(FLOAT *vect, int grid_rows, int grid_cols, char *file) +{ + int i, index; + FILE *fp; + char str[STR_SIZE]; + FLOAT val; + + fp = fopen (file, "r"); + if (!fp) + fatal ("file could not be opened for reading"); + + for (i=0; i < grid_rows * grid_cols; i++) { + fgets(str, STR_SIZE, fp); + if (feof(fp)) + fatal("not enough lines in file"); + if ((sscanf(str, "%f", &val) != 1) ) + fatal("invalid file format"); + vect[i] = val; + } + + fclose(fp); +} + +void usage(int argc, char **argv) +{ + fprintf(stderr, "Usage: %s <grid_rows> <grid_cols> <sim_time> <no. of threads><temp_file> <power_file>\n", argv[0]); + fprintf(stderr, "\t<grid_rows> - number of rows in the grid (positive integer)\n"); + fprintf(stderr, "\t<grid_cols> - number of columns in the grid (positive integer)\n"); + fprintf(stderr, "\t<sim_time> - number of iterations\n"); + fprintf(stderr, "\t<no. of threads> - number of threads\n"); + fprintf(stderr, "\t<temp_file> - name of the file containing the initial temperature values of each cell\n"); + fprintf(stderr, "\t<power_file> - name of the file containing the dissipated power values of each cell\n"); + fprintf(stderr, "\t<output_file> - name of the output file\n"); + exit(1); +} + +int main(int argc, char **argv) +{ + int grid_rows, grid_cols, sim_time, i; + FLOAT *temp, *power, *result; + char *tfile, *pfile, *ofile; + + /* check validity of inputs */ + if (argc != 8) + usage(argc, argv); + if ((grid_rows = atoi(argv[1])) <= 0 || + (grid_cols = atoi(argv[2])) <= 0 || + (sim_time = atoi(argv[3])) <= 0 || + (num_omp_threads = atoi(argv[4])) <= 0 + ) + usage(argc, argv); + + /* allocate memory for the temperature and power arrays */ + temp = (FLOAT *) calloc (grid_rows * grid_cols, sizeof(FLOAT)); + power = (FLOAT *) calloc (grid_rows * grid_cols, sizeof(FLOAT)); + result = (FLOAT *) calloc (grid_rows * grid_cols, sizeof(FLOAT)); + if(!temp || !power) + fatal("unable to allocate memory"); + + /* read initial temperatures and input power */ + tfile = argv[5]; + pfile = argv[6]; + ofile = argv[7]; + + read_input(temp, grid_rows, grid_cols, tfile); + read_input(power, grid_rows, grid_cols, pfile); + + printf("Start computing the transient temperature\n"); + + long long start_time = get_time(); + + compute_tran_temp(result,sim_time, temp, power, grid_rows, grid_cols); + + long long end_time = get_time(); + + printf("Ending simulation\n"); + printf("Total time: %.3f seconds\n", ((float) (end_time - start_time)) / (1000*1000)); + + writeoutput((1&sim_time) ? result : temp, grid_rows, grid_cols, ofile); + + /* output results */ +#ifdef VERBOSE + fprintf(stdout, "Final Temperatures:\n"); +#endif + +#ifdef OUTPUT + for(i=0; i < grid_rows * grid_cols; i++) + fprintf(stdout, "%d\t%g\n", i, temp[i]); +#endif + /* cleanup */ + free(temp); + free(power); + + return 0; +} +/* vim: set ts=4 sw=4 sts=4 et si ai: */