[v0.98][Refactoring] Unify the use of get_str_peak_performance

This commit is contained in:
Dr-Noob
2021-08-06 10:26:07 +02:00
parent 7e1dde3c71
commit 6953d8dda5
9 changed files with 73 additions and 83 deletions

View File

@@ -210,37 +210,25 @@ char* get_str_topology(struct cpuInfo* cpu, struct topology* topo, bool dual_soc
return string;
}
char* get_str_peak_performance(struct cpuInfo* cpu) {
//7 for GFLOP/s and 6 for digits,eg 412.14
uint32_t size = 7+6+1+1;
assert(strlen(STRING_UNKNOWN)+1 <= size);
char* string = emalloc(sizeof(char)*size);
bool get_peak_performance(struct cpuInfo* cpu, double* flops) {
struct cpuInfo* ptr = cpu;
//First check we have consistent data
for(int i=0; i < cpu->num_cpus; ptr = ptr->next_cpu, i++) {
if(get_freq(ptr->freq) == UNKNOWN_FREQ) {
snprintf(string, strlen(STRING_UNKNOWN)+1, STRING_UNKNOWN);
return string;
return false;
}
}
double flops = 0.0;
*flops = 0.0;
ptr = cpu;
for(int i=0; i < cpu->num_cpus; ptr = ptr->next_cpu, i++) {
flops += ptr->topo->total_cores * (get_freq(ptr->freq) * 1000000);
*flops += ptr->topo->total_cores * (get_freq(ptr->freq) * 1000000);
}
if(cpu->feat->NEON) flops = flops * 4;
if(cpu->feat->NEON) *flops = *flops * 4;
if(flops >= (double)1000000000000.0)
snprintf(string,size,"%.2f TFLOP/s",flops/1000000000000);
else if(flops >= 1000000000.0)
snprintf(string,size,"%.2f GFLOP/s",flops/1000000000);
else
snprintf(string,size,"%.2f MFLOP/s",flops/1000000);
return string;
return true;
}
char* get_str_features(struct cpuInfo* cpu) {

View File

@@ -7,9 +7,10 @@ struct cpuInfo* get_cpu_info();
uint32_t get_nsockets(struct topology* topo);
char* get_str_topology(struct cpuInfo* cpu, struct topology* topo, bool dual_socket);
char* get_str_peak_performance(struct cpuInfo* cpu);
char* get_str_features(struct cpuInfo* cpu);
bool get_peak_performance(struct cpuInfo* cpu, double* flops);
void print_debug(struct cpuInfo* cpu);
void free_topo_struct(struct topology* topo);

View File

@@ -151,6 +151,28 @@ char* get_str_freq(struct frequency* freq) {
return string;
}
char* get_str_peak_performance(double flops, bool valid_flops) {
char* str;
if(!valid_flops) {
str = emalloc(sizeof(char) * (strlen(STRING_UNKNOWN) + 1));
strncpy(str, STRING_UNKNOWN, strlen(STRING_UNKNOWN) + 1);
}
// 7 for digits (e.g, XXXX.XX), 7 for XFLOP/s
uint32_t max_size = 7+1+7+1;
str = ecalloc(max_size, sizeof(char));
if(flops >= (double)1000000000000.0)
snprintf(str, max_size, "%.2f TFLOP/s", flops/1000000000000);
else if(flops >= 1000000000.0)
snprintf(str, max_size, "%.2f GFLOP/s", flops/1000000000);
else
snprintf(str, max_size, "%.2f MFLOP/s", flops/1000000);
return str;
}
void init_topology_struct(struct topology* topo, struct cache* cach) {
topo->total_cores = 0;
topo->cach = cach;

View File

@@ -161,6 +161,7 @@ char* get_str_l1d(struct cache* cach);
char* get_str_l2(struct cache* cach);
char* get_str_l3(struct cache* cach);
char* get_str_freq(struct frequency* freq);
char* get_str_peak_performance(double flops, bool valid_pp);
void init_topology_struct(struct topology* topo, struct cache* cach);
void init_cache_struct(struct cache* cach);

View File

@@ -446,6 +446,9 @@ bool print_cpufetch_x86(struct cpuInfo* cpu, STYLE s, struct colors* cs) {
if(art == NULL)
return false;
double flops;
bool valid_pp = get_peak_performance(cpu, cpu->topo, get_freq(cpu->freq), &flops);
char* uarch = get_str_uarch(cpu);
char* manufacturing_process = get_str_process(cpu);
char* sockets = get_str_sockets(cpu->topo);
@@ -456,12 +459,11 @@ bool print_cpufetch_x86(struct cpuInfo* cpu, STYLE s, struct colors* cs) {
char* avx = get_str_avx(cpu);
char* fma = get_str_fma(cpu);
char* l1i = get_str_l1i(cpu->cach);
char* l1d = get_str_l1d(cpu->cach);
char* l2 = get_str_l2(cpu->cach);
char* l3 = get_str_l3(cpu->cach);
char* pp = get_str_peak_performance(cpu,cpu->topo,get_freq(cpu->freq));
char* pp = get_str_peak_performance(flops, valid_pp);
setAttribute(art,ATTRIBUTE_NAME,cpu_name);
if(cpu->hv->present) {
@@ -568,6 +570,9 @@ bool print_cpufetch_ppc(struct cpuInfo* cpu, STYLE s, struct colors* cs) {
if(art == NULL)
return false;
double flops;
bool valid_pp = get_peak_performance(cpu, cpu->topo, get_freq(cpu->freq), &flops);
char* uarch = get_str_uarch(cpu);
char* manufacturing_process = get_str_process(cpu);
char* sockets = get_str_sockets(cpu->topo);
@@ -581,7 +586,7 @@ bool print_cpufetch_ppc(struct cpuInfo* cpu, STYLE s, struct colors* cs) {
char* l1d = get_str_l1d(cpu->cach);
char* l2 = get_str_l2(cpu->cach);
char* l3 = get_str_l3(cpu->cach);
char* pp = get_str_peak_performance(cpu,cpu->topo,get_freq(cpu->freq));
char* pp = get_str_peak_performance(flops, valid_pp);
if(cpu_name != NULL) {
setAttribute(art,ATTRIBUTE_NAME,cpu_name);
@@ -787,7 +792,9 @@ bool print_cpufetch_arm(struct cpuInfo* cpu, STYLE s, struct colors* cs) {
}
}
}
char* pp = get_str_peak_performance(cpu);
double flops;
bool valid_pp = get_peak_performance(cpu, &flops);
char* pp = get_str_peak_performance(flops, valid_pp);
setAttribute(art,ATTRIBUTE_PEAK,pp);
if(art->n_attributes_set > NUMBER_OF_LINES) {

View File

@@ -170,43 +170,30 @@ char* get_str_altivec(struct cpuInfo* cpu) {
return string;
}
char* get_str_peak_performance(struct cpuInfo* cpu, struct topology* topo, int64_t freq) {
bool get_peak_performance(struct cpuInfo* cpu, struct topology* topo, int64_t freq, double* flops) {
/*
* Not sure about this
* PP(SP) = N_CORES * FREQUENCY * 4(If altivec)
*/
//7 for GFLOP/s and 6 for digits,eg 412.14
uint32_t size = 7+6+1+1;
assert(strlen(STRING_UNKNOWN)+1 <= size);
char* string = emalloc(sizeof(char)*size);
//First check we have consistent data
if(freq == UNKNOWN_FREQ) {
snprintf(string,strlen(STRING_UNKNOWN)+1,STRING_UNKNOWN);
return string;
return false;
}
struct features* feat = cpu->feat;
double flops = topo->physical_cores * topo->sockets * (freq*1000000);
if(feat->altivec) flops = flops*4;
*flops = topo->physical_cores * topo->sockets * (freq*1000000);
if(feat->altivec) *flops = *flops * 4;
// POWER9 has the concept called "slices". Each SMT4 core has two super-slices,
// and each super-slice is capable of doing two FLOPS per cycle. In the case of
// SMT8, it has 4 super-slices, thus four FLOPS per cycle.
if(is_power9(cpu->arch)) {
int threads_per_core = topo->logical_cores / topo->physical_cores;
flops = flops * (threads_per_core / 2);
*flops = *flops * (threads_per_core / 2);
}
if(flops >= (double)1000000000000.0)
snprintf(string,size,"%.2f TFLOP/s",flops/1000000000000);
else if(flops >= 1000000000.0)
snprintf(string,size,"%.2f GFLOP/s",flops/1000000000);
else
snprintf(string,size,"%.2f MFLOP/s",flops/1000000);
return string;
return true;
}
char* get_str_topology(struct topology* topo, bool dual_socket) {

View File

@@ -6,7 +6,7 @@
struct cpuInfo* get_cpu_info();
char* get_str_altivec(struct cpuInfo* cpu);
char* get_str_topology(struct topology* topo, bool dual_socket);
char* get_str_peak_performance(struct cpuInfo* cpu, struct topology* topo, int64_t freq);
bool get_peak_performance(struct cpuInfo* cpu, struct topology* topo, int64_t freq, double* flops);
void print_debug(struct cpuInfo* cpu);
#endif

View File

@@ -655,65 +655,48 @@ struct frequency* get_frequency_info(struct cpuInfo* cpu) {
return freq;
}
/*** STRING FUNCTIONS ***/
// STRING FUNCTIONS
bool get_peak_performance(struct cpuInfo* cpu, struct topology* topo, int64_t freq, double* flops) {
/*
* PP = PeakPerformance
* SP = SinglePrecision
*
* PP(SP) =
* N_CORES *
* FREQUENCY *
* 2(Two vector units) *
* 2(If cpu has fma) *
* 16(If AVX512), 8(If AVX), 4(If SSE) *
*/
char* get_str_peak_performance(struct cpuInfo* cpu, struct topology* topo, int64_t freq) {
/***
PP = PeakPerformance
SP = SinglePrecision
PP(SP) =
N_CORES *
FREQUENCY *
2(Two vector units) *
2(If cpu has fma) *
16(If AVX512), 8(If AVX), 4(If SSE) *
***/
//7 for GFLOP/s and 6 for digits,eg 412.14
uint32_t size = 7+6+1+1;
assert(strlen(STRING_UNKNOWN)+1 <= size);
char* string = emalloc(sizeof(char)*size);
//First check we have consistent data
//First, check we have consistent data
if(freq == UNKNOWN_FREQ) {
snprintf(string,strlen(STRING_UNKNOWN)+1,STRING_UNKNOWN);
return string;
return false;
}
struct features* feat = cpu->feat;
double flops = topo->physical_cores * topo->sockets * (freq*1000000);
int vpus = get_number_of_vpus(cpu);
flops = flops * vpus;
*flops = topo->physical_cores * topo->sockets * (freq*1000000) * vpus;
if(feat->FMA3 || feat->FMA4)
flops = flops*2;
*flops = *flops*2;
// Ice Lake has AVX512, but it has 1 VPU for AVX512, while
// it has 2 for AVX2. If this is a Ice Lake CPU, we are computing
// the peak performance supposing AVX2, not AVX512
if(feat->AVX512 && vpus_are_AVX512(cpu))
flops = flops*16;
*flops = *flops*16;
else if(feat->AVX || feat->AVX2)
flops = flops*8;
*flops = *flops*8;
else if(feat->SSE)
flops = flops*4;
*flops = *flops*4;
// See https://sites.utexas.edu/jdm4372/2018/01/22/a-peculiar-
// throughput-limitation-on-intels-xeon-phi-x200-knights-landing/
if(is_knights_landing(cpu))
flops = flops * 6 / 7;
*flops = *flops * 6 / 7;
if(flops >= (double)1000000000000.0)
snprintf(string,size,"%.2f TFLOP/s",flops/1000000000000);
else if(flops >= 1000000000.0)
snprintf(string,size,"%.2f GFLOP/s",flops/1000000000);
else
snprintf(string,size,"%.2f MFLOP/s",flops/1000000);
return string;
return true;
}
// TODO: Refactoring

View File

@@ -12,7 +12,8 @@ char* get_str_avx(struct cpuInfo* cpu);
char* get_str_sse(struct cpuInfo* cpu);
char* get_str_fma(struct cpuInfo* cpu);
char* get_str_topology(struct cpuInfo* cpu, struct topology* topo, bool dual_socket);
char* get_str_peak_performance(struct cpuInfo* cpu, struct topology* topo, int64_t freq);
bool get_peak_performance(struct cpuInfo* cpu, struct topology* topo, int64_t freq, double* flops);
void print_debug(struct cpuInfo* cpu);
void print_raw(struct cpuInfo* cpu);