From c24dd7cbb6ee9028f0eecbcc5359653ec20c6ef7 Mon Sep 17 00:00:00 2001 From: Dr-Noob Date: Fri, 6 Aug 2021 11:04:29 +0200 Subject: [PATCH] [v0.98][Refactoring] Use int for peak performance, which makes code cleaner --- src/arm/midr.c | 43 +++++++++++----------- src/arm/midr.h | 2 - src/common/cpu.c | 15 ++++---- src/common/cpu.h | 3 +- src/common/printer.c | 14 ++----- src/ppc/ppc.c | 54 +++++++++++++-------------- src/ppc/ppc.h | 1 - src/x86/cpuid.c | 87 ++++++++++++++++++++++---------------------- src/x86/cpuid.h | 2 - 9 files changed, 106 insertions(+), 115 deletions(-) diff --git a/src/arm/midr.c b/src/arm/midr.c index 809e9f9..c9d32f1 100644 --- a/src/arm/midr.c +++ b/src/arm/midr.c @@ -61,6 +61,27 @@ struct topology* get_topology_info(struct cpuInfo* cpu, struct cache* cach, uint return topo; } +int64_t get_peak_performance(struct cpuInfo* cpu) { + struct cpuInfo* ptr = cpu; + + //First check we have consistent data + for(int i=0; i < cpu->num_cpus; ptr = ptr->next_cpu, i++) { + if(get_freq(ptr->freq) == UNKNOWN_FREQ) { + return -1; + } + } + + int64_t flops = 0; + + ptr = cpu; + for(int i=0; i < cpu->num_cpus; ptr = ptr->next_cpu, i++) { + flops += ptr->topo->total_cores * (get_freq(ptr->freq) * 1000000); + } + if(cpu->feat->NEON) flops = flops * 4; + + return flops; +} + bool cores_are_equal(int c1pos, int c2pos, uint32_t* midr_array, int32_t* freq_array) { return midr_array[c1pos] == midr_array[c2pos] && freq_array[c1pos] == freq_array[c2pos]; } @@ -198,6 +219,7 @@ struct cpuInfo* get_cpu_info() { cpu->hv = emalloc(sizeof(struct hypervisor)); cpu->hv->present = false; cpu->soc = get_soc(); + cpu->peak_performance = get_peak_performance(cpu); return cpu; } @@ -210,27 +232,6 @@ char* get_str_topology(struct cpuInfo* cpu, struct topology* topo, bool dual_soc return string; } -bool get_peak_performance(struct cpuInfo* cpu, double* flops) { - struct cpuInfo* ptr = cpu; - - //First check we have consistent data - for(int i=0; i < cpu->num_cpus; ptr = ptr->next_cpu, i++) { - if(get_freq(ptr->freq) == UNKNOWN_FREQ) { - return false; - } - } - - *flops = 0.0; - - ptr = cpu; - for(int i=0; i < cpu->num_cpus; ptr = ptr->next_cpu, i++) { - *flops += ptr->topo->total_cores * (get_freq(ptr->freq) * 1000000); - } - if(cpu->feat->NEON) *flops = *flops * 4; - - return true; -} - char* get_str_features(struct cpuInfo* cpu) { struct features* feat = cpu->feat; char* string = emalloc(sizeof(char) * 25); diff --git a/src/arm/midr.h b/src/arm/midr.h index 1854ca8..9da3bb6 100644 --- a/src/arm/midr.h +++ b/src/arm/midr.h @@ -9,8 +9,6 @@ uint32_t get_nsockets(struct topology* topo); char* get_str_topology(struct cpuInfo* cpu, struct topology* topo, bool dual_socket); char* get_str_features(struct cpuInfo* cpu); -bool get_peak_performance(struct cpuInfo* cpu, double* flops); - void print_debug(struct cpuInfo* cpu); void free_topo_struct(struct topology* topo); diff --git a/src/common/cpu.c b/src/common/cpu.c index 9a1f197..b5c1d7e 100644 --- a/src/common/cpu.c +++ b/src/common/cpu.c @@ -151,24 +151,25 @@ char* get_str_freq(struct frequency* freq) { return string; } -char* get_str_peak_performance(double flops, bool valid_flops) { +char* get_str_peak_performance(int64_t flops) { char* str; - if(!valid_flops) { + if(flops == -1) { str = emalloc(sizeof(char) * (strlen(STRING_UNKNOWN) + 1)); strncpy(str, STRING_UNKNOWN, strlen(STRING_UNKNOWN) + 1); } // 7 for digits (e.g, XXXX.XX), 7 for XFLOP/s + double flopsd = (double) flops; uint32_t max_size = 7+1+7+1; str = ecalloc(max_size, sizeof(char)); - if(flops >= (double)1000000000000.0) - snprintf(str, max_size, "%.2f TFLOP/s", flops/1000000000000); - else if(flops >= 1000000000.0) - snprintf(str, max_size, "%.2f GFLOP/s", flops/1000000000); + if(flopsd >= (double)1000000000000.0) + snprintf(str, max_size, "%.2f TFLOP/s", flopsd/1000000000000); + else if(flopsd >= 1000000000.0) + snprintf(str, max_size, "%.2f GFLOP/s", flopsd/1000000000); else - snprintf(str, max_size, "%.2f MFLOP/s", flops/1000000); + snprintf(str, max_size, "%.2f MFLOP/s", flopsd/1000000); return str; } diff --git a/src/common/cpu.h b/src/common/cpu.h index a45f060..a0ab4f9 100644 --- a/src/common/cpu.h +++ b/src/common/cpu.h @@ -115,6 +115,7 @@ struct cpuInfo { struct cache* cach; struct topology* topo; struct features* feat; + int64_t peak_performance; #if defined(ARCH_X86) || defined(ARCH_PPC) // CPU name from model @@ -161,7 +162,7 @@ char* get_str_l1d(struct cache* cach); char* get_str_l2(struct cache* cach); char* get_str_l3(struct cache* cach); char* get_str_freq(struct frequency* freq); -char* get_str_peak_performance(double flops, bool valid_pp); +char* get_str_peak_performance(int64_t flops); void init_topology_struct(struct topology* topo, struct cache* cach); void init_cache_struct(struct cache* cach); diff --git a/src/common/printer.c b/src/common/printer.c index 17cf6d5..dd5a4b4 100644 --- a/src/common/printer.c +++ b/src/common/printer.c @@ -446,9 +446,6 @@ bool print_cpufetch_x86(struct cpuInfo* cpu, STYLE s, struct colors* cs) { if(art == NULL) return false; - double flops; - bool valid_pp = get_peak_performance(cpu, cpu->topo, get_freq(cpu->freq), &flops); - char* uarch = get_str_uarch(cpu); char* manufacturing_process = get_str_process(cpu); char* sockets = get_str_sockets(cpu->topo); @@ -463,7 +460,7 @@ bool print_cpufetch_x86(struct cpuInfo* cpu, STYLE s, struct colors* cs) { char* l1d = get_str_l1d(cpu->cach); char* l2 = get_str_l2(cpu->cach); char* l3 = get_str_l3(cpu->cach); - char* pp = get_str_peak_performance(flops, valid_pp); + char* pp = get_str_peak_performance(cpu->peak_performance); setAttribute(art,ATTRIBUTE_NAME,cpu_name); if(cpu->hv->present) { @@ -570,9 +567,6 @@ bool print_cpufetch_ppc(struct cpuInfo* cpu, STYLE s, struct colors* cs) { if(art == NULL) return false; - double flops; - bool valid_pp = get_peak_performance(cpu, cpu->topo, get_freq(cpu->freq), &flops); - char* uarch = get_str_uarch(cpu); char* manufacturing_process = get_str_process(cpu); char* sockets = get_str_sockets(cpu->topo); @@ -586,7 +580,7 @@ bool print_cpufetch_ppc(struct cpuInfo* cpu, STYLE s, struct colors* cs) { char* l1d = get_str_l1d(cpu->cach); char* l2 = get_str_l2(cpu->cach); char* l3 = get_str_l3(cpu->cach); - char* pp = get_str_peak_performance(flops, valid_pp); + char* pp = get_str_peak_performance(cpu->peak_performance); if(cpu_name != NULL) { setAttribute(art,ATTRIBUTE_NAME,cpu_name); @@ -792,9 +786,7 @@ bool print_cpufetch_arm(struct cpuInfo* cpu, STYLE s, struct colors* cs) { } } } - double flops; - bool valid_pp = get_peak_performance(cpu, &flops); - char* pp = get_str_peak_performance(flops, valid_pp); + char* pp = get_str_peak_performance(cpu->peak_performance); setAttribute(art,ATTRIBUTE_PEAK,pp); if(art->n_attributes_set > NUMBER_OF_LINES) { diff --git a/src/ppc/ppc.c b/src/ppc/ppc.c index 24aa062..5fddf4c 100644 --- a/src/ppc/ppc.c +++ b/src/ppc/ppc.c @@ -132,6 +132,32 @@ struct frequency* get_frequency_info() { return freq; } +int64_t get_peak_performance(struct cpuInfo* cpu, struct topology* topo, int64_t freq) { + /* + * Not sure about this + * PP(SP) = N_CORES * FREQUENCY * 4(If altivec) + */ + + //First check we have consistent data + if(freq == UNKNOWN_FREQ) { + return -1; + } + + struct features* feat = cpu->feat; + int64_t flops = topo->physical_cores * topo->sockets * (freq * 1000000); + if(feat->altivec) flops = flops * 4; + + // POWER9 has the concept called "slices". Each SMT4 core has two super-slices, + // and each super-slice is capable of doing two FLOPS per cycle. In the case of + // SMT8, it has 4 super-slices, thus four FLOPS per cycle. + if(is_power9(cpu->arch)) { + int threads_per_core = topo->logical_cores / topo->physical_cores; + flops = flops * (threads_per_core / 2); + } + + return flops; +} + struct cpuInfo* get_cpu_info() { struct cpuInfo* cpu = emalloc(sizeof(struct cpuInfo)); struct features* feat = emalloc(sizeof(struct features)); @@ -152,8 +178,8 @@ struct cpuInfo* get_cpu_info() { cpu->freq = get_frequency_info(); cpu->topo = get_topology_info(cpu->cach); cpu->cach = get_cache_info(cpu); - feat->altivec = has_altivec(cpu->arch); + cpu->peak_performance = get_peak_performance(cpu, cpu->topo, get_freq(cpu->freq)); if(cpu->cach == NULL || cpu->topo == NULL) { return NULL; @@ -170,32 +196,6 @@ char* get_str_altivec(struct cpuInfo* cpu) { return string; } -bool get_peak_performance(struct cpuInfo* cpu, struct topology* topo, int64_t freq, double* flops) { - /* - * Not sure about this - * PP(SP) = N_CORES * FREQUENCY * 4(If altivec) - */ - - //First check we have consistent data - if(freq == UNKNOWN_FREQ) { - return false; - } - - struct features* feat = cpu->feat; - *flops = topo->physical_cores * topo->sockets * (freq*1000000); - if(feat->altivec) *flops = *flops * 4; - - // POWER9 has the concept called "slices". Each SMT4 core has two super-slices, - // and each super-slice is capable of doing two FLOPS per cycle. In the case of - // SMT8, it has 4 super-slices, thus four FLOPS per cycle. - if(is_power9(cpu->arch)) { - int threads_per_core = topo->logical_cores / topo->physical_cores; - *flops = *flops * (threads_per_core / 2); - } - - return true; -} - char* get_str_topology(struct topology* topo, bool dual_socket) { char* string; if(topo->smt_supported > 1) { diff --git a/src/ppc/ppc.h b/src/ppc/ppc.h index 9e0fc41..7181b6f 100644 --- a/src/ppc/ppc.h +++ b/src/ppc/ppc.h @@ -6,7 +6,6 @@ struct cpuInfo* get_cpu_info(); char* get_str_altivec(struct cpuInfo* cpu); char* get_str_topology(struct topology* topo, bool dual_socket); -bool get_peak_performance(struct cpuInfo* cpu, struct topology* topo, int64_t freq, double* flops); void print_debug(struct cpuInfo* cpu); #endif diff --git a/src/x86/cpuid.c b/src/x86/cpuid.c index 6ed8769..66bc1fa 100644 --- a/src/x86/cpuid.c +++ b/src/x86/cpuid.c @@ -133,6 +133,49 @@ struct uarch* get_cpu_uarch(struct cpuInfo* cpu) { return get_uarch_from_cpuid(cpu, efamily, family, emodel, model, (int)stepping); } +int64_t get_peak_performance(struct cpuInfo* cpu, struct topology* topo, int64_t freq) { + /* + * PP = PeakPerformance + * SP = SinglePrecision + * + * PP(SP) = + * N_CORES * + * FREQUENCY * + * 2(Two vector units) * + * 2(If cpu has fma) * + * 16(If AVX512), 8(If AVX), 4(If SSE) * + */ + + //First, check we have consistent data + if(freq == UNKNOWN_FREQ) { + return -1; + } + + struct features* feat = cpu->feat; + int vpus = get_number_of_vpus(cpu); + int64_t flops = topo->physical_cores * topo->sockets * (freq*1000000) * vpus; + + if(feat->FMA3 || feat->FMA4) + flops = flops*2; + + // Ice Lake has AVX512, but it has 1 VPU for AVX512, while + // it has 2 for AVX2. If this is a Ice Lake CPU, we are computing + // the peak performance supposing AVX2, not AVX512 + if(feat->AVX512 && vpus_are_AVX512(cpu)) + flops = flops*16; + else if(feat->AVX || feat->AVX2) + flops = flops*8; + else if(feat->SSE) + flops = flops*4; + + // See https://sites.utexas.edu/jdm4372/2018/01/22/a-peculiar- + // throughput-limitation-on-intels-xeon-phi-x200-knights-landing/ + if(is_knights_landing(cpu)) + flops = flops * 6 / 7; + + return flops; +} + struct hypervisor* get_hp_info(bool hv_present) { struct hypervisor* hv = emalloc(sizeof(struct hypervisor)); if(!hv_present) { @@ -289,6 +332,7 @@ struct cpuInfo* get_cpu_info() { cpu->freq = get_frequency_info(cpu); cpu->cach = get_cache_info(cpu); cpu->topo = get_topology_info(cpu, cpu->cach); + cpu->peak_performance = get_peak_performance(cpu, cpu->topo, get_freq(cpu->freq)); if(cpu->cach == NULL || cpu->topo == NULL) { return NULL; @@ -656,49 +700,6 @@ struct frequency* get_frequency_info(struct cpuInfo* cpu) { } // STRING FUNCTIONS -bool get_peak_performance(struct cpuInfo* cpu, struct topology* topo, int64_t freq, double* flops) { - /* - * PP = PeakPerformance - * SP = SinglePrecision - * - * PP(SP) = - * N_CORES * - * FREQUENCY * - * 2(Two vector units) * - * 2(If cpu has fma) * - * 16(If AVX512), 8(If AVX), 4(If SSE) * - */ - - //First, check we have consistent data - if(freq == UNKNOWN_FREQ) { - return false; - } - - struct features* feat = cpu->feat; - int vpus = get_number_of_vpus(cpu); - *flops = topo->physical_cores * topo->sockets * (freq*1000000) * vpus; - - if(feat->FMA3 || feat->FMA4) - *flops = *flops*2; - - // Ice Lake has AVX512, but it has 1 VPU for AVX512, while - // it has 2 for AVX2. If this is a Ice Lake CPU, we are computing - // the peak performance supposing AVX2, not AVX512 - if(feat->AVX512 && vpus_are_AVX512(cpu)) - *flops = *flops*16; - else if(feat->AVX || feat->AVX2) - *flops = *flops*8; - else if(feat->SSE) - *flops = *flops*4; - - // See https://sites.utexas.edu/jdm4372/2018/01/22/a-peculiar- - // throughput-limitation-on-intels-xeon-phi-x200-knights-landing/ - if(is_knights_landing(cpu)) - *flops = *flops * 6 / 7; - - return true; -} - // TODO: Refactoring char* get_str_topology(struct cpuInfo* cpu, struct topology* topo, bool dual_socket) { char* string; diff --git a/src/x86/cpuid.h b/src/x86/cpuid.h index bfe0f6e..3dc6202 100644 --- a/src/x86/cpuid.h +++ b/src/x86/cpuid.h @@ -13,8 +13,6 @@ char* get_str_sse(struct cpuInfo* cpu); char* get_str_fma(struct cpuInfo* cpu); char* get_str_topology(struct cpuInfo* cpu, struct topology* topo, bool dual_socket); -bool get_peak_performance(struct cpuInfo* cpu, struct topology* topo, int64_t freq, double* flops); - void print_debug(struct cpuInfo* cpu); void print_raw(struct cpuInfo* cpu);