From c24dd7cbb6ee9028f0eecbcc5359653ec20c6ef7 Mon Sep 17 00:00:00 2001
From: Dr-Noob <peibolms@gmail.com>
Date: Fri, 6 Aug 2021 11:04:29 +0200
Subject: [PATCH] [v0.98][Refactoring] Use int for peak performance, which
 makes code cleaner

---
 src/arm/midr.c       | 43 +++++++++++-----------
 src/arm/midr.h       |  2 -
 src/common/cpu.c     | 15 ++++----
 src/common/cpu.h     |  3 +-
 src/common/printer.c | 14 ++-----
 src/ppc/ppc.c        | 54 +++++++++++++--------------
 src/ppc/ppc.h        |  1 -
 src/x86/cpuid.c      | 87 ++++++++++++++++++++++----------------------
 src/x86/cpuid.h      |  2 -
 9 files changed, 106 insertions(+), 115 deletions(-)

diff --git a/src/arm/midr.c b/src/arm/midr.c
index 809e9f9..c9d32f1 100644
--- a/src/arm/midr.c
+++ b/src/arm/midr.c
@@ -61,6 +61,27 @@ struct topology* get_topology_info(struct cpuInfo* cpu, struct cache* cach, uint
   return topo;
 }
 
+int64_t get_peak_performance(struct cpuInfo* cpu) {
+  struct cpuInfo* ptr = cpu;
+
+  //First check we have consistent data
+  for(int i=0; i < cpu->num_cpus; ptr = ptr->next_cpu, i++) {
+    if(get_freq(ptr->freq) == UNKNOWN_FREQ) {
+      return -1;
+    }
+  }
+
+  int64_t flops = 0;
+
+  ptr = cpu;
+  for(int i=0; i < cpu->num_cpus; ptr = ptr->next_cpu, i++) {
+    flops += ptr->topo->total_cores * (get_freq(ptr->freq) * 1000000);
+  }
+  if(cpu->feat->NEON) flops = flops * 4;
+
+  return flops;
+}
+
 bool cores_are_equal(int c1pos, int c2pos, uint32_t* midr_array, int32_t* freq_array) {
   return midr_array[c1pos] == midr_array[c2pos] && freq_array[c1pos] == freq_array[c2pos];
 }
@@ -198,6 +219,7 @@ struct cpuInfo* get_cpu_info() {
   cpu->hv = emalloc(sizeof(struct hypervisor));
   cpu->hv->present = false;
   cpu->soc = get_soc();
+  cpu->peak_performance = get_peak_performance(cpu);
 
   return cpu;
 }
@@ -210,27 +232,6 @@ char* get_str_topology(struct cpuInfo* cpu, struct topology* topo, bool dual_soc
   return string;
 }
 
-bool get_peak_performance(struct cpuInfo* cpu, double* flops) {
-  struct cpuInfo* ptr = cpu;
-
-  //First check we have consistent data
-  for(int i=0; i < cpu->num_cpus; ptr = ptr->next_cpu, i++) {
-    if(get_freq(ptr->freq) == UNKNOWN_FREQ) {
-      return false;
-    }
-  }
-
-  *flops = 0.0;
-
-  ptr = cpu;
-  for(int i=0; i < cpu->num_cpus; ptr = ptr->next_cpu, i++) {
-    *flops += ptr->topo->total_cores * (get_freq(ptr->freq) * 1000000);
-  }
-  if(cpu->feat->NEON) *flops = *flops * 4;
-
-  return true;
-}
-
 char* get_str_features(struct cpuInfo* cpu) {
   struct features* feat = cpu->feat;
   char* string = emalloc(sizeof(char) * 25);
diff --git a/src/arm/midr.h b/src/arm/midr.h
index 1854ca8..9da3bb6 100644
--- a/src/arm/midr.h
+++ b/src/arm/midr.h
@@ -9,8 +9,6 @@ uint32_t get_nsockets(struct topology* topo);
 char* get_str_topology(struct cpuInfo* cpu, struct topology* topo, bool dual_socket);
 char* get_str_features(struct cpuInfo* cpu);
 
-bool get_peak_performance(struct cpuInfo* cpu, double* flops);
-
 void print_debug(struct cpuInfo* cpu);
 void free_topo_struct(struct topology* topo);
 
diff --git a/src/common/cpu.c b/src/common/cpu.c
index 9a1f197..b5c1d7e 100644
--- a/src/common/cpu.c
+++ b/src/common/cpu.c
@@ -151,24 +151,25 @@ char* get_str_freq(struct frequency* freq) {
   return string;
 }
 
-char* get_str_peak_performance(double flops, bool valid_flops) {
+char* get_str_peak_performance(int64_t flops) {
   char* str;
 
-  if(!valid_flops) {
+  if(flops == -1) {
     str = emalloc(sizeof(char) * (strlen(STRING_UNKNOWN) + 1));
     strncpy(str, STRING_UNKNOWN, strlen(STRING_UNKNOWN) + 1);
   }
 
   // 7 for digits (e.g, XXXX.XX), 7 for XFLOP/s
+  double flopsd = (double) flops;
   uint32_t max_size = 7+1+7+1;
   str = ecalloc(max_size, sizeof(char));
 
-  if(flops >= (double)1000000000000.0)
-    snprintf(str, max_size, "%.2f TFLOP/s", flops/1000000000000);
-  else if(flops >= 1000000000.0)
-    snprintf(str, max_size, "%.2f GFLOP/s", flops/1000000000);
+  if(flopsd >= (double)1000000000000.0)
+    snprintf(str, max_size, "%.2f TFLOP/s", flopsd/1000000000000);
+  else if(flopsd >= 1000000000.0)
+    snprintf(str, max_size, "%.2f GFLOP/s", flopsd/1000000000);
   else
-    snprintf(str, max_size, "%.2f MFLOP/s", flops/1000000);
+    snprintf(str, max_size, "%.2f MFLOP/s", flopsd/1000000);
 
   return str;
 }
diff --git a/src/common/cpu.h b/src/common/cpu.h
index a45f060..a0ab4f9 100644
--- a/src/common/cpu.h
+++ b/src/common/cpu.h
@@ -115,6 +115,7 @@ struct cpuInfo {
   struct cache* cach;
   struct topology* topo;
   struct features* feat;
+  int64_t peak_performance;
 
 #if defined(ARCH_X86) || defined(ARCH_PPC)
   // CPU name from model
@@ -161,7 +162,7 @@ char* get_str_l1d(struct cache* cach);
 char* get_str_l2(struct cache* cach);
 char* get_str_l3(struct cache* cach);
 char* get_str_freq(struct frequency* freq);
-char* get_str_peak_performance(double flops, bool valid_pp);
+char* get_str_peak_performance(int64_t flops);
 
 void init_topology_struct(struct topology* topo, struct cache* cach);
 void init_cache_struct(struct cache* cach);
diff --git a/src/common/printer.c b/src/common/printer.c
index 17cf6d5..dd5a4b4 100644
--- a/src/common/printer.c
+++ b/src/common/printer.c
@@ -446,9 +446,6 @@ bool print_cpufetch_x86(struct cpuInfo* cpu, STYLE s, struct colors* cs) {
   if(art == NULL)
     return false;
 
-  double flops;
-  bool valid_pp = get_peak_performance(cpu, cpu->topo, get_freq(cpu->freq), &flops);
-
   char* uarch = get_str_uarch(cpu);
   char* manufacturing_process = get_str_process(cpu);
   char* sockets = get_str_sockets(cpu->topo);
@@ -463,7 +460,7 @@ bool print_cpufetch_x86(struct cpuInfo* cpu, STYLE s, struct colors* cs) {
   char* l1d = get_str_l1d(cpu->cach);
   char* l2 = get_str_l2(cpu->cach);
   char* l3 = get_str_l3(cpu->cach);
-  char* pp = get_str_peak_performance(flops, valid_pp);
+  char* pp = get_str_peak_performance(cpu->peak_performance);
 
   setAttribute(art,ATTRIBUTE_NAME,cpu_name);
   if(cpu->hv->present) {
@@ -570,9 +567,6 @@ bool print_cpufetch_ppc(struct cpuInfo* cpu, STYLE s, struct colors* cs) {
   if(art == NULL)
     return false;
 
-  double flops;
-  bool valid_pp = get_peak_performance(cpu, cpu->topo, get_freq(cpu->freq), &flops);
-
   char* uarch = get_str_uarch(cpu);
   char* manufacturing_process = get_str_process(cpu);
   char* sockets = get_str_sockets(cpu->topo);
@@ -586,7 +580,7 @@ bool print_cpufetch_ppc(struct cpuInfo* cpu, STYLE s, struct colors* cs) {
   char* l1d = get_str_l1d(cpu->cach);
   char* l2 = get_str_l2(cpu->cach);
   char* l3 = get_str_l3(cpu->cach);
-  char* pp = get_str_peak_performance(flops, valid_pp);
+  char* pp = get_str_peak_performance(cpu->peak_performance);
 
   if(cpu_name != NULL) {
     setAttribute(art,ATTRIBUTE_NAME,cpu_name);
@@ -792,9 +786,7 @@ bool print_cpufetch_arm(struct cpuInfo* cpu, STYLE s, struct colors* cs) {
       }
     }
   }
-  double flops;
-  bool valid_pp = get_peak_performance(cpu, &flops);
-  char* pp = get_str_peak_performance(flops, valid_pp);
+  char* pp = get_str_peak_performance(cpu->peak_performance);
   setAttribute(art,ATTRIBUTE_PEAK,pp);
 
   if(art->n_attributes_set > NUMBER_OF_LINES) {
diff --git a/src/ppc/ppc.c b/src/ppc/ppc.c
index 24aa062..5fddf4c 100644
--- a/src/ppc/ppc.c
+++ b/src/ppc/ppc.c
@@ -132,6 +132,32 @@ struct frequency* get_frequency_info() {
   return freq;
 }
 
+int64_t get_peak_performance(struct cpuInfo* cpu, struct topology* topo, int64_t freq) {
+  /*
+   * Not sure about this
+   * PP(SP) = N_CORES * FREQUENCY * 4(If altivec)
+   */
+
+  //First check we have consistent data
+  if(freq == UNKNOWN_FREQ) {
+    return -1;
+  }
+
+  struct features* feat = cpu->feat;
+  int64_t flops = topo->physical_cores * topo->sockets * (freq * 1000000);
+  if(feat->altivec) flops = flops * 4;
+
+  // POWER9 has the concept called "slices". Each SMT4 core has two super-slices,
+  // and each super-slice is capable of doing two FLOPS per cycle. In the case of
+  // SMT8, it has 4 super-slices, thus four FLOPS per cycle.
+  if(is_power9(cpu->arch)) {
+    int threads_per_core = topo->logical_cores / topo->physical_cores;
+    flops = flops * (threads_per_core / 2);
+  }
+
+  return flops;
+}
+
 struct cpuInfo* get_cpu_info() {
   struct cpuInfo* cpu = emalloc(sizeof(struct cpuInfo));
   struct features* feat = emalloc(sizeof(struct features));
@@ -152,8 +178,8 @@ struct cpuInfo* get_cpu_info() {
   cpu->freq = get_frequency_info();
   cpu->topo = get_topology_info(cpu->cach);
   cpu->cach = get_cache_info(cpu);
-
   feat->altivec = has_altivec(cpu->arch);
+  cpu->peak_performance = get_peak_performance(cpu, cpu->topo, get_freq(cpu->freq));
 
   if(cpu->cach == NULL || cpu->topo == NULL) {
     return NULL;
@@ -170,32 +196,6 @@ char* get_str_altivec(struct cpuInfo* cpu) {
   return string;
 }
 
-bool get_peak_performance(struct cpuInfo* cpu, struct topology* topo, int64_t freq, double* flops) {
-  /*
-   * Not sure about this
-   * PP(SP) = N_CORES * FREQUENCY * 4(If altivec)
-   */
-
-  //First check we have consistent data
-  if(freq == UNKNOWN_FREQ) {
-    return false;
-  }
-
-  struct features* feat = cpu->feat;
-  *flops = topo->physical_cores * topo->sockets * (freq*1000000);
-  if(feat->altivec) *flops = *flops * 4;
-
-  // POWER9 has the concept called "slices". Each SMT4 core has two super-slices,
-  // and each super-slice is capable of doing two FLOPS per cycle. In the case of
-  // SMT8, it has 4 super-slices, thus four FLOPS per cycle.
-  if(is_power9(cpu->arch)) {
-    int threads_per_core = topo->logical_cores / topo->physical_cores;
-    *flops = *flops * (threads_per_core / 2);
-  }
-
-  return true;
-}
-
 char* get_str_topology(struct topology* topo, bool dual_socket) {
   char* string;
   if(topo->smt_supported > 1) {
diff --git a/src/ppc/ppc.h b/src/ppc/ppc.h
index 9e0fc41..7181b6f 100644
--- a/src/ppc/ppc.h
+++ b/src/ppc/ppc.h
@@ -6,7 +6,6 @@
 struct cpuInfo* get_cpu_info();
 char* get_str_altivec(struct cpuInfo* cpu);
 char* get_str_topology(struct topology* topo, bool dual_socket);
-bool get_peak_performance(struct cpuInfo* cpu, struct topology* topo, int64_t freq, double* flops);
 void print_debug(struct cpuInfo* cpu);
 
 #endif
diff --git a/src/x86/cpuid.c b/src/x86/cpuid.c
index 6ed8769..66bc1fa 100644
--- a/src/x86/cpuid.c
+++ b/src/x86/cpuid.c
@@ -133,6 +133,49 @@ struct uarch* get_cpu_uarch(struct cpuInfo* cpu) {
   return get_uarch_from_cpuid(cpu, efamily, family, emodel, model, (int)stepping);
 }
 
+int64_t get_peak_performance(struct cpuInfo* cpu, struct topology* topo, int64_t freq) {
+  /*
+   * PP = PeakPerformance
+   * SP = SinglePrecision
+   *
+   * PP(SP) =
+   * N_CORES                             *
+   * FREQUENCY                           *
+   * 2(Two vector units)                 *
+   * 2(If cpu has fma)                   *
+   * 16(If AVX512), 8(If AVX), 4(If SSE) *
+   */
+
+  //First, check we have consistent data
+  if(freq == UNKNOWN_FREQ) {
+    return -1;
+  }
+
+  struct features* feat = cpu->feat;
+  int vpus = get_number_of_vpus(cpu);
+  int64_t flops = topo->physical_cores * topo->sockets * (freq*1000000) * vpus;
+
+  if(feat->FMA3 || feat->FMA4)
+    flops = flops*2;
+
+  // Ice Lake has AVX512, but it has 1 VPU for AVX512, while
+  // it has 2 for AVX2. If this is a Ice Lake CPU, we are computing
+  // the peak performance supposing AVX2, not AVX512
+  if(feat->AVX512 && vpus_are_AVX512(cpu))
+    flops = flops*16;
+  else if(feat->AVX || feat->AVX2)
+    flops = flops*8;
+  else if(feat->SSE)
+    flops = flops*4;
+
+  // See https://sites.utexas.edu/jdm4372/2018/01/22/a-peculiar-
+  // throughput-limitation-on-intels-xeon-phi-x200-knights-landing/
+  if(is_knights_landing(cpu))
+    flops = flops * 6 / 7;
+
+  return flops;
+}
+
 struct hypervisor* get_hp_info(bool hv_present) {
   struct hypervisor* hv = emalloc(sizeof(struct hypervisor));
   if(!hv_present) {
@@ -289,6 +332,7 @@ struct cpuInfo* get_cpu_info() {
   cpu->freq = get_frequency_info(cpu);
   cpu->cach = get_cache_info(cpu);
   cpu->topo = get_topology_info(cpu, cpu->cach);
+  cpu->peak_performance = get_peak_performance(cpu, cpu->topo, get_freq(cpu->freq));
 
   if(cpu->cach == NULL || cpu->topo == NULL) {
     return NULL;
@@ -656,49 +700,6 @@ struct frequency* get_frequency_info(struct cpuInfo* cpu) {
 }
 
 // STRING FUNCTIONS
-bool get_peak_performance(struct cpuInfo* cpu, struct topology* topo, int64_t freq, double* flops) {
-  /*
-   * PP = PeakPerformance
-   * SP = SinglePrecision
-   *
-   * PP(SP) =
-   * N_CORES                             *
-   * FREQUENCY                           *
-   * 2(Two vector units)                 *
-   * 2(If cpu has fma)                   *
-   * 16(If AVX512), 8(If AVX), 4(If SSE) *
-   */
-
-  //First, check we have consistent data
-  if(freq == UNKNOWN_FREQ) {
-    return false;
-  }
-
-  struct features* feat = cpu->feat;
-  int vpus = get_number_of_vpus(cpu);
-  *flops = topo->physical_cores * topo->sockets * (freq*1000000) * vpus;
-
-  if(feat->FMA3 || feat->FMA4)
-    *flops = *flops*2;
-
-  // Ice Lake has AVX512, but it has 1 VPU for AVX512, while
-  // it has 2 for AVX2. If this is a Ice Lake CPU, we are computing
-  // the peak performance supposing AVX2, not AVX512
-  if(feat->AVX512 && vpus_are_AVX512(cpu))
-    *flops = *flops*16;
-  else if(feat->AVX || feat->AVX2)
-    *flops = *flops*8;
-  else if(feat->SSE)
-    *flops = *flops*4;
-
-  // See https://sites.utexas.edu/jdm4372/2018/01/22/a-peculiar-
-  // throughput-limitation-on-intels-xeon-phi-x200-knights-landing/
-  if(is_knights_landing(cpu))
-    *flops = *flops * 6 / 7;
-
-  return true;
-}
-
 // TODO: Refactoring
 char* get_str_topology(struct cpuInfo* cpu, struct topology* topo, bool dual_socket) {
   char* string;
diff --git a/src/x86/cpuid.h b/src/x86/cpuid.h
index bfe0f6e..3dc6202 100644
--- a/src/x86/cpuid.h
+++ b/src/x86/cpuid.h
@@ -13,8 +13,6 @@ char* get_str_sse(struct cpuInfo* cpu);
 char* get_str_fma(struct cpuInfo* cpu);
 char* get_str_topology(struct cpuInfo* cpu, struct topology* topo, bool dual_socket);
 
-bool get_peak_performance(struct cpuInfo* cpu, struct topology* topo, int64_t freq, double* flops);
-
 void print_debug(struct cpuInfo* cpu);
 void print_raw(struct cpuInfo* cpu);