#ifdef _WIN32
  #define NOMINMAX
  #include <windows.h>
#else
  #include "../common/udev.h"
  #include <unistd.h>
#endif

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <stdbool.h>

#include "cpuid.h"
#include "cpuid_asm.h"
#include "../common/global.h"
#include "../common/args.h"
#include "apic.h"
#include "uarch.h"
#include "freq/freq.h"

#define CPU_VENDOR_INTEL_STRING "GenuineIntel"
#define CPU_VENDOR_AMD_STRING   "AuthenticAMD"

static const char *hv_vendors_string[] = {
  [HV_VENDOR_KVM]       = "KVMKVMKVM",
  [HV_VENDOR_QEMU]      = "TCGTCGTCGTCG",
  [HV_VENDOR_HYPERV]    = "Microsoft Hv",
  [HV_VENDOR_VMWARE]    = "VMwareVMware",
  [HV_VENDOR_XEN]       = "XenVMMXenVMM",
  [HV_VENDOR_PARALLELS] = "lrpepyh vr",
  [HV_VENDOR_PHYP]      = NULL
};

static char *hv_vendors_name[] = {
  [HV_VENDOR_KVM]       = "KVM",
  [HV_VENDOR_QEMU]      = "QEMU",
  [HV_VENDOR_HYPERV]    = "Microsoft Hyper-V",
  [HV_VENDOR_VMWARE]    = "VMware",
  [HV_VENDOR_XEN]       = "Xen",
  [HV_VENDOR_PARALLELS] = "Parallels",
  [HV_VENDOR_PHYP]      = "pHyp",
  [HV_VENDOR_INVALID]   = STRING_UNKNOWN
};

#define MASK 0xFF

/*
 * cpuid reference: http://www.sandpile.org/x86/cpuid.htm
 * cpuid amd: https://www.amd.com/system/files/TechDocs/25481.pdf
 */

void get_name_cpuid(char* name, uint32_t reg1, uint32_t reg2, uint32_t reg3) {
  uint32_t c = 0;

  name[c++] = reg1       & MASK;
  name[c++] = (reg1>>8)  & MASK;
  name[c++] = (reg1>>16) & MASK;
  name[c++] = (reg1>>24) & MASK;

  name[c++] = reg2       & MASK;
  name[c++] = (reg2>>8)  & MASK;
  name[c++] = (reg2>>16) & MASK;
  name[c++] = (reg2>>24) & MASK;

  name[c++] = reg3       & MASK;
  name[c++] = (reg3>>8)  & MASK;
  name[c++] = (reg3>>16) & MASK;
  name[c++] = (reg3>>24) & MASK;
}

char* get_str_cpu_name_internal(void) {
  uint32_t eax = 0;
  uint32_t ebx = 0;
  uint32_t ecx = 0;
  uint32_t edx = 0;
  uint32_t c = 0;

  char * name = emalloc(sizeof(char) * CPU_NAME_MAX_LENGTH);
  memset(name, 0, CPU_NAME_MAX_LENGTH);

  for(int i=0; i < 3; i++) {
    eax = 0x80000002 + i;
    cpuid(&eax, &ebx, &ecx, &edx);

    name[c++] = eax       & MASK;
    name[c++] = (eax>>8)  & MASK;
    name[c++] = (eax>>16) & MASK;
    name[c++] = (eax>>24) & MASK;
    name[c++] = ebx       & MASK;
    name[c++] = (ebx>>8)  & MASK;
    name[c++] = (ebx>>16) & MASK;
    name[c++] = (ebx>>24) & MASK;
    name[c++] = ecx       & MASK;
    name[c++] = (ecx>>8)  & MASK;
    name[c++] = (ecx>>16) & MASK;
    name[c++] = (ecx>>24) & MASK;
    name[c++] = edx       & MASK;
    name[c++] = (edx>>8)  & MASK;
    name[c++] = (edx>>16) & MASK;
    name[c++] = (edx>>24) & MASK;
  }
  name[c] = '\0';

  //Remove unused characters
  char *str = name;
  char *dest = name;
  // Remove spaces before name
  while (*str != '\0' && *str == ' ')str++;
  // Remove spaces between the name and after it
  while (*str != '\0') {
    while (*str == ' ' && *(str + 1) == ' ') str++;
    *dest++ = *str++;
  }
  *dest = '\0';

  return name;
}

bool abbreviate_intel_cpu_name(char** name) {
  char* old_name = *name;
  char* new_name = ecalloc(strlen(old_name) + 1, sizeof(char));

  char* old_name_ptr = old_name;
  char* new_name_ptr = new_name;
  char* aux_ptr = NULL;

  // 1. Remove "(R)"
  old_name_ptr = strstr(old_name_ptr, "Intel(R)");
  if(old_name_ptr == NULL) return false;
  strcpy(new_name_ptr, "Intel");
  new_name_ptr += strlen("Intel");
  old_name_ptr += strlen("Intel(R)");

  // 2. Remove "(R)" or "(TM)"
  aux_ptr = strstr(old_name_ptr, "(");
  if(aux_ptr == NULL) return false;
  strncpy(new_name_ptr, old_name_ptr, aux_ptr-old_name_ptr);

  new_name_ptr += aux_ptr-old_name_ptr;
  strcpy(new_name_ptr, " ");
  new_name_ptr++;
  old_name_ptr = strstr(aux_ptr, ")");
  if(old_name_ptr == NULL) return false;
  old_name_ptr++;
  while(*old_name_ptr == ' ') old_name_ptr++;

  // 3. Copy the CPU name
  aux_ptr = strstr(old_name_ptr, "@");
  if(aux_ptr == NULL) return false;
  strncpy(new_name_ptr, old_name_ptr, (aux_ptr-1)-old_name_ptr);

  // 4. Remove dummy strings in Intel CPU names
  strremove(new_name, " CPU");
  strremove(new_name, " Dual");
  strremove(new_name, " 0");

  free(old_name);
  *name = new_name;

  return true;
}

struct uarch* get_cpu_uarch(struct cpuInfo* cpu) {
  uint32_t eax = 0x00000001;
  uint32_t ebx = 0;
  uint32_t ecx = 0;
  uint32_t edx = 0;

  cpuid(&eax, &ebx, &ecx, &edx);

  uint32_t stepping = eax & 0xF;
  uint32_t model = (eax >> 4) & 0xF;
  uint32_t emodel = (eax >> 16) & 0xF;
  uint32_t family = (eax >> 8) & 0xF;
  uint32_t efamily = (eax >> 20) & 0xFF;

  return get_uarch_from_cpuid(cpu, eax, efamily, family, emodel, model, (int)stepping);
}

int64_t get_peak_performance(struct cpuInfo* cpu, bool accurate_pp) {
  /*
   * PP = PeakPerformance
   * SP = SinglePrecision
   *
   * PP(SP) =
   * N_CORES                             *
   * FREQUENCY                           *
   * 2(Two vector units)                 *
   * 2(If cpu has fma)                   *
   * 16(If AVX512), 8(If AVX), 4(If SSE) *
   */

  struct cpuInfo* ptr = cpu;
  int64_t total_flops = 0;

  for(int i=0; i < cpu->num_cpus; ptr = ptr->next_cpu, i++) {
    struct topology* topo = ptr->topo;
    int64_t max_freq = get_freq(ptr->freq);

    int64_t freq;
  #ifdef __linux__
    if(accurate_pp)
      freq = measure_frequency(ptr);
    else
      freq = max_freq;
  #else
    // Silence compiler warning
    (void)(accurate_pp);
    freq = max_freq;
  #endif

    //First, check we have consistent data
    if(freq == UNKNOWN_DATA || topo->logical_cores == UNKNOWN_DATA) {
      return -1;
    }

    struct features* feat = ptr->feat;
    int vpus = get_number_of_vpus(ptr);
    int64_t flops = topo->physical_cores * topo->sockets * (freq*1000000) * vpus;

    if(feat->FMA3 || feat->FMA4)
      flops = flops*2;

    // NOTE:
    // Some CPUs (Ice Lake, Zen 4) have AVX512, but they have only
    // 1 VPU for AVX512, while they have 2 for AVX2. In such cases,
    // we are computing the peak performance supposing AVX2, not AVX512
    if(feat->AVX512 && vpus_are_AVX512(ptr))
      flops = flops*16;
    else if(feat->AVX || feat->AVX2)
      flops = flops*8;
    else if(feat->SSE)
      flops = flops*4;

    // See https://sites.utexas.edu/jdm4372/2018/01/22/a-peculiar-
    // throughput-limitation-on-intels-xeon-phi-x200-knights-landing/
    if(is_knights_landing(ptr))
      flops = flops * 6 / 7;

    total_flops += flops;
  }

  return total_flops;
}

struct hypervisor* get_hp_info(bool hv_present) {
  struct hypervisor* hv = emalloc(sizeof(struct hypervisor));
  if(!hv_present) {
    hv->present = false;
    return hv;
  }

  hv->present = true;

  uint32_t eax = 0x40000000;
  uint32_t ebx = 0;
  uint32_t ecx = 0;
  uint32_t edx = 0;

  cpuid(&eax, &ebx, &ecx, &edx);

  if(ebx == 0x0 && ecx == 0x0 && edx == 0x0) {
    hv->hv_vendor = HV_VENDOR_INVALID;
    printWarn("Hypervisor vendor is empty");
  }
  else {
    char name[13];
    memset(name, 0, 13);
    get_name_cpuid(name, ebx, ecx, edx);

    bool found = false;
    uint8_t len = sizeof(hv_vendors_string) / sizeof(hv_vendors_string[0]);

    for(uint8_t v=0; v < len && !found; v++) {
      if(hv_vendors_string[v] != NULL && strcmp(hv_vendors_string[v], name) == 0) {
        hv->hv_vendor = v;
        found = true;
      }
    }

    if(!found) {
      hv->hv_vendor = HV_VENDOR_INVALID;
      printBug("Unknown hypervisor vendor: %s", name);
    }
  }

  hv->hv_name = hv_vendors_name[hv->hv_vendor];

  return hv;
}

struct features* get_features_info(struct cpuInfo* cpu) {
  uint32_t eax = 0;
  uint32_t ebx = 0;
  uint32_t ecx = 0;
  uint32_t edx = 0;

  struct features* feat = emalloc(sizeof(struct features));

  bool *ptr = &(feat->AES);
  for(uint32_t i = 0; i < sizeof(struct features)/sizeof(bool); i++, ptr++) {
    *ptr = false;
  }

  //Fill instructions support
  if (cpu->maxLevels >= 0x00000001){
    eax = 0x00000001;
    cpuid(&eax, &ebx, &ecx, &edx);
    feat->SSE    = (edx & (1U << 25)) != 0;
    feat->SSE2   = (edx & (1U << 26)) != 0;
    feat->SSE3   = (ecx & (1U <<  0)) != 0;

    feat->SSSE3  = (ecx & (1U <<  9)) != 0;
    feat->SSE4_1 = (ecx & (1U << 19)) != 0;
    feat->SSE4_2 = (ecx & (1U << 20)) != 0;

    feat->AES    = (ecx & (1U << 25)) != 0;

    feat->AVX    = (ecx & (1U << 28)) != 0;
    feat->FMA3   = (ecx & (1U << 12)) != 0;

    bool hv_present = (ecx & (1U << 31)) != 0;
    if((cpu->hv = get_hp_info(hv_present)) == NULL)
      return NULL;
  }
  else {
    printWarn("Can't read features information from cpuid (needed level is 0x%.8X, max is 0x%.8X)", 0x00000001, cpu->maxLevels);
  }

  if (cpu->maxLevels >= 0x00000007){
    eax = 0x00000007;
    ecx = 0x00000000;
    cpuid(&eax, &ebx, &ecx, &edx);
    feat->AVX2         = (ebx & (1U <<  5)) != 0;
    feat->SHA          = (ebx & (1U << 29)) != 0;
    feat->AVX512       = (((ebx & (1U << 16)) != 0) ||
                        ((ebx & (1U << 28)) != 0)  ||
                        ((ebx & (1U << 26)) != 0)  ||
                        ((ebx & (1U << 27)) != 0)  ||
                        ((ebx & (1U << 31)) != 0)  ||
                        ((ebx & (1U << 30)) != 0)  ||
                        ((ebx & (1U << 17)) != 0)  ||
                        ((ebx & (1U << 21)) != 0));
  }
  else {
    printWarn("Can't read features information from cpuid (needed level is 0x%.8X, max is 0x%.8X)", 0x00000007, cpu->maxLevels);
  }

  if (cpu->maxExtendedLevels >= 0x80000001){
    eax = 0x80000001;
    cpuid(&eax, &ebx, &ecx, &edx);
    feat->SSE4a = (ecx & (1U <<  6)) != 0;
    feat->FMA4  = (ecx & (1U << 16)) != 0;
  }
  else {
    printWarn("Can't read features information from cpuid (needed extended level is 0x%.8X, max is 0x%.8X)", 0x80000001, cpu->maxExtendedLevels);
  }

  return feat;
}

bool set_cpu_module(int m, int total_modules, int32_t* first_core) {
  if(total_modules > 1) {
    #ifdef __APPLE__
    UNUSED(m);
    printBug("Hybrid architectures are not supported under macOS");
    return false;
    #else
    // We have a hybrid architecture.
    // 1. Find the first core from module m
    int32_t core_id = -1;
    int32_t currrent_module_idx = -1;
    int32_t* core_types = emalloc(sizeof(uint32_t) * total_modules);
    for(int i=0; i < total_modules; i++) core_types[i] = -1;
    int i = 0;

    while(core_id == -1) {
      if(!bind_to_cpu(i)) {
        return false;
      }
      uint32_t eax = 0x0000001A;
      uint32_t ebx = 0;
      uint32_t ecx = 0;
      uint32_t edx = 0;
      cpuid(&eax, &ebx, &ecx, &edx);
      int32_t core_type = eax >> 24 & 0xFF;
      bool found = false;

      for(int j=0; j < total_modules && !found; j++) {
        if(core_types[j] == core_type) found = true;
      }
      if(!found) {
        currrent_module_idx++;
        core_types[currrent_module_idx] = core_type;
        if(currrent_module_idx == m) {
          core_id = i;
        }
      }

      i++;
    }

    *first_core = core_id;

    //printf("Module %d: Core %d\n", m, core_id);
    // 2. Now bind to that core
    if(!bind_to_cpu(core_id)) {
      return false;
    }
    #endif
  }
  else {
    // This is a normal architecture
    *first_core = 0;
  }

  return true;
}

int32_t get_core_type(void) {
  uint32_t eax = 0x0000001A;
  uint32_t ebx = 0;
  uint32_t ecx = 0;
  uint32_t edx = 0;

  eax = 0x0000001A;
  cpuid(&eax, &ebx, &ecx, &edx);

  int32_t type = eax >> 24 & 0xFF;
  if(type == 0x20) return CORE_TYPE_EFFICIENCY;
  else if(type == 0x40) return CORE_TYPE_PERFORMANCE;
  else {
    printErr("Found invalid core type: 0x%.8X\n", type);
    return CORE_TYPE_UNKNOWN;
  }
}

struct cpuInfo* get_cpu_info(void) {
  struct cpuInfo* cpu = emalloc(sizeof(struct cpuInfo));
  cpu->peak_performance = -1;
  cpu->next_cpu = NULL;
  cpu->topo = NULL;
  cpu->cach = NULL;
  cpu->feat = NULL;

  uint32_t modules = 1;
  uint32_t eax = 0;
  uint32_t ebx = 0;
  uint32_t ecx = 0;
  uint32_t edx = 0;

  //Get max cpuid level
  cpuid(&eax, &ebx, &ecx, &edx);
  cpu->maxLevels = eax;

  //Fill vendor
  char name[13];
  memset(name,0,13);
  get_name_cpuid(name, ebx, edx, ecx);

  if(strcmp(CPU_VENDOR_INTEL_STRING,name) == 0)
    cpu->cpu_vendor = CPU_VENDOR_INTEL;
  else if (strcmp(CPU_VENDOR_AMD_STRING,name) == 0)
    cpu->cpu_vendor = CPU_VENDOR_AMD;
  else {
    cpu->cpu_vendor = CPU_VENDOR_INVALID;
    printErr("Unknown CPU vendor: %s", name);
    return NULL;
  }

  //Get max extended level
  eax = 0x80000000;
  ebx = 0;
  ecx = 0;
  edx = 0;
  cpuid(&eax, &ebx, &ecx, &edx);
  cpu->maxExtendedLevels = eax;

  if (cpu->maxExtendedLevels >= 0x80000004){
    cpu->cpu_name = get_str_cpu_name_internal();
  }
  else {
    cpu->cpu_name = emalloc(sizeof(char) * (strlen(STRING_UNKNOWN) + 1));
    strcpy(cpu->cpu_name, STRING_UNKNOWN);
    printWarn("Can't read cpu name from cpuid (needed extended level is 0x%.8X, max is 0x%.8X)", 0x80000004, cpu->maxExtendedLevels);
  }

  cpu->topology_extensions = false;
  if(cpu->cpu_vendor == CPU_VENDOR_AMD && cpu->maxExtendedLevels >= 0x80000001) {
    eax = 0x80000001;
    cpuid(&eax, &ebx, &ecx, &edx);
    cpu->topology_extensions = (ecx >> 22) & 1;
  }

  cpu->hybrid_flag = false;
  if(cpu->cpu_vendor == CPU_VENDOR_INTEL && cpu->maxLevels >= 0x00000007) {
    eax = 0x00000007;
    ecx = 0x00000000;
    cpuid(&eax, &ebx, &ecx, &edx);
    cpu->hybrid_flag = (edx >> 15) & 0x1;
  }

  if(cpu->hybrid_flag) modules = 2;

  struct cpuInfo* ptr = cpu;
  for(uint32_t i=0; i < modules; i++) {
    int32_t first_core;
    set_cpu_module(i, modules, &first_core);

    if(i > 0) {
      ptr->next_cpu = emalloc(sizeof(struct cpuInfo));
      ptr = ptr->next_cpu;
      ptr->next_cpu = NULL;
      ptr->peak_performance = -1;
      ptr->topo = NULL;
      ptr->cach = NULL;
      ptr->feat = NULL;
      // We assume that this cores have the
      // same cpuid capabilities
      ptr->cpu_vendor = cpu->cpu_vendor;
      ptr->maxLevels = cpu->maxLevels;
      ptr->maxExtendedLevels = cpu->maxExtendedLevels;
      ptr->hybrid_flag = cpu->hybrid_flag;
    }

    if(cpu->hybrid_flag) {
      // Detect core type
      eax = 0x0000001A;
      cpuid(&eax, &ebx, &ecx, &edx);
      ptr->core_type = get_core_type();
    }
    ptr->first_core_id = first_core;
    ptr->feat = get_features_info(ptr);

    // If any field of the struct is NULL,
    // return inmideately, as further functions
    // require valid fields (cach, topo, etc)
    ptr->arch = get_cpu_uarch(ptr);
    ptr->freq = get_frequency_info(ptr);

    ptr->cach = get_cache_info(ptr);
    if(ptr->cach == NULL) return cpu;

    if(cpu->hybrid_flag) {
      ptr->topo = get_topology_info(ptr, ptr->cach, i);
    }
    else {
      ptr->topo = get_topology_info(ptr, ptr->cach, -1);
    }
    if(cpu->topo == NULL) return cpu;
  }

  cpu->num_cpus = modules;
  cpu->peak_performance = get_peak_performance(cpu, accurate_pp());

  return cpu;
}

bool get_cache_topology_amd(struct cpuInfo* cpu, struct topology* topo) {
  if(cpu->maxExtendedLevels >= 0x8000001D && cpu->topology_extensions) {
    uint32_t i, eax, ebx, ecx, edx, num_sharing_cache, cache_type, cache_level;

    i = 0;
    do {
      eax = 0x8000001D;
      ebx = 0;
      ecx = i; // cache id
      edx = 0;

      cpuid(&eax, &ebx, &ecx, &edx);

      cache_type = eax & 0x1F;

      if(cache_type > 0) {
        num_sharing_cache = ((eax >> 14) & 0xFFF) + 1;
        cache_level = (eax >>= 5) & 0x7;

        switch (cache_type) {
          case 1: // Data Cache (We assume this is L1d)
            if(cache_level != 1) {
              printBug("Found data cache at level %d (expected 1)", cache_level);
              return false;
            }
            topo->cach->L1d->num_caches = topo->logical_cores / num_sharing_cache;
            break;

          case 2: // Instruction Cache (We assume this is L1i)
            if(cache_level != 1) {
              printBug("Found instruction cache at level %d (expected 1)", cache_level);
              return false;
            }
            topo->cach->L1i->num_caches = topo->logical_cores / num_sharing_cache;
            break;

          case 3: // Unified Cache (This may be L2 or L3)
            if(cache_level == 2) {
              topo->cach->L2->num_caches = topo->logical_cores / num_sharing_cache;
            }
            else if(cache_level == 3) {
              topo->cach->L3->num_caches = topo->logical_cores / num_sharing_cache;
            }
            else {
              printWarn("Found unknown unified cache at level %d", cache_level);
            }
            break;

          default: // Unknown cache type
            printBug("Unknown cache type %d with level %d found at i=%d", cache_type, cache_level, i);
            return false;
        }
      }

      i++;
    } while (cache_type > 0);
  }
  else {
    printWarn("Can't read topology information from cpuid (needed extended level is 0x%.8X, max is 0x%.8X and topology_extensions=%s). Guessing cache topology", 0x8000001D, cpu->maxExtendedLevels, cpu->topology_extensions ? "true" : "false");
    topo->cach->L1i->num_caches = topo->physical_cores;
    topo->cach->L1d->num_caches = topo->physical_cores;

    if(topo->cach->L3->exists) {
      topo->cach->L2->num_caches = topo->physical_cores;
      topo->cach->L3->num_caches = 1;
    }
    else {
      topo->cach->L2->num_caches = 1;
    }
  }

  return true;
}

void get_topology_from_udev(struct topology* topo) {
  // TODO: To be improved in the future
  topo->total_cores = get_ncores_from_cpuinfo();
  topo->logical_cores = topo->total_cores;
  topo->physical_cores = topo->total_cores;
  topo->smt_available = 1;
  topo->smt_supported = 1;
  topo->sockets = 1;
}

// Main reference: https://software.intel.com/content/www/us/en/develop/articles/intel-64-architecture-processor-topology-enumeration.html
// Very interesting resource: https://wiki.osdev.org/Detecting_CPU_Topology_(80x86)
struct topology* get_topology_info(struct cpuInfo* cpu, struct cache* cach, int module) {
  struct topology* topo = emalloc(sizeof(struct topology));
  init_topology_struct(topo, cach);

  uint32_t eax = 0;
  uint32_t ebx = 0;
  uint32_t ecx = 0;
  uint32_t edx = 0;

  // Ask the OS the total number of cores it sees
  // If we have one socket, it will be same as the cpuid,
  // but in dual socket it will not!
  // TODO: Replace by apic?
  #ifdef _WIN32
    SYSTEM_INFO info;
    GetSystemInfo(&info);
    topo->total_cores = info.dwNumberOfProcessors;
  #else
    if((topo->total_cores = sysconf(_SC_NPROCESSORS_ONLN)) == -1) {
      printWarn("sysconf(_SC_NPROCESSORS_ONLN): %s", strerror(errno));
      topo->total_cores = topo->logical_cores; // fallback
    }
  #endif

  if(cpu->hybrid_flag) {
    #ifdef __linux__
      topo->total_cores_module = get_total_cores_module(topo->total_cores, module);
    #else
      UNUSED(module);
      topo->total_cores_module = topo->total_cores;
    #endif
  }
  else {
    topo->total_cores_module = topo->total_cores;
  }

  switch(cpu->cpu_vendor) {
    case CPU_VENDOR_INTEL:
      if (cpu->maxLevels >= 0x00000004) {
        bool toporet = get_topology_from_apic(cpu, topo);
        if(!toporet) {
          #ifdef __linux__
            printWarn("Failed to retrieve topology from APIC, using udev...\n");
            get_topology_from_udev(topo);
          #else
            printErr("Failed to retrieve topology from APIC, assumming default values...\n");
            topo->logical_cores = UNKNOWN_DATA;
            topo->physical_cores = UNKNOWN_DATA;
            topo->smt_available = 1;
            topo->smt_supported = 1;
          #endif
        }
      }
      else {
        printWarn("Can't read topology information from cpuid (needed level is 0x%.8X, max is 0x%.8X)", 0x00000001, cpu->maxLevels);
        topo->physical_cores = 1;
        topo->logical_cores = 1;
        topo->smt_available = 1;
        topo->smt_supported = 1;
      }
      break;
    case CPU_VENDOR_AMD:
      if (cpu->maxExtendedLevels >= 0x80000008) {
        eax = 0x80000008;
        cpuid(&eax, &ebx, &ecx, &edx);
        topo->logical_cores = (ecx & 0xFF) + 1;

        if (cpu->maxExtendedLevels >= 0x8000001E && cpu->topology_extensions) {
          eax = 0x8000001E;
          cpuid(&eax, &ebx, &ecx, &edx);
          topo->smt_supported = ((ebx >> 8) & 0x03) + 1;
        }
        else {
          printWarn("Can't read topology information from cpuid (needed extended level is 0x%.8X, max is 0x%.8X and topology_extensions=%s)", 0x8000001E, cpu->maxExtendedLevels, cpu->topology_extensions ? "true" : "false");
          topo->smt_supported = 1;
        }
      }
      else {
        printWarn("Can't read topology information from cpuid (needed extended level is 0x%.8X, max is 0x%.8X)", 0x80000008, cpu->maxExtendedLevels);
        topo->physical_cores = 1;
        topo->logical_cores = 1;
        topo->smt_supported = 1;
      }

      if (cpu->maxLevels >= 0x00000001) {
        if(topo->smt_supported > 1)
          topo->smt_available = is_smt_enabled_amd(topo);
        else
          topo->smt_available = 1;
      }
      else {
        printWarn("Can't read topology information from cpuid (needed level is 0x%.8X, max is 0x%.8X)", 0x0000000B, cpu->maxLevels);
        topo->smt_available = 1;
      }
      topo->physical_cores = topo->logical_cores / topo->smt_available;

      if(topo->smt_supported > 1)
        topo->sockets = topo->total_cores / topo->smt_supported / topo->physical_cores; // Idea borrowed from lscpu
      else
        topo->sockets = topo->total_cores / topo->physical_cores;

      get_cache_topology_amd(cpu, topo);

      break;

    default:
      printBug("Cant get topology because VENDOR is empty");
      return NULL;
  }

  return topo;
}

struct cache* get_cache_info_amd_fallback(struct cache* cach) {
  uint32_t eax = 0x80000005;
  uint32_t ebx = 0;
  uint32_t ecx = 0;
  uint32_t edx = 0;
  cpuid(&eax, &ebx, &ecx, &edx);

  cach->L1d->size = (ecx >> 24) * 1024;
  cach->L1i->size = (edx >> 24) * 1024;

  eax = 0x80000006;
  cpuid(&eax, &ebx, &ecx, &edx);

  cach->L2->size = (ecx >> 16) * 1024;
  cach->L3->size = (edx >> 18) * 512 * 1024;

  cach->L1i->exists = cach->L1i->size > 0;
  cach->L1d->exists = cach->L1d->size > 0;
  cach->L2->exists = cach->L2->size > 0;
  cach->L3->exists = cach->L3->size > 0;

  if(cach->L3->exists)
   cach->max_cache_level = 4;
  else
   cach->max_cache_level = 3;

  return cach;
}

struct cache* get_cache_info_general(struct cache* cach, uint32_t level) {
  uint32_t eax = 0;
  uint32_t ebx = 0;
  uint32_t ecx = 0;
  uint32_t edx = 0;
  int i=0;
  int32_t cache_type;

  do {
    eax = level; // get cache info
    ebx = 0;
    ecx = i; // cache id
    edx = 0;

    cpuid(&eax, &ebx, &ecx, &edx);

    cache_type = eax & 0x1F;

    // If its 0, we tried fetching a non existing cache
    if (cache_type > 0) {
      int32_t cache_level = (eax >>= 5) & 0x7;
      uint32_t cache_sets = ecx + 1;
      uint32_t cache_coherency_line_size = (ebx & 0xFFF) + 1;
      uint32_t cache_physical_line_partitions = ((ebx >>= 12) & 0x3FF) + 1;
      uint32_t cache_ways_of_associativity = ((ebx >>= 10) & 0x3FF) + 1;

      int32_t cache_total_size = cache_ways_of_associativity * cache_physical_line_partitions * cache_coherency_line_size * cache_sets;
      cach->max_cache_level++;

      switch (cache_type) {
        case 1: // Data Cache (We assume this is L1d)
          if(cache_level != 1) {
            printBug("Found data cache at level %d (expected 1)", cache_level);
            return NULL;
          }
          cach->L1d->size = cache_total_size;
          cach->L1d->exists = true;
          break;

        case 2: // Instruction Cache (We assume this is L1i)
          if(cache_level != 1) {
            printBug("Found instruction cache at level %d (expected 1)", cache_level);
            return NULL;
          }
          cach->L1i->size = cache_total_size;
          cach->L1i->exists = true;
          break;

        case 3: // Unified Cache (This may be L2 or L3)
          if(cache_level == 2) {
            cach->L2->size = cache_total_size;
            cach->L2->exists = true;
          }
          else if(cache_level == 3) {
            cach->L3->size = cache_total_size;
            cach->L3->exists = true;
          }
          else {
            printWarn("Found unknown unified cache at level %d (size is %d bytes)", cache_level, cache_total_size);
            cach->max_cache_level--;
          }
          break;

        default: // Unknown cache type
          printBug("Unknown cache type %d with level %d found at i=%d", cache_type, cache_level, i);
          return NULL;
      }
    }

    i++;
  } while (cache_type > 0);

  return cach;
}

struct cache* get_cache_info(struct cpuInfo* cpu) {
  struct cache* cach = emalloc(sizeof(struct cache));
  init_cache_struct(cach);

  uint32_t level;

  // We use standard 0x00000004 for Intel
  // We use extended 0x8000001D for AMD
  // or 0x80000005/6 for old AMD
  if(cpu->cpu_vendor == CPU_VENDOR_INTEL) {
    level = 0x00000004;
    if(cpu->maxLevels < level) {
      printWarn("Can't read cache information from cpuid (needed level is 0x%.8X, max is 0x%.8X)", level, cpu->maxLevels);
      return NULL;
    }
    else {
      cach = get_cache_info_general(cach, level);
    }
  }
  else {
    level = 0x8000001D;
    if(cpu->maxExtendedLevels < level || !cpu->topology_extensions) {
      printWarn("Can't read cache information from cpuid (needed extended level is 0x%.8X, max is 0x%.8X and topology_extensions=%s)", level, cpu->maxExtendedLevels, cpu->topology_extensions ? "true" : "false");
      level = 0x80000006;
      if(cpu->maxExtendedLevels < level) {
        printWarn("Can't read cache information from cpuid using old method (needed extended level is 0x%.8X, max is 0x%.8X)", level, cpu->maxExtendedLevels);
        return NULL;
      }
      printWarn("Fallback to old method using 0x%.8X and 0x%.8X", level-1, level);
      cach = get_cache_info_amd_fallback(cach);
    }
    else {
      cach = get_cache_info_general(cach, level);
    }
  }

  return cach;
}

struct frequency* get_frequency_info(struct cpuInfo* cpu) {
  struct frequency* freq = emalloc(sizeof(struct frequency));

  if(cpu->maxLevels < 0x00000016) {
    #if defined (_WIN32) || defined (__APPLE__)
      printWarn("Can't read frequency information from cpuid (needed level is 0x%.8X, max is 0x%.8X)", 0x00000016, cpu->maxLevels);
      freq->base = UNKNOWN_DATA;
      freq->max = UNKNOWN_DATA;
    #else
      printWarn("Can't read frequency information from cpuid (needed level is 0x%.8X, max is 0x%.8X). Using udev", 0x00000016, cpu->maxLevels);
      freq->base = UNKNOWN_DATA;
      freq->max = get_max_freq_from_file(0);

      if(freq->max == 0) {
        printWarn("Read max CPU frequency from udev and got 0 MHz");
        freq->max = UNKNOWN_DATA;
      }
    #endif
  }
  else {
    uint32_t eax = 0x00000016;
    uint32_t ebx = 0;
    uint32_t ecx = 0;
    uint32_t edx = 0;

    cpuid(&eax, &ebx, &ecx, &edx);

    freq->base = eax;
    freq->max = ebx;

    if(freq->base == 0) {
      printWarn("Read base CPU frequency from CPUID and got 0 MHz");
      freq->base = UNKNOWN_DATA;
    }
    if(freq->max == 0) {
      printWarn("Read max CPU frequency from CPUID and got 0 MHz");
      #ifdef __linux__
        printWarn("Using udev to detect frequency");
        freq->max = get_max_freq_from_file(0);

        if(freq->max == 0) {
          printWarn("Read max CPU frequency from udev and got 0 MHz");
          freq->max = UNKNOWN_DATA;
        }
      #else
        freq->max = UNKNOWN_DATA;
      #endif
    }
  }

  return freq;
}

// STRING FUNCTIONS
char* get_str_cpu_name_abbreviated(struct cpuInfo* cpu) {
  if(cpu->cpu_vendor == CPU_VENDOR_INTEL) {
    if(!abbreviate_intel_cpu_name(&cpu->cpu_name)) {
      printWarn("Failed to abbreviate CPU name");
    }
  }
  return cpu->cpu_name;
}

char* get_str_topology(struct cpuInfo* cpu, struct topology* topo, bool dual_socket) {
  int topo_sockets = dual_socket ? topo->sockets : 1;
  char* string;

  if(topo->logical_cores == UNKNOWN_DATA) {
    string = emalloc(sizeof(char) * (strlen(STRING_UNKNOWN) + 1));
    strcpy(string, STRING_UNKNOWN);
  }
  else if(topo->smt_supported > 1) {
    // 4 for digits, 21 for ' cores (SMT disabled)' which is the longest possible output
    uint32_t max_size = 4+21+1;
    string = emalloc(sizeof(char) * max_size);

    if(topo->smt_available > 1)
      snprintf(string, max_size, "%d cores (%d threads)", topo->physical_cores * topo_sockets, topo->logical_cores * topo_sockets);
    else {
      if(cpu->cpu_vendor == CPU_VENDOR_AMD)
        snprintf(string, max_size, "%d cores (SMT disabled)", topo->physical_cores * topo_sockets);
      else
        snprintf(string, max_size, "%d cores (HT disabled)", topo->physical_cores * topo_sockets);
    }
  }
  else {
    uint32_t max_size = 4+7+1;
    string = emalloc(sizeof(char) * max_size);
    snprintf(string, max_size, "%d cores",topo->physical_cores * topo_sockets);
  }

  return string;
}

char* get_str_avx(struct cpuInfo* cpu) {
  //If all AVX are available, it will use up to 15
  char* string = emalloc(sizeof(char)*17+1);
  if(!cpu->feat->AVX)
    snprintf(string,2+1,"No");
  else if(!cpu->feat->AVX2)
    snprintf(string,3+1,"AVX");
  else if(!cpu->feat->AVX512)
    snprintf(string,8+1,"AVX,AVX2");
  else
    snprintf(string,15+1,"AVX,AVX2,AVX512");

  return string;
}

char* get_str_sse(struct cpuInfo* cpu) {
  uint32_t last = 0;
  uint32_t SSE_sl = 4;
  uint32_t SSE2_sl = 5;
  uint32_t SSE3_sl = 5;
  uint32_t SSSE3_sl = 6;
  uint32_t SSE4a_sl = 6;
  uint32_t SSE4_1_sl = 7;
  uint32_t SSE4_2_sl = 7;
  char* string = emalloc(sizeof(char)*SSE_sl+SSE2_sl+SSE3_sl+SSSE3_sl+SSE4a_sl+SSE4_1_sl+SSE4_2_sl+1);

  if(cpu->feat->SSE) {
      snprintf(string+last,SSE_sl+1,"SSE,");
      last+=SSE_sl;
  }
  if(cpu->feat->SSE2) {
      snprintf(string+last,SSE2_sl+1,"SSE2,");
      last+=SSE2_sl;
  }
  if(cpu->feat->SSE3) {
      snprintf(string+last,SSE3_sl+1,"SSE3,");
      last+=SSE3_sl;
  }
  if(cpu->feat->SSSE3) {
      snprintf(string+last,SSSE3_sl+1,"SSSE3,");
      last+=SSSE3_sl;
  }
  if(cpu->feat->SSE4a) {
      snprintf(string+last,SSE4a_sl+1,"SSE4a,");
      last+=SSE4a_sl;
  }
  if(cpu->feat->SSE4_1) {
      snprintf(string+last,SSE4_1_sl+1,"SSE4.1,");
      last+=SSE4_1_sl;
  }
  if(cpu->feat->SSE4_2) {
      snprintf(string+last,SSE4_2_sl+1,"SSE4.2,");
      last+=SSE4_2_sl;
  }

  //Purge last comma
  string[last-1] = '\0';
  return string;
}

char* get_str_fma(struct cpuInfo* cpu) {
  char* string = emalloc(sizeof(char)*9+1);
  if(!cpu->feat->FMA3)
    snprintf(string,2+1,"No");
  else if(!cpu->feat->FMA4)
    snprintf(string,4+1,"FMA3");
  else
    snprintf(string,9+1,"FMA3,FMA4");

  return string;
}

void print_debug(struct cpuInfo* cpu) {
  uint32_t eax = 0x00000001;
  uint32_t ebx = 0;
  uint32_t ecx = 0;
  uint32_t edx = 0;

  cpuid(&eax, &ebx, &ecx, &edx);

  printf("%s\n", cpu->cpu_name);
  if(cpu->hv->present) {
    printf("- Hypervisor: %s\n", cpu->hv->hv_name);
  }
  printf("- Max standard level: 0x%.8X\n", cpu->maxLevels);
  printf("- Max extended level: 0x%.8X\n", cpu->maxExtendedLevels);
  if(cpu->cpu_vendor == CPU_VENDOR_AMD) {
    printf("- AMD topology extensions: %d\n", cpu->topology_extensions);
  }
  if(cpu->cpu_vendor == CPU_VENDOR_INTEL) {
    printf("- Hybrid Flag: %d\n", cpu->hybrid_flag);
  }
  printf("- CPUID dump: 0x%.8X\n", eax);

  free_cpuinfo_struct(cpu);
}

void print_raw_level(uint32_t reg) {
  uint32_t eax = reg;
  uint32_t ebx = 0;
  uint32_t ecx = 0;
  uint32_t edx = 0;

  cpuid(&eax, &ebx, &ecx, &edx);

  printf("  0x%.8X 0x%.2X: 0x%.8X 0x%.8X 0x%.8X 0x%.8X\n", reg, 0x00, eax, ebx, ecx, edx);
}

void print_raw_sublevel(uint32_t reg, uint32_t reg2) {
  uint32_t eax = reg;
  uint32_t ebx = 0;
  uint32_t ecx = reg2;
  uint32_t edx = 0;

  cpuid(&eax, &ebx, &ecx, &edx);

  printf("  0x%.8X 0x%.2X: 0x%.8X 0x%.8X 0x%.8X 0x%.8X\n", reg, reg2, eax, ebx, ecx, edx);
}

void print_raw(struct cpuInfo* cpu) {
  printf("%s\n\n", cpu->cpu_name);
  printf("  CPUID leaf sub   EAX        EBX        ECX        EDX       \n");
  printf("--------------------------------------------------------------\n");

  for(int c=0; c < cpu->topo->total_cores; c++) {
    #ifndef __APPLE__
    if(!bind_to_cpu(c)) {
      printErr("Failed binding to CPU %d", c);
      return;
    }
    #endif

    printf("CPU %d:\n", c);

    // Standard levels
    for(uint32_t reg=0x00000000; reg <= cpu->maxLevels; reg++) {
      if(reg == 0x00000004) {
        for(uint32_t reg2=0x00000000; reg2 < cpu->cach->max_cache_level; reg2++) {
          print_raw_sublevel(reg, reg2);
        }
      }
      else if(reg == 0x0000000B) {
        for(uint32_t reg2=0x00000000; reg2 < cpu->topo->smt_supported; reg2++) {
          print_raw_sublevel(reg, reg2);
        }
      }
      else {
        print_raw_level(reg);
      }
    }

    // Hypervisor levels
    for(uint32_t reg=0x40000000; reg <= 0x40000006; reg++) {
      print_raw_level(reg);
    }

    // Extended levels
    for(uint32_t reg=0x80000000; reg <= cpu->maxExtendedLevels; reg++) {
      if(reg == 0x8000001D) {
        for(uint32_t reg2=0x00000000; reg2 < cpu->cach->max_cache_level; reg2++) {
          print_raw_sublevel(reg, reg2);
        }
      }
      else {
        print_raw_level(reg);
      }
    }

  }
}

void free_topo_struct(struct topology* topo) {
  free(topo->apic->cache_select_mask);
  free(topo->apic->cache_id_apic);
  free(topo->apic);
  free(topo);
}