/* Copyright (C) 2024 Wildfire Games. * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be included * in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* * CPU-specific routines common to 32 and 64-bit x86 */ #include "precompiled.h" #include "lib/sysdep/arch/x86_x64/x86_x64.h" #include #include #include #include #include #include "lib/posix/posix_pthread.h" #include "lib/bits.h" #include "lib/timer.h" #include "lib/module_init.h" #include "lib/sysdep/cpu.h" #include "lib/sysdep/os_cpu.h" #if MSC_VERSION # include // __rdtsc #endif namespace x86_x64 { #if defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 150030729 // VC10+ and VC9 SP1: __cpuidex is already available #elif GCC_VERSION # define __cpuidex(regsArray, level, index)\ __asm__ __volatile__ ("cpuid"\ : "=a" ((regsArray)[0]), "=b" ((regsArray)[1]), "=c" ((regsArray)[2]), "=d" ((regsArray)[3])\ : "0" (level), "2" (index)); #else # error "compiler not supported" #endif // some of this module's functions are frequently called but require // non-trivial initialization, so caching is helpful. isInitialized // flags aren't thread-safe, so we use ModuleInit. calling it from // every function is a bit wasteful, but it is convenient to avoid // requiring users to pass around a global state object. // one big Init() would be prone to deadlock if its subroutines also // call a public function (that re-enters ModuleInit), so each // function gets its own initState. //----------------------------------------------------------------------------- // CPUID static void Invoke_cpuid(CpuidRegs* regs) { cassert(sizeof(regs->eax) == sizeof(int)); cassert(sizeof(*regs) == 4*sizeof(int)); __cpuidex((int*)regs, regs->eax, regs->ecx); } static u32 cpuid_maxFunction; static u32 cpuid_maxExtendedFunction; static Status InitCpuid() { CpuidRegs regs = { 0 }; regs.eax = 0; Invoke_cpuid(®s); cpuid_maxFunction = regs.eax; regs.eax = 0x80000000; Invoke_cpuid(®s); cpuid_maxExtendedFunction = regs.eax; return INFO::OK; } bool cpuid(CpuidRegs* regs) { static ModuleInitState initState{ 0 }; ModuleInit(&initState, InitCpuid); const u32 function = regs->eax; if(function > cpuid_maxExtendedFunction) return false; if(function < 0x80000000 && function > cpuid_maxFunction) return false; Invoke_cpuid(regs); return true; } //----------------------------------------------------------------------------- // capability bits // treated as 128 bit field; order: std ecx, std edx, ext ecx, ext edx // keep in sync with enum Cap! static u32 caps[4]; static ModuleInitState capsInitState{ 0 }; static Status InitCaps() { CpuidRegs regs = { 0 }; regs.eax = 1; if(cpuid(®s)) { caps[0] = regs.ecx; caps[1] = regs.edx; } regs.eax = 0x80000001; if(cpuid(®s)) { caps[2] = regs.ecx; caps[3] = regs.edx; } return INFO::OK; } bool Cap(Caps cap) { ModuleInit(&capsInitState, InitCaps); const size_t index = cap >> 5; const size_t bit = cap & 0x1F; if(index >= ARRAY_SIZE(caps)) { DEBUG_WARN_ERR(ERR::INVALID_PARAM); return false; } return IsBitSet(caps[index], bit); } void GetCapBits(u32* d0, u32* d1, u32* d2, u32* d3) { ModuleInit(&capsInitState, InitCaps); *d0 = caps[0]; *d1 = caps[1]; *d2 = caps[2]; *d3 = caps[3]; } //----------------------------------------------------------------------------- // vendor static Vendors vendor; static Status InitVendor() { CpuidRegs regs = { 0 }; regs.eax = 0; if(!cpuid(®s)) DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING); // copy regs to string // note: 'strange' ebx,edx,ecx reg order is due to ModR/M encoding order. char vendorString[13]; memcpy(&vendorString[0], ®s.ebx, 4); memcpy(&vendorString[4], ®s.edx, 4); memcpy(&vendorString[8], ®s.ecx, 4); vendorString[12] = '\0'; // 0-terminate if(!strcmp(vendorString, "AuthenticAMD")) vendor = x86_x64::VENDOR_AMD; else if(!strcmp(vendorString, "GenuineIntel")) vendor = x86_x64::VENDOR_INTEL; else { DEBUG_WARN_ERR(ERR::CPU_UNKNOWN_VENDOR); vendor = x86_x64::VENDOR_UNKNOWN; } return INFO::OK; } Vendors Vendor() { static ModuleInitState initState{ 0 }; ModuleInit(&initState, InitVendor); return vendor; } //----------------------------------------------------------------------------- // signature static size_t m_Model; static size_t m_Family; static ModuleInitState signatureInitState{ 0 }; static Status InitSignature() { CpuidRegs regs = { 0 }; regs.eax = 1; if(!cpuid(®s)) DEBUG_WARN_ERR(ERR::CPU_FEATURE_MISSING); m_Model = bits(regs.eax, 4, 7); m_Family = bits(regs.eax, 8, 11); const size_t extendedModel = bits(regs.eax, 16, 19); const size_t extendedFamily = bits(regs.eax, 20, 27); if(m_Family == 0xF) m_Family += extendedFamily; if(m_Family == 0xF || (Vendor() == x86_x64::VENDOR_INTEL && m_Family == 6)) m_Model += extendedModel << 4; return INFO::OK; } size_t Model() { ModuleInit(&signatureInitState, InitSignature); return m_Model; } size_t Family() { ModuleInit(&signatureInitState, InitSignature); return m_Family; } //----------------------------------------------------------------------------- // identifier string /// functor to remove substrings from the CPU identifier string class StringStripper { public: StringStripper(char* string, size_t max_chars) : m_string(string), m_max_chars(max_chars) { } // remove all instances of substring from m_string void operator()(const char* substring) { const size_t substring_length = strlen(substring); for(;;) { char* substring_pos = strstr(m_string, substring); if(!substring_pos) break; const size_t substring_ofs = substring_pos - m_string; const size_t num_chars = m_max_chars - substring_ofs - substring_length; memmove(substring_pos, substring_pos+substring_length, num_chars); } } private: char* m_string; size_t m_max_chars; }; // 3 calls x 4 registers x 4 bytes = 48 + 0-terminator static char identifierString[48+1]; static Status InitIdentifierString() { // get brand string (if available) char* pos = identifierString; bool gotBrandString = true; for(u32 function = 0x80000002; function <= 0x80000004; function++) { CpuidRegs regs = { 0 }; regs.eax = function; gotBrandString &= cpuid(®s); memcpy(pos, ®s, 16); pos += 16; } // fall back to manual detect of CPU type because either: // - CPU doesn't support brand string (we use a flag to indicate this // rather than comparing against a default value because it is safer); // - the brand string is useless, e.g. "Unknown". this happens on // some older boards whose BIOS reprograms the string for CPUs it // doesn't recognize. if(!gotBrandString || strncmp(identifierString, "Unknow", 6) == 0) { const size_t family = Family(); const size_t model = Model(); switch(Vendor()) { case x86_x64::VENDOR_AMD: // everything else is either too old, or should have a brand string. if(family == 6) { if(model == 3 || model == 7) strcpy_s(identifierString, ARRAY_SIZE(identifierString), "AMD Duron"); else if(model <= 5) strcpy_s(identifierString, ARRAY_SIZE(identifierString), "AMD Athlon"); else { if(Cap(x86_x64::CAP_AMD_MP)) strcpy_s(identifierString, ARRAY_SIZE(identifierString), "AMD Athlon MP"); else strcpy_s(identifierString, ARRAY_SIZE(identifierString), "AMD Athlon XP"); } } break; case x86_x64::VENDOR_INTEL: // everything else is either too old, or should have a brand string. if(family == 6) { if(model == 1) strcpy_s(identifierString, ARRAY_SIZE(identifierString), "Intel Pentium Pro"); else if(model == 3 || model == 5) strcpy_s(identifierString, ARRAY_SIZE(identifierString), "Intel Pentium II"); else if(model == 6) strcpy_s(identifierString, ARRAY_SIZE(identifierString), "Intel Celeron"); else strcpy_s(identifierString, ARRAY_SIZE(identifierString), "Intel Pentium III"); } break; default: strcpy_s(identifierString, ARRAY_SIZE(identifierString), "Unknown, non-Intel/AMD"); break; } } // identifierString already holds a valid brand string; pretty it up. else { const char* const undesiredStrings[] = { "(tm)", "(TM)", "(R)", "CPU ", " " }; std::for_each(undesiredStrings, undesiredStrings+ARRAY_SIZE(undesiredStrings), StringStripper(identifierString, strlen(identifierString)+1)); // note: Intel brand strings include a frequency, but we can't rely // on it because the CPU may be overclocked. we'll leave it in the // string to show measurement accuracy and if SpeedStep is active. } return INFO::OK; } static const char* IdentifierString() { static ModuleInitState initState{ 0 }; ModuleInit(&initState, InitIdentifierString); return identifierString; } //----------------------------------------------------------------------------- // miscellaneous stateless functions #if !MSC_VERSION // ensure not already defined in header u64 rdtsc() { #if GCC_VERSION // GCC supports "portable" assembly for both x86 and x64 volatile u32 lo, hi; __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi)); return u64_from_u32(hi, lo); #endif } #endif void DebugBreak() { #if MSC_VERSION __debugbreak(); #elif GCC_VERSION // note: this probably isn't necessary, since unix_debug_break // (SIGTRAP) is most probably available if GCC_VERSION. // we include it for completeness, though. __asm__ __volatile__ ("int $3"); #endif } //----------------------------------------------------------------------------- // CPU frequency // set scheduling priority and restore when going out of scope. class ScopedSetPriority { public: ScopedSetPriority(int newPriority) { // get current scheduling policy and priority pthread_getschedparam(pthread_self(), &m_oldPolicy, &m_oldParam); // set new priority sched_param newParam = {0}; newParam.sched_priority = newPriority; pthread_setschedparam(pthread_self(), SCHED_FIFO, &newParam); } ~ScopedSetPriority() { // restore previous policy and priority. pthread_setschedparam(pthread_self(), m_oldPolicy, &m_oldParam); } private: int m_oldPolicy; sched_param m_oldParam; }; // note: this function uses timer.cpp!timer_Time, which is implemented via // whrt.cpp on Windows. double ClockFrequency() { // if the TSC isn't available, there's really no good way to count the // actual CPU clocks per known time interval, so bail. // note: loop iterations ("bogomips") are not a reliable measure due // to differing IPC and compiler optimizations. if(!Cap(x86_x64::CAP_TSC)) return -1.0; // impossible value // increase priority to reduce interference while measuring. const int priority = sched_get_priority_max(SCHED_FIFO)-1; ScopedSetPriority ssp(priority); // note: no need to "warm up" cpuid - it will already have been // called several times by the time this code is reached. // (background: it's used in rdtsc() to serialize instruction flow; // the first call is documented to be slower on Intel CPUs) size_t numSamples = 16; // if clock is low-res, do less samples so it doesn't take too long. // balance measuring time (~ 10 ms) and accuracy (< 0.1% error - // ok for using the TSC as a time reference) if(timer_Resolution() >= 1e-3) numSamples = 8; std::vector samples(numSamples); for(size_t i = 0; i < numSamples; i++) { double dt; i64 dc; // (i64 instead of u64 for faster conversion to double) // count # of clocks in max{1 tick, 1 ms}: // .. wait for start of tick. const double t0 = timer_Time(); u64 c1; double t1; do { // note: timer_Time effectively has a long delay (up to 5 us) // before returning the time. we call it before rdtsc to // minimize the delay between actually sampling time / TSC, // thus decreasing the chance for interference. // (if unavoidable background activity, e.g. interrupts, // delays the second reading, inaccuracy is introduced). t1 = timer_Time(); c1 = rdtsc(); } while(t1 == t0); // .. wait until start of next tick and at least 1 ms elapsed. do { const double t2 = timer_Time(); const u64 c2 = rdtsc(); dc = (i64)(c2 - c1); dt = t2 - t1; } while(dt < 1e-3); // .. freq = (delta_clocks) / (delta_seconds); // rdtsc/timer overhead is negligible. const double freq = dc / dt; samples[i] = freq; } std::sort(samples.begin(), samples.end()); // median filter (remove upper and lower 25% and average the rest). // note: don't just take the lowest value! it could conceivably be // too low, if background processing delays reading c1 (see above). double sum = 0.0; const size_t lo = numSamples/4, hi = 3*numSamples/4; for(size_t i = lo; i < hi; i++) sum += samples[i]; const double clockFrequency = sum / (hi-lo); return clockFrequency; } } // namespace x86_x64 const char* cpu_IdentifierString() { return x86_x64::IdentifierString(); }