#include<cstdint> #include<cmath> #include<cstring> #include <stdint.h> // <cstdint> is preferred in C++, but stdint.h works. #ifdef _MSC_VER # include <intrin.h> #else # include <x86intrin.h> #endif // optional wrapper if you don't want to just use __rdtsc() everywhere inline uint64_t readTSC() { // _mm_lfence(); // optionally wait for earlier insns to retire before reading the clock uint64_t tsc = __rdtsc(); // _mm_lfence(); // optionally block later instructions until rdtsc retires return tsc; } void intTest(uint32_t * const __restrict__ val1, uint32_t * const __restrict__ val2, uint32_t * const __restrict__ val3) { for(int i=0;i<1024;i++) { val3[i] = val1[i]/val2[i]; // scalar idiv } } void int64Test(uint64_t * const __restrict__ val1, uint64_t * const __restrict__ val2, uint64_t * const __restrict__ val3) { for(int i=0;i<1024;i++) { val3[i] = val1[i]/val2[i]; // scalar idiv } } void intEmulationTest(uint32_t * const __restrict__ val1, uint32_t * const __restrict__ val2, uint32_t * const __restrict__ val3) { for(int i=0;i<1024;i++) { double v1 = val1[i]; double v2 = val2[i]; double v3 = v1/v2; double t = v3 - (uint32_t)v3; v3 += t<0.99?0.01:0.0; val3[i] = v3; // 42-instruction code-bloat 2x faster than 1 idiv >:c } } // writing bits of integer // directly to bits of mantissa // up to 23 bits shoul be ok // do not use ffast-math, flushes this denormal to zero!! // "fp rounding mode: truncation" is required // and do no divide by zero // warning: 10x speedup in Zen2 architecture void intMagicTest(uint32_t * const __restrict__ val1, uint32_t * const __restrict__ val2, uint32_t * const __restrict__ val3) { for(int i=0;i<1024;i++) { float v1; float v2; std::memcpy( &v1, //mantissa dest &val1[i], //23 least significant bits src sizeof(float) // write all bytes anyway. Assume float is 4 bytes as uint32_t! ); std::memcpy(&v2,&val2[i],sizeof(float)); // I don't know how to de-normalize a float // (result of v1/v2) // (so just let compiler convert it) // if de-normalization was possible // then this could have no conversion latency at all val3[i] = v1/v2; // vdivps with only 1 conversion } } // writing bits of 32 integer (but in 64bit storage) // directly to bits of mantissa of double (53 bits enough?) // do not use ffast-math, flushes this denormal to zero!! // "fp rounding mode: truncation" is required // and do no divide by zero // warning: 10x speedup in Zen2 architecture void intMagicTestDouble(uint64_t * const __restrict__ val1, uint64_t * const __restrict__ val2, uint64_t * const __restrict__ val3) { for(int i=0;i<1024;i++) { double v1; double v2; std::memcpy( &v1, //mantissa dest &val1[i], //53 least significant bits src sizeof(double) // write all bytes anyway. Assume float is 8 bytes as uint64_t! ); std::memcpy(&v2,&val2[i],sizeof(double)); // I don't know how to de-normalize a float // (result of v1/v2) // (so just let compiler convert it) // if de-normalization was possible // then this could have no conversion latency at all val3[i] = v1/v2; // vdivps with only 1 conversion } } // writing bits of 32 integer (using temporary 64bit storage) // directly to bits of mantissa of double (53 bits enough?) // do not use ffast-math, flushes this denormal to zero!! // "fp rounding mode: truncation" is required // and do no divide by zero // warning: 10x speedup in Zen2 architecture void intMagicTestDoubleTmp(uint32_t * const __restrict__ val1, uint32_t * const __restrict__ val2, uint32_t * const __restrict__ val3) { for(int i=0;i<1024;i++) { uint64_t tmp1 = val1[i]; uint64_t tmp2 = val2[i]; double v1; double v2; std::memcpy( &v1, //mantissa dest &tmp1, //53 least significant bits src sizeof(double) // write all bytes anyway. Assume float is 8 bytes as uint64_t! ); std::memcpy(&v2,&tmp2,sizeof(double)); // I don't know how to de-normalize a float // (result of v1/v2) // (so just let compiler convert it) // if de-normalization was possible // then this could have no conversion latency at all val3[i] = v1/v2; // vdivps with only 1 conversion } } #include <iostream> #include <cpuid.h> // GCC-provided int main() { uint32_t a[1024],b[1024],c[1024]; for(int k=0;k<1000;k++) for(int i=0;i<1024;i++) { a[i]=1+i*i+clock(); b[i]=1+i; } uint64_t a64[1024],b64[1024],c64[1024]; for(int i=0;i<1024;i++) { a64[i]=1+i*i; b64[i]=1+i; } std::cout<<"emulation:"<<std::endl; auto t1 = readTSC() ; intEmulationTest(a,b,c); auto t2 = readTSC() ; for(int i=0;i<10;i++) std::cout<<c[i]<<" "<<std::endl; std::cout<<"magic:"<<std::endl; auto t3 = readTSC() ; intMagicTest(a,b,c); auto t4 = readTSC() ; for(int i=0;i<10;i++) std::cout<<c[i]<<" "<<std::endl; std::cout<<"int:"<<std::endl; auto t5 = readTSC() ; int64Test(a64,b64,c64); auto t6 = readTSC() ; for(int i=0;i<10;i++) std::cout<<c[i]<<" "<<std::endl; std::cout<<"magic double:"<<std::endl; auto t7 = readTSC() ; intMagicTestDouble(a64,b64,c64); auto t8 = readTSC() ; for(int i=0;i<10;i++) std::cout<<c[i]<<" "<<std::endl; std::cout<<"magic double tmp:"<<std::endl; auto t9 = readTSC() ; intMagicTestDoubleTmp(a,b,c); auto t10 = readTSC() ; for(int i=0;i<10;i++) std::cout<<c[i]<<" "<<std::endl; char CPUBrandString[0x40]; unsigned int CPUInfo[4] = {0,0,0,0}; __cpuid(0x80000000, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); unsigned int nExIds = CPUInfo[0]; memset(CPUBrandString, 0, sizeof(CPUBrandString)); for (unsigned int i = 0x80000000; i <= nExIds; ++i) { __cpuid(i, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]); if (i == 0x80000002) memcpy(CPUBrandString, CPUInfo, sizeof(CPUInfo)); else if (i == 0x80000003) memcpy(CPUBrandString + 16, CPUInfo, sizeof(CPUInfo)); else if (i == 0x80000004) memcpy(CPUBrandString + 32, CPUInfo, sizeof(CPUInfo)); } std::cout << "CPU Type: " << CPUBrandString << std::endl; std::cout<<"emulation: "<<t2-t1<<" cycles"<<std::endl; std::cout<<"magic: "<<t4-t3<<" cycles"<<std::endl; std::cout<<"int: "<<t6-t5<<" cycles"<<std::endl; std::cout<<"magic double: "<<t8-t7<<" cycles"<<std::endl; std::cout<<"magic double tmp: "<<t10-t9<<" cycles"<<std::endl; return 0; }
Write, Run & Share C++ code online using OneCompiler's C++ online compiler for free. It's one of the robust, feature-rich online compilers for C++ language, running on the latest version 17. Getting started with the OneCompiler's C++ compiler is simple and pretty fast. The editor shows sample boilerplate code when you choose language as C++
and start coding!
OneCompiler's C++ online compiler supports stdin and users can give inputs to programs using the STDIN textbox under the I/O tab. Following is a sample program which takes name as input and print your name with hello.
#include <iostream>
#include <string>
using namespace std;
int main()
{
string name;
cout << "Enter name:";
getline (cin, name);
cout << "Hello " << name;
return 0;
}
C++ is a widely used middle-level programming language.
When ever you want to perform a set of operations based on a condition If-Else is used.
if(conditional-expression) {
//code
}
else {
//code
}
You can also use if-else for nested Ifs and If-Else-If ladder when multiple conditions are to be performed on a single variable.
Switch is an alternative to If-Else-If ladder.
switch(conditional-expression){
case value1:
// code
break; // optional
case value2:
// code
break; // optional
......
default:
code to be executed when all the above cases are not matched;
}
For loop is used to iterate a set of statements based on a condition.
for(Initialization; Condition; Increment/decrement){
//code
}
While is also used to iterate a set of statements based on a condition. Usually while is preferred when number of iterations are not known in advance.
while (condition) {
// code
}
Do-while is also used to iterate a set of statements based on a condition. It is mostly used when you need to execute the statements atleast once.
do {
// code
} while (condition);
Function is a sub-routine which contains set of statements. Usually functions are written when multiple calls are required to same set of statements which increases re-usuability and modularity. Function gets run only when it is called.
return_type function_name(parameters);
function_name (parameters)
return_type function_name(parameters) {
// code
}