#include<cstdint>
#include<cmath>
#include<cstring>
#include <stdint.h> // <cstdint> is preferred in C++, but stdint.h works.
#ifdef _MSC_VER
# include <intrin.h>
#else
# include <x86intrin.h>
#endif
// optional wrapper if you don't want to just use __rdtsc() everywhere
inline
uint64_t readTSC() {
// _mm_lfence(); // optionally wait for earlier insns to retire before reading the clock
uint64_t tsc = __rdtsc();
// _mm_lfence(); // optionally block later instructions until rdtsc retires
return tsc;
}
void intTest(uint32_t * const __restrict__ val1, uint32_t * const __restrict__ val2, uint32_t * const __restrict__ val3)
{
for(int i=0;i<1024;i++)
{
val3[i] = val1[i]/val2[i]; // scalar idiv
}
}
void int64Test(uint64_t * const __restrict__ val1, uint64_t * const __restrict__ val2, uint64_t * const __restrict__ val3)
{
for(int i=0;i<1024;i++)
{
val3[i] = val1[i]/val2[i]; // scalar idiv
}
}
void intEmulationTest(uint32_t * const __restrict__ val1, uint32_t * const __restrict__ val2, uint32_t * const __restrict__ val3)
{
for(int i=0;i<1024;i++)
{
double v1 = val1[i];
double v2 = val2[i];
double v3 = v1/v2;
double t = v3 - (uint32_t)v3;
v3 += t<0.99?0.01:0.0;
val3[i] = v3; // 42-instruction code-bloat 2x faster than 1 idiv >:c
}
}
// writing bits of integer
// directly to bits of mantissa
// up to 23 bits shoul be ok
// do not use ffast-math, flushes this denormal to zero!!
// "fp rounding mode: truncation" is required
// and do no divide by zero
// warning: 10x speedup in Zen2 architecture
void intMagicTest(uint32_t * const __restrict__ val1, uint32_t * const __restrict__ val2, uint32_t * const __restrict__ val3)
{
for(int i=0;i<1024;i++)
{
float v1;
float v2;
std::memcpy(
&v1, //mantissa dest
&val1[i], //23 least significant bits src
sizeof(float) // write all bytes anyway. Assume float is 4 bytes as uint32_t!
);
std::memcpy(&v2,&val2[i],sizeof(float));
// I don't know how to de-normalize a float
// (result of v1/v2)
// (so just let compiler convert it)
// if de-normalization was possible
// then this could have no conversion latency at all
val3[i] = v1/v2; // vdivps with only 1 conversion
}
}
// writing bits of 32 integer (but in 64bit storage)
// directly to bits of mantissa of double (53 bits enough?)
// do not use ffast-math, flushes this denormal to zero!!
// "fp rounding mode: truncation" is required
// and do no divide by zero
// warning: 10x speedup in Zen2 architecture
void intMagicTestDouble(uint64_t * const __restrict__ val1, uint64_t * const __restrict__ val2, uint64_t * const __restrict__ val3)
{
for(int i=0;i<1024;i++)
{
double v1;
double v2;
std::memcpy(
&v1, //mantissa dest
&val1[i], //53 least significant bits src
sizeof(double) // write all bytes anyway. Assume float is 8 bytes as uint64_t!
);
std::memcpy(&v2,&val2[i],sizeof(double));
// I don't know how to de-normalize a float
// (result of v1/v2)
// (so just let compiler convert it)
// if de-normalization was possible
// then this could have no conversion latency at all
val3[i] = v1/v2; // vdivps with only 1 conversion
}
}
// writing bits of 32 integer (using temporary 64bit storage)
// directly to bits of mantissa of double (53 bits enough?)
// do not use ffast-math, flushes this denormal to zero!!
// "fp rounding mode: truncation" is required
// and do no divide by zero
// warning: 10x speedup in Zen2 architecture
void intMagicTestDoubleTmp(uint32_t * const __restrict__ val1, uint32_t * const __restrict__ val2, uint32_t * const __restrict__ val3)
{
for(int i=0;i<1024;i++)
{
uint64_t tmp1 = val1[i];
uint64_t tmp2 = val2[i];
double v1;
double v2;
std::memcpy(
&v1, //mantissa dest
&tmp1, //53 least significant bits src
sizeof(double) // write all bytes anyway. Assume float is 8 bytes as uint64_t!
);
std::memcpy(&v2,&tmp2,sizeof(double));
// I don't know how to de-normalize a float
// (result of v1/v2)
// (so just let compiler convert it)
// if de-normalization was possible
// then this could have no conversion latency at all
val3[i] = v1/v2; // vdivps with only 1 conversion
}
}
#include <iostream>
#include <cpuid.h> // GCC-provided
int main()
{
uint32_t a[1024],b[1024],c[1024];
for(int k=0;k<1000;k++)
for(int i=0;i<1024;i++)
{
a[i]=1+i*i+clock(); b[i]=1+i;
}
uint64_t a64[1024],b64[1024],c64[1024];
for(int i=0;i<1024;i++)
{
a64[i]=1+i*i; b64[i]=1+i;
}
std::cout<<"emulation:"<<std::endl;
auto t1 = readTSC() ;
intEmulationTest(a,b,c);
auto t2 = readTSC() ;
for(int i=0;i<10;i++)
std::cout<<c[i]<<" "<<std::endl;
std::cout<<"magic:"<<std::endl;
auto t3 = readTSC() ;
intMagicTest(a,b,c);
auto t4 = readTSC() ;
for(int i=0;i<10;i++)
std::cout<<c[i]<<" "<<std::endl;
std::cout<<"int:"<<std::endl;
auto t5 = readTSC() ;
int64Test(a64,b64,c64);
auto t6 = readTSC() ;
for(int i=0;i<10;i++)
std::cout<<c[i]<<" "<<std::endl;
std::cout<<"magic double:"<<std::endl;
auto t7 = readTSC() ;
intMagicTestDouble(a64,b64,c64);
auto t8 = readTSC() ;
for(int i=0;i<10;i++)
std::cout<<c[i]<<" "<<std::endl;
std::cout<<"magic double tmp:"<<std::endl;
auto t9 = readTSC() ;
intMagicTestDoubleTmp(a,b,c);
auto t10 = readTSC() ;
for(int i=0;i<10;i++)
std::cout<<c[i]<<" "<<std::endl;
char CPUBrandString[0x40];
unsigned int CPUInfo[4] = {0,0,0,0};
__cpuid(0x80000000, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
unsigned int nExIds = CPUInfo[0];
memset(CPUBrandString, 0, sizeof(CPUBrandString));
for (unsigned int i = 0x80000000; i <= nExIds; ++i)
{
__cpuid(i, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
if (i == 0x80000002)
memcpy(CPUBrandString, CPUInfo, sizeof(CPUInfo));
else if (i == 0x80000003)
memcpy(CPUBrandString + 16, CPUInfo, sizeof(CPUInfo));
else if (i == 0x80000004)
memcpy(CPUBrandString + 32, CPUInfo, sizeof(CPUInfo));
}
std::cout << "CPU Type: " << CPUBrandString << std::endl;
std::cout<<"emulation: "<<t2-t1<<" cycles"<<std::endl;
std::cout<<"magic: "<<t4-t3<<" cycles"<<std::endl;
std::cout<<"int: "<<t6-t5<<" cycles"<<std::endl;
std::cout<<"magic double: "<<t8-t7<<" cycles"<<std::endl;
std::cout<<"magic double tmp: "<<t10-t9<<" cycles"<<std::endl;
return 0;
}
Write, Run & Share C++ code online using OneCompiler's C++ online compiler for free. It's one of the robust, feature-rich online compilers for C++ language, running on the latest version 17. Getting started with the OneCompiler's C++ compiler is simple and pretty fast. The editor shows sample boilerplate code when you choose language as C++ and start coding!
OneCompiler's C++ online compiler supports stdin and users can give inputs to programs using the STDIN textbox under the I/O tab. Following is a sample program which takes name as input and print your name with hello.
#include <iostream>
#include <string>
using namespace std;
int main()
{
string name;
cout << "Enter name:";
getline (cin, name);
cout << "Hello " << name;
return 0;
}
C++ is a widely used middle-level programming language.
When ever you want to perform a set of operations based on a condition If-Else is used.
if(conditional-expression) {
//code
}
else {
//code
}
You can also use if-else for nested Ifs and If-Else-If ladder when multiple conditions are to be performed on a single variable.
Switch is an alternative to If-Else-If ladder.
switch(conditional-expression){
case value1:
// code
break; // optional
case value2:
// code
break; // optional
......
default:
code to be executed when all the above cases are not matched;
}
For loop is used to iterate a set of statements based on a condition.
for(Initialization; Condition; Increment/decrement){
//code
}
While is also used to iterate a set of statements based on a condition. Usually while is preferred when number of iterations are not known in advance.
while (condition) {
// code
}
Do-while is also used to iterate a set of statements based on a condition. It is mostly used when you need to execute the statements atleast once.
do {
// code
} while (condition);
Function is a sub-routine which contains set of statements. Usually functions are written when multiple calls are required to same set of statements which increases re-usuability and modularity. Function gets run only when it is called.
return_type function_name(parameters);
function_name (parameters)
return_type function_name(parameters) {
// code
}