Monthly Archive: February 2018

Evaluation of std::chrono

If you have a look around in the web, a solution to correctly measure time is to use a new C++ package: std::chrono , which is part of the standard C++ library.
So the aim of this article is to investigate if this solution can be used to have a very high resolution timer. If you remember well as we are doing small improvement we want to be able to measure the improvement (or degradation). of optimization.

First step is to

#include <chrono>
#include <ratio>
#include <climits>
#include <algorithm>    // std::max
int main()
{
  long long  value = 0;
  double max = LONG_MIN ;
  double min = LONG_MAX;
  for (int i=  1;i<100;i++){

    auto startInitial = std::chrono::high_resolution_clock::now();              
    auto endInitial = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::nano > elapsedInitial = (endInitial - startInitial) ;
    max = std::max(max,elapsedInitial.count());
    min = std::min(min,elapsedInitial.count());
    value=value+elapsedInitial.count();
  }
  std::cout <<"Sum for 100 loop"<<value<<" " <<value/100<<"ns"std::endl;
  std::cout<<" Max:" <<max <<"ns Min:"<<min<<"ns"<<std::endl;
}
fflayol@:/tmp$ g++ test1.c  -std=c++11;./a.out 
Sum for 100 loop: 2235
Mean: 22ns
Max : 53ns
Min : 21ns

This example shows that the call last at means 20ns which is quite too long for our purpose.
Indeed if we are trying to be more accurate:

#include <iostream>
#include <chrono>
#include <ratio>
#include <climits>
#include <algorithm>    // std::max
int main()
{
  {  
    long long  value = 0;
    double max = LONG_MIN ;
    double min = LONG_MAX;
    for (int i=  1;i<100;i++){
  
      auto startInitial = std::chrono::high_resolution_clock::now();              
      auto endInitial = std::chrono::high_resolution_clock::now();
      std::chrono::duration<double, std::nano > elapsedInitial = (endInitial - startInitial) ;
      max = std::max(max,elapsedInitial.count());
      min = std::min(min,elapsedInitial.count());
      value=value+elapsedInitial.count();
    }
    std::cout <<"Sum for 100 loop"<<value<<" " <<value/100<<"ns"<<std::endl;
    std::cout<<" Max:" <<max <<"ns Min:"<<min<<"ns"<<std::endl;
  }
  std::cout <<"Second function"<<std::endl;
  { 
    long long  value = 0;
    double max = LONG_MIN ;
    double min = LONG_MAX;
    for (int i=  1;i<100;i++){
 
      auto startInitial = std::chrono::high_resolution_clock::now();

      asm("nop");
      asm("nop");
      asm("nop");
      asm("nop");
      asm("nop");
      asm("nop");
      asm("nop");
      asm("nop");
      asm("nop");
      asm("nop");
      asm("nop");
      asm("nop");
      asm("nop");
      asm("nop");
      asm("nop");
      asm("nop");
      asm("nop");
      asm("nop");
      auto endInitial = std::chrono::high_resolution_clock::now();
      std::chrono::duration<double, std::nano > elapsedInitial = (endInitial - startInitial) ;
      max = std::max(max,elapsedInitial.count());
      min = std::min(min,elapsedInitial.count());
      value=value+elapsedInitial.count();
    }
    std::cout <<"Sum for 100 loop"<<value<<" " <<value/100<<"ns"<<std::endl;
    std::cout<<" Max:" <<max <<"ns Min:"<<min<<"ns"<<std::endl;
  }
}

Std Chrono, a high resolution timer ?

#include <iostream>
#include <string>
#include <vector>
#include <functional>
#include <chrono>
#include <smmintrin.h>
#include <unistd.h>
#include <glm.hpp>
#include <gtx/simd_vec4.hpp>
#include <gtx/simd_mat4.hpp>
#include <gtc/type_ptr.hpp>
#include <immintrin.h>


namespace ch = std::chrono;

const int Iter = 1<<28;

void RunBench_GLM()
{
	glm::vec4 v(1.0f);
	glm::vec4 v2;
	glm::mat4 m(1.0f);
	
	for (int i = 0; i < Iter; i++)
	{
		v2 += m * v;
	}

	auto t = v2;
	std::cout << t.x << " " << t.y << " " << t.z << " " << t.w << std::endl;
}


void RunBench_GLM_SIMD()
{
	glm::detail::fvec4SIMD v(1.0f);
	glm::detail::fvec4SIMD v2(0.0f);
	glm::detail::fmat4x4SIMD m(1.0f);

	for (int i = 0; i < Iter; i++)
	{
		v2 += v * m;
	}

	auto t = glm::vec4_cast(v2);
	std::cout << t.x << " " << t.y << " " << t.z << " " << t.w << std::endl;
}



void RunBench_Double_GLM()
{
	glm::dvec4 v(1.0);
	glm::dvec4 v2;
	glm::dmat4 m(1.0);

	for (int i = 0; i < Iter; i++)
	{
		v2 += v * m;
	}

	auto t = v2;
	std::cout << t.x << " " << t.y << " " << t.z << " " << t.w << std::endl;
}

void RunBench_Double_AVX()
{
	__m256d v = _mm256_set_pd(1, 1, 1, 1);
	__m256d s = _mm256_setzero_pd();
	__m256d m[4] =
	{
		_mm256_set_pd(1, 0, 0, 0),
		_mm256_set_pd(0, 1, 0, 0),
		_mm256_set_pd(0, 0, 1, 0),
		_mm256_set_pd(0, 0, 0, 1)
	};

	for (int i = 0; i < Iter; i++)
	{
		__m256d v0 = _mm256_shuffle_pd(v, v, _MM_SHUFFLE(0, 0, 0, 0));
		__m256d v1 = _mm256_shuffle_pd(v, v, _MM_SHUFFLE(1, 1, 1, 1));
		__m256d v2 = _mm256_shuffle_pd(v, v, _MM_SHUFFLE(2, 2, 2, 2));
		__m256d v3 = _mm256_shuffle_pd(v, v, _MM_SHUFFLE(3, 3, 3, 3));

		__m256d m0 = _mm256_mul_pd(m[0], v0);
		__m256d m1 = _mm256_mul_pd(m[1], v1);
		__m256d m2 = _mm256_mul_pd(m[2], v2);
		__m256d m3 = _mm256_mul_pd(m[3], v3);

		__m256d a0 = _mm256_add_pd(m0, m1);
		__m256d a1 = _mm256_add_pd(m2, m3);
		__m256d a2 = _mm256_add_pd(a0, a1);

		s = _mm256_add_pd(s, a2);
	}

	double t[4];
	_mm256_store_pd(t, s);
	std::cout << t[0] << " " << t[1] << " " << t[2] << " " << t[3] << std::endl;
}

int main()
{
	std::vector<std::pair<std::string, std::function<void ()>>> benches;
	benches.push_back(std::make_pair("GLM", RunBench_GLM));
	benches.push_back(std::make_pair("GLM_SIMD", RunBench_GLM_SIMD));
	benches.push_back(std::make_pair("Double_GLM", RunBench_Double_GLM));
	benches.push_back(std::make_pair("Double_AVX", RunBench_Double_AVX));
	auto startInitial = ch::high_resolution_clock::now();
        
        for (int i=0;i<500000;i++){
          asm("NOP");
        }
        
        auto endInitial = ch::high_resolution_clock::now();

	double elapsedInitial = (double)ch::duration_cast<ch::milliseconds>(endInitial - startInitial).count() ;
	std::cout << "resolution :" <<elapsedInitial <<std::endl;
	for (auto& bench : benches)
	{
		std::cout << "Begin [ " << bench.first << " ]" << std::endl;

		auto start = ch::high_resolution_clock::now();
		bench.second();
		auto end = ch::high_resolution_clock::now();	

		double elapsed = (double)ch::duration_cast<ch::milliseconds>(end - start).count() / 1000.0;
		std::cout << "End [ " << bench.first << " ] : " << elapsed << " seconds" << std::endl;
	}
	
	std::cin.get();
	return 0;
}

Shuffle in SSE

#include <immintrin.h>
#include <stdio.h>
void test(int32_t *Y, int32_t *X)
{
        __m128i *v1  __attribute__((aligned (16)));

        __m128i *v2 __attribute__((aligned (16)));
        __m128i v3 __attribute__((aligned (16)));
        __m128i  v4 __attribute__((aligned (16)));
        int32_t * rslt;
        int64_t * rslt64;
        v1 = (__m128i *) X;
        v2 = (__m128i *) Y;
        rslt = (int32_t * ) v1;
      printf("In test, V1  after MUL  SHUFFLE: %d\t%d\t%d\t%d\t\n", rslt[0], rslt[1], rslt[2], rslt[3]);
      rslt = (int32_t * ) v2;
      printf("In test, V2 before  MUL SHUFFLE: %d\t%d\t%d\t%d\t\n", rslt[0], rslt[1], rslt[2], rslt[3]);
        v3 =  _mm_mul_epi32(*v1, *v2);
        v4 = _mm_mul_epi32(_mm_shuffle_epi32(*v1, _MM_SHUFFLE(2, 3, 0, 1)), _mm_shuffle_epi32(*v2, _MM_SHUFFLE(2, 3, 0, 1)));

        rslt64 = (int64_t * ) &v3;
        printf("In REDC, product before SHUFFLE: %ldt%ldn", rslt64[0], rslt64[1]);

        rslt64 = (int64_t * ) &v4;
        printf("In REDC, product after  SHUFFLE: %ldt%ldn", rslt64[0], rslt64[1]);

        rslt = (int32_t * ) v1;
        printf("In REDC, 4-way vect before  SHUFFLE: %dt%dt%dt%dtn", rslt[0], rslt[1], rslt[2], rslt[3]);

        *v1 = _mm_shuffle_epi32(*v1, _MM_SHUFFLE(2, 3, 0, 1));
        rslt = (int32_t * ) v1;
        printf("In REDC, 4-way vect after  SHUFFLE: %dt%dt%dt%dtn", rslt[0], rslt[1], rslt[2], rslt[3]);
}
int main (int nb, char** argv){
   int32_t a = (int32_t)1234;
   int32_t b = (int32_t)5678;
    test(&a,&b);
}

Quest for the ultimate timer framework

A lot stuff on this blog talks about code optimization, and sometime very small improvement that have performance minimal impacts but in case they are called a lot of time it became difficult to ensure optimization are useful. Let’s take an example. Imagine you have a function that lasts one millisecond. You are optimizing this function and as a result you found two solutions to optimize your code . But if you are using a timer that lasts 0.5 milliseconds, you won’t be able to choose one of this other. The aim of this article is to help you to understand ???

Throughput of the algorithm

In this case

Pro:

  • Easy to implement
  • Can cover a full range of service, or the lifetime

Con

  • Can be difficult to implement as you invert the timing (aka how many cycles I’ve done in one minute for instance)
  • Initial conditions can be impossible to reproduce
  • Program should maintain this feature

Example:

  •  Our ethminer with -m option.

Time with linux command

The time command, is an Unix/Linux standard line command. It will use the internal timer to return the time elapsed by the command.

time ls -l 

real	0m0.715s
user	0m0.000s
sys	0m0.004s

You know that in this blog I like to use example. Imagine you have a 3D application that do a lot complex mathematical calculus function (square root, cosines,..). You know that theses functions are called a lot of time (billion per second). As we have already seen a small improvement in this function can have very strong impact on all the program. Now you know two way to implement this calculus in C by using the standard mathematic library or in assembler which is a bit complex to do but you might achieve better performance. The method I’m gonna to present can be also used when you have two implementations of the same feature and you don’t know which one to choose. If you have to choose how fast is the new version, and do it worth the pain to do it in assembler as the code becomes difficult to maintain.

#include <math.h>
inline double Calc_c(double x,double y,double z){
double tmpx = sqrt(x)*cos(x)/sin(x);
double tmpy = sqrt(y)*cos(y)/sin(y);
double tmpz = sqrt(z)*cos(z)/sin(z);
return (tmpx+tmpy+tmpz)*tmpx+tmpy+tmpz
}

inline double Calc_as(double x,double y,double z){

    __m512d a1  _mm512_set4_pd(x,y,z,0.0);
    
}

We know that the assembler version will be faster but to which value ?

 

SHA3

Preimage attack on Keccak-512 reduced to 8 rounds, requiring 2511.5 time and 2508 memory[2]
Zero-sum distinguishers exist for the full 24-round Keccak-f[1600], though they cannot be used to attack the hash function itself[3]

SHA-3 (Secure Hash Algorithm 3) is the latest member of the Secure Hash Algorithm family of standards, released by NIST on August 5, 2015.[4][5] The reference implementation source code was dedicated to public domain via CC0 waiver.[6] Although part of the same series of standards, SHA-3 is internally quite different from the MD5-like structure of SHA-1 and SHA-2.

SHA-3 is a subset of the broader cryptographic primitive family Keccak (/ˈkɛtʃæk/, or /ˈkɛtʃɑːk/),[7][8] designed by Guido Bertoni, Joan Daemen, Michaël Peeters, and Gilles Van Assche,

Keccak is based on a novel approach called sponge construction. Sponge construction is based on a wide random function or random permutation, and allows inputting (“absorbing” in sponge terminology) any amount of data, and outputting (“squeezing”) any amount of data, while acting as a pseudorandom function with regard to all previous inputs. This leads to great flexibility.

SHA-3 uses the sponge construction,[13][23] in which data is “absorbed” into the sponge, then the result is “squeezed” out. In the absorbing phase, message blocks are XORed into a subset of the state, which is then transformed as a whole using a permutation function f. In the “squeeze” phase, output blocks are read from the same subset of the state, alternated with the state transformation function f. The size of the part of the state that is written and read is called the “rate” (denoted r), and the size of the part that is untouched by input/output is called the “capacity” (denoted c). The capacity determines the security of the scheme. The maximum security level is half the capacity.

Given an input bit string N, a padding function pad, a permutation function f that operates on bit blocks of width b, a rate r and an output length d, we have capacity c = b − r and the sponge construction Z = sponge[f,pad,r](N,d), yielding a bit string Z of length d, works as follows:[24]:18

pad the input N using the pad function, yielding a padded bit string P with a length divisible by r (such that n = len(P)/r is integer),
break P into n consecutive r-bit pieces P0, …, Pn-1
initialize the state S to a string of b 0 bits.
absorb the input into the state: For each block Pi,
extend Pi at the end by a string of c 0 bits, yielding one of length b,
XOR that with S and
apply the block permutation f to the result, yielding a new state S
initialize Z to be the empty string
while the length of Z is less than d:
append the first r bits of S to Z
if Z is still less than d bits long, apply f to S, yielding a new state S.
truncate Z to d bits

The fact that the internal state S contains c additional bits of information in addition to what is output to Z prevents the length extension attacks that SHA-2, SHA-1, MD5 and other hashes based on the Merkle–Damgård construction are susceptible to.

In SHA-3, the state S consists of a 5 × 5 array of w = 64-bit words, b = 5 × 5 × w = 5 × 5 × 64 = 1600 bits total. Keccak is also defined for smaller power-of-2 word sizes w down to 1 bit (25 bits total state). Small state sizes can be used to test cryptanalytic attacks, and intermediate state sizes (from w = 8, 200 bits, to w = 32, 800 bits) can be used in practical, lightweight applications.[11][12]

For SHA-3-224, SHA-3-256, SHA-3-384, and SHA-3-512 instances, r is greater than d, so there is no need for additional block permutations in the squeezing phase; the leading d bits of the state are the desired hash. However, SHAKE-128 and SHAKE-256 allow an arbitrary output length, which is useful in applications such as optimal asymmetric encryption padding.
Padding

To ensure the message can be evenly divided into r-bit blocks, padding is required. SHA-3 uses the pattern 10*1 in its padding function: a 1 bit, followed by zero or more 0 bits (maximum r − 1) and a final 1 bit.

The maximum of r − 1 0 bits occurs when the last message block is r − 1 bits long. Then another block is added after the initial 1 bit, containing r − 1 0 bits before the final 1 bit.

The two 1 bits will be added even if the length of the message is already divisible by r.[24]:5.1 In this case, another block is added to the message, containing a 1 bit, followed by a block of r − 2 0 bits and another 1 bit. This is necessary so that a message with length divisible by r ending in something that looks like padding does not produce the same hash as the message with those bits removed.

The initial 1 bit is required so messages differing only in a few additional 0 bits at the end do not produce the same hash.

The position of the final 1 bit indicates which rate r was used (multi-rate padding), which is required for the security proof to work for different hash variants. Without it, different hash variants of the same short message would be the same up to truncation.
The block permutation

The block transformation f, which is Keccak-f[1600] for SHA-3, is a permutation that uses xor, and and not operations, and is designed for easy implementation in both software and hardware.

It is defined for any power-of-two word size, w = 2ℓ bits. The main SHA-3 submission uses 64-bit words, ℓ = 6.

The state can be considered to be a 5 × 5 × w array of bits. Let a[i][ j][k] be bit (5i + j) × w + k of the input, using a little-endian bit numbering convention and row-major indexing. I.e. i selects the row, j the column, and k the bit.

Index arithmetic is performed modulo 5 for the first two dimensions and modulo w for the third.

The basic block permutation function consists of 12 + 2ℓ rounds of five steps, each individually very simple:

θ
Compute the parity of each of the 5w (320, when w = 64) 5-bit columns, and exclusive-or that into two nearby columns in a regular pattern. To be precise, a[i][ j][k] ← a[i][ j][k] ⊕ parity(a[0…4][ j−1][k]) ⊕ parity(a[0…4][ j+1][k−1])
ρ
Bitwise rotate each of the 25 words by a different triangular number 0, 1, 3, 6, 10, 15, …. To be precise, a[0][0] is not rotated, and for all 0 ≤ t < 24, a[i][ j][k] ← a[i][ j][k−(t+1)(t+2)/2], where ( i j ) = ( 3 2 1 0 ) t ( 0 1 ) {\displaystyle {\begin{pmatrix}i\\j\end{pmatrix}}={\begin{pmatrix}3&2\\1&0\end{pmatrix}}^{t}{\begin{pmatrix}0\\1\end{pmatrix}}} {\begin{pmatrix}i\\j\end{pmatrix}}={\begin{pmatrix}3&2\\1&0\end{pmatrix}}^{t}{\begin{pmatrix}0\\1\end{pmatrix}}. π Permute the 25 words in a fixed pattern. a[j][2i+3j] ← a[ i][j]. χ Bitwise combine along rows, using x ← x ⊕ (¬y & z). To be precise, a[i][ j][k] ← a[i][ j][k] ⊕ (¬a[i][ j+1][k] & a[i][ j+2][k]). This is the only non-linear operation in SHA-3. ι Exclusive-or a round constant into one word of the state. To be precise, in round n, for 0 ≤ m ≤ ℓ, a[0][0][2m−1] is exclusive-ORed with bit m + 7n of a degree-8 LFSR sequence. This breaks the symmetry that is preserved by the other steps. Speed The speed of SHA-3 hashing of long messages is dominated by the computation of f = Keccak-f[1600] and XORing S with the extended Pi, an operation on b = 1600 bits. However, since the last c bits of the extended Pi are 0 anyway, and XOR with 0 is a noop, it is sufficient to perform XOR operations only for r bits (r = 1600 − 2 × 224 = 1152 bits for SHA3-224, 1088 bits for SHA3-256, 832 bits for SHA3-384 and 576 bits for SHA3-512). The lower r is (and, conversely, the higher c = b − r = 1600 − r), the less efficient but more secure the hashing becomes since fewer bits of the message can be XORed into the state (a quick operation) before each application of the computationally expensive f. Keccak[c](N, d) = sponge[Keccak-f[1600], pad10*1, r](N, d)[24]:20 Keccak-f[1600] = Keccak-p[1600, 24][24]:17 c is the capacity r is the rate = 1600 − c N is the input bit string