initial commit

2019-10-30 09:02:41 +01:00 · 2019-10-30 09:02:41 +01:00 · 0bee7c5ead
commit 0bee7c5ead
7 changed files with 859 additions and 0 deletions
--- a/ex0.1/README.md
+++ b/ex0.1/README.md
@ -0,0 +1,35 @@
 C++ versus Python performance
 =============================
 This exercise is intended to show explicitly the great difference that can exist between interpreted and compiled software.
 To illustrate the point, a few benchmarks have been lifted from the [benchmarks game](https://benchmarksgame-team.pages.debian.net/benchmarksgame/fastest/gpp-python3.html) site.
 The benchmarks have been coded in such a way as to be algorithmically identical in the different programming languages.
 # Performance measurement #
 The task here is to carry out a performance measurement using the `time` program, running both the C++ and the Python version of each code. A general way to determine the time needed for a program to run is the following:
    time ./myexe args
 where `args` are any run-time arguments to the program. (The `./` prefix is not needed if you have the "current working directory" `.` in your `PATH`; however, doing so is generally considered bad practice, as it is insecure.) Note that all three programs take arguments that will affect the computing time needed. In the Python case you would replace `myexe` with `python3 mypython.py` (note that Python2 will likely fail to work).
 # How to run: C++ #
 The general way to use (on Linux machines) the command-line g++ compiler illustrated below with a fictive source file called mysource.cc .
    g++ mysource.cc -o myexe
 Note that the C++ language standard is (still) evolving, and different programs adhere to different standards. The programs used here conform to the c++11 standard (which appears to be the default setting for the g++ version 7 compiler). Other standards could be specified using the `-std=...` option to g++. Many options to g++ exist; extensive information can be obtained by doing
    man g++
 Note also that instead of a command line tool, other (graphical) tools or *Integrated Development Environments* or IDEs may be used for your code development. As a general principle, the important thing is that whatever you do should work with a straight compilation using g++.
 # Specifics #
 * The `spectral-norm` code uses the OpenMP library to enable *multi-threading*, a topic that we will get to later. To compile this code, the `-fopenmp` compilation flag is needed.
 * The `mandelbrot` code produces a NetPBM graphics output file, but does so by writing it to its standard output. One way to redirect the output is to specify
        time ( ./myexe args > mandelbrot.pbm )
 (The parentheses here are needed to create a sub-shell; without them, the outputs from the `time` command and from `myexe` would get mixed up.)
--- a/ex0.1/mandelbrot/mandelbrot.cc
+++ b/ex0.1/mandelbrot/mandelbrot.cc
@ -0,0 +1,208 @@
 // The Computer Language Benchmarks Game
 // https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
 //
 // Contributed by Kevin Miller ( as C code )
 //
 // Ported to C++ with minor changes by Dave Compton
 // Optimized to x86 by Kenta Yoshimura
 //
 // Compile with following g++ flags
 // Use '-O3 -ffp-contract=off -fno-expensive-optimizations' instead of '-Ofast',
 // because FMA is fast, but different precision to original version
 //   -Wall -O3 -ffp-contract=off -fno-expensive-optimizations -march=native -fopenmp --std=c++14 mandelbrot.cpp
 #include <immintrin.h>
 #include <stdio.h>
 #include <stdint.h>
 using namespace std;
 namespace {
 #if defined(__AVX512BW__)
    typedef __m512d Vec;
    Vec vec_init(double value)       { return _mm512_set1_pd(value); }
    bool vec_is_any_le(Vec v, Vec f) { return bool(_mm512_cmp_pd_mask(v, f, _CMP_LE_OS)); }
    int vec_is_le(Vec v1, Vec v2)    { return _mm512_cmp_pd_mask(v1, v2, _CMP_LE_OS); }
    const uint8_t k_bit_rev[] =
    {
        0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0,
        0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8,
        0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4,
        0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC,
        0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2,
        0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA,
        0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6,
        0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE,
        0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1,
        0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9,
        0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5,
        0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD,
        0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3,
        0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB,
        0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7,
        0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF
    };
 #elif defined(__AVX__)
    typedef __m256d Vec;
    Vec vec_init(double value)       { return _mm256_set1_pd(value); }
    bool vec_is_any_le(Vec v, Vec f) { Vec m = v<=f; return ! _mm256_testz_pd(m, m); }
    int vec_is_le(Vec v1, Vec v2)    { return _mm256_movemask_pd(v1 <= v2); }
    const uint8_t k_bit_rev[] =
    {
        0b0000, 0b1000, 0b0100, 0b1100, 0b0010, 0b1010, 0b0110, 0b1110,
        0b0001, 0b1001, 0b0101, 0b1101, 0b0011, 0b1011, 0b0111, 0b1111
    };
 #elif defined(__SSE4_1__)
    typedef __m128d Vec;
    Vec vec_init(double value)       { return _mm_set1_pd(value); }
    bool vec_is_any_le(Vec v, Vec f) { __m128i m = __m128i(v<=f); return ! _mm_testz_si128(m, m); }
    int vec_is_le(Vec v1, Vec v2)    { return _mm_movemask_pd(v1 <= v2); }
    const uint8_t k_bit_rev[] = { 0b00, 0b10, 0b01, 0b11 };
 #elif defined(__SSSE3__)
    typedef __m128d Vec;
    Vec vec_init(double value)       { return _mm_set1_pd(value); }
    bool vec_is_any_le(Vec v, Vec f) { return bool(_mm_movemask_pd(v<=f)); }
    int vec_is_le(Vec v1, Vec v2)    { return _mm_movemask_pd(v1 <= v2); }
    const uint8_t k_bit_rev[] = { 0b00, 0b10, 0b01, 0b11 };
 #endif
    constexpr int k_vec_size = sizeof(Vec) / sizeof(double);
    // Return true iff all of 8 members of vector v1 is
    // NOT less than or equal to v2.
    bool vec_all_nle(const Vec* v1, Vec v2)
    {
        for ( auto i = 0; i < 8/k_vec_size; i++ ) {
            if ( vec_is_any_le(v1[i], v2) ) {
                return false;
            }
        }
        return true;
    }
    // Return 8 bit value with bits set iff cooresponding
    // member of vector value is less than or equal to limit.
    unsigned pixels(const Vec* value, Vec limit)
    {
        unsigned res = 0;
        for ( auto i = 0; i < 8/k_vec_size; i++ ) {
            res <<= k_vec_size;
            res |= k_bit_rev[vec_is_le(value[i], limit)];
        }
        return res;
    }
    //
    // Do one iteration of mandelbrot calculation for a vector of eight
    // complex values.  Using Vec to work with groups of doubles speeds
    // up computations.
    //
    void calcSum(Vec* real, Vec* imag, Vec* sum, const Vec* init_real, Vec init_imag)
    {
        for ( auto vec = 0; vec < 8/k_vec_size; vec++ ) {
            auto r2 = real[vec] * real[vec];
            auto i2 = imag[vec] * imag[vec];
            auto ri = real[vec] * imag[vec];
            sum[vec] = r2 + i2;
            real[vec]=r2 - i2 + init_real[vec];
            imag[vec]=ri + ri + init_imag;
        }
    }
    //
    // Do 50 iterations of mandelbrot calculation for a vector of eight
    // complex values.  Check occasionally to see if the iterated results
    // have wandered beyond the point of no return (> 4.0).
    //
    unsigned mand8(bool to_prune, const Vec* init_real, Vec init_imag)
    {
        Vec k4_0 = vec_init(4.0);
        Vec real[8 / k_vec_size];
        Vec imag[8 / k_vec_size];
        Vec sum[8 / k_vec_size];
        for ( auto k = 0; k < 8/k_vec_size; k++ ) {
            real[k] = init_real[k];
            imag[k] = init_imag;
        }
        if ( to_prune ) {
            // 4*12 + 2 = 50
            for ( auto j = 0; j < 12; j++ ) {
                for ( auto k = 0; k < 4; k++ ) {
                    calcSum(real, imag, sum, init_real, init_imag);
                }
                if ( vec_all_nle(sum, k4_0) ) {
                    return 0; // prune
                }
            }
            calcSum(real, imag, sum, init_real, init_imag);
            calcSum(real, imag, sum, init_real, init_imag);
        } else {
            // 6*8 + 2 = 50
            for ( auto j = 0; j < 8; j++ ) {
                for ( auto k = 0; k < 6; k++ ) {
                    calcSum(real, imag, sum, init_real, init_imag);
                }
            }
            calcSum(real, imag, sum, init_real, init_imag);
            calcSum(real, imag, sum, init_real, init_imag);
        }
        return pixels(sum, k4_0);
    }
 } // namespace
 int main(int argc, char ** argv)
 {
    // get width/height from arguments
    auto wid_ht = 16000;
    if ( argc >= 2 ) {
        wid_ht = atoi(argv[1]);
    }
    // round up to multiple of 8
    wid_ht = -(-wid_ht & -8);
    auto width = wid_ht;
    auto height = wid_ht;
    // allocate memory for pixels.
    auto dataLength = height*(width>>3);
    auto pixels = new uint8_t[dataLength];
    // calculate initial x values, store in r0
    Vec r0[width / k_vec_size];
    double* r0_ = reinterpret_cast<double*>(r0);
    for ( auto x = 0; x < width; x++ ) {
        r0_[x] = 2.0 / width * x - 1.5;
    }
    // generate the bitmap
    // process 8 pixels (one byte) at a time
    #pragma omp parallel for schedule(guided)
    for ( auto y = 0; y < height; y++ ) {
        // all 8 pixels have same y value (iy).
        auto iy = 2.0 / height *  y - 1.0;
        Vec init_imag = vec_init(iy);
        auto rowstart = y*width/8;
        bool to_prune = false;
        for ( auto x = 0; x < width; x += 8 ) {
            auto res = mand8(to_prune, &r0[x/k_vec_size], init_imag);
            pixels[rowstart + x/8] = res;
            to_prune = ! res;
        }
    }
    // write the data
    printf("P4\n%d %d\n", width, height);
    fwrite(pixels, 1, dataLength, stdout);
    delete[] pixels;
    return 0;
 }
--- a/ex0.1/mandelbrot/mandelbrot.py
+++ b/ex0.1/mandelbrot/mandelbrot.py
@ -0,0 +1,75 @@
 # The Computer Language Benchmarks Game
 # https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
 #
 # contributed by Joerg Baumann
 from contextlib import closing
 from itertools import islice
 from os import cpu_count
 from sys import argv, stdout
 def pixels(y, n, abs):
    range7 = bytearray(range(7))
    pixel_bits = bytearray(128 >> pos for pos in range(8))
    c1 = 2. / float(n)
    c0 = -1.5 + 1j * y * c1 - 1j
    x = 0
    while True:
        pixel = 0
        c = x * c1 + c0
        for pixel_bit in pixel_bits:
            z = c
            for _ in range7:
                for _ in range7:
                    z = z * z + c
                if abs(z) >= 2.: break
            else:
                pixel += pixel_bit
            c += c1
        yield pixel
        x += 8
 def compute_row(p):
    y, n = p
    result = bytearray(islice(pixels(y, n, abs), (n + 7) // 8))
    result[-1] &= 0xff << (8 - n % 8)
    return y, result
 def ordered_rows(rows, n):
    order = [None] * n
    i = 0
    j = n
    while i < len(order):
        if j > 0:
            row = next(rows)
            order[row[0]] = row
            j -= 1
        if order[i]:
            yield order[i]
            order[i] = None
            i += 1
 def compute_rows(n, f):
    row_jobs = ((y, n) for y in range(n))
    if cpu_count() < 2:
        yield from map(f, row_jobs)
    else:
        from multiprocessing import Pool
        with Pool() as pool:
            unordered_rows = pool.imap_unordered(f, row_jobs)
            yield from ordered_rows(unordered_rows, n)
 def mandelbrot(n):
    write = stdout.buffer.write
    with closing(compute_rows(n, compute_row)) as rows:
        write("P4\n{0} {0}\n".format(n).encode())
        for row in rows:
            write(row[1])
 if __name__ == '__main__':
    mandelbrot(int(argv[1]))
--- a/ex0.1/nbody/nbody.cc
+++ b/ex0.1/nbody/nbody.cc
@ -0,0 +1,224 @@
 /* The Computer Language Benchmarks Game
   https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
   contributed by Mark C. Lewis
   modified slightly by Chad Whipkey
   converted from java to c++,added sse support, by Branimir Maksimovic
   modified by Vaclav Zeman
   modified by Vaclav Haisman to use explicit SSE2 intrinsics
 */
 #include <cstdio>
 #include <cmath>
 #include <cstdlib>
 #include <array>
 #include <immintrin.h>
 static const double PI = 3.141592653589793;
 static const double SOLAR_MASS = 4 * PI * PI;
 static const double DAYS_PER_YEAR = 365.24;
 class Body {
 public:
    double x, y, z, filler, vx, vy, vz, mass;
    Body(){}
    static Body& jupiter(){
        static Body p;
        p.x = 4.84143144246472090e+00;
        p.y = -1.16032004402742839e+00;
        p.z = -1.03622044471123109e-01;
        p.vx = 1.66007664274403694e-03 * DAYS_PER_YEAR;
        p.vy = 7.69901118419740425e-03 * DAYS_PER_YEAR;
        p.vz = -6.90460016972063023e-05 * DAYS_PER_YEAR;
        p.mass = 9.54791938424326609e-04 * SOLAR_MASS;
        return p;
    }
    static Body& saturn(){
        static Body p;
        p.x = 8.34336671824457987e+00;
        p.y = 4.12479856412430479e+00;
        p.z = -4.03523417114321381e-01;
        p.vx = -2.76742510726862411e-03 * DAYS_PER_YEAR;
        p.vy = 4.99852801234917238e-03 * DAYS_PER_YEAR;
        p.vz = 2.30417297573763929e-05 * DAYS_PER_YEAR;
        p.mass = 2.85885980666130812e-04 * SOLAR_MASS;
        return p;
    }
    static Body& uranus(){
        static Body p;
        p.x = 1.28943695621391310e+01;
        p.y = -1.51111514016986312e+01;
        p.z = -2.23307578892655734e-01;
        p.vx = 2.96460137564761618e-03 * DAYS_PER_YEAR;
        p.vy = 2.37847173959480950e-03 * DAYS_PER_YEAR;
        p.vz = -2.96589568540237556e-05 * DAYS_PER_YEAR;
        p.mass = 4.36624404335156298e-05 * SOLAR_MASS;
        return p;
    }
    static Body& neptune(){
        static Body p;
        p.x = 1.53796971148509165e+01;
        p.y = -2.59193146099879641e+01;
        p.z = 1.79258772950371181e-01;
        p.vx = 2.68067772490389322e-03 * DAYS_PER_YEAR;
        p.vy = 1.62824170038242295e-03 * DAYS_PER_YEAR;
        p.vz = -9.51592254519715870e-05 * DAYS_PER_YEAR;
        p.mass = 5.15138902046611451e-05 * SOLAR_MASS;
        return p;
    }
    static Body& sun(){
        static Body p;
        p.mass = SOLAR_MASS;
        return p;
    }
    Body& offsetMomentum(double px, double py, double pz){
        vx = -px / SOLAR_MASS;
        vy = -py / SOLAR_MASS;
        vz = -pz / SOLAR_MASS;
        return *this;
    }
 };
 class NBodySystem {
 private:
    std::array<Body, 5> bodies;
 public:
    NBodySystem()
        :  bodies {{
            Body::sun(),
            Body::jupiter(),
            Body::saturn(),
            Body::uranus(),
            Body::neptune()
            }}
 	{
        double px = 0.0;
        double py = 0.0;
        double pz = 0.0;
        for(unsigned i=0; i < bodies.size(); ++i) {
            px += bodies[i].vx * bodies[i].mass;
            py += bodies[i].vy * bodies[i].mass;
            pz += bodies[i].vz * bodies[i].mass;
        }
        bodies[0].offsetMomentum(px,py,pz);
    }
    void advance(double dt) {
        const unsigned N = (bodies.size()-1)*bodies.size()/2;
        struct __attribute__((aligned(16))) R {
            double dx,dy,dz,filler;
        };
        static R r[1000];
        static __attribute__((aligned(16))) double mag[1000];
        for(unsigned i=0,k=0; i < bodies.size()-1; ++i) {
            Body& iBody = bodies[i];
            for(unsigned j=i+1; j < bodies.size(); ++j,++k) {
                r[k].dx = iBody.x - bodies[j].x;
                r[k].dy = iBody.y - bodies[j].y;
                r[k].dz = iBody.z - bodies[j].z;
            }
        }
        for(unsigned i=0; i < N; i+=2) {
            __m128d dx,dy,dz;
            dx = _mm_loadl_pd(dx,&r[i].dx);
            dy = _mm_loadl_pd(dy,&r[i].dy);
            dz = _mm_loadl_pd(dz,&r[i].dz);
            dx = _mm_loadh_pd(dx,&r[i+1].dx);
            dy = _mm_loadh_pd(dy,&r[i+1].dy);
            dz = _mm_loadh_pd(dz,&r[i+1].dz);
            //__m128d dSquared = dx*dx + dy*dy + dz*dz;
            __m128d dSquared = _mm_add_pd(
                _mm_add_pd(_mm_mul_pd(dx, dx), _mm_mul_pd(dy, dy)),
                _mm_mul_pd(dz, dz));
            __m128d distance =
                _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(dSquared)));
            for(unsigned j=0;j<2;++j)
            {
                distance = _mm_sub_pd(
                    _mm_mul_pd(distance, _mm_set1_pd(1.5)),
                    _mm_mul_pd(
                        _mm_mul_pd(
                            _mm_mul_pd(_mm_set1_pd(0.5), dSquared),
                             distance),
                        _mm_mul_pd(distance, distance)
                    )
                );
            }
            __m128d dmag = _mm_mul_pd(
                _mm_div_pd(_mm_set1_pd(dt), dSquared), distance);
            _mm_store_pd(&mag[i],dmag);
        }
        for(unsigned i=0,k=0; i < bodies.size()-1; ++i) {
            Body& iBody = bodies[i];
            for(unsigned j=i+1; j < bodies.size(); ++j,++k) {
                iBody.vx -= r[k].dx * bodies[j].mass * mag[k];
                iBody.vy -= r[k].dy * bodies[j].mass * mag[k];
                iBody.vz -= r[k].dz * bodies[j].mass * mag[k];
                bodies[j].vx += r[k].dx * iBody.mass * mag[k];
                bodies[j].vy += r[k].dy * iBody.mass * mag[k];
                bodies[j].vz += r[k].dz * iBody.mass * mag[k];
            }
        }
        for (unsigned i = 0; i < bodies.size(); ++i) {
            bodies[i].x += dt * bodies[i].vx;
            bodies[i].y += dt * bodies[i].vy;
            bodies[i].z += dt * bodies[i].vz;
        }
    }
    double energy(){
        double e = 0.0;
        for (unsigned i=0; i < bodies.size(); ++i) {
            Body const & iBody = bodies[i];
            double dx, dy, dz, distance;
            e += 0.5 * iBody.mass *
                ( iBody.vx * iBody.vx
                    + iBody.vy * iBody.vy
                    + iBody.vz * iBody.vz );
            for (unsigned j=i+1; j < bodies.size(); ++j) {
                Body const & jBody = bodies[j];
                dx = iBody.x - jBody.x;
                dy = iBody.y - jBody.y;
                dz = iBody.z - jBody.z;
                distance = sqrt(dx*dx + dy*dy + dz*dz);
                e -= (iBody.mass * jBody.mass) / distance;
            }
        }
        return e;
    }
 };
 int main(int argc, char** argv) {
    int n = atoi(argv[1]);
    NBodySystem bodies;
    printf("%.9f\n", bodies.energy());
    for (int i=0; i<n; ++i)
        bodies.advance(0.01);
    printf("%.9f\n", bodies.energy());
 }
--- a/ex0.1/nbody/nbody.py
+++ b/ex0.1/nbody/nbody.py
@ -0,0 +1,117 @@
 # The Computer Language Benchmarks Game
 # https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
 #
 # originally by Kevin Carson
 # modified by Tupteq, Fredrik Johansson, and Daniel Nanz
 # modified by Maciej Fijalkowski
 # 2to3
 import sys 
 def combinations(l):
    result = []
    for x in range(len(l) - 1):
        ls = l[x+1:]
        for y in ls:
            result.append((l[x],y))
    return result
 PI = 3.14159265358979323
 SOLAR_MASS = 4 * PI * PI
 DAYS_PER_YEAR = 365.24
 BODIES = {
    'sun': ([0.0, 0.0, 0.0], [0.0, 0.0, 0.0], SOLAR_MASS),
    'jupiter': ([4.84143144246472090e+00,
                 -1.16032004402742839e+00,
                 -1.03622044471123109e-01],
                [1.66007664274403694e-03 * DAYS_PER_YEAR,
                 7.69901118419740425e-03 * DAYS_PER_YEAR,
                 -6.90460016972063023e-05 * DAYS_PER_YEAR],
                9.54791938424326609e-04 * SOLAR_MASS),
    'saturn': ([8.34336671824457987e+00,
                4.12479856412430479e+00,
                -4.03523417114321381e-01],
               [-2.76742510726862411e-03 * DAYS_PER_YEAR,
                4.99852801234917238e-03 * DAYS_PER_YEAR,
                2.30417297573763929e-05 * DAYS_PER_YEAR],
               2.85885980666130812e-04 * SOLAR_MASS),
    'uranus': ([1.28943695621391310e+01,
                -1.51111514016986312e+01,
                -2.23307578892655734e-01],
               [2.96460137564761618e-03 * DAYS_PER_YEAR,
                2.37847173959480950e-03 * DAYS_PER_YEAR,
                -2.96589568540237556e-05 * DAYS_PER_YEAR],
               4.36624404335156298e-05 * SOLAR_MASS),
    'neptune': ([1.53796971148509165e+01,
                 -2.59193146099879641e+01,
                 1.79258772950371181e-01],
                [2.68067772490389322e-03 * DAYS_PER_YEAR,
                 1.62824170038242295e-03 * DAYS_PER_YEAR,
                 -9.51592254519715870e-05 * DAYS_PER_YEAR],
                5.15138902046611451e-05 * SOLAR_MASS) }
 SYSTEM = list(BODIES.values())
 PAIRS = combinations(SYSTEM)
 def advance(dt, n, bodies=SYSTEM, pairs=PAIRS):
    for i in range(n):
        for (([x1, y1, z1], v1, m1),
             ([x2, y2, z2], v2, m2)) in pairs:
            dx = x1 - x2
            dy = y1 - y2
            dz = z1 - z2
            mag = dt * ((dx * dx + dy * dy + dz * dz) ** (-1.5))
            b1m = m1 * mag
            b2m = m2 * mag
            v1[0] -= dx * b2m
            v1[1] -= dy * b2m
            v1[2] -= dz * b2m
            v2[0] += dx * b1m
            v2[1] += dy * b1m
            v2[2] += dz * b1m
        for (r, [vx, vy, vz], m) in bodies:
            r[0] += dt * vx
            r[1] += dt * vy
            r[2] += dt * vz
 def report_energy(bodies=SYSTEM, pairs=PAIRS, e=0.0):
    for (((x1, y1, z1), v1, m1),
         ((x2, y2, z2), v2, m2)) in pairs:
        dx = x1 - x2
        dy = y1 - y2
        dz = z1 - z2
        e -= (m1 * m2) / ((dx * dx + dy * dy + dz * dz) ** 0.5)
    for (r, [vx, vy, vz], m) in bodies:
        e += m * (vx * vx + vy * vy + vz * vz) / 2.
    print("%.9f" % e)
 def offset_momentum(ref, bodies=SYSTEM, px=0.0, py=0.0, pz=0.0):
    for (r, [vx, vy, vz], m) in bodies:
        px -= vx * m
        py -= vy * m
        pz -= vz * m
    (r, v, m) = ref
    v[0] = px / m
    v[1] = py / m
    v[2] = pz / m
 def main(n, ref='sun'):
    offset_momentum(BODIES[ref])
    report_energy()
    advance(0.01, n)
    report_energy()
 if __name__ == '__main__':
    main(int(sys.argv[1]))
--- a/ex0.1/spectral-norm/spectral-norm.cc
+++ b/ex0.1/spectral-norm/spectral-norm.cc
@ -0,0 +1,136 @@
 // The Computer Language Benchmarks Game
 // https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
 //
 // Original C contributed by Sebastien Loisel
 // Conversion to C++ by Jon Harrop
 // OpenMP parallelize by The Anh Tran
 // Add SSE by The Anh Tran
 // Additional SSE optimization by Krzysztof Jakubowski
 // g++ -pipe -O3 -march=native -fopenmp -mfpmath=sse -msse2 \
 //     ./spec.c++ -o ./spec.run
 #include <cmath>
 #include <cstdlib>
 #include <cstdio>
 #include <sched.h>
 #include <omp.h>
 #include <emmintrin.h>
 template <bool modei> int Index(int i, int j) {
    return (((i + j) * (i + j + 1)) >> 1) + (modei? i : j) + 1;
 }
 template <bool modei>
 void EvalPart(double *__restrict__ src, double *__restrict__ dst,
                int begin, int end, int length) {
    int i = begin;
    for(; i + 1 < end; i += 2) {
        __m128d sum = _mm_set_pd(
 			src[0] / double(Index<modei>(i + 1, 0)),
 			src[0] / double(Index<modei>(i + 0, 0)));
 		__m128d ti = modei?
 			_mm_set_pd(i + 1, i + 0) :
 			_mm_set_pd(i + 2, i + 1);	
 		__m128d last = _mm_set_pd(
 			Index<modei>(i + 1, 0),
 			Index<modei>(i + 0, 0));
        for(int j = 1; j < length; j++) {
 			__m128d idx = last + ti + _mm_set1_pd(j);
 			last = idx;
            sum = sum + _mm_set1_pd(src[j]) / idx;
        }
        _mm_storeu_pd(dst + i + 0, sum);
    }
    for(; i < end; i++) {
        double sum = 0;
        for (int j = 0; j < length; j++)
            sum += src[j] / double(Index<modei>(i, j));
        dst[i] = sum;
    }
 }
 void EvalATimesU(double *src, double *dst, int begin, int end, int N) {
    EvalPart<1>(src, dst, begin, end, N);
 }
 void EvalAtTimesU(double *src, double *dst, int begin, int end, int N) {
    EvalPart<0>(src, dst, begin, end, N);
 }
 void EvalAtATimesU(double *src, double *dst, double *tmp,
                   int begin, int end, int N) {
    EvalATimesU (src, tmp, begin, end, N);
    #pragma omp barrier
    EvalAtTimesU(tmp, dst, begin, end, N);
    #pragma omp barrier
 }
 int GetThreadCount() {
    cpu_set_t cs;
    CPU_ZERO(&cs);
    sched_getaffinity(0, sizeof(cs), &cs);
    int count = 0;
    for (int i = 0; i < CPU_SETSIZE; ++i)
        if (CPU_ISSET(i, &cs))
            ++count;
    return count;
 }
 double spectral_game(int N) {
    __attribute__((aligned(16))) double u[N];
    __attribute__((aligned(16))) double v[N], tmp[N];
    double vBv = 0.0;
    double vv = 0.0;
    #pragma omp parallel default(shared) num_threads(GetThreadCount())
    {
        // this block will be executed by NUM_THREADS
        // variable declared in this block is private for each thread
        int threadid = omp_get_thread_num();
        int threadcount = omp_get_num_threads();
        int chunk = N / threadcount;
        // calculate each thread's working range [r1 .. r2) => static schedule
        int begin = threadid * chunk;
        int end = (threadid < (threadcount -1)) ? (begin + chunk) : N;
        for(int i = begin; i < end; i++)
            u[i] = 1.0;
        #pragma omp barrier
        for (int ite = 0; ite < 10; ++ite) {
            EvalAtATimesU(u, v, tmp, begin, end, N);
            EvalAtATimesU(v, u, tmp, begin, end, N);
        }
        double sumvb = 0.0, sumvv = 0.0;
        for (int i = begin; i < end; i++) {
            sumvv += v[i] * v[i];
            sumvb += u[i] * v[i];
        }
        #pragma omp critical
        {
            vBv += sumvb;
            vv += sumvv;
        }
    }
    return sqrt(vBv / vv);
 }
 int main(int argc, char *argv[]) {
    int N = ((argc >= 2) ? atoi(argv[1]) : 2000);
    printf("%.9f\n", spectral_game(N));
    return 0;
 }
--- a/ex0.1/spectral-norm/spectral-norm.py
+++ b/ex0.1/spectral-norm/spectral-norm.py
@ -0,0 +1,64 @@
 # The Computer Language Benchmarks Game
 # https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
 #
 # Contributed by Sebastien Loisel
 # Fixed by Isaac Gouy
 # Sped up by Josh Goldfoot
 # Dirtily sped up by Simon Descarpentries
 # Used list comprehension by Vadim Zelenin
 # 2to3
 from math      import sqrt
 from sys       import argv
 def eval_A(i, j):
    ij = i+j
    return 1.0 / (ij * (ij + 1) / 2 + i + 1)
 def eval_A_times_u(u):
    local_eval_A = eval_A
    return [ sum([ local_eval_A(i, j) * u_j
                   for j, u_j in enumerate(u)
                 ]
                )
             for i in range(len(u))
           ]
 def eval_At_times_u(u):
    local_eval_A = eval_A
    return [ sum([ local_eval_A(j, i) * u_j
                   for j, u_j in enumerate(u)
                 ]
                )
             for i in range(len(u))
           ]
 def eval_AtA_times_u(u):
    return eval_At_times_u(eval_A_times_u(u))
 def main():
    n = int(argv[1])
    u = [1] * n
    local_eval_AtA_times_u = eval_AtA_times_u
    for dummy in range(10):
        v = local_eval_AtA_times_u(u)
        u = local_eval_AtA_times_u(v)
    vBv = vv = 0
    for ue, ve in zip(u, v):
        vBv += ue * ve
        vv  += ve * ve
    print("%0.9f" % (sqrt(vBv/vv)))
 main()