Browse Source

initial commit

master
Frank Filthaut 2 years ago
commit
0bee7c5ead
  1. 35
      ex0.1/README.md
  2. 208
      ex0.1/mandelbrot/mandelbrot.cc
  3. 75
      ex0.1/mandelbrot/mandelbrot.py
  4. 224
      ex0.1/nbody/nbody.cc
  5. 117
      ex0.1/nbody/nbody.py
  6. 136
      ex0.1/spectral-norm/spectral-norm.cc
  7. 64
      ex0.1/spectral-norm/spectral-norm.py

35
ex0.1/README.md

@ -0,0 +1,35 @@
C++ versus Python performance
=============================
This exercise is intended to show explicitly the great difference that can exist between interpreted and compiled software.
To illustrate the point, a few benchmarks have been lifted from the [benchmarks game](https://benchmarksgame-team.pages.debian.net/benchmarksgame/fastest/gpp-python3.html) site.
The benchmarks have been coded in such a way as to be algorithmically identical in the different programming languages.
# Performance measurement #
The task here is to carry out a performance measurement using the `time` program, running both the C++ and the Python version of each code. A general way to determine the time needed for a program to run is the following:
time ./myexe args
where `args` are any run-time arguments to the program. (The `./` prefix is not needed if you have the "current working directory" `.` in your `PATH`; however, doing so is generally considered bad practice, as it is insecure.) Note that all three programs take arguments that will affect the computing time needed. In the Python case you would replace `myexe` with `python3 mypython.py` (note that Python2 will likely fail to work).
# How to run: C++ #
The general way to use (on Linux machines) the command-line g++ compiler illustrated below with a fictive source file called mysource.cc .
g++ mysource.cc -o myexe
Note that the C++ language standard is (still) evolving, and different programs adhere to different standards. The programs used here conform to the c++11 standard (which appears to be the default setting for the g++ version 7 compiler). Other standards could be specified using the `-std=...` option to g++. Many options to g++ exist; extensive information can be obtained by doing
man g++
Note also that instead of a command line tool, other (graphical) tools or *Integrated Development Environments* or IDEs may be used for your code development. As a general principle, the important thing is that whatever you do should work with a straight compilation using g++.
# Specifics #
* The `spectral-norm` code uses the OpenMP library to enable *multi-threading*, a topic that we will get to later. To compile this code, the `-fopenmp` compilation flag is needed.
* The `mandelbrot` code produces a NetPBM graphics output file, but does so by writing it to its standard output. One way to redirect the output is to specify
time ( ./myexe args > mandelbrot.pbm )
(The parentheses here are needed to create a sub-shell; without them, the outputs from the `time` command and from `myexe` would get mixed up.)

208
ex0.1/mandelbrot/mandelbrot.cc

@ -0,0 +1,208 @@
// The Computer Language Benchmarks Game
// https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
//
// Contributed by Kevin Miller ( as C code )
//
// Ported to C++ with minor changes by Dave Compton
// Optimized to x86 by Kenta Yoshimura
//
// Compile with following g++ flags
// Use '-O3 -ffp-contract=off -fno-expensive-optimizations' instead of '-Ofast',
// because FMA is fast, but different precision to original version
// -Wall -O3 -ffp-contract=off -fno-expensive-optimizations -march=native -fopenmp --std=c++14 mandelbrot.cpp
#include <immintrin.h>
#include <stdio.h>
#include <stdint.h>
using namespace std;
namespace {
#if defined(__AVX512BW__)
typedef __m512d Vec;
Vec vec_init(double value) { return _mm512_set1_pd(value); }
bool vec_is_any_le(Vec v, Vec f) { return bool(_mm512_cmp_pd_mask(v, f, _CMP_LE_OS)); }
int vec_is_le(Vec v1, Vec v2) { return _mm512_cmp_pd_mask(v1, v2, _CMP_LE_OS); }
const uint8_t k_bit_rev[] =
{
0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0,
0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8,
0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4,
0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC,
0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2,
0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA,
0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6,
0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE,
0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1,
0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9,
0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5,
0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD,
0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3,
0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB,
0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7,
0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF
};
#elif defined(__AVX__)
typedef __m256d Vec;
Vec vec_init(double value) { return _mm256_set1_pd(value); }
bool vec_is_any_le(Vec v, Vec f) { Vec m = v<=f; return ! _mm256_testz_pd(m, m); }
int vec_is_le(Vec v1, Vec v2) { return _mm256_movemask_pd(v1 <= v2); }
const uint8_t k_bit_rev[] =
{
0b0000, 0b1000, 0b0100, 0b1100, 0b0010, 0b1010, 0b0110, 0b1110,
0b0001, 0b1001, 0b0101, 0b1101, 0b0011, 0b1011, 0b0111, 0b1111
};
#elif defined(__SSE4_1__)
typedef __m128d Vec;
Vec vec_init(double value) { return _mm_set1_pd(value); }
bool vec_is_any_le(Vec v, Vec f) { __m128i m = __m128i(v<=f); return ! _mm_testz_si128(m, m); }
int vec_is_le(Vec v1, Vec v2) { return _mm_movemask_pd(v1 <= v2); }
const uint8_t k_bit_rev[] = { 0b00, 0b10, 0b01, 0b11 };
#elif defined(__SSSE3__)
typedef __m128d Vec;
Vec vec_init(double value) { return _mm_set1_pd(value); }
bool vec_is_any_le(Vec v, Vec f) { return bool(_mm_movemask_pd(v<=f)); }
int vec_is_le(Vec v1, Vec v2) { return _mm_movemask_pd(v1 <= v2); }
const uint8_t k_bit_rev[] = { 0b00, 0b10, 0b01, 0b11 };
#endif
constexpr int k_vec_size = sizeof(Vec) / sizeof(double);
// Return true iff all of 8 members of vector v1 is
// NOT less than or equal to v2.
bool vec_all_nle(const Vec* v1, Vec v2)
{
for ( auto i = 0; i < 8/k_vec_size; i++ ) {
if ( vec_is_any_le(v1[i], v2) ) {
return false;
}
}
return true;
}
// Return 8 bit value with bits set iff cooresponding
// member of vector value is less than or equal to limit.
unsigned pixels(const Vec* value, Vec limit)
{
unsigned res = 0;
for ( auto i = 0; i < 8/k_vec_size; i++ ) {
res <<= k_vec_size;
res |= k_bit_rev[vec_is_le(value[i], limit)];
}
return res;
}
//
// Do one iteration of mandelbrot calculation for a vector of eight
// complex values. Using Vec to work with groups of doubles speeds
// up computations.
//
void calcSum(Vec* real, Vec* imag, Vec* sum, const Vec* init_real, Vec init_imag)
{
for ( auto vec = 0; vec < 8/k_vec_size; vec++ ) {
auto r2 = real[vec] * real[vec];
auto i2 = imag[vec] * imag[vec];
auto ri = real[vec] * imag[vec];
sum[vec] = r2 + i2;
real[vec]=r2 - i2 + init_real[vec];
imag[vec]=ri + ri + init_imag;
}
}
//
// Do 50 iterations of mandelbrot calculation for a vector of eight
// complex values. Check occasionally to see if the iterated results
// have wandered beyond the point of no return (> 4.0).
//
unsigned mand8(bool to_prune, const Vec* init_real, Vec init_imag)
{
Vec k4_0 = vec_init(4.0);
Vec real[8 / k_vec_size];
Vec imag[8 / k_vec_size];
Vec sum[8 / k_vec_size];
for ( auto k = 0; k < 8/k_vec_size; k++ ) {
real[k] = init_real[k];
imag[k] = init_imag;
}
if ( to_prune ) {
// 4*12 + 2 = 50
for ( auto j = 0; j < 12; j++ ) {
for ( auto k = 0; k < 4; k++ ) {
calcSum(real, imag, sum, init_real, init_imag);
}
if ( vec_all_nle(sum, k4_0) ) {
return 0; // prune
}
}
calcSum(real, imag, sum, init_real, init_imag);
calcSum(real, imag, sum, init_real, init_imag);
} else {
// 6*8 + 2 = 50
for ( auto j = 0; j < 8; j++ ) {
for ( auto k = 0; k < 6; k++ ) {
calcSum(real, imag, sum, init_real, init_imag);
}
}
calcSum(real, imag, sum, init_real, init_imag);
calcSum(real, imag, sum, init_real, init_imag);
}
return pixels(sum, k4_0);
}
} // namespace
int main(int argc, char ** argv)
{
// get width/height from arguments
auto wid_ht = 16000;
if ( argc >= 2 ) {
wid_ht = atoi(argv[1]);
}
// round up to multiple of 8
wid_ht = -(-wid_ht & -8);
auto width = wid_ht;
auto height = wid_ht;
// allocate memory for pixels.
auto dataLength = height*(width>>3);
auto pixels = new uint8_t[dataLength];
// calculate initial x values, store in r0
Vec r0[width / k_vec_size];
double* r0_ = reinterpret_cast<double*>(r0);
for ( auto x = 0; x < width; x++ ) {
r0_[x] = 2.0 / width * x - 1.5;
}
// generate the bitmap
// process 8 pixels (one byte) at a time
#pragma omp parallel for schedule(guided)
for ( auto y = 0; y < height; y++ ) {
// all 8 pixels have same y value (iy).
auto iy = 2.0 / height * y - 1.0;
Vec init_imag = vec_init(iy);
auto rowstart = y*width/8;
bool to_prune = false;
for ( auto x = 0; x < width; x += 8 ) {
auto res = mand8(to_prune, &r0[x/k_vec_size], init_imag);
pixels[rowstart + x/8] = res;
to_prune = ! res;
}
}
// write the data
printf("P4\n%d %d\n", width, height);
fwrite(pixels, 1, dataLength, stdout);
delete[] pixels;
return 0;
}

75
ex0.1/mandelbrot/mandelbrot.py

@ -0,0 +1,75 @@
# The Computer Language Benchmarks Game
# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
#
# contributed by Joerg Baumann
from contextlib import closing
from itertools import islice
from os import cpu_count
from sys import argv, stdout
def pixels(y, n, abs):
range7 = bytearray(range(7))
pixel_bits = bytearray(128 >> pos for pos in range(8))
c1 = 2. / float(n)
c0 = -1.5 + 1j * y * c1 - 1j
x = 0
while True:
pixel = 0
c = x * c1 + c0
for pixel_bit in pixel_bits:
z = c
for _ in range7:
for _ in range7:
z = z * z + c
if abs(z) >= 2.: break
else:
pixel += pixel_bit
c += c1
yield pixel
x += 8
def compute_row(p):
y, n = p
result = bytearray(islice(pixels(y, n, abs), (n + 7) // 8))
result[-1] &= 0xff << (8 - n % 8)
return y, result
def ordered_rows(rows, n):
order = [None] * n
i = 0
j = n
while i < len(order):
if j > 0:
row = next(rows)
order[row[0]] = row
j -= 1
if order[i]:
yield order[i]
order[i] = None
i += 1
def compute_rows(n, f):
row_jobs = ((y, n) for y in range(n))
if cpu_count() < 2:
yield from map(f, row_jobs)
else:
from multiprocessing import Pool
with Pool() as pool:
unordered_rows = pool.imap_unordered(f, row_jobs)
yield from ordered_rows(unordered_rows, n)
def mandelbrot(n):
write = stdout.buffer.write
with closing(compute_rows(n, compute_row)) as rows:
write("P4\n{0} {0}\n".format(n).encode())
for row in rows:
write(row[1])
if __name__ == '__main__':
mandelbrot(int(argv[1]))

224
ex0.1/nbody/nbody.cc

@ -0,0 +1,224 @@
/* The Computer Language Benchmarks Game
https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
contributed by Mark C. Lewis
modified slightly by Chad Whipkey
converted from java to c++,added sse support, by Branimir Maksimovic
modified by Vaclav Zeman
modified by Vaclav Haisman to use explicit SSE2 intrinsics
*/
#include <cstdio>
#include <cmath>
#include <cstdlib>
#include <array>
#include <immintrin.h>
static const double PI = 3.141592653589793;
static const double SOLAR_MASS = 4 * PI * PI;
static const double DAYS_PER_YEAR = 365.24;
class Body {
public:
double x, y, z, filler, vx, vy, vz, mass;
Body(){}
static Body& jupiter(){
static Body p;
p.x = 4.84143144246472090e+00;
p.y = -1.16032004402742839e+00;
p.z = -1.03622044471123109e-01;
p.vx = 1.66007664274403694e-03 * DAYS_PER_YEAR;
p.vy = 7.69901118419740425e-03 * DAYS_PER_YEAR;
p.vz = -6.90460016972063023e-05 * DAYS_PER_YEAR;
p.mass = 9.54791938424326609e-04 * SOLAR_MASS;
return p;
}
static Body& saturn(){
static Body p;
p.x = 8.34336671824457987e+00;
p.y = 4.12479856412430479e+00;
p.z = -4.03523417114321381e-01;
p.vx = -2.76742510726862411e-03 * DAYS_PER_YEAR;
p.vy = 4.99852801234917238e-03 * DAYS_PER_YEAR;
p.vz = 2.30417297573763929e-05 * DAYS_PER_YEAR;
p.mass = 2.85885980666130812e-04 * SOLAR_MASS;
return p;
}
static Body& uranus(){
static Body p;
p.x = 1.28943695621391310e+01;
p.y = -1.51111514016986312e+01;
p.z = -2.23307578892655734e-01;
p.vx = 2.96460137564761618e-03 * DAYS_PER_YEAR;
p.vy = 2.37847173959480950e-03 * DAYS_PER_YEAR;
p.vz = -2.96589568540237556e-05 * DAYS_PER_YEAR;
p.mass = 4.36624404335156298e-05 * SOLAR_MASS;
return p;
}
static Body& neptune(){
static Body p;
p.x = 1.53796971148509165e+01;
p.y = -2.59193146099879641e+01;
p.z = 1.79258772950371181e-01;
p.vx = 2.68067772490389322e-03 * DAYS_PER_YEAR;
p.vy = 1.62824170038242295e-03 * DAYS_PER_YEAR;
p.vz = -9.51592254519715870e-05 * DAYS_PER_YEAR;
p.mass = 5.15138902046611451e-05 * SOLAR_MASS;
return p;
}
static Body& sun(){
static Body p;
p.mass = SOLAR_MASS;
return p;
}
Body& offsetMomentum(double px, double py, double pz){
vx = -px / SOLAR_MASS;
vy = -py / SOLAR_MASS;
vz = -pz / SOLAR_MASS;
return *this;
}
};
class NBodySystem {
private:
std::array<Body, 5> bodies;
public:
NBodySystem()
: bodies {{
Body::sun(),
Body::jupiter(),
Body::saturn(),
Body::uranus(),
Body::neptune()
}}
{
double px = 0.0;
double py = 0.0;
double pz = 0.0;
for(unsigned i=0; i < bodies.size(); ++i) {
px += bodies[i].vx * bodies[i].mass;
py += bodies[i].vy * bodies[i].mass;
pz += bodies[i].vz * bodies[i].mass;
}
bodies[0].offsetMomentum(px,py,pz);
}
void advance(double dt) {
const unsigned N = (bodies.size()-1)*bodies.size()/2;
struct __attribute__((aligned(16))) R {
double dx,dy,dz,filler;
};
static R r[1000];
static __attribute__((aligned(16))) double mag[1000];
for(unsigned i=0,k=0; i < bodies.size()-1; ++i) {
Body& iBody = bodies[i];
for(unsigned j=i+1; j < bodies.size(); ++j,++k) {
r[k].dx = iBody.x - bodies[j].x;
r[k].dy = iBody.y - bodies[j].y;
r[k].dz = iBody.z - bodies[j].z;
}
}
for(unsigned i=0; i < N; i+=2) {
__m128d dx,dy,dz;
dx = _mm_loadl_pd(dx,&r[i].dx);
dy = _mm_loadl_pd(dy,&r[i].dy);
dz = _mm_loadl_pd(dz,&r[i].dz);
dx = _mm_loadh_pd(dx,&r[i+1].dx);
dy = _mm_loadh_pd(dy,&r[i+1].dy);
dz = _mm_loadh_pd(dz,&r[i+1].dz);
//__m128d dSquared = dx*dx + dy*dy + dz*dz;
__m128d dSquared = _mm_add_pd(
_mm_add_pd(_mm_mul_pd(dx, dx), _mm_mul_pd(dy, dy)),
_mm_mul_pd(dz, dz));
__m128d distance =
_mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(dSquared)));
for(unsigned j=0;j<2;++j)
{
distance = _mm_sub_pd(
_mm_mul_pd(distance, _mm_set1_pd(1.5)),
_mm_mul_pd(
_mm_mul_pd(
_mm_mul_pd(_mm_set1_pd(0.5), dSquared),
distance),
_mm_mul_pd(distance, distance)
)
);
}
__m128d dmag = _mm_mul_pd(
_mm_div_pd(_mm_set1_pd(dt), dSquared), distance);
_mm_store_pd(&mag[i],dmag);
}
for(unsigned i=0,k=0; i < bodies.size()-1; ++i) {
Body& iBody = bodies[i];
for(unsigned j=i+1; j < bodies.size(); ++j,++k) {
iBody.vx -= r[k].dx * bodies[j].mass * mag[k];
iBody.vy -= r[k].dy * bodies[j].mass * mag[k];
iBody.vz -= r[k].dz * bodies[j].mass * mag[k];
bodies[j].vx += r[k].dx * iBody.mass * mag[k];
bodies[j].vy += r[k].dy * iBody.mass * mag[k];
bodies[j].vz += r[k].dz * iBody.mass * mag[k];
}
}
for (unsigned i = 0; i < bodies.size(); ++i) {
bodies[i].x += dt * bodies[i].vx;
bodies[i].y += dt * bodies[i].vy;
bodies[i].z += dt * bodies[i].vz;
}
}
double energy(){
double e = 0.0;
for (unsigned i=0; i < bodies.size(); ++i) {
Body const & iBody = bodies[i];
double dx, dy, dz, distance;
e += 0.5 * iBody.mass *
( iBody.vx * iBody.vx
+ iBody.vy * iBody.vy
+ iBody.vz * iBody.vz );
for (unsigned j=i+1; j < bodies.size(); ++j) {
Body const & jBody = bodies[j];
dx = iBody.x - jBody.x;
dy = iBody.y - jBody.y;
dz = iBody.z - jBody.z;
distance = sqrt(dx*dx + dy*dy + dz*dz);
e -= (iBody.mass * jBody.mass) / distance;
}
}
return e;
}
};
int main(int argc, char** argv) {
int n = atoi(argv[1]);
NBodySystem bodies;
printf("%.9f\n", bodies.energy());
for (int i=0; i<n; ++i)
bodies.advance(0.01);
printf("%.9f\n", bodies.energy());
}

117
ex0.1/nbody/nbody.py

@ -0,0 +1,117 @@
# The Computer Language Benchmarks Game
# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
#
# originally by Kevin Carson
# modified by Tupteq, Fredrik Johansson, and Daniel Nanz
# modified by Maciej Fijalkowski
# 2to3
import sys
def combinations(l):
result = []
for x in range(len(l) - 1):
ls = l[x+1:]
for y in ls:
result.append((l[x],y))
return result
PI = 3.14159265358979323
SOLAR_MASS = 4 * PI * PI
DAYS_PER_YEAR = 365.24
BODIES = {
'sun': ([0.0, 0.0, 0.0], [0.0, 0.0, 0.0], SOLAR_MASS),
'jupiter': ([4.84143144246472090e+00,
-1.16032004402742839e+00,
-1.03622044471123109e-01],
[1.66007664274403694e-03 * DAYS_PER_YEAR,
7.69901118419740425e-03 * DAYS_PER_YEAR,
-6.90460016972063023e-05 * DAYS_PER_YEAR],
9.54791938424326609e-04 * SOLAR_MASS),
'saturn': ([8.34336671824457987e+00,
4.12479856412430479e+00,
-4.03523417114321381e-01],
[-2.76742510726862411e-03 * DAYS_PER_YEAR,
4.99852801234917238e-03 * DAYS_PER_YEAR,
2.30417297573763929e-05 * DAYS_PER_YEAR],
2.85885980666130812e-04 * SOLAR_MASS),
'uranus': ([1.28943695621391310e+01,
-1.51111514016986312e+01,
-2.23307578892655734e-01],
[2.96460137564761618e-03 * DAYS_PER_YEAR,
2.37847173959480950e-03 * DAYS_PER_YEAR,
-2.96589568540237556e-05 * DAYS_PER_YEAR],
4.36624404335156298e-05 * SOLAR_MASS),
'neptune': ([1.53796971148509165e+01,
-2.59193146099879641e+01,
1.79258772950371181e-01],
[2.68067772490389322e-03 * DAYS_PER_YEAR,
1.62824170038242295e-03 * DAYS_PER_YEAR,
-9.51592254519715870e-05 * DAYS_PER_YEAR],
5.15138902046611451e-05 * SOLAR_MASS) }
SYSTEM = list(BODIES.values())
PAIRS = combinations(SYSTEM)
def advance(dt, n, bodies=SYSTEM, pairs=PAIRS):
for i in range(n):
for (([x1, y1, z1], v1, m1),
([x2, y2, z2], v2, m2)) in pairs:
dx = x1 - x2
dy = y1 - y2
dz = z1 - z2
mag = dt * ((dx * dx + dy * dy + dz * dz) ** (-1.5))
b1m = m1 * mag
b2m = m2 * mag
v1[0] -= dx * b2m
v1[1] -= dy * b2m
v1[2] -= dz * b2m
v2[0] += dx * b1m
v2[1] += dy * b1m
v2[2] += dz * b1m
for (r, [vx, vy, vz], m) in bodies:
r[0] += dt * vx
r[1] += dt * vy
r[2] += dt * vz
def report_energy(bodies=SYSTEM, pairs=PAIRS, e=0.0):
for (((x1, y1, z1), v1, m1),
((x2, y2, z2), v2, m2)) in pairs:
dx = x1 - x2
dy = y1 - y2
dz = z1 - z2
e -= (m1 * m2) / ((dx * dx + dy * dy + dz * dz) ** 0.5)
for (r, [vx, vy, vz], m) in bodies:
e += m * (vx * vx + vy * vy + vz * vz) / 2.
print("%.9f" % e)
def offset_momentum(ref, bodies=SYSTEM, px=0.0, py=0.0, pz=0.0):
for (r, [vx, vy, vz], m) in bodies:
px -= vx * m
py -= vy * m
pz -= vz * m
(r, v, m) = ref
v[0] = px / m
v[1] = py / m
v[2] = pz / m
def main(n, ref='sun'):
offset_momentum(BODIES[ref])
report_energy()
advance(0.01, n)
report_energy()
if __name__ == '__main__':
main(int(sys.argv[1]))

136
ex0.1/spectral-norm/spectral-norm.cc

@ -0,0 +1,136 @@
// The Computer Language Benchmarks Game
// https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
//
// Original C contributed by Sebastien Loisel
// Conversion to C++ by Jon Harrop
// OpenMP parallelize by The Anh Tran
// Add SSE by The Anh Tran
// Additional SSE optimization by Krzysztof Jakubowski
// g++ -pipe -O3 -march=native -fopenmp -mfpmath=sse -msse2 \
// ./spec.c++ -o ./spec.run
#include <cmath>
#include <cstdlib>
#include <cstdio>
#include <sched.h>
#include <omp.h>
#include <emmintrin.h>
template <bool modei> int Index(int i, int j) {
return (((i + j) * (i + j + 1)) >> 1) + (modei? i : j) + 1;
}
template <bool modei>
void EvalPart(double *__restrict__ src, double *__restrict__ dst,
int begin, int end, int length) {
int i = begin;
for(; i + 1 < end; i += 2) {
__m128d sum = _mm_set_pd(
src[0] / double(Index<modei>(i + 1, 0)),
src[0] / double(Index<modei>(i + 0, 0)));
__m128d ti = modei?
_mm_set_pd(i + 1, i + 0) :
_mm_set_pd(i + 2, i + 1);
__m128d last = _mm_set_pd(
Index<modei>(i + 1, 0),
Index<modei>(i + 0, 0));
for(int j = 1; j < length; j++) {
__m128d idx = last + ti + _mm_set1_pd(j);
last = idx;
sum = sum + _mm_set1_pd(src[j]) / idx;
}
_mm_storeu_pd(dst + i + 0, sum);
}
for(; i < end; i++) {
double sum = 0;
for (int j = 0; j < length; j++)
sum += src[j] / double(Index<modei>(i, j));
dst[i] = sum;
}
}
void EvalATimesU(double *src, double *dst, int begin, int end, int N) {
EvalPart<1>(src, dst, begin, end, N);
}
void EvalAtTimesU(double *src, double *dst, int begin, int end, int N) {
EvalPart<0>(src, dst, begin, end, N);
}
void EvalAtATimesU(double *src, double *dst, double *tmp,
int begin, int end, int N) {
EvalATimesU (src, tmp, begin, end, N);
#pragma omp barrier
EvalAtTimesU(tmp, dst, begin, end, N);
#pragma omp barrier
}
int GetThreadCount() {
cpu_set_t cs;
CPU_ZERO(&cs);
sched_getaffinity(0, sizeof(cs), &cs);
int count = 0;
for (int i = 0; i < CPU_SETSIZE; ++i)
if (CPU_ISSET(i, &cs))
++count;
return count;
}
double spectral_game(int N) {
__attribute__((aligned(16))) double u[N];
__attribute__((aligned(16))) double v[N], tmp[N];
double vBv = 0.0;
double vv = 0.0;
#pragma omp parallel default(shared) num_threads(GetThreadCount())
{
// this block will be executed by NUM_THREADS
// variable declared in this block is private for each thread
int threadid = omp_get_thread_num();
int threadcount = omp_get_num_threads();
int chunk = N / threadcount;
// calculate each thread's working range [r1 .. r2) => static schedule
int begin = threadid * chunk;
int end = (threadid < (threadcount -1)) ? (begin + chunk) : N;
for(int i = begin; i < end; i++)
u[i] = 1.0;
#pragma omp barrier
for (int ite = 0; ite < 10; ++ite) {
EvalAtATimesU(u, v, tmp, begin, end, N);
EvalAtATimesU(v, u, tmp, begin, end, N);
}
double sumvb = 0.0, sumvv = 0.0;
for (int i = begin; i < end; i++) {
sumvv += v[i] * v[i];
sumvb += u[i] * v[i];
}
#pragma omp critical
{
vBv += sumvb;
vv += sumvv;
}
}
return sqrt(vBv / vv);
}
int main(int argc, char *argv[]) {
int N = ((argc >= 2) ? atoi(argv[1]) : 2000);
printf("%.9f\n", spectral_game(N));
return 0;
}

64
ex0.1/spectral-norm/spectral-norm.py

@ -0,0 +1,64 @@
# The Computer Language Benchmarks Game
# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
#
# Contributed by Sebastien Loisel
# Fixed by Isaac Gouy
# Sped up by Josh Goldfoot
# Dirtily sped up by Simon Descarpentries
# Used list comprehension by Vadim Zelenin
# 2to3
from math import sqrt
from sys import argv
def eval_A(i, j):
ij = i+j
return 1.0 / (ij * (ij + 1) / 2 + i + 1)
def eval_A_times_u(u):
local_eval_A = eval_A
return [ sum([ local_eval_A(i, j) * u_j
for j, u_j in enumerate(u)
]
)
for i in range(len(u))
]
def eval_At_times_u(u):
local_eval_A = eval_A
return [ sum([ local_eval_A(j, i) * u_j
for j, u_j in enumerate(u)
]
)
for i in range(len(u))
]
def eval_AtA_times_u(u):
return eval_At_times_u(eval_A_times_u(u))
def main():
n = int(argv[1])
u = [1] * n
local_eval_AtA_times_u = eval_AtA_times_u
for dummy in range(10):
v = local_eval_AtA_times_u(u)
u = local_eval_AtA_times_u(v)
vBv = vv = 0
for ue, ve in zip(u, v):
vBv += ue * ve
vv += ve * ve
print("%0.9f" % (sqrt(vBv/vv)))
main()