# Lösungsvorschlag

Zur Zusatzfrage: Der Aufruf der Zugriffsfunktion ist auf der CPU-Seite nicht zulässig, da die Methode mit __device__ gekannzeichnet ist. Deswegen ist x.data statt &x(0) zu verwenden.

#include <cstdio>
#include <cmath>
#include <hpc/cuda/check.h>
#include <hpc/cuda/densevector.h>
#include <hpc/matvec/densevector.h>
#include <hpc/cuda/copy.h>

#define N 8192

template<typename Index, typename TX, typename TY>
TX dot(Index n, const TX* x, Index incX, TY* y, Index incY) {

TX res = 0;
for (Index index = 0; index < n; ++index) {
res += x[index * incX] * y[index * incY];
}
return res;
}

template<typename Index, typename T>
__global__ void asum(Index n, const T* x, T* sums) {
Index index = threadIdx.x + blockIdx.x * blockDim.x;
if (index < n) {
sums_per_block[me] = x[index];
} else {
sums_per_block[me] = 0;
}
/* aggregate sum within a block */
index = blockDim.x / 2;
while (index) {
if (me < index) {
sums_per_block[me] += sums_per_block[me + index];
}
index /= 2;
}
if (me == 0) {
sums[blockIdx.x] = sums_per_block[0];
}
}

template<typename Index, typename TX, typename TY, typename T>
__global__ void dot(Index n,
const TX* x, Index incX, TY* y, Index incY, T* sums) {
Index index = threadIdx.x + blockIdx.x * blockDim.x;

T res;
if (index < n) {
res = x[index * incX] * y[index * incY];
} else {
res = 0;
}

sums_per_block[me] = res;

/* aggregate sum within a block */
index = blockDim.x / 2;
while (index) {
if (me < index) {
sums_per_block[me] += sums_per_block[me + index];
}
index /= 2;
}
if (me == 0) {
sums[blockIdx.x] = sums_per_block[0];
}
}

int main() {
typedef double T;
hpc::matvec::DenseVector<T> a(N);
hpc::matvec::DenseVector<T> b(a.length);
for (unsigned int i = 0; i < a.length; ++i) {
a(i) = i; b(i) = i * i;
}

/* transfer vectors to GPU memory */
hpc::cuda::DenseVector<T> dev_a(a.length);
hpc::cuda::DenseVector<T> dev_b(b.length);
copy(a, dev_a); copy(b, dev_b);

hpc::cuda::DenseVector<T>* dev_sums =
new hpc::cuda::DenseVector<T>(NUM_BLOCKS);

/* execute kernel function on GPU */
dev_a.inc, dev_b.data, dev_b.inc, dev_sums->data);
unsigned int len = NUM_BLOCKS;
while (len > 1) {
hpc::cuda::DenseVector<T>* dev_sums2 =
new hpc::cuda::DenseVector<T>(num_blocks);
delete dev_sums; dev_sums = dev_sums2;
len = num_blocks;
}

/* transfer result vector from GPU to host memory */
hpc::matvec::DenseVector<T> result(1);
copy(*dev_sums, result); delete dev_sums;

/* print difference to local result */
double local_sum = dot(a.length, a.data, a.inc, b.data, b.inc);
std::printf("diff: %12.4lg\n", std::abs(result(0) - local_sum));
}