...
 
Commits (3)
......@@ -254,22 +254,22 @@ template <int N, typename Iter> GPUCounter<N> create_GPUCounter(int n, int m, It
offset += p.base_->nodeList_[xi].r_ * bitvectorSize;
}
cudaMemcpy(bvPtr, tempBvPtr, sizeof(uint64_t) * bitvectorWordCount, cudaMemcpyHostToDevice);
cucheck_dev( cudaMemcpy(bvPtr, tempBvPtr, sizeof(uint64_t) * bitvectorWordCount, cudaMemcpyHostToDevice) );
delete[] tempBvPtr;
// expected size = (number of configurations in the query) * sizeof(uint64_t)
cudaMallocManaged(&p.resultList_, sizeof(uint64_t) * MAX_COUNTS_PER_QUERY * STREAM_COUNT);
cudaMallocManaged(&p.resultListPa_, sizeof(uint64_t) * MAX_COUNTS_PER_QUERY * STREAM_COUNT);
cucheck_dev( cudaMallocManaged(&p.resultList_, sizeof(uint64_t) * MAX_COUNTS_PER_QUERY * STREAM_COUNT) );
cucheck_dev( cudaMallocManaged(&p.resultListPa_, sizeof(uint64_t) * MAX_COUNTS_PER_QUERY * STREAM_COUNT) );
cudaMalloc(&p.intermediaResult_, sizeof(uint64_t) * bitvectorSize * 32 * STREAM_COUNT);
cucheck_dev( cudaMalloc(&p.intermediaResult_, sizeof(uint64_t) * bitvectorSize * 32 * STREAM_COUNT) );
p.streams.resize(STREAM_COUNT);
for (int i = 0; i < STREAM_COUNT; ++i) {
cudaStreamCreate(&p.streams[i]);
cucheck_dev( cudaStreamCreate(&p.streams[i]) );
}
cudaDeviceSynchronize();
cucheck_dev( cudaDeviceSynchronize() );
return p;
} // create_GPUCounter
......
......@@ -14,6 +14,16 @@
#define CUDA_CALLABLE
#endif
#define cucheck_dev(call) \
{ \
cudaError_t cucheck_err = (call); \
if(cucheck_err != cudaSuccess) { \
const char *err_str = cudaGetErrorString(cucheck_err); \
printf("%s (%d): %s\n", __FILE__, __LINE__, err_str); \
assert(0); \
} \
}
void copyAritiesToDevice(
int streamId,
const std::vector<uint64_t>& pArities,
......
......@@ -48,16 +48,6 @@ __constant__ uint64_t aritiesPtr_[4][10];
__constant__ uint64_t aritiesPrefixProdPtr_[4][11];
__constant__ uint64_t aritiesPrefixSumPtr_[4][10];
#define cucheck_dev(call) \
{ \
cudaError_t cucheck_err = (call); \
if(cucheck_err != cudaSuccess) { \
const char *err_str = cudaGetErrorString(cucheck_err); \
printf("%s (%d): %s\n", __FILE__, __LINE__, err_str); \
assert(0); \
} \
}
template <class T, unsigned int blockSize, bool nIsPow2, bool isSecondStage>
__global__ void counts(const T* inputData,
T* outputData,
......@@ -253,9 +243,12 @@ __host__ void copyAritiesToDevice(int streamId,
const std::vector<uint64_t>& pArities,
const std::vector<uint64_t>& pAritiesPrefixProd,
const std::vector<uint64_t>& pAritiesPrefixSum) {
cudaMemcpyToSymbol(aritiesPtr_[streamId], pArities.data(), pArities.size() * sizeof(uint64_t));
cudaMemcpyToSymbol(aritiesPrefixProdPtr_[streamId], pAritiesPrefixProd.data(), pAritiesPrefixProd.size() * sizeof(uint64_t));
cudaMemcpyToSymbol(aritiesPrefixSumPtr_[streamId], pAritiesPrefixSum.data(), pAritiesPrefixSum.size() * sizeof(uint64_t));
cucheck_dev( cudaMemcpyToSymbol(aritiesPtr_, pArities.data(),
pArities.size() * sizeof(uint64_t), streamId * sizeof(uint64_t) * 10) );
cucheck_dev( cudaMemcpyToSymbol(aritiesPrefixProdPtr_,
pAritiesPrefixProd.data(), pAritiesPrefixProd.size() * sizeof(uint64_t), streamId * sizeof(uint64_t) * 11) );
cucheck_dev( cudaMemcpyToSymbol(aritiesPrefixSumPtr_,
pAritiesPrefixSum.data(), pAritiesPrefixSum.size() * sizeof(uint64_t), streamId * sizeof(uint64_t) * 10) );
} // m_copyAritiesToDevice__
......@@ -490,7 +483,7 @@ void cudaCallBlockCount(const uint block_count,
startKernel<false>(bvectorsPtr, results, resultsPa, intermediateData, words_per_vector,
variablesCount, configs_per_query, 0, streamId, threadCount, -1);
cudaStreamSynchronize(0);
cucheck_dev( cudaStreamSynchronize(0) );
} // cudaCallBlockCount
......