Commit 4fb666d2 authored by Jack Doerner's avatar Jack Doerner

Floram can now automatically choose the correct number of threads at runtime....

Floram can now automatically choose the correct number of threads at runtime. This works well for CPRG, but for regular FSSL it’s not optimal.
parent 7ea32e84
......@@ -34,6 +34,7 @@ void bitpropagator_offline_push_Z(bitpropagator_offline * bpo, uint8_t * Z, bool
void bitpropagator_offline_readblockvector(uint8_t * local_output, bool * local_bit_output, bitpropagator_offline * bpo) {
floram_set_procs_for_data_size(BLOCKSIZE * (1ll<<bpo->endlevel + 1ll<<(bpo->endlevel-1)));
#pragma omp parallel
{
size_t thislevel = bpo->startlevel;
......@@ -201,7 +202,7 @@ void bitpropagator_offline_readblockvector(uint8_t * local_output, bool * local_
b2 = a2; b = a; b_bits = a_bits;
a2 = t2; a = t; a_bits = t_bits;
#pragma omp for
#pragma omp for schedule(guided)
for (size_t ii = 0; ii < thislevelblocks; ii++) {
if (b == local_output) {
if (ii%2 == 0) {
......
......@@ -111,7 +111,8 @@ void bitpropagator_cprg_offline_process_round(uint8_t * accumulator_L, uint8_t *
expansion_stride = BLOCKSIZE;
}
#pragma omp parallel for reduction(^:accL,accR)
floram_set_procs_for_data_size(BLOCKSIZE * (bpo->nextlevelblocks + bpo->thislevelblocks));
#pragma omp parallel for reduction(^:accL,accR) schedule(guided)
for (size_t ii = 0; ii < 4*(bpo->nextlevelblocks/8); ii+=4) {
bpo->lba[ii] = (bpo->lda2[ii*BLOCKSIZE] & 1) ^ (bpo->lbb[ii/2] & advicebit_l);
bpo->lba[ii+1] = (bpo->lda2[(ii+1)*BLOCKSIZE] & 1) ^ (bpo->lbb[ii/2] & advicebit_r);
......@@ -212,10 +213,12 @@ void bitpropagator_cprg_offline_finalize(uint8_t * accumulator, uint8_t * z, boo
block_t acc = {0};
floram_set_procs_for_data_size(BLOCKSIZE * bpo->thislevelblocks * bpo->blockmultiple);
if (bpo->thislevel%2==0) {
local_output = bpo->ldb2;
#pragma omp parallel for reduction(^:acc)
#pragma omp parallel for reduction(^:acc) schedule(guided)
for (size_t ii = 0; ii < bpo->thislevelblocks; ii++) {
if (ii%2 == 0) {
bpo->lba[ii] = (bpo->lda2[ii*BLOCKSIZE] & 1) ^ (bpo->lbb[ii/2] & advicebit_l);
......@@ -243,7 +246,7 @@ void bitpropagator_cprg_offline_finalize(uint8_t * accumulator, uint8_t * z, boo
} else {
local_output = bpo->lda2;
#pragma omp parallel for reduction(^:acc)
#pragma omp parallel for reduction(^:acc) schedule(guided)
for (size_t ii = 0; ii < bpo->thislevelblocks; ii++) {
if (ii%2 == 0) {
bpo->lba[ii] = (bpo->lda2[ii*(BLOCKSIZE*bpo->blockmultiple)] & 1) ^ (bpo->lbb[ii/2] & advicebit_l);
......@@ -273,7 +276,7 @@ void bitpropagator_cprg_offline_finalize(uint8_t * accumulator, uint8_t * z, boo
for (size_t jj = 1; jj < bpo->blockmultiple; jj++) {
for (size_t ii = 0; ii < BLOCKSIZE/sizeof(uint64_t); ii++) acc.data[ii] = 0;
#pragma omp parallel for reduction(^:acc)
#pragma omp parallel for reduction(^:acc) schedule(guided)
for (size_t ii = 0; ii < 8*(bpo->thislevelblocks/8); ii+=8) {
offline_prg_oct(
&local_output[(ii+0) * (BLOCKSIZE*bpo->blockmultiple) + (jj * BLOCKSIZE)],
......
......@@ -10,7 +10,10 @@
#define KEYSIZE 16
#define BLOCKSIZE 16
#define CACHE_PER_CORE (1024*1024)
//#define SCANROM_DISABLE_ENCRYPTION
//#define FLORAM_DISABLE_AUTO_THREAD_COUNT
//#define ORAM_PROFILE_SCHEDULING
#endif
\ No newline at end of file
......@@ -24,6 +24,14 @@ int floram_zpma(void** dst, size_t alignment, size_t size) {
return res;
}
void floram_set_procs_for_data_size(size_t dsize) {
#ifndef FLORAM_DISABLE_AUTO_THREAD_COUNT
size_t recommended_cores = (dsize + CACHE_PER_CORE - 1) / CACHE_PER_CORE;
size_t actual_cores = MIN(omp_get_num_procs(), MAX(1, recommended_cores));
omp_set_num_threads(actual_cores);
#endif
}
#ifdef __AES__
#include <wmmintrin.h>
......
......@@ -11,6 +11,7 @@ void scanrom_read_with_bitvector_offline(uint8_t * data, uint8_t * local_data, b
uint64_t ** sums;
size_t threadcount;
floram_set_procs_for_data_size(memblocksize * blockcount);
#pragma omp parallel
{
threadcount = omp_get_num_threads();
......@@ -53,6 +54,7 @@ void scanrom_encrypt_offline(uint8_t * out, uint8_t * in, uint8_t* key, size_t i
void scanrom_encrypt_offline(uint8_t * out, uint8_t * in, uint8_t* key, size_t index, size_t blockmultiple, size_t blockcount) {
offline_expand_from(out, key, index*blockmultiple, blockcount * blockmultiple);
if (in != NULL) {
floram_set_procs_for_data_size(BLOCKSIZE * blockmultiple * blockcount);
#pragma omp parallel for simd schedule(guided)
for (size_t ii = 0; ii < blockcount * blockmultiple * BLOCKSIZE / sizeof(uint64_t); ii++) {
((uint64_t *)out)[ii] ^= ((uint64_t *)in)[ii];
......@@ -66,6 +68,7 @@ void scanwrom_write_with_blockvector_offline(uint8_t * local_data, uint8_t * blo
uint64_t * b = blockvector;
uint64_t * z = Zblock;
floram_set_procs_for_data_size(memblocksize * blockcount);
#pragma omp parallel for schedule(guided)
for (size_t ii = 0; ii< blockcount; ii++) {
#pragma omp simd aligned(d,b,bitvector,z:16)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment