kernelrunner.cpp 6.28 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
#include "kernelrunner.h"

#include <stdexcept>

#ifndef NDEBUG
#include <iostream>
#endif

#define THREADS_PER_LANE 32

namespace argon2 {
namespace opencl {

enum {
    ARGON2_REFS_PER_BLOCK = ARGON2_BLOCK_SIZE / (2 * sizeof(cl_uint)),
};

KernelRunner::KernelRunner(const ProgramContext *programContext,
                           const Argon2Params *params, const Device *device,
                           std::uint32_t batchSize, bool bySegment, bool precompute)
    : programContext(programContext), params(params), batchSize(batchSize),
      bySegment(bySegment), precompute(precompute),
      memorySize(params->getMemorySize() * batchSize)
{
    auto context = programContext->getContext();
    std::uint32_t passes = params->getTimeCost();
    std::uint32_t lanes = params->getLanes();
    std::uint32_t segmentBlocks = params->getSegmentBlocks();

    queue = cl::CommandQueue(context, device->getCLDevice(),
                             CL_QUEUE_PROFILING_ENABLE);
    memoryBuffer = cl::Buffer(context, CL_MEM_READ_WRITE, memorySize);

    Type type = programContext->getArgon2Type();
    if ((type == ARGON2_I || type == ARGON2_ID) && precompute) {
        uint32_t segments =
                type == ARGON2_ID
                ? lanes * (ARGON2_SYNC_POINTS / 2)
                : passes * lanes * ARGON2_SYNC_POINTS;

        std::uint32_t refsSize = segments * segmentBlocks * sizeof(cl_uint) * 2;

#ifndef NDEBUG
        std::cerr << "[INFO] Allocating " << refsSize << " bytes for refs..."
                  << std::endl;
#endif

        refsBuffer = cl::Buffer(context, CL_MEM_READ_ONLY, refsSize);

        precomputeRefs();
    }

    static const char *KERNEL_NAMES[2][2] = {
        {
            "argon2_kernel_oneshot",
            "argon2_kernel_segment",
        },
        {
            "argon2_kernel_oneshot_precompute",
            "argon2_kernel_segment_precompute",
        }
    };

    kernel = cl::Kernel(programContext->getProgram(),
                        KERNEL_NAMES[precompute][bySegment]);
    kernel.setArg<cl::Buffer>(1, memoryBuffer);
    if (precompute) {
        kernel.setArg<cl::Buffer>(2, refsBuffer);
        kernel.setArg<cl_uint>(3, passes);
        kernel.setArg<cl_uint>(4, lanes);
        kernel.setArg<cl_uint>(5, segmentBlocks);
    } else {
        kernel.setArg<cl_uint>(2, passes);
        kernel.setArg<cl_uint>(3, lanes);
        kernel.setArg<cl_uint>(4, segmentBlocks);
    }
}

void KernelRunner::precomputeRefs()
{
    std::uint32_t passes = params->getTimeCost();
    std::uint32_t lanes = params->getLanes();
    std::uint32_t segmentBlocks = params->getSegmentBlocks();
    std::uint32_t segmentAddrBlocks =
            (segmentBlocks + ARGON2_REFS_PER_BLOCK - 1)
            / ARGON2_REFS_PER_BLOCK;
    std::uint32_t segments = programContext->getArgon2Type() == ARGON2_ID
            ? lanes * (ARGON2_SYNC_POINTS / 2)
            : passes * lanes * ARGON2_SYNC_POINTS;

    std::uint32_t shmemSize = THREADS_PER_LANE * sizeof(cl_uint) * 2;

    cl::Kernel kernel = cl::Kernel(programContext->getProgram(),
                                   "argon2_precompute_kernel");
    kernel.setArg<cl::LocalSpaceArg>(0, { shmemSize });
    kernel.setArg<cl::Buffer>(1, refsBuffer);
    kernel.setArg<cl_uint>(2, passes);
    kernel.setArg<cl_uint>(3, lanes);
    kernel.setArg<cl_uint>(4, segmentBlocks);

    cl::NDRange globalRange { THREADS_PER_LANE * segments * segmentAddrBlocks };
    cl::NDRange localRange { THREADS_PER_LANE };
    queue.enqueueNDRangeKernel(kernel, cl::NullRange, globalRange, localRange);
    queue.finish();
}

107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
void *KernelRunner::mapInputMemory(std::uint32_t jobId)
{
    std::size_t memorySize = params->getMemorySize();
    std::size_t mappedSize = params->getLanes() * 2 * ARGON2_BLOCK_SIZE;
    return queue.enqueueMapBuffer(memoryBuffer, true, CL_MAP_WRITE,
                                  memorySize * jobId, mappedSize);
}

void KernelRunner::unmapInputMemory(void *memory)
{
    queue.enqueueUnmapMemObject(memoryBuffer, memory);
}

void *KernelRunner::mapOutputMemory(std::uint32_t jobId)
{
    std::size_t memorySize = params->getMemorySize();
    std::size_t mappedSize = params->getLanes() * ARGON2_BLOCK_SIZE;
    std::size_t mappedOffset = memorySize * (jobId + 1) - mappedSize;
    return queue.enqueueMapBuffer(memoryBuffer, true, CL_MAP_WRITE,
                                  mappedOffset, mappedSize);
}

void KernelRunner::unmapOutputMemory(void *memory)
{
    queue.enqueueUnmapMemObject(memoryBuffer, memory);
}

134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
void KernelRunner::run(std::uint32_t lanesPerBlock, std::uint32_t jobsPerBlock)
{
    std::uint32_t lanes = params->getLanes();
    std::uint32_t passes = params->getTimeCost();

    if (bySegment) {
        if (lanesPerBlock > lanes || lanes % lanesPerBlock != 0) {
            throw std::logic_error("Invalid lanesPerBlock!");
        }
    } else {
        if (lanesPerBlock != lanes) {
            throw std::logic_error("Invalid lanesPerBlock!");
        }
    }

    if (jobsPerBlock > batchSize || batchSize % jobsPerBlock != 0) {
        throw std::logic_error("Invalid jobsPerBlock!");
    }

    cl::NDRange globalRange { THREADS_PER_LANE * lanes, batchSize };
    cl::NDRange localRange { THREADS_PER_LANE * lanesPerBlock, jobsPerBlock };

156 157 158 159 160 161 162 163 164 165 166 167 168
    queue.enqueueMarker(&start);

    std::uint32_t shmemSize =
            THREADS_PER_LANE * lanesPerBlock * jobsPerBlock *
            sizeof(cl_uint) * 2;
    kernel.setArg<cl::LocalSpaceArg>(0, { shmemSize });
    if (bySegment) {
        for (std::uint32_t pass = 0; pass < passes; pass++) {
            for (std::uint32_t slice = 0; slice < ARGON2_SYNC_POINTS; slice++) {
                kernel.setArg<cl_uint>(precompute ? 6 : 5, pass);
                kernel.setArg<cl_uint>(precompute ? 7 : 6, slice);
                queue.enqueueNDRangeKernel(kernel, cl::NullRange,
                                           globalRange, localRange);
169 170
            }
        }
171 172 173
    } else {
        queue.enqueueNDRangeKernel(kernel, cl::NullRange,
                                   globalRange, localRange);
174 175
    }

176
    queue.enqueueMarker(&end);
177 178 179 180 181 182
}

float KernelRunner::finish()
{
    end.wait();

183 184
    cl_ulong nsStart = start.getProfilingInfo<CL_PROFILING_COMMAND_END>();
    cl_ulong nsEnd   = end.getProfilingInfo<CL_PROFILING_COMMAND_START>();
185 186 187 188 189 190

    return (nsEnd - nsStart) / (1000.0F * 1000.0F);
}

} // namespace opencl
} // namespace argon2