/** * @brief Computes one layer of the perceptron given the previous one and the * weights * The kernel is run once for each layer. * The work items are each tasked with computing the output of a single neuron * of the out layer. * * @param out_layer_size * Size of the output layer (number of elements in the output array that will * contain the result for each neuron). * @param in_layer_size * Number of elements of the input layer * @param in_value * Values of the neuron in the previous layer * @param in_weights * Array containing the weights for each input neuron. It is organised as a * two dimensional matrix, written by concatenating each line in the array * [ w11, w12, w13, ... * w21, w22, w23, ... * ..., ..., ..., ... * ] * Where wij is the weight linking the neuron i of the input layer to the * neuron j of the output layer * @param out_values * Computed values for the current layer */ void kernel perceptron(global const int* in_layer_size, global const int* out_layer_size, global const float *in_value, global const float* in_weights, global float* out_values) { private const int global_id = get_global_id(0); private const int out_layer_s = *out_layer_size; private const int in_layer_s = *in_layer_size; private const int offset = out_layer_s * global_id; private float sum = 0.; for(int i=0; i < in_layer_s; i++) { sum += in_weights[i*out_layer_s+global_id] * in_value[i]; } //out_values[global_id] = sigma(sum); out_values[global_id] = sum; }
queue.enqueueNDRangeKernel(kernel, cl::NullRange,cl::NDRange(number of neurons within layer),cl::NullRange);
只是为了让您了解性能(即在Nvidia GTX 660M上),我将向您展示我实现的一些时间.每个值是每层神经元的数量:
> 2500,10000,2500:0.018s~60FPS.它比我的处理器(运行在2.40GHz的Intel Core i7)快4到5倍
> 100 000,150 000,500:140s – >因为第二层中的每个神经元必须执行100 000个元素的加权和,所以我认为这并不令人惊讶.在我的处理器上运行它会产生大致相同的结果.
void kernel Kernel( __global const int in_layer_size, __global const int out_layer_size, __global const float *in_value, __global const float *in_weights, __global float *out_values){ __local float buffer[SOME_SIZE]; __global const float* p_in = in_value; __global float* p_out = out_values; const int global_id = get_global_id(0), local_id = get_local_id(0), num_buffers = in_layer_size / SOME_SIZE, offset = out_layer_size * global_id; float sum = 0.0f; for(int i=0; i < num_buffers; i++){ buffer[local_id] = p_in[local_id]; barrier(CLK_LOCAL_MEM_FENCE); //Process all data inside buffer by every WI in WG //... p_in += SOME_SIZE; out_values += SOME_SIZE; } //... return;
所以,你正在用固定尺寸的窗户滑动.在&内计算数据然后进入下一个窗口. Al数据操作是独立完成的,工作项只能同时使用相同的数据.本地组的最佳大小取决于设备和内核.