c++ - Small sized binary searches on CUDA GPUs -


i have large device array inputvalues of int64_t type. every 32 elements of array sorted in ascending order. have unsorted search array removevalues.

my intention elements in removevalues inside inputvalues , mark them -1. efficient method achieve this? using 3.5 cuda device if helps.

i not looking higher level solution, i.e. not want use thrust or cub, want write using cuda kernels.

my initial approach load every 32 values in shared memory in thread block. every thread loads single value removevalues , independent binary search on shared memory array. if found, value set according using if condition.

wouldn't approach involve lot of bank conflicts , branch divergence? think branch divergence can addressed using ternary operators while implementing binary search? if solved, how can bank conflict eliminated? since size of sorted arrays 32, possible implement binary search using shuffle instructions? help?

edit : have added example show intend achieve.

let's inputvalues vector every 32 elements sorted:
[2, 4, 6, ... , 64], [95, 97, ... , 157], [1, 3, ... , 63], [...]

the typical size array can range between 32*2 32*32. values range 0 int64_max.

an example of removevalues be:
[7, 75, 95, 106]

the typical size array range 1 1024.

after operation removevalues be: [-1, 75, -1, 106]

the values in inputvalues remain unchanged.

i concur answer (now deleted) , comment @harrism. since put effort non-thrust approach, i'll present findings.

i tried naively implement binary search @ warp-level using __shfl(), , repeat binary search across data set, passing data set through each 32-element group.

it's embarrassing, code around 20x slower thrust (in fact may worse if careful timing nvprof).

i made data sizes little larger proposed in question, because data sizes in question small timing in dust.

here's worked example of 2 approaches:

  1. what approximately outlined in question, i.e. create binary search using warp shuffle can search 32 elements against 32-element ordered array. repeat process many 32-element ordered arrays there are, passing entire data set through each ordered array (hopefully can start see of inefficiency now.)

  2. use thrust, same outlined @harrism, i.e. sort grouped data set, , run vectorized thrust::binary_search on that.

here's example:

$ cat t1030.cu #include <stdio.h> #include <assert.h> #include <thrust/host_vector.h> #include <thrust/device_vector.h> #include <thrust/sort.h> #include <thrust/binary_search.h>  typedef long mytype;  const int gsize = 32; const int ngrp = 512; const int dsize = ngrp*gsize;//gsize*ngrp;  #include <time.h> #include <sys/time.h> #define usecpsec 1000000ull  unsigned long long dtime_usec(unsigned long long start){    timeval tv;   gettimeofday(&tv, 0);   return ((tv.tv_sec*usecpsec)+tv.tv_usec)-start; }  template <typename t> __device__ t my_shfl32(t val, unsigned lane){   return __shfl(val, lane); }  template <typename t> __device__ t my_shfl64(t val, unsigned lane){   t retval = val;   int2 t1 = *(reinterpret_cast<int2 *>(&retval));   t1.x = __shfl(t1.x, lane);   t1.y = __shfl(t1.y, lane);   retval = *(reinterpret_cast<t *>(&t1));   return retval; }  template <typename t> __device__ bool bsearch_shfl(t grp_val, t my_val){   int src_lane = gsize>>1;   bool return_val = false;   t test_val;   int shift = gsize>>2;   (int = 0; <= gsize>>3; i++){     if (sizeof(t)==4){       test_val = my_shfl32(grp_val, src_lane);}     else if (sizeof(t)==8){       test_val = my_shfl64(grp_val, src_lane);}     else assert(0);     if (test_val == my_val) return_val = true;     src_lane += (((test_val<my_val)*2)-1)*shift;     shift>>=1;     assert ((src_lane < gsize)&&(src_lane > 0));}   if (sizeof(t)==4){     test_val = my_shfl32(grp_val, 0);}   else if (sizeof(t)==8){     test_val = my_shfl64(grp_val, 0);}   else assert(0);   if (test_val == my_val) return_val = true;   return return_val; }  template <typename t> __global__ void bsearch_grp(const t * __restrict__ search_grps, t *data){    int idx = threadidx.x+blockdim.x*blockidx.x;   int tid = threadidx.x;   if (idx < gsize*ngrp){     t grp_val = search_grps[idx];     while (tid < dsize){       t my_val = data[tid];       if (bsearch_shfl(grp_val, my_val)) data[tid] = -1;       tid += blockdim.x;}   } }   int main(){    // data setup   assert(gsize == 32);  //mandatory (warp size)   assert((dsize % 32)==0);  //needed preserve shfl capability   thrust::host_vector<mytype> grps(gsize*ngrp);   thrust::host_vector<mytype> data(dsize);   thrust::host_vector<mytype> result(dsize);   (int = 0; < gsize*ngrp; i++) grps[i] = i;   (int = 0; < dsize; i++) data[i] = i;   // method 1: individual shfl-based binary searches on each group   mytype *d_grps, *d_data;   cudamalloc(&d_grps, gsize*ngrp*sizeof(mytype));   cudamalloc(&d_data, dsize*sizeof(mytype));   cudamemcpy(d_grps, &(grps[0]), gsize*ngrp*sizeof(mytype), cudamemcpyhosttodevice);   cudamemcpy(d_data, &(data[0]), dsize*sizeof(mytype), cudamemcpyhosttodevice);   unsigned long long my_time = dtime_usec(0);   bsearch_grp<<<ngrp, gsize>>>(d_grps, d_data);   cudadevicesynchronize();   my_time = dtime_usec(my_time);   cudamemcpy(&(result[0]), d_data, dsize*sizeof(mytype), cudamemcpydevicetohost);   (int = 0; < dsize; i++) if (result[i] != -1) {printf("method 1 mismatch @ %d, %d, should -1\n", i, (int)(result[i])); return 1;}   printf("method 1 time: %fs\n", my_time/(float)usecpsec);   // method 2: thrust sort, followed thrust binary search   thrust::device_vector<mytype> t_grps = grps;   thrust::device_vector<mytype> t_data = data;   thrust::device_vector<bool> t_rslt(t_data.size());   my_time = dtime_usec(0);   thrust::sort(t_grps.begin(), t_grps.end());   thrust::binary_search(t_grps.begin(), t_grps.end(), t_data.begin(), t_data.end(), t_rslt.begin());   cudadevicesynchronize();   my_time = dtime_usec(my_time);   thrust::host_vector<bool> rslt = t_rslt;   (int = 0; < dsize; i++) if (rslt[i] != true) {printf("method 2 mismatch @ %d, %d, should 1\n", i, (int)(rslt[i])); return 1;}   printf("method 2 time: %fs\n", my_time/(float)usecpsec);    // method 3:  multiple thrust merges, followed thrust binary search      return 0; }  $ nvcc -o3 -arch=sm_35 t1030.cu -o t1030 $ ./t1030 method 1 time: 0.009075s method 2 time: 0.000516s $ 

i running on linux, cuda 7.5, gt640 gpu. performance different on different gpus, i'd surprised if gpu closed gap.

in short, you'd advised use well-tuned library thrust or cub. if don't monolithic nature of thrust, try cub. don't know if cub has binary search, single binary search against whole sorted data set not difficult thing write, , it's smaller part of time involved (for method 2 -- identifiable using nvprof or additional timing code).

since 32-element grouped ranges sorted, pondered idea of using multiple thrust::merge operations rather single sort. i'm not sure faster, since thrust method faster 32-element shuffle search method, think thrust (or cub) obvious choice.


Comments

Popular posts from this blog

how to insert data php javascript mysql with multiple array session 2 -

multithreading - Exception in Application constructor -

windows - CertCreateCertificateContext returns CRYPT_E_ASN1_BADTAG / 8009310b -