diff --git a/baselines/pfsp/lib/Pool.c b/baselines/pfsp/lib/Pool.c
index 326f8c38d792790c214950d7fac7bac69620b7be..c04397b000021f955dd5efa5546738e871464e72 100644
--- a/baselines/pfsp/lib/Pool.c
+++ b/baselines/pfsp/lib/Pool.c
@@ -1,6 +1,8 @@
 #include "Pool.h"
 #include <stdlib.h>
 
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
 void initSinglePool(SinglePool* pool)
 {
   pool->elements = (Node*)malloc(CAPACITY * sizeof(Node));
@@ -33,6 +35,20 @@ Node popBack(SinglePool* pool, int* hasWork)
   return (Node){0};
 }
 
+// Bulk removal from the end of the deque.
+int popBackBulk(SinglePool* pool, const int m, const int M, Node* parents) {
+  if (pool->size >= m) {
+    const int poolSize = MIN(pool->size, M);
+    pool->size -= poolSize;
+    for (int i = 0; i < poolSize; i++) {
+      parents[i] = pool->elements[pool->front + pool->size + i];
+    }
+    return poolSize;
+  }
+
+  return 0;
+}
+
 // Removal from the front of the deque.
 Node popFront(SinglePool* pool, int* hasWork)
 {
diff --git a/baselines/pfsp/lib/Pool.h b/baselines/pfsp/lib/Pool.h
index a93556d4739eded990887469fb8a244fc2d38172..acb647644cf293941405ebf587df2b812fa58a29 100644
--- a/baselines/pfsp/lib/Pool.h
+++ b/baselines/pfsp/lib/Pool.h
@@ -30,6 +30,8 @@ void pushBack(SinglePool* pool, Node node);
 
 Node popBack(SinglePool* pool, int* hasWork);
 
+int popBackBulk(SinglePool* pool, const int m, const int M, Node* parents);
+
 Node popFront(SinglePool* pool, int* hasWork);
 
 void deleteSinglePool(SinglePool* pool);
diff --git a/baselines/pfsp/lib/evaluate.cu b/baselines/pfsp/lib/evaluate.cu
index 822b9631fb4e488d1f3e000f47f7b6a3d5fb24a9..682976e047e8d9cac46c07da77902276661b919a 100644
--- a/baselines/pfsp/lib/evaluate.cu
+++ b/baselines/pfsp/lib/evaluate.cu
@@ -6,113 +6,114 @@ extern "C" {
 #include "evaluate.h"
 #include "c_bounds_gpu.cu"
 
-  __device__ void swap_cuda(int* a, int* b)
-  {
-    int tmp = *b;
-    *b = *a;
-    *a = tmp;
-  }
+__device__ void swap_cuda(int* a, int* b)
+{
+  int tmp = *b;
+  *b = *a;
+  *a = tmp;
+}
 
-  void printDims(dim3 gridDim, dim3 blockDim) {
-    printf("Grid Dimensions : [%d, %d, %d] blocks. \n",
-      gridDim.x, gridDim.y, gridDim.z);
+void printDims(dim3 gridDim, dim3 blockDim) {
+  printf("Grid Dimensions : [%d, %d, %d] blocks. \n",
+    gridDim.x, gridDim.y, gridDim.z);
 
-    printf("Block Dimensions : [%d, %d, %d] threads.\n",
-      blockDim.x, blockDim.y, blockDim.z);
-  }
+  printf("Block Dimensions : [%d, %d, %d] threads.\n",
+    blockDim.x, blockDim.y, blockDim.z);
+}
 
-  // Evaluate a bulk of parent nodes on GPU using lb1
-  __global__ void evaluate_gpu_lb1(const int jobs, const int size, Node* parents_d, const lb1_bound_data  lbound1_d, int* bounds)
-  {
-    int threadId = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (threadId < size) {
-      const int parentId = threadId / jobs;
-      const int k = threadId % jobs;
-      Node parent =  parents_d[parentId];
-      int depth = parent.depth;
-      int limit1 = parent.limit1;
-
-      // We evaluate all permutations by varying index k from limit1 forward
-      if (k >= limit1+1) {
-        swap_cuda(&parent.prmu[depth],&parent.prmu[k]);
-        lb1_bound_gpu(lbound1_d, parent.prmu, limit1+1, jobs, &bounds[threadId]);
-        swap_cuda(&parent.prmu[depth],&parent.prmu[k]);
-      }
+// Evaluate a bulk of parent nodes on GPU using lb1
+__global__ void evaluate_gpu_lb1(const int jobs, const int size, Node* parents_d, const lb1_bound_data  lbound1_d, int* bounds)
+{
+  int threadId = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (threadId < size) {
+    const int parentId = threadId / jobs;
+    const int k = threadId % jobs;
+    Node parent =  parents_d[parentId];
+    int depth = parent.depth;
+    int limit1 = parent.limit1;
+
+    // We evaluate all permutations by varying index k from limit1 forward
+    if (k >= limit1+1) {
+      swap_cuda(&parent.prmu[depth], &parent.prmu[k]);
+      lb1_bound_gpu(lbound1_d, parent.prmu, limit1+1, jobs, &bounds[threadId]);
+      swap_cuda(&parent.prmu[depth], &parent.prmu[k]);
     }
   }
+}
+
+/*
+  NOTE: This lower bound evaluates all the children of a given parent at the same time.
+  Therefore, the GPU loop is on the parent nodes and not on the children ones, in contrast
+  to the other lower bounds.
+*/
+// Evaluate a bulk of parent nodes on GPU using lb1_d.
+__global__ void evaluate_gpu_lb1_d(const int jobs, const int size, Node* parents_d, const lb1_bound_data lbound1_d, int* bounds)
+{
+  int parentId = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (parentId < size) {
+    Node parent = parents_d[parentId];
 
-  /*
-    NOTE: This lower bound evaluates all the children of a given parent at the same time.
-    Therefore, the GPU loop is on the parent nodes and not on the children ones, in contrast
-    to the other lower bounds.
-  */
-  // Evaluate a bulk of parent nodes on GPU using lb1_d.
-  __global__ void evaluate_gpu_lb1_d(const int jobs, const int size, Node* parents_d, const lb1_bound_data lbound1_d, int* bounds)
-  {
-    int parentId = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if(parentId < size){
-      Node parent = parents_d[parentId];
-
-      // Vector of integers of size MAX_JOBS
-      int lb_begin[MAX_JOBS];
-
-      lb1_children_bounds_gpu(lbound1_d, parent.prmu, parent.limit1, jobs, lb_begin);
-
-      for(int k = 0; k < jobs; k++) {
-        if (k >= parent.limit1+1) {
-          const int job = parent.prmu[k];
-          bounds[parentId*jobs+k] = lb_begin[job];
-        }
+    // Vector of integers of size MAX_JOBS
+    int lb_begin[MAX_JOBS];
+
+    lb1_children_bounds_gpu(lbound1_d, parent.prmu, parent.limit1, jobs, lb_begin);
+
+    for (int k = 0; k < jobs; k++) {
+      if (k >= parent.limit1+1) {
+        const int job = parent.prmu[k];
+        bounds[parentId*jobs+k] = lb_begin[job];
       }
     }
   }
+}
 
-  // Evaluate a bulk of parent nodes on GPU using lb2.
-  __global__ void evaluate_gpu_lb2(const int jobs, const int size, int best, Node* parents_d, const lb1_bound_data lbound1_d, const lb2_bound_data lbound2_d, int* bounds)
-  {
-    int threadId = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (threadId < size) {
-      const int parentId = threadId / jobs;
-      const int k = threadId % jobs;
-      Node parent =  parents_d[parentId];
-      int depth = parent.depth;
-      int limit1 = parent.limit1;
-
-      // We evaluate all permutations by varying index k from limit1 forward
-      if (k >= limit1+1) {
-        swap_cuda(&parent.prmu[depth],&parent.prmu[k]);
-        lb2_bound_gpu(lbound1_d, lbound2_d, parent.prmu, limit1+1, jobs, best, &bounds[threadId]);
-        swap_cuda(&parent.prmu[depth],&parent.prmu[k]);
-      }
+// Evaluate a bulk of parent nodes on GPU using lb2.
+__global__ void evaluate_gpu_lb2(const int jobs, const int size, int best, Node* parents_d, const lb1_bound_data lbound1_d, const lb2_bound_data lbound2_d, int* bounds)
+{
+  int threadId = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (threadId < size) {
+    const int parentId = threadId / jobs;
+    const int k = threadId % jobs;
+    Node parent =  parents_d[parentId];
+    int depth = parent.depth;
+    int limit1 = parent.limit1;
+
+    // We evaluate all permutations by varying index k from limit1 forward
+    if (k >= limit1+1) {
+      swap_cuda(&parent.prmu[depth], &parent.prmu[k]);
+      lb2_bound_gpu(lbound1_d, lbound2_d, parent.prmu, limit1+1, jobs, best, &bounds[threadId]);
+      swap_cuda(&parent.prmu[depth], &parent.prmu[k]);
     }
   }
+}
 
-  void evaluate_gpu(const int jobs, const int lb, const int size, const int nbBlocks,
-    int* best, const lb1_bound_data lbound1, const lb2_bound_data lbound2, Node* parents, int* bounds)
-  {
-    // 1D grid of 1D nbBlocks(_lb1_d) blocks with block size BLOCK_SIZE
-    int nbBlocks_lb1_d;
-    switch (lb) {
-    case 0: // lb1_d
-      nbBlocks_lb1_d = ceil((double)nbBlocks/jobs);
-      evaluate_gpu_lb1_d<<<nbBlocks_lb1_d, BLOCK_SIZE>>>(jobs, size, parents, lbound1, bounds);
-      return;
-      break;
-
-    case 1: // lb1
-      evaluate_gpu_lb1<<<nbBlocks, BLOCK_SIZE>>>(jobs, size, parents, lbound1, bounds);
-      return;
-      break;
-
-    case 2: // lb2
-      evaluate_gpu_lb2<<<nbBlocks, BLOCK_SIZE>>>(jobs, size, *best, parents, lbound1, lbound2, bounds);
-      return;
-      break;
-    }
+void evaluate_gpu(const int jobs, const int lb, const int size, const int nbBlocks,
+  int* best, const lb1_bound_data lbound1, const lb2_bound_data lbound2, Node* parents, int* bounds)
+{
+  // 1D grid of 1D nbBlocks(_lb1_d) blocks with block size BLOCK_SIZE
+  int nbBlocks_lb1_d;
+  switch (lb) {
+  case 0: // lb1_d
+    nbBlocks_lb1_d = ceil((double)nbBlocks/jobs);
+    evaluate_gpu_lb1_d<<<nbBlocks_lb1_d, BLOCK_SIZE>>>(jobs, size, parents, lbound1, bounds);
+    return;
+    break;
+
+  case 1: // lb1
+    evaluate_gpu_lb1<<<nbBlocks, BLOCK_SIZE>>>(jobs, size, parents, lbound1, bounds);
+    return;
+    break;
+
+  case 2: // lb2
+    evaluate_gpu_lb2<<<nbBlocks, BLOCK_SIZE>>>(jobs, size, *best, parents, lbound1, lbound2, bounds);
+    return;
+    break;
   }
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/baselines/pfsp/pfsp_gpu_cuda.c b/baselines/pfsp/pfsp_gpu_cuda.c
index 9f5079071c7a1e3a8f79d45a4936d24fbaf9d45c..b8bea6d49d10ba06d54e3edc6f8cee8d5a522d0a 100644
--- a/baselines/pfsp/pfsp_gpu_cuda.c
+++ b/baselines/pfsp/pfsp_gpu_cuda.c
@@ -162,10 +162,10 @@ void decompose_lb1(const int jobs, const lb1_bound_data* const lbound1, const No
 {
   for (int i = parent.limit1+1; i < jobs; i++) {
     Node child;
-    memcpy(child.prmu, parent.prmu, jobs * sizeof(int));
-    swap(&child.prmu[parent.depth], &child.prmu[i]);
     child.depth = parent.depth + 1;
     child.limit1 = parent.limit1 + 1;
+    memcpy(child.prmu, parent.prmu, jobs * sizeof(int));
+    swap(&child.prmu[parent.depth], &child.prmu[i]);
 
     int lowerbound = lb1_bound(lbound1, child.prmu, child.limit1, jobs);
 
@@ -204,9 +204,9 @@ void decompose_lb1_d(const int jobs, const lb1_bound_data* const lbound1, const
     } else { // if not leaf
       if (lb < *best) { // if child feasible
         Node child;
-        memcpy(child.prmu, parent.prmu, jobs * sizeof(int));
         child.depth = parent.depth + 1;
         child.limit1 = parent.limit1 + 1;
+        memcpy(child.prmu, parent.prmu, jobs * sizeof(int));
         swap(&child.prmu[child.limit1], &child.prmu[i]);
 
         pushBack(pool, child);
@@ -224,10 +224,10 @@ void decompose_lb2(const int jobs, const lb1_bound_data* const lbound1, const lb
 {
   for (int i = parent.limit1+1; i < jobs; i++) {
     Node child;
-    memcpy(child.prmu, parent.prmu, jobs * sizeof(int));
-    swap(&child.prmu[parent.depth], &child.prmu[i]);
     child.depth = parent.depth + 1;
     child.limit1 = parent.limit1 + 1;
+    memcpy(child.prmu, parent.prmu, jobs * sizeof(int));
+    swap(&child.prmu[parent.depth], &child.prmu[i]);
 
     int lowerbound = lb2_bound(lbound1, lbound2, child.prmu, child.limit1, jobs, *best);
 
@@ -277,19 +277,19 @@ void generate_children(Node* parents, const int size, const int jobs, int* bound
       const int lowerbound = bounds[j + i * jobs];
 
       // If child leaf
-      if(depth + 1 == jobs){
+      if (depth + 1 == jobs) {
         *exploredSol += 1;
 
         // If child feasible
-        if(lowerbound < *best) *best = lowerbound;
+        if (lowerbound < *best) *best = lowerbound;
 
       } else { // If not leaf
-        if(lowerbound < *best) {
+        if (lowerbound < *best) {
           Node child;
-          memcpy(child.prmu, parent.prmu, jobs * sizeof(int));
-          swap(&child.prmu[depth], &child.prmu[j]);
           child.depth = depth + 1;
           child.limit1 = parent.limit1 + 1;
+          memcpy(child.prmu, parent.prmu, jobs * sizeof(int));
+          swap(&child.prmu[depth], &child.prmu[j]);
 
           pushBack(pool, child);
           *exploredTree += 1;
@@ -316,11 +316,14 @@ void pfsp_search(const int inst, const int lb, const int m, const int M, int* be
 
   pushBack(&pool, root);
 
-  // Timer
   struct timespec start, end;
+
+  /*
+    Step 1: We perform a partial breadth-first search on CPU in order to create
+    a sufficiently large amount of work for GPU computation.
+  */
   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
 
-  // Bounding data
   lb1_bound_data* lbound1;
   lbound1 = new_bound_data(jobs, machines);
   taillard_get_processing_times(lbound1->p_times, inst);
@@ -332,19 +335,14 @@ void pfsp_search(const int inst, const int lb, const int m, const int M, int* be
   fill_lags(lbound1->p_times, lbound2);
   fill_johnson_schedules(lbound1->p_times, lbound2);
 
-  /*
-    Step 1: We perform a partial breadth-first search on CPU in order to create
-    a sufficiently large amount of work for GPU computation.
-  */
-
-  while(pool.size < m) {
-    // CPU side
+  while (pool.size < m) {
     int hasWork = 0;
     Node parent = popFront(&pool, &hasWork);
     if (!hasWork) break;
 
     decompose(jobs, lb, best, lbound1, lbound2, parent, exploredTree, exploredSol, &pool);
   }
+
   clock_gettime(CLOCK_MONOTONIC_RAW, &end);
   double t1 = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
 
@@ -357,7 +355,6 @@ void pfsp_search(const int inst, const int lb, const int m, const int M, int* be
     Step 2: We continue the search on GPU in a depth-first manner, until there
     is not enough work.
   */
-
   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
 
   // TODO: add function 'copyBoundsDevice' to perform the deep copy of bounding data
@@ -413,27 +410,18 @@ void pfsp_search(const int inst, const int lb, const int m, const int M, int* be
   lbound2_d.nb_jobs = lbound2->nb_jobs;
   lbound2_d.nb_machines = lbound2->nb_machines;
 
-  // Allocating parents vector on CPU and GPU
   Node* parents = (Node*)malloc(M * sizeof(Node));
-  Node* parents_d;
-  cudaMalloc((void**)&parents_d, M * sizeof(Node));
-
-  // Allocating bounds vector on CPU and GPU
   int* bounds = (int*)malloc((jobs*M) * sizeof(int));
+
+  Node* parents_d;
   int *bounds_d;
+  cudaMalloc((void**)&parents_d, M * sizeof(Node));
   cudaMalloc((void**)&bounds_d, (jobs*M) * sizeof(int));
 
   while (1) {
-    int poolSize = pool.size;
-    if (poolSize >= m) {
-      poolSize = MIN(poolSize,M);
-
-      for(int i= 0; i < poolSize; i++) {
-        int hasWork = 0;
-        parents[i] = popBack(&pool,&hasWork);
-        if (!hasWork) break;
-      }
+    int poolSize = popBackBulk(&pool, m, M, parents);
 
+    if (poolSize > 0) {
       /*
         TODO: Optimize 'numBounds' based on the fact that the maximum number of
         generated children for a parent is 'parent.limit2 - parent.limit1 + 1' or
@@ -443,10 +431,7 @@ void pfsp_search(const int inst, const int lb, const int m, const int M, int* be
       const int nbBlocks = ceil((double)numBounds / BLOCK_SIZE);
 
       cudaMemcpy(parents_d, parents, poolSize *sizeof(Node), cudaMemcpyHostToDevice);
-
-      // numBounds is the 'size' of the problem
       evaluate_gpu(jobs, lb, numBounds, nbBlocks, best, lbound1_d, lbound2_d, parents_d, bounds_d);
-
       cudaMemcpy(bounds, bounds_d, numBounds * sizeof(int), cudaMemcpyDeviceToHost);
 
       /*
@@ -458,6 +443,7 @@ void pfsp_search(const int inst, const int lb, const int m, const int M, int* be
       break;
     }
   }
+
   clock_gettime(CLOCK_MONOTONIC_RAW, &end);
   double t2 = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
 
@@ -469,8 +455,8 @@ void pfsp_search(const int inst, const int lb, const int m, const int M, int* be
   /*
     Step 3: We complete the depth-first search on CPU.
   */
-
   clock_gettime(CLOCK_MONOTONIC_RAW, &start);
+
   while (1) {
     int hasWork = 0;
     Node parent = popBack(&pool, &hasWork);
@@ -479,14 +465,17 @@ void pfsp_search(const int inst, const int lb, const int m, const int M, int* be
     decompose(jobs, lb, best, lbound1, lbound2, parent, exploredTree, exploredSol, &pool);
   }
 
-  // Freeing memory for structs
-  deleteSinglePool(&pool);
-  free_bound_data(lbound1);
-  free_johnson_bd_data(lbound2);
+  clock_gettime(CLOCK_MONOTONIC_RAW, &end);
+  double t3 = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
+  *elapsedTime = t1 + t2 + t3;
+
+  printf("\nSearch on CPU completed\n");
+  printf("Size of the explored tree: %llu\n", *exploredTree);
+  printf("Number of explored solutions: %llu\n", *exploredSol);
+  printf("Elapsed time: %f [s]\n", t3);
+
+  printf("\nExploration terminated.\n");
 
-  // Freeing memory for device
-  cudaFree(parents_d);
-  cudaFree(bounds_d);
   cudaFree(p_times_d);
   cudaFree(min_heads_d);
   cudaFree(min_tails_d);
@@ -495,20 +484,15 @@ void pfsp_search(const int inst, const int lb, const int m, const int M, int* be
   cudaFree(machine_pairs_1_d);
   cudaFree(machine_pairs_2_d);
   cudaFree(machine_pair_order_d);
+  cudaFree(parents_d);
+  cudaFree(bounds_d);
 
-  //Freeing memory for host
+  free_bound_data(lbound1);
+  free_johnson_bd_data(lbound2);
   free(parents);
   free(bounds);
 
-  clock_gettime(CLOCK_MONOTONIC_RAW, &end);
-  double t3 = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
-  *elapsedTime = t1 + t2 + t3;
-  printf("\nSearch on CPU completed\n");
-  printf("Size of the explored tree: %llu\n", *exploredTree);
-  printf("Number of explored solutions: %llu\n", *exploredSol);
-  printf("Elapsed time: %f [s]\n", t3);
-
-  printf("\nExploration terminated.\n");
+  deleteSinglePool(&pool);
 }
 
 int main(int argc, char* argv[])
diff --git a/lib/pfsp/Bound_johnson.chpl b/lib/pfsp/Bound_johnson.chpl
index 6f60be9c341e1fe589ff435eb09dbfa046afd672..e8a25c89348f436c0a1819a5ca66b23eb504d093 100644
--- a/lib/pfsp/Bound_johnson.chpl
+++ b/lib/pfsp/Bound_johnson.chpl
@@ -47,18 +47,6 @@ module Bound_johnson
     }
   }
 
-  /* NOTE: This wrapper allows one to store persistent data on the GPU memory. */
-  class WrapperClassLB2 {
-    forwarding var lb2_bound: lb2_bound_data;
-
-    proc init(const jobs: int(32), const machines: int(32))
-    {
-      this.lb2_bound = new lb2_bound_data(jobs, machines);
-    }
-  }
-
-  type WrapperLB2 = owned WrapperClassLB2?;
-
   proc fill_machine_pairs(ref lb2_data: lb2_bound_data/*, enum lb2_variant lb2_type*/): void
   {
     var lb2_type = lb2_variant.LB2_LEARN;
@@ -201,13 +189,18 @@ module Bound_johnson
   {
     const nb_jobs = lb2_data.nb_jobs;
 
+    use CTypes only c_ptrToConst;
+    const lb2_js = c_ptrToConst(lb2_data.johnson_schedules[0]);
+    const lb1_pt = c_ptrToConst(lb1_p_times[0]);
+    const lb2_lags = c_ptrToConst(lb2_data.lags[0]);
+
     for j in 0..#nb_jobs {
-      var job = lb2_data.johnson_schedules[ind*nb_jobs + j];
+      var job = lb2_js[ind*nb_jobs + j];
       // j-loop is on unscheduled jobs... (==0 if jobCour is unscheduled)
       if (flag[job] == 0) {
-        var ptm0 = lb1_p_times[ma0*nb_jobs + job];
-        var ptm1 = lb1_p_times[ma1*nb_jobs + job];
-        var lag = lb2_data.lags[ind*nb_jobs + job];
+        var ptm0 = lb1_pt[ma0*nb_jobs + job];
+        var ptm1 = lb1_pt[ma1*nb_jobs + job];
+        var lag = lb2_lags[ind*nb_jobs + job];
         // add job on ma0 and ma1
         tmp0 += ptm0;
         tmp1 = max(tmp1,tmp0 + lag);
diff --git a/lib/pfsp/Bound_simple.chpl b/lib/pfsp/Bound_simple.chpl
index a9efd3f5d105a0f1d0d1366ffadd6765fa652e09..0ca9004bae6a69569add62f8aab836b54347e545 100644
--- a/lib/pfsp/Bound_simple.chpl
+++ b/lib/pfsp/Bound_simple.chpl
@@ -26,18 +26,6 @@ module Bound_simple
     }
   }
 
-  /* NOTE: This wrapper allows one to store persistent data on the GPU memory. */
-  class WrapperClassLB1 {
-    forwarding var lb1_bound: lb1_bound_data;
-
-    proc init(const jobs: int(32), const machines: int(32))
-    {
-      this.lb1_bound = new lb1_bound_data(jobs, machines);
-    }
-  }
-
-  type WrapperLB1 = owned WrapperClassLB1?;
-
   inline proc add_forward(const job: int(32), const p_times: [] int(32), const nb_jobs: int(32), const nb_machines: int(32), ref front): void
   {
     front[0] += p_times[job];
diff --git a/pfsp_chpl.chpl b/pfsp_chpl.chpl
index 2cbc2ade2f1999435ef04da5ef3ea9910134d8e6..3e7b578e467c3557ddb5f6b1e751cd6a2704c17a 100644
--- a/pfsp_chpl.chpl
+++ b/pfsp_chpl.chpl
@@ -6,7 +6,6 @@ use Time;
 
 use util;
 use Pool;
-
 use PFSP_node;
 use Bound_johnson;
 use Bound_simple;
diff --git a/pfsp_gpu_chpl.chpl b/pfsp_gpu_chpl.chpl
index 617f9ceff04a31b14aec4b4b520982bafc503c47..b6641c70404e15e8ecfdd0dc744c7c8eb55338e0 100644
--- a/pfsp_gpu_chpl.chpl
+++ b/pfsp_gpu_chpl.chpl
@@ -5,11 +5,8 @@
 use Time;
 use GpuDiagnostics;
 
-config const BLOCK_SIZE = 512;
-
 use util;
 use Pool;
-
 use PFSP_node;
 use Bound_johnson;
 use Bound_simple;
@@ -17,6 +14,8 @@ use Taillard;
 
 const allowedLowerBounds = ["lb1", "lb1_d", "lb2"];
 
+config const BLOCK_SIZE = 512;
+
 /*******************************************************************************
 Implementation of the single-GPU PFSP search.
 *******************************************************************************/
@@ -35,15 +34,6 @@ config const ub: int = 1; // initial upper bound
 const jobs = taillard_get_nb_jobs(inst);
 const machines = taillard_get_nb_machines(inst);
 
-var lbound1 = new WrapperLB1(jobs, machines); //lb1_bound_data(jobs, machines);
-taillard_get_processing_times(lbound1!.lb1_bound.p_times, inst);
-fill_min_heads_tails(lbound1!.lb1_bound);
-
-var lbound2 = new WrapperLB2(jobs, machines);
-fill_machine_pairs(lbound2!.lb2_bound/*, LB2_FULL*/);
-fill_lags(lbound1!.lb1_bound.p_times, lbound2!.lb2_bound);
-fill_johnson_schedules(lbound1!.lb1_bound.p_times, lbound2!.lb2_bound);
-
 const initUB = if (ub == 1) then taillard_get_best_ub(inst) else max(int);
 
 proc check_parameters()
@@ -95,8 +85,8 @@ proc help_message(): void
 }
 
 // Evaluate and generate children nodes on CPU.
-proc decompose_lb1(const parent: Node, ref tree_loc: uint, ref num_sol: uint,
-  ref best: int, ref pool: SinglePool(Node))
+proc decompose_lb1(const lb1_data, const parent: Node, ref tree_loc: uint, ref num_sol: uint,
+  ref best: int, ref pool)
 {
   for i in parent.limit1+1..(jobs-1) {
     var child = new Node();
@@ -105,7 +95,7 @@ proc decompose_lb1(const parent: Node, ref tree_loc: uint, ref num_sol: uint,
     child.prmu = parent.prmu;
     child.prmu[parent.depth] <=> child.prmu[i];
 
-    var lowerbound = lb1_bound(lbound1!.lb1_bound, child.prmu, child.limit1, jobs);
+    var lowerbound = lb1_bound(lb1_data, child.prmu, child.limit1, jobs);
 
     if (child.depth == jobs) { // if child leaf
       num_sol += 1;
@@ -122,12 +112,12 @@ proc decompose_lb1(const parent: Node, ref tree_loc: uint, ref num_sol: uint,
   }
 }
 
-proc decompose_lb1_d(const parent: Node, ref tree_loc: uint, ref num_sol: uint,
-  ref best: int, ref pool: SinglePool(Node))
+proc decompose_lb1_d(const lb1_data, const parent: Node, ref tree_loc: uint, ref num_sol: uint,
+  ref best: int, ref pool)
 {
   var lb_begin: MAX_JOBS*int(32);
 
-  lb1_children_bounds(lbound1!.lb1_bound, parent.prmu, parent.limit1, jobs, lb_begin);
+  lb1_children_bounds(lb1_data, parent.prmu, parent.limit1, jobs, lb_begin);
 
   for i in parent.limit1+1..(jobs-1) {
     const job = parent.prmu[i];
@@ -154,8 +144,8 @@ proc decompose_lb1_d(const parent: Node, ref tree_loc: uint, ref num_sol: uint,
   }
 }
 
-proc decompose_lb2(const parent: Node, ref tree_loc: uint, ref num_sol: uint,
-  ref best: int, ref pool: SinglePool(Node))
+proc decompose_lb2(const lb1_data, const lb2_data, const parent: Node, ref tree_loc: uint,
+  ref num_sol: uint, ref best: int, ref pool)
 {
   for i in parent.limit1+1..(jobs-1) {
     var child = new Node();
@@ -164,7 +154,7 @@ proc decompose_lb2(const parent: Node, ref tree_loc: uint, ref num_sol: uint,
     child.prmu = parent.prmu;
     child.prmu[parent.depth] <=> child.prmu[i];
 
-    var lowerbound = lb2_bound(lbound1!.lb1_bound, lbound2!.lb2_bound, child.prmu, child.limit1, jobs, best);
+    var lowerbound = lb2_bound(lb1_data, lb2_data, child.prmu, child.limit1, jobs, best);
 
     if (child.depth == jobs) { // if child leaf
       num_sol += 1;
@@ -182,18 +172,18 @@ proc decompose_lb2(const parent: Node, ref tree_loc: uint, ref num_sol: uint,
 }
 
 // Evaluate and generate children nodes on CPU.
-proc decompose(const parent: Node, ref tree_loc: uint, ref num_sol: uint,
-  ref best: int, ref pool: SinglePool(Node))
+proc decompose(const lb1_data, const lb2_data, const parent: Node, ref tree_loc: uint,
+  ref num_sol: uint, ref best: int, ref pool)
 {
   select lb {
     when "lb1_d" {
-      decompose_lb1_d(parent, tree_loc, num_sol, best, pool);
+      decompose_lb1_d(lb1_data, parent, tree_loc, num_sol, best, pool);
     }
     when "lb1" {
-      decompose_lb1(parent, tree_loc, num_sol, best, pool);
+      decompose_lb1(lb1_data, parent, tree_loc, num_sol, best, pool);
     }
     otherwise { // lb2
-      decompose_lb2(parent, tree_loc, num_sol, best, pool);
+      decompose_lb2(lb1_data, lb2_data, parent, tree_loc, num_sol, best, pool);
     }
   }
 }
@@ -211,7 +201,7 @@ proc evaluate_gpu_lb1(const parents_d: [] Node, const size, const lbound1_d, ref
 
     if (k >= parent.limit1+1) {
       prmu[depth] <=> prmu[k];
-      bounds_d[threadId] = lb1_bound(lbound1_d!.lb1_bound, prmu, parent.limit1+1, jobs);
+      bounds_d[threadId] = lb1_bound(lbound1_d, prmu, parent.limit1+1, jobs);
       prmu[depth] <=> prmu[k];
     }
   }
@@ -228,12 +218,12 @@ proc evaluate_gpu_lb1_d(const parents_d: [] Node, const size, const best, const
   @assertOnGpu
   foreach parentId in 0..#(size/jobs) {
     var parent = parents_d[parentId];
-    const depth = parent.depth;
+    /* const depth = parent.depth; */
     var prmu = parent.prmu;
 
     var lb_begin: MAX_JOBS*int(32);
 
-    lb1_children_bounds(lbound1_d!.lb1_bound, parent.prmu, parent.limit1, jobs, lb_begin);
+    lb1_children_bounds(lbound1_d, parent.prmu, parent.limit1, jobs, lb_begin);
 
     for k in 0..#jobs {
       if (k >= parent.limit1+1) {
@@ -257,7 +247,7 @@ proc evaluate_gpu_lb2(const parents_d: [] Node, const size, const best, const lb
 
     if (k >= parent.limit1+1) {
       prmu[depth] <=> prmu[k];
-      bounds_d[threadId] = lb2_bound(lbound1_d!.lb1_bound, lbound2_d!.lb2_bound, prmu, parent.limit1+1, jobs, best);
+      bounds_d[threadId] = lb2_bound(lbound1_d, lbound2_d, prmu, parent.limit1+1, jobs, best);
       prmu[depth] <=> prmu[k];
     }
   }
@@ -299,10 +289,10 @@ proc generate_children(const ref parents: [] Node, const size: int, const ref bo
       } else { // if not leaf
         if (lowerbound < best) { // if child feasible
           var child = new Node();
-          child.depth = parent.depth + 1;
+          child.depth = depth + 1;
           child.limit1 = parent.limit1 + 1;
           child.prmu = parent.prmu;
-          child.prmu[parent.depth] <=> child.prmu[j];
+          child.prmu[depth] <=> child.prmu[j];
 
           pool.pushBack(child);
           exploredTree += 1;
@@ -312,16 +302,6 @@ proc generate_children(const ref parents: [] Node, const size: int, const ref bo
   }
 }
 
-class WrapperClassArrayParents {
-  forwarding var arr: [0..#M] Node = noinit;
-}
-type WrapperArrayParents = owned WrapperClassArrayParents?;
-
-class WrapperClassArrayBounds {
-  forwarding var arr: [0..#(M*jobs)] int(32) = noinit;
-}
-type WrapperArrayBounds = owned WrapperClassArrayBounds?;
-
 // Single-GPU PFSP search.
 proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint, ref elapsedTime: real)
 {
@@ -342,15 +322,26 @@ proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint
   */
   timer.start();
 
+  var lbound1 = new lb1_bound_data(jobs, machines);
+  taillard_get_processing_times(lbound1.p_times, inst);
+  fill_min_heads_tails(lbound1);
+
+  var lbound2 = new lb2_bound_data(jobs, machines);
+  fill_machine_pairs(lbound2/*, LB2_FULL*/);
+  fill_lags(lbound1.p_times, lbound2);
+  fill_johnson_schedules(lbound1.p_times, lbound2);
+
   while (pool.size < m) {
     var hasWork = 0;
     var parent = pool.popFront(hasWork);
     if !hasWork then break;
 
-    decompose(parent, exploredTree, exploredSol, best, pool);
+    decompose(lbound1, lbound2, parent, exploredTree, exploredSol, best, pool);
   }
+
   timer.stop();
   const res1 = (timer.elapsed(), exploredTree, exploredSol);
+
   writeln("\nInitial search on CPU completed");
   writeln("Size of the explored tree: ", res1[1]);
   writeln("Number of explored solutions: ", res1[2]);
@@ -365,38 +356,24 @@ proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint
   var parents: [0..#M] Node = noinit;
   var bounds: [0..#(M*jobs)] int(32) = noinit;
 
-  var lbound1_d: lbound1.type;
-  var lbound2_d: lbound2.type;
-  var parents_d: WrapperArrayParents;
-  var bounds_d: WrapperArrayBounds;
-
-  on device {
-    lbound1_d = new WrapperLB1(jobs, machines);
-    lbound1_d!.lb1_bound.p_times   = lbound1!.lb1_bound.p_times;
-    lbound1_d!.lb1_bound.min_heads = lbound1!.lb1_bound.min_heads;
-    lbound1_d!.lb1_bound.min_tails = lbound1!.lb1_bound.min_tails;
-
-    lbound2_d = new WrapperLB2(jobs, machines);
-    lbound2_d!.lb2_bound.johnson_schedules  = lbound2!.lb2_bound.johnson_schedules;
-    lbound2_d!.lb2_bound.lags               = lbound2!.lb2_bound.lags;
-    lbound2_d!.lb2_bound.machine_pairs      = lbound2!.lb2_bound.machine_pairs;
-    lbound2_d!.lb2_bound.machine_pair_order = lbound2!.lb2_bound.machine_pair_order;
-
-    parents_d = new WrapperArrayParents();
-    bounds_d = new WrapperArrayBounds();
-  }
+  on device var parents_d: [0..#M] Node;
+  on device var bounds_d: [0..#(M*jobs)] int(32);
 
-  while true {
+  on device var lbound1_d = new lb1_bound_data(jobs, machines);
+  lbound1_d.p_times   = lbound1.p_times;
+  lbound1_d.min_heads = lbound1.min_heads;
+  lbound1_d.min_tails = lbound1.min_tails;
 
-    var poolSize = pool.size;
-    if (poolSize >= m) {
-      poolSize = min(poolSize, M);
-      for i in 0..#poolSize {
-        var hasWork = 0;
-        parents[i] = pool.popBack(hasWork);
-        if !hasWork then break;
-      }
+  on device var lbound2_d = new lb2_bound_data(jobs, machines);
+  lbound2_d.johnson_schedules  = lbound2.johnson_schedules;
+  lbound2_d.lags               = lbound2.lags;
+  lbound2_d.machine_pairs      = lbound2.machine_pairs;
+  lbound2_d.machine_pair_order = lbound2.machine_pair_order;
+
+  while true {
+    var poolSize = pool.popBackBulk(m, M, parents);
 
+    if (poolSize > 0) {
       /*
         TODO: Optimize 'numBounds' based on the fact that the maximum number of
         generated children for a parent is 'parent.limit2 - parent.limit1 + 1' or
@@ -404,11 +381,9 @@ proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint
       */
       const numBounds = jobs * poolSize;
 
-      on device {
-        parents_d!.arr = parents; // host-to-device
-        evaluate_gpu(parents_d!.arr, numBounds, best, lbound1_d, lbound2_d, bounds_d!.arr);
-        bounds = bounds_d!.arr; // device-to-host
-      }
+      parents_d = parents; // host-to-device
+      on device do evaluate_gpu(parents_d, numBounds, best, lbound1_d, lbound2_d, bounds_d); // GPU kernel
+      bounds = bounds_d; // device-to-host
 
       /*
         Each task generates and inserts its children nodes to the pool.
@@ -419,9 +394,10 @@ proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint
       break;
     }
   }
-  timer.stop();
 
+  timer.stop();
   const res2 = (timer.elapsed(), exploredTree, exploredSol) - res1;
+
   writeln("Search on GPU completed");
   writeln("Size of the explored tree: ", res2[1]);
   writeln("Number of explored solutions: ", res2[2]);
@@ -431,16 +407,19 @@ proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint
     Step 3: We complete the depth-first search on CPU.
   */
   timer.start();
+
   while true {
     var hasWork = 0;
     var parent = pool.popBack(hasWork);
     if !hasWork then break;
 
-    decompose(parent, exploredTree, exploredSol, best, pool);
+    decompose(lbound1, lbound2, parent, exploredTree, exploredSol, best, pool);
   }
+
   timer.stop();
   elapsedTime = timer.elapsed();
   const res3 = (elapsedTime, exploredTree, exploredSol) - res1 - res2;
+
   writeln("Search on CPU completed");
   writeln("Size of the explored tree: ", res3[1]);
   writeln("Number of explored solutions: ", res3[2]);
diff --git a/pfsp_multigpu_chpl.chpl b/pfsp_multigpu_chpl.chpl
index 2bb3ae22c4ceb07c5ddefe3c6fadeb9b76a773ca..4d2db4e6bd45aaf9e49aedc8f4af01094c43f7c4 100644
--- a/pfsp_multigpu_chpl.chpl
+++ b/pfsp_multigpu_chpl.chpl
@@ -6,11 +6,8 @@ use Time;
 use Random;
 use GpuDiagnostics;
 
-config const BLOCK_SIZE = 512;
-
 use util;
 use Pool_par;
-
 use PFSP_node;
 use Bound_johnson;
 use Bound_simple;
@@ -18,6 +15,8 @@ use Taillard;
 
 const allowedLowerBounds = ["lb1", "lb1_d", "lb2"];
 
+config const BLOCK_SIZE = 512;
+
 /*******************************************************************************
 Implementation of the multi-GPU PFSP search.
 *******************************************************************************/
@@ -89,7 +88,7 @@ proc help_message(): void
 
 // Evaluate and generate children nodes on CPU.
 proc decompose_lb1(const lb1_data, const parent: Node, ref tree_loc: uint, ref num_sol: uint,
-  ref best: int, ref pool: SinglePool_par(Node))
+  ref best: int, ref pool)
 {
   for i in parent.limit1+1..(jobs-1) {
     var child = new Node();
@@ -108,7 +107,7 @@ proc decompose_lb1(const lb1_data, const parent: Node, ref tree_loc: uint, ref n
       }
     } else { // if not leaf
       if (lowerbound < best) { // if child feasible
-        pool.pushBack(child);
+        pool.pushBackFree(child);
         tree_loc += 1;
       }
     }
@@ -116,7 +115,7 @@ proc decompose_lb1(const lb1_data, const parent: Node, ref tree_loc: uint, ref n
 }
 
 proc decompose_lb1_d(const lb1_data, const parent: Node, ref tree_loc: uint, ref num_sol: uint,
-  ref best: int, ref pool: SinglePool_par(Node))
+  ref best: int, ref pool)
 {
   var lb_begin: MAX_JOBS*int(32);
 
@@ -140,15 +139,15 @@ proc decompose_lb1_d(const lb1_data, const parent: Node, ref tree_loc: uint, ref
         child.prmu = parent.prmu;
         child.prmu[parent.depth] <=> child.prmu[i];
 
-        pool.pushBack(child);
+        pool.pushBackFree(child);
         tree_loc += 1;
       }
     }
   }
 }
 
-proc decompose_lb2(const lb1_data, const lb2_data, const parent: Node, ref tree_loc: uint, ref num_sol: uint,
-  ref best: int, ref pool: SinglePool_par(Node))
+proc decompose_lb2(const lb1_data, const lb2_data, const parent: Node, ref tree_loc: uint,
+  ref num_sol: uint, ref best: int, ref pool)
 {
   for i in parent.limit1+1..(jobs-1) {
     var child = new Node();
@@ -167,7 +166,7 @@ proc decompose_lb2(const lb1_data, const lb2_data, const parent: Node, ref tree_
       }
     } else { // if not leaf
       if (lowerbound < best) { // if child feasible
-        pool.pushBack(child);
+        pool.pushBackFree(child);
         tree_loc += 1;
       }
     }
@@ -175,8 +174,8 @@ proc decompose_lb2(const lb1_data, const lb2_data, const parent: Node, ref tree_
 }
 
 // Evaluate and generate children nodes on CPU.
-proc decompose(const lb1_data, const lb2_data, const parent: Node, ref tree_loc: uint, ref num_sol: uint,
-  ref best: int, ref pool: SinglePool_par(Node))
+proc decompose(const lb1_data, const lb2_data, const parent: Node, ref tree_loc: uint,
+  ref num_sol: uint, ref best: int, ref pool)
 {
   select lb {
     when "lb1_d" {
@@ -221,7 +220,7 @@ proc evaluate_gpu_lb1_d(const parents_d: [] Node, const size, const best, const
   @assertOnGpu
   foreach parentId in 0..#(size/jobs) {
     var parent = parents_d[parentId];
-    const depth = parent.depth;
+    /* const depth = parent.depth; */
     var prmu = parent.prmu;
 
     var lb_begin: MAX_JOBS*int(32);
@@ -261,21 +260,23 @@ proc evaluate_gpu(const parents_d: [] Node, const size, const best, const lbound
 {
   select lb {
     when "lb1_d" {
-      evaluate_gpu_lb1_d(parents_d, size, best, lbound1_d!.lb1_bound, bounds_d);
+      evaluate_gpu_lb1_d(parents_d, size, best, lbound1_d, bounds_d);
     }
     when "lb1" {
-      evaluate_gpu_lb1(parents_d, size, lbound1_d!.lb1_bound, bounds_d);
+      evaluate_gpu_lb1(parents_d, size, lbound1_d, bounds_d);
     }
     otherwise { // lb2
-      evaluate_gpu_lb2(parents_d, size, best, lbound1_d!.lb1_bound, lbound2_d!.lb2_bound, bounds_d);
+      evaluate_gpu_lb2(parents_d, size, best, lbound1_d, lbound2_d, bounds_d);
     }
   }
 }
 
 // Generate children nodes (evaluated by GPU) on CPU.
 proc generate_children(const ref parents: [] Node, const size: int, const ref bounds: [] int(32),
-  ref exploredTree: uint, ref exploredSol: uint, ref best: int, ref pool: SinglePool_par(Node))
+  ref exploredTree: uint, ref exploredSol: uint, ref best: int, ref pool)
 {
+  pool.acquireLock();
+
   for i in 0..#size {
     const parent = parents[i];
     const depth = parent.depth;
@@ -292,28 +293,20 @@ proc generate_children(const ref parents: [] Node, const size: int, const ref bo
       } else { // if not leaf
         if (lowerbound < best) { // if child feasible
           var child = new Node();
-          child.depth = parent.depth + 1;
+          child.depth = depth + 1;
           child.limit1 = parent.limit1 + 1;
           child.prmu = parent.prmu;
-          child.prmu[parent.depth] <=> child.prmu[j];
+          child.prmu[depth] <=> child.prmu[j];
 
-          pool.pushBack(child);
+          pool.pushBackFree(child);
           exploredTree += 1;
         }
       }
     }
   }
-}
 
-class WrapperClassArrayParents {
-  forwarding var arr: [0..#M] Node = noinit;
+  pool.releaseLock();
 }
-type WrapperArrayParents = owned WrapperClassArrayParents?;
-
-class WrapperClassArrayBounds {
-  forwarding var arr: [0..#(M*jobs)] int(32) = noinit;
-}
-type WrapperArrayBounds = owned WrapperClassArrayBounds?;
 
 // Multi-GPU PFSP search.
 proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint, ref elapsedTime: real)
@@ -325,9 +318,6 @@ proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint
   var pool = new SinglePool_par(Node);
   pool.pushBack(root);
 
-  var allTasksIdleFlag: atomic bool = false;
-  var eachTaskState: [0..#D] atomic bool = BUSY; // one task per GPU
-
   var timer: stopwatch;
 
   /*
@@ -352,8 +342,10 @@ proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint
 
     decompose(lbound1, lbound2, parent, exploredTree, exploredSol, best, pool);
   }
+
   timer.stop();
   const res1 = (timer.elapsed(), exploredTree, exploredSol);
+
   writeln("\nInitial search on CPU completed");
   writeln("Size of the explored tree: ", res1[1]);
   writeln("Number of explored solutions: ", res1[2]);
@@ -364,14 +356,16 @@ proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint
     is not enough work.
   */
   timer.start();
+
   var eachExploredTree, eachExploredSol: [0..#D] uint = noinit;
   var eachBest: [0..#D] int = noinit;
+  var eachTaskState: [0..#D] atomic bool = BUSY; // one task per GPU
+  var allTasksIdleFlag: atomic bool = false;
 
   const poolSize = pool.size;
   const c = poolSize / D;
   const l = poolSize - (D-1)*c;
   const f = pool.front;
-  var lock_p: atomic bool;
 
   pool.front = 0;
   pool.size = 0;
@@ -401,31 +395,21 @@ proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint
     var parents: [0..#M] Node = noinit;
     var bounds: [0..#(M*jobs)] int(32) = noinit;
 
-    var lbound1_d: WrapperLB1;
-    var lbound2_d: WrapperLB2;
-    var parents_d: WrapperArrayParents;
-    var bounds_d: WrapperArrayBounds;
-
-    on device {
-      lbound1_d = new WrapperLB1(jobs, machines);
-      lbound1_d!.lb1_bound.p_times   = lbound1.p_times;
-      lbound1_d!.lb1_bound.min_heads = lbound1.min_heads;
-      lbound1_d!.lb1_bound.min_tails = lbound1.min_tails;
-
-      lbound2_d = new WrapperLB2(jobs, machines);
-      lbound2_d!.lb2_bound.johnson_schedules  = lbound2.johnson_schedules;
-      lbound2_d!.lb2_bound.lags               = lbound2.lags;
-      lbound2_d!.lb2_bound.machine_pairs      = lbound2.machine_pairs;
-      lbound2_d!.lb2_bound.machine_pair_order = lbound2.machine_pair_order;
-
-      parents_d = new WrapperArrayParents();
-      bounds_d = new WrapperArrayBounds();
-    }
+    on device var parents_d: [0..#M] Node;
+    on device var bounds_d: [0..#(M*jobs)] int(32);
+
+    on device var lbound1_d = new lb1_bound_data(jobs, machines);
+    lbound1_d.p_times   = lbound1.p_times;
+    lbound1_d.min_heads = lbound1.min_heads;
+    lbound1_d.min_tails = lbound1.min_tails;
+
+    on device var lbound2_d = new lb2_bound_data(jobs, machines);
+    lbound2_d.johnson_schedules  = lbound2.johnson_schedules;
+    lbound2_d.lags               = lbound2.lags;
+    lbound2_d.machine_pairs      = lbound2.machine_pairs;
+    lbound2_d.machine_pair_order = lbound2.machine_pair_order;
 
     while true {
-      /*
-        Each task gets its parents nodes from the pool.
-      */
       var poolSize = pool_loc.popBackBulk(m, M, parents);
 
       if (poolSize > 0) {
@@ -434,15 +418,6 @@ proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint
           eachTaskState[gpuID].write(BUSY);
         }
 
-        /* poolSize = min(poolSize, M);
-        var hasWork = 0;
-        var parents: [0..#poolSize] Node = noinit;
-        pool_loc.popBackBulk(poolSize, parents, hasWork);
-        if (hasWork == 0) {
-          writeln("DEADCODE in get parents");
-          break;
-        } */
-
         /*
           TODO: Optimize 'numBounds' based on the fact that the maximum number of
           generated children for a parent is 'parent.limit2 - parent.limit1 + 1' or
@@ -450,11 +425,9 @@ proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint
         */
         const numBounds = jobs * poolSize;
 
-        on device {
-          parents_d!.arr = parents; // host-to-device
-          evaluate_gpu(parents_d!.arr, numBounds, best_l, lbound1_d, lbound2_d, bounds_d!.arr);
-          bounds = bounds_d!.arr; // device-to-host
-        }
+        parents_d = parents; // host-to-device
+        on device do evaluate_gpu(parents_d, numBounds, best_l, lbound1_d, lbound2_d, bounds_d); // GPU kernel
+        bounds = bounds_d; // device-to-host
 
         /*
           Each task generates and inserts its children nodes to the pool.
@@ -462,7 +435,7 @@ proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint
         generate_children(parents, poolSize, bounds, tree, sol, best_l, pool_loc);
       }
       else {
-        // work stealing
+        // work stealing attempts
         var tries = 0;
         var steal = false;
         const victims = permute(0..#D);
@@ -486,9 +459,6 @@ proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint
                     halt("DEADCODE in work stealing");
                   }
 
-                  /* for i in 0..#(size/2) {
-                    pool_loc.pushBack(p[i]);
-                  } */
                   pool_loc.pushBackBulk(p);
 
                   steal = true;
@@ -525,23 +495,25 @@ proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint
       }
     }
 
-    if lock_p.compareAndSwap(false, true) {
-      const poolLocSize = pool_loc.size;
-      for p in 0..#poolLocSize {
-        var hasWork = 0;
-        pool.pushBack(pool_loc.popBack(hasWork));
-        if !hasWork then break;
-      }
-      lock_p.write(false);
+    const poolLocSize = pool_loc.size;
+    for p in 0..#poolLocSize {
+      var hasWork = 0;
+      pool.pushBack(pool_loc.popBack(hasWork));
+      if !hasWork then break;
     }
 
     eachExploredTree[gpuID] = tree;
     eachExploredSol[gpuID] = sol;
     eachBest[gpuID] = best_l;
-
-    writeln("work stealing tries on tasks ", gpuID, " : ", nSteal, " and successes : ", nSSteal);
   }
+
   timer.stop();
+  const res2 = (timer.elapsed(), exploredTree, exploredSol) - res1;
+
+  writeln("Search on GPU completed");
+  writeln("Size of the explored tree: ", res2[1]);
+  writeln("Number of explored solutions: ", res2[2]);
+  writeln("Elapsed time: ", res2[0], " [s]\n");
 
   exploredTree += (+ reduce eachExploredTree);
   exploredSol += (+ reduce eachExploredSol);
@@ -549,16 +521,11 @@ proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint
 
   writeln("workload per GPU: ", 100.0*eachExploredTree/(exploredTree-res1[1]):real);
 
-  const res2 = (timer.elapsed(), exploredTree, exploredSol) - res1;
-  writeln("Search on GPU completed");
-  writeln("Size of the explored tree: ", res2[1]);
-  writeln("Number of explored solutions: ", res2[2]);
-  writeln("Elapsed time: ", res2[0], " [s]\n");
-
   /*
     Step 3: We complete the depth-first search on CPU.
   */
   timer.start();
+
   while true {
     var hasWork = 0;
     var parent = pool.popBackFree(hasWork);
@@ -566,9 +533,11 @@ proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint
 
     decompose(lbound1, lbound2, parent, exploredTree, exploredSol, best, pool);
   }
+
   timer.stop();
   elapsedTime = timer.elapsed();
   const res3 = (elapsedTime, exploredTree, exploredSol) - res1 - res2;
+
   writeln("Search on CPU completed");
   writeln("Size of the explored tree: ", res3[1]);
   writeln("Number of explored solutions: ", res3[2]);