diff --git a/pfsp_gpu_chpl.chpl b/pfsp_gpu_chpl.chpl
index 8e27849981dc7084dd0ff1ddc7d07429fc007c12..89b960be84a6cd07655c74eeed023b703e2e2ade 100644
--- a/pfsp_gpu_chpl.chpl
+++ b/pfsp_gpu_chpl.chpl
@@ -34,15 +34,6 @@ config const ub: int = 1; // initial upper bound
 const jobs = taillard_get_nb_jobs(inst);
 const machines = taillard_get_nb_machines(inst);
 
-var lbound1 = new lb1_bound_data(jobs, machines);
-taillard_get_processing_times(lbound1.p_times, inst);
-fill_min_heads_tails(lbound1);
-
-var lbound2 = new lb2_bound_data(jobs, machines);
-fill_machine_pairs(lbound2/*, LB2_FULL*/);
-fill_lags(lbound1.p_times, lbound2);
-fill_johnson_schedules(lbound1.p_times, lbound2);
-
 const initUB = if (ub == 1) then taillard_get_best_ub(inst) else max(int);
 
 proc check_parameters()
@@ -94,8 +85,8 @@ proc help_message(): void
 }
 
 // Evaluate and generate children nodes on CPU.
-proc decompose_lb1(const parent: Node, ref tree_loc: uint, ref num_sol: uint,
-  ref best: int, ref pool: SinglePool(Node))
+proc decompose_lb1(const lb1_data, const parent: Node, ref tree_loc: uint, ref num_sol: uint,
+  ref best: int, ref pool)
 {
   for i in parent.limit1+1..(jobs-1) {
     var child = new Node();
@@ -104,7 +95,7 @@ proc decompose_lb1(const parent: Node, ref tree_loc: uint, ref num_sol: uint,
     child.prmu = parent.prmu;
     child.prmu[parent.depth] <=> child.prmu[i];
 
-    var lowerbound = lb1_bound(lbound1, child.prmu, child.limit1, jobs);
+    var lowerbound = lb1_bound(lb1_data, child.prmu, child.limit1, jobs);
 
     if (child.depth == jobs) { // if child leaf
       num_sol += 1;
@@ -121,12 +112,12 @@ proc decompose_lb1(const parent: Node, ref tree_loc: uint, ref num_sol: uint,
   }
 }
 
-proc decompose_lb1_d(const parent: Node, ref tree_loc: uint, ref num_sol: uint,
-  ref best: int, ref pool: SinglePool(Node))
+proc decompose_lb1_d(const lb1_data, const parent: Node, ref tree_loc: uint, ref num_sol: uint,
+  ref best: int, ref pool)
 {
   var lb_begin: MAX_JOBS*int(32);
 
-  lb1_children_bounds(lbound1, parent.prmu, parent.limit1, jobs, lb_begin);
+  lb1_children_bounds(lb1_data, parent.prmu, parent.limit1, jobs, lb_begin);
 
   for i in parent.limit1+1..(jobs-1) {
     const job = parent.prmu[i];
@@ -153,8 +144,8 @@ proc decompose_lb1_d(const parent: Node, ref tree_loc: uint, ref num_sol: uint,
   }
 }
 
-proc decompose_lb2(const parent: Node, ref tree_loc: uint, ref num_sol: uint,
-  ref best: int, ref pool: SinglePool(Node))
+proc decompose_lb2(const lb1_data, const lb2_data, const parent: Node, ref tree_loc: uint,
+  ref num_sol: uint, ref best: int, ref pool)
 {
   for i in parent.limit1+1..(jobs-1) {
     var child = new Node();
@@ -163,7 +154,7 @@ proc decompose_lb2(const parent: Node, ref tree_loc: uint, ref num_sol: uint,
     child.prmu = parent.prmu;
     child.prmu[parent.depth] <=> child.prmu[i];
 
-    var lowerbound = lb2_bound(lbound1, lbound2, child.prmu, child.limit1, jobs, best);
+    var lowerbound = lb2_bound(lb1_data, lb2_data, child.prmu, child.limit1, jobs, best);
 
     if (child.depth == jobs) { // if child leaf
       num_sol += 1;
@@ -181,18 +172,18 @@ proc decompose_lb2(const parent: Node, ref tree_loc: uint, ref num_sol: uint,
 }
 
 // Evaluate and generate children nodes on CPU.
-proc decompose(const parent: Node, ref tree_loc: uint, ref num_sol: uint,
-  ref best: int, ref pool: SinglePool(Node))
+proc decompose(const lb1_data, const lb2_data, const parent: Node, ref tree_loc: uint,
+  ref num_sol: uint, ref best: int, ref pool)
 {
   select lb {
     when "lb1_d" {
-      decompose_lb1_d(parent, tree_loc, num_sol, best, pool);
+      decompose_lb1_d(lb1_data, parent, tree_loc, num_sol, best, pool);
     }
     when "lb1" {
-      decompose_lb1(parent, tree_loc, num_sol, best, pool);
+      decompose_lb1(lb1_data, parent, tree_loc, num_sol, best, pool);
     }
     otherwise { // lb2
-      decompose_lb2(parent, tree_loc, num_sol, best, pool);
+      decompose_lb2(lb1_data, lb2_data, parent, tree_loc, num_sol, best, pool);
     }
   }
 }
@@ -331,12 +322,21 @@ proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint
   */
   timer.start();
 
+  var lbound1 = new lb1_bound_data(jobs, machines);
+  taillard_get_processing_times(lbound1.p_times, inst);
+  fill_min_heads_tails(lbound1);
+
+  var lbound2 = new lb2_bound_data(jobs, machines);
+  fill_machine_pairs(lbound2/*, LB2_FULL*/);
+  fill_lags(lbound1.p_times, lbound2);
+  fill_johnson_schedules(lbound1.p_times, lbound2);
+
   while (pool.size < m) {
     var hasWork = 0;
     var parent = pool.popFront(hasWork);
     if !hasWork then break;
 
-    decompose(parent, exploredTree, exploredSol, best, pool);
+    decompose(lbound1, lbound2, parent, exploredTree, exploredSol, best, pool);
   }
 
   timer.stop();
@@ -371,16 +371,9 @@ proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint
   lbound2_d.machine_pair_order = lbound2.machine_pair_order;
 
   while true {
+    var poolSize = pool.popBackBulk(m, M, parents);
 
-    var poolSize = pool.size;
-    if (poolSize >= m) {
-      poolSize = min(poolSize, M);
-      for i in 0..#poolSize {
-        var hasWork = 0;
-        parents[i] = pool.popBack(hasWork);
-        if !hasWork then break;
-      }
-
+    if (poolSize > 0) {
       /*
         TODO: Optimize 'numBounds' based on the fact that the maximum number of
         generated children for a parent is 'parent.limit2 - parent.limit1 + 1' or
@@ -420,7 +413,7 @@ proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint
     var parent = pool.popBack(hasWork);
     if !hasWork then break;
 
-    decompose(parent, exploredTree, exploredSol, best, pool);
+    decompose(lbound1, lbound2, parent, exploredTree, exploredSol, best, pool);
   }
 
   timer.stop();
diff --git a/pfsp_multigpu_chpl.chpl b/pfsp_multigpu_chpl.chpl
index 647345c1e4dbe21246ac6cd8d87f7ae48d8f195e..3ff5b0650331b585e3c2ae5941b266465434f451 100644
--- a/pfsp_multigpu_chpl.chpl
+++ b/pfsp_multigpu_chpl.chpl
@@ -88,7 +88,7 @@ proc help_message(): void
 
 // Evaluate and generate children nodes on CPU.
 proc decompose_lb1(const lb1_data, const parent: Node, ref tree_loc: uint, ref num_sol: uint,
-  ref best: int, ref pool: SinglePool_par(Node))
+  ref best: int, ref pool)
 {
   for i in parent.limit1+1..(jobs-1) {
     var child = new Node();
@@ -146,8 +146,8 @@ proc decompose_lb1_d(const lb1_data, const parent: Node, ref tree_loc: uint, ref
   }
 }
 
-proc decompose_lb2(const lb1_data, const lb2_data, const parent: Node, ref tree_loc: uint, ref num_sol: uint,
-  ref best: int, ref pool: SinglePool_par(Node))
+proc decompose_lb2(const lb1_data, const lb2_data, const parent: Node, ref tree_loc: uint,
+  ref num_sol: uint, ref best: int, ref pool: SinglePool_par(Node))
 {
   for i in parent.limit1+1..(jobs-1) {
     var child = new Node();
@@ -174,8 +174,8 @@ proc decompose_lb2(const lb1_data, const lb2_data, const parent: Node, ref tree_
 }
 
 // Evaluate and generate children nodes on CPU.
-proc decompose(const lb1_data, const lb2_data, const parent: Node, ref tree_loc: uint, ref num_sol: uint,
-  ref best: int, ref pool: SinglePool_par(Node))
+proc decompose(const lb1_data, const lb2_data, const parent: Node, ref tree_loc: uint,
+  ref num_sol: uint, ref best: int, ref pool)
 {
   select lb {
     when "lb1_d" {