diff --git a/baselines/nqueens/nqueens_gpu_cuda.cu b/baselines/nqueens/nqueens_gpu_cuda.cu
index 275c8c83de70166e64a73c1c7ef9759701740a52..227abc0a8691645697d3ae700b68e9a2ec3c997f 100644
--- a/baselines/nqueens/nqueens_gpu_cuda.cu
+++ b/baselines/nqueens/nqueens_gpu_cuda.cu
@@ -127,14 +127,16 @@ void decompose(const int N, const int G, const Node parent,
   if (depth == N) {
     *num_sol += 1;
   }
-  for (int j = depth; j < N; j++) {
-    if (isSafe(G, parent.board, depth, parent.board[j])) {
-      Node child;
-      memcpy(child.board, parent.board, N * sizeof(uint8_t));
-      swap(&child.board[depth], &child.board[j]);
-      child.depth = depth + 1;
-      pushBack(pool, child);
-      *tree_loc += 1;
+  else {
+    for (int j = depth; j < N; j++) {
+      if (isSafe(G, parent.board, depth, parent.board[j])) {
+        Node child;
+        child.depth = depth + 1;
+        memcpy(child.board, parent.board, N * sizeof(uint8_t));
+        swap(&child.board[depth], &child.board[j]);
+        pushBack(pool, child);
+        *tree_loc += 1;
+      }
     }
   }
 }
@@ -151,18 +153,18 @@ __global__ void evaluate_gpu(const int N, const int G, const Node* parents_d, ui
     const uint8_t depth = parent.depth;
     const uint8_t queen_num = parent.board[k];
 
-    uint8_t isSafe = 1;
+    uint8_t isSafe;
 
     // If child 'k' is not scheduled, we evaluate its safety 'G' times, otherwise 0.
     if (k >= depth) {
+      isSafe = 1;
       // const int G_notScheduled = G * (k >= depth);
       for (int i = 0; i < depth; i++) {
         const uint8_t pbi = parent.board[i];
-        int y;
+
         for (int g = 0; g < G/*G_notScheduled*/; g++) {
           isSafe *= (pbi != queen_num - (depth - i) &&
                      pbi != queen_num + (depth - i));
-          y += g;
         }
       }
       labels_d[threadId] = isSafe;
@@ -181,14 +183,16 @@ void generate_children(const int N, const Node* parents, const int size, const u
     if (depth == N) {
       *exploredSol += 1;
     }
-    for (int j = depth; j < N; j++) {
-      if (labels[j + i * N] == 1) {
-        Node child;
-        memcpy(child.board, parent.board, N * sizeof(uint8_t));
-        swap(&child.board[depth], &child.board[j]);
-        child.depth = depth + 1;
-        pushBack(pool, child);
-        *exploredTree += 1;
+    else {
+      for (int j = depth; j < N; j++) {
+        if (labels[j + i * N] == 1) {
+          Node child;
+          child.depth = depth + 1;
+          memcpy(child.board, parent.board, N * sizeof(uint8_t));
+          swap(&child.board[depth], &child.board[j]);
+          pushBack(pool, child);
+          *exploredTree += 1;
+        }
       }
     }
   }
@@ -207,7 +211,36 @@ void nqueens_search(const int N, const int G, const int m, const int M,
 
   pushBack(&pool, root);
 
-  clock_t startTime = clock();
+  // Timers
+  struct timespec start, end;
+
+  /*
+    Step 1: We perform a partial breadth-first search on CPU in order to create
+    a sufficiently large amount of work for GPU computation.
+  */
+  clock_gettime(CLOCK_MONOTONIC_RAW, &start);
+
+  while (pool.size < m) {
+    int hasWork = 0;
+    Node parent = popFront(&pool, &hasWork);
+    if (!hasWork) break;
+
+    decompose(N, G, parent, tree_loc, num_sol, &pool);
+  }
+
+  clock_gettime(CLOCK_MONOTONIC_RAW, &end);
+  double t1 = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
+
+  printf("\nInitial search on CPU completed\n");
+  printf("Size of the explored tree: %llu\n", *exploredTree);
+  printf("Number of explored solutions: %llu\n", *exploredSol);
+  printf("Elapsed time: %f [s]\n", t1);
+
+  /*
+    Step 2: We continue the search on GPU in a depth-first manner, until there
+    is not enough work.
+  */
+  clock_gettime(CLOCK_MONOTONIC_RAW, &start);
 
   Node* parents = (Node*)malloc(M * sizeof(Node));
   uint8_t* labels = (uint8_t*)malloc(M*N * sizeof(uint8_t));
@@ -218,16 +251,10 @@ void nqueens_search(const int N, const int G, const int m, const int M,
   cudaMalloc(&labels_d, M*N * sizeof(uint8_t));
 
   while (1) {
-    int hasWork = 0;
-    Node parent = popBack(&pool, &hasWork);
-    if (!hasWork) break;
-
-    decompose(N, G, parent, exploredTree, exploredSol, &pool);
+    int poolSize = pool.size;
 
-    int poolSize = MIN(pool.size, M);
-
-    // If 'poolSize' is sufficiently large, we offload the pool on GPU.
     if (poolSize >= m) {
+      poolSize = MIN(poolSize, M);
 
       for (int i = 0; i < poolSize; i++) {
         int hasWork = 0;
@@ -236,30 +263,57 @@ void nqueens_search(const int N, const int G, const int m, const int M,
       }
 
       const int numLabels = N * poolSize;
-
-      cudaMemcpy(parents_d, parents, poolSize * sizeof(Node), cudaMemcpyHostToDevice);
-
       const int nbBlocks = ceil((double)numLabels / BLOCK_SIZE);
 
+      cudaMemcpy(parents_d, parents, poolSize * sizeof(Node), cudaMemcpyHostToDevice);
       evaluate_gpu<<<nbBlocks, BLOCK_SIZE>>>(N, G, parents_d, labels_d, numLabels);
-
       cudaMemcpy(labels, labels_d, numLabels * sizeof(uint8_t), cudaMemcpyDeviceToHost);
 
       generate_children(N, parents, poolSize, labels, exploredTree, exploredSol, &pool);
     }
+    else {
+      break;
+    }
+  }
+
+  clock_gettime(CLOCK_MONOTONIC_RAW, &end);
+  double t2 = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
+
+  printf("\nSearch on GPU completed\n");
+  printf("Size of the explored tree: %llu\n", *exploredTree);
+  printf("Number of explored solutions: %llu\n", *exploredSol);
+  printf("Elapsed time: %f [s]\n", t2);
+
+  /*
+    Step 3: We complete the depth-first search on CPU.
+  */
+  clock_gettime(CLOCK_MONOTONIC_RAW, &start);
+
+  while (1) {
+    int hasWork = 0;
+    Node parent = popBack(&pool, &hasWork);
+    if (!hasWork) break;
+
+    decompose(N, G, parent, tree_loc, num_sol, &pool);
   }
 
+  clock_gettime(CLOCK_MONOTONIC_RAW, &end);
+
+  double t3 = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
+  *elapsedTime = t1 + t2 + t3;
+  printf("\nSearch on CPU completed\n");
+  printf("Size of the explored tree: %llu\n", *exploredTree);
+  printf("Number of explored solutions: %llu\n", *exploredSol);
+  printf("Elapsed time: %f [s]\n", t3);
+
+  printf("\nExploration terminated.\n");
+
   cudaFree(parents_d);
   cudaFree(labels_d);
 
   free(parents);
   free(labels);
 
-  clock_t endTime = clock();
-  *elapsedTime = (double)(endTime - startTime) / CLOCKS_PER_SEC;
-
-  printf("\nExploration terminated.\n");
-
   deleteSinglePool(&pool);
 }