diff --git a/baselines/pfsp/lib/Pool.c b/baselines/pfsp/lib/Pool.c index 326f8c38d792790c214950d7fac7bac69620b7be..c04397b000021f955dd5efa5546738e871464e72 100644 --- a/baselines/pfsp/lib/Pool.c +++ b/baselines/pfsp/lib/Pool.c @@ -1,6 +1,8 @@ #include "Pool.h" #include <stdlib.h> +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + void initSinglePool(SinglePool* pool) { pool->elements = (Node*)malloc(CAPACITY * sizeof(Node)); @@ -33,6 +35,20 @@ Node popBack(SinglePool* pool, int* hasWork) return (Node){0}; } +// Bulk removal from the end of the deque. +int popBackBulk(SinglePool* pool, const int m, const int M, Node* parents) { + if (pool->size >= m) { + const int poolSize = MIN(pool->size, M); + pool->size -= poolSize; + for (int i = 0; i < poolSize; i++) { + parents[i] = pool->elements[pool->front + pool->size + i]; + } + return poolSize; + } + + return 0; +} + // Removal from the front of the deque. Node popFront(SinglePool* pool, int* hasWork) { diff --git a/baselines/pfsp/lib/Pool.h b/baselines/pfsp/lib/Pool.h index a93556d4739eded990887469fb8a244fc2d38172..acb647644cf293941405ebf587df2b812fa58a29 100644 --- a/baselines/pfsp/lib/Pool.h +++ b/baselines/pfsp/lib/Pool.h @@ -30,6 +30,8 @@ void pushBack(SinglePool* pool, Node node); Node popBack(SinglePool* pool, int* hasWork); +int popBackBulk(SinglePool* pool, const int m, const int M, Node* parents); + Node popFront(SinglePool* pool, int* hasWork); void deleteSinglePool(SinglePool* pool); diff --git a/baselines/pfsp/lib/evaluate.cu b/baselines/pfsp/lib/evaluate.cu index 822b9631fb4e488d1f3e000f47f7b6a3d5fb24a9..682976e047e8d9cac46c07da77902276661b919a 100644 --- a/baselines/pfsp/lib/evaluate.cu +++ b/baselines/pfsp/lib/evaluate.cu @@ -6,113 +6,114 @@ extern "C" { #include "evaluate.h" #include "c_bounds_gpu.cu" - __device__ void swap_cuda(int* a, int* b) - { - int tmp = *b; - *b = *a; - *a = tmp; - } +__device__ void swap_cuda(int* a, int* b) +{ + int tmp = *b; + *b = *a; + *a = tmp; +} - void printDims(dim3 gridDim, dim3 blockDim) { - printf("Grid Dimensions : [%d, %d, %d] blocks. \n", - gridDim.x, gridDim.y, gridDim.z); +void printDims(dim3 gridDim, dim3 blockDim) { + printf("Grid Dimensions : [%d, %d, %d] blocks. \n", + gridDim.x, gridDim.y, gridDim.z); - printf("Block Dimensions : [%d, %d, %d] threads.\n", - blockDim.x, blockDim.y, blockDim.z); - } + printf("Block Dimensions : [%d, %d, %d] threads.\n", + blockDim.x, blockDim.y, blockDim.z); +} - // Evaluate a bulk of parent nodes on GPU using lb1 - __global__ void evaluate_gpu_lb1(const int jobs, const int size, Node* parents_d, const lb1_bound_data lbound1_d, int* bounds) - { - int threadId = blockIdx.x * blockDim.x + threadIdx.x; - - if (threadId < size) { - const int parentId = threadId / jobs; - const int k = threadId % jobs; - Node parent = parents_d[parentId]; - int depth = parent.depth; - int limit1 = parent.limit1; - - // We evaluate all permutations by varying index k from limit1 forward - if (k >= limit1+1) { - swap_cuda(&parent.prmu[depth],&parent.prmu[k]); - lb1_bound_gpu(lbound1_d, parent.prmu, limit1+1, jobs, &bounds[threadId]); - swap_cuda(&parent.prmu[depth],&parent.prmu[k]); - } +// Evaluate a bulk of parent nodes on GPU using lb1 +__global__ void evaluate_gpu_lb1(const int jobs, const int size, Node* parents_d, const lb1_bound_data lbound1_d, int* bounds) +{ + int threadId = blockIdx.x * blockDim.x + threadIdx.x; + + if (threadId < size) { + const int parentId = threadId / jobs; + const int k = threadId % jobs; + Node parent = parents_d[parentId]; + int depth = parent.depth; + int limit1 = parent.limit1; + + // We evaluate all permutations by varying index k from limit1 forward + if (k >= limit1+1) { + swap_cuda(&parent.prmu[depth], &parent.prmu[k]); + lb1_bound_gpu(lbound1_d, parent.prmu, limit1+1, jobs, &bounds[threadId]); + swap_cuda(&parent.prmu[depth], &parent.prmu[k]); } } +} + +/* + NOTE: This lower bound evaluates all the children of a given parent at the same time. + Therefore, the GPU loop is on the parent nodes and not on the children ones, in contrast + to the other lower bounds. +*/ +// Evaluate a bulk of parent nodes on GPU using lb1_d. +__global__ void evaluate_gpu_lb1_d(const int jobs, const int size, Node* parents_d, const lb1_bound_data lbound1_d, int* bounds) +{ + int parentId = blockIdx.x * blockDim.x + threadIdx.x; + + if (parentId < size) { + Node parent = parents_d[parentId]; - /* - NOTE: This lower bound evaluates all the children of a given parent at the same time. - Therefore, the GPU loop is on the parent nodes and not on the children ones, in contrast - to the other lower bounds. - */ - // Evaluate a bulk of parent nodes on GPU using lb1_d. - __global__ void evaluate_gpu_lb1_d(const int jobs, const int size, Node* parents_d, const lb1_bound_data lbound1_d, int* bounds) - { - int parentId = blockIdx.x * blockDim.x + threadIdx.x; - - if(parentId < size){ - Node parent = parents_d[parentId]; - - // Vector of integers of size MAX_JOBS - int lb_begin[MAX_JOBS]; - - lb1_children_bounds_gpu(lbound1_d, parent.prmu, parent.limit1, jobs, lb_begin); - - for(int k = 0; k < jobs; k++) { - if (k >= parent.limit1+1) { - const int job = parent.prmu[k]; - bounds[parentId*jobs+k] = lb_begin[job]; - } + // Vector of integers of size MAX_JOBS + int lb_begin[MAX_JOBS]; + + lb1_children_bounds_gpu(lbound1_d, parent.prmu, parent.limit1, jobs, lb_begin); + + for (int k = 0; k < jobs; k++) { + if (k >= parent.limit1+1) { + const int job = parent.prmu[k]; + bounds[parentId*jobs+k] = lb_begin[job]; } } } +} - // Evaluate a bulk of parent nodes on GPU using lb2. - __global__ void evaluate_gpu_lb2(const int jobs, const int size, int best, Node* parents_d, const lb1_bound_data lbound1_d, const lb2_bound_data lbound2_d, int* bounds) - { - int threadId = blockIdx.x * blockDim.x + threadIdx.x; - - if (threadId < size) { - const int parentId = threadId / jobs; - const int k = threadId % jobs; - Node parent = parents_d[parentId]; - int depth = parent.depth; - int limit1 = parent.limit1; - - // We evaluate all permutations by varying index k from limit1 forward - if (k >= limit1+1) { - swap_cuda(&parent.prmu[depth],&parent.prmu[k]); - lb2_bound_gpu(lbound1_d, lbound2_d, parent.prmu, limit1+1, jobs, best, &bounds[threadId]); - swap_cuda(&parent.prmu[depth],&parent.prmu[k]); - } +// Evaluate a bulk of parent nodes on GPU using lb2. +__global__ void evaluate_gpu_lb2(const int jobs, const int size, int best, Node* parents_d, const lb1_bound_data lbound1_d, const lb2_bound_data lbound2_d, int* bounds) +{ + int threadId = blockIdx.x * blockDim.x + threadIdx.x; + + if (threadId < size) { + const int parentId = threadId / jobs; + const int k = threadId % jobs; + Node parent = parents_d[parentId]; + int depth = parent.depth; + int limit1 = parent.limit1; + + // We evaluate all permutations by varying index k from limit1 forward + if (k >= limit1+1) { + swap_cuda(&parent.prmu[depth], &parent.prmu[k]); + lb2_bound_gpu(lbound1_d, lbound2_d, parent.prmu, limit1+1, jobs, best, &bounds[threadId]); + swap_cuda(&parent.prmu[depth], &parent.prmu[k]); } } +} - void evaluate_gpu(const int jobs, const int lb, const int size, const int nbBlocks, - int* best, const lb1_bound_data lbound1, const lb2_bound_data lbound2, Node* parents, int* bounds) - { - // 1D grid of 1D nbBlocks(_lb1_d) blocks with block size BLOCK_SIZE - int nbBlocks_lb1_d; - switch (lb) { - case 0: // lb1_d - nbBlocks_lb1_d = ceil((double)nbBlocks/jobs); - evaluate_gpu_lb1_d<<<nbBlocks_lb1_d, BLOCK_SIZE>>>(jobs, size, parents, lbound1, bounds); - return; - break; - - case 1: // lb1 - evaluate_gpu_lb1<<<nbBlocks, BLOCK_SIZE>>>(jobs, size, parents, lbound1, bounds); - return; - break; - - case 2: // lb2 - evaluate_gpu_lb2<<<nbBlocks, BLOCK_SIZE>>>(jobs, size, *best, parents, lbound1, lbound2, bounds); - return; - break; - } +void evaluate_gpu(const int jobs, const int lb, const int size, const int nbBlocks, + int* best, const lb1_bound_data lbound1, const lb2_bound_data lbound2, Node* parents, int* bounds) +{ + // 1D grid of 1D nbBlocks(_lb1_d) blocks with block size BLOCK_SIZE + int nbBlocks_lb1_d; + switch (lb) { + case 0: // lb1_d + nbBlocks_lb1_d = ceil((double)nbBlocks/jobs); + evaluate_gpu_lb1_d<<<nbBlocks_lb1_d, BLOCK_SIZE>>>(jobs, size, parents, lbound1, bounds); + return; + break; + + case 1: // lb1 + evaluate_gpu_lb1<<<nbBlocks, BLOCK_SIZE>>>(jobs, size, parents, lbound1, bounds); + return; + break; + + case 2: // lb2 + evaluate_gpu_lb2<<<nbBlocks, BLOCK_SIZE>>>(jobs, size, *best, parents, lbound1, lbound2, bounds); + return; + break; } +} + #ifdef __cplusplus } #endif diff --git a/baselines/pfsp/pfsp_gpu_cuda.c b/baselines/pfsp/pfsp_gpu_cuda.c index 9f5079071c7a1e3a8f79d45a4936d24fbaf9d45c..b8bea6d49d10ba06d54e3edc6f8cee8d5a522d0a 100644 --- a/baselines/pfsp/pfsp_gpu_cuda.c +++ b/baselines/pfsp/pfsp_gpu_cuda.c @@ -162,10 +162,10 @@ void decompose_lb1(const int jobs, const lb1_bound_data* const lbound1, const No { for (int i = parent.limit1+1; i < jobs; i++) { Node child; - memcpy(child.prmu, parent.prmu, jobs * sizeof(int)); - swap(&child.prmu[parent.depth], &child.prmu[i]); child.depth = parent.depth + 1; child.limit1 = parent.limit1 + 1; + memcpy(child.prmu, parent.prmu, jobs * sizeof(int)); + swap(&child.prmu[parent.depth], &child.prmu[i]); int lowerbound = lb1_bound(lbound1, child.prmu, child.limit1, jobs); @@ -204,9 +204,9 @@ void decompose_lb1_d(const int jobs, const lb1_bound_data* const lbound1, const } else { // if not leaf if (lb < *best) { // if child feasible Node child; - memcpy(child.prmu, parent.prmu, jobs * sizeof(int)); child.depth = parent.depth + 1; child.limit1 = parent.limit1 + 1; + memcpy(child.prmu, parent.prmu, jobs * sizeof(int)); swap(&child.prmu[child.limit1], &child.prmu[i]); pushBack(pool, child); @@ -224,10 +224,10 @@ void decompose_lb2(const int jobs, const lb1_bound_data* const lbound1, const lb { for (int i = parent.limit1+1; i < jobs; i++) { Node child; - memcpy(child.prmu, parent.prmu, jobs * sizeof(int)); - swap(&child.prmu[parent.depth], &child.prmu[i]); child.depth = parent.depth + 1; child.limit1 = parent.limit1 + 1; + memcpy(child.prmu, parent.prmu, jobs * sizeof(int)); + swap(&child.prmu[parent.depth], &child.prmu[i]); int lowerbound = lb2_bound(lbound1, lbound2, child.prmu, child.limit1, jobs, *best); @@ -277,19 +277,19 @@ void generate_children(Node* parents, const int size, const int jobs, int* bound const int lowerbound = bounds[j + i * jobs]; // If child leaf - if(depth + 1 == jobs){ + if (depth + 1 == jobs) { *exploredSol += 1; // If child feasible - if(lowerbound < *best) *best = lowerbound; + if (lowerbound < *best) *best = lowerbound; } else { // If not leaf - if(lowerbound < *best) { + if (lowerbound < *best) { Node child; - memcpy(child.prmu, parent.prmu, jobs * sizeof(int)); - swap(&child.prmu[depth], &child.prmu[j]); child.depth = depth + 1; child.limit1 = parent.limit1 + 1; + memcpy(child.prmu, parent.prmu, jobs * sizeof(int)); + swap(&child.prmu[depth], &child.prmu[j]); pushBack(pool, child); *exploredTree += 1; @@ -316,11 +316,14 @@ void pfsp_search(const int inst, const int lb, const int m, const int M, int* be pushBack(&pool, root); - // Timer struct timespec start, end; + + /* + Step 1: We perform a partial breadth-first search on CPU in order to create + a sufficiently large amount of work for GPU computation. + */ clock_gettime(CLOCK_MONOTONIC_RAW, &start); - // Bounding data lb1_bound_data* lbound1; lbound1 = new_bound_data(jobs, machines); taillard_get_processing_times(lbound1->p_times, inst); @@ -332,19 +335,14 @@ void pfsp_search(const int inst, const int lb, const int m, const int M, int* be fill_lags(lbound1->p_times, lbound2); fill_johnson_schedules(lbound1->p_times, lbound2); - /* - Step 1: We perform a partial breadth-first search on CPU in order to create - a sufficiently large amount of work for GPU computation. - */ - - while(pool.size < m) { - // CPU side + while (pool.size < m) { int hasWork = 0; Node parent = popFront(&pool, &hasWork); if (!hasWork) break; decompose(jobs, lb, best, lbound1, lbound2, parent, exploredTree, exploredSol, &pool); } + clock_gettime(CLOCK_MONOTONIC_RAW, &end); double t1 = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9; @@ -357,7 +355,6 @@ void pfsp_search(const int inst, const int lb, const int m, const int M, int* be Step 2: We continue the search on GPU in a depth-first manner, until there is not enough work. */ - clock_gettime(CLOCK_MONOTONIC_RAW, &start); // TODO: add function 'copyBoundsDevice' to perform the deep copy of bounding data @@ -413,27 +410,18 @@ void pfsp_search(const int inst, const int lb, const int m, const int M, int* be lbound2_d.nb_jobs = lbound2->nb_jobs; lbound2_d.nb_machines = lbound2->nb_machines; - // Allocating parents vector on CPU and GPU Node* parents = (Node*)malloc(M * sizeof(Node)); - Node* parents_d; - cudaMalloc((void**)&parents_d, M * sizeof(Node)); - - // Allocating bounds vector on CPU and GPU int* bounds = (int*)malloc((jobs*M) * sizeof(int)); + + Node* parents_d; int *bounds_d; + cudaMalloc((void**)&parents_d, M * sizeof(Node)); cudaMalloc((void**)&bounds_d, (jobs*M) * sizeof(int)); while (1) { - int poolSize = pool.size; - if (poolSize >= m) { - poolSize = MIN(poolSize,M); - - for(int i= 0; i < poolSize; i++) { - int hasWork = 0; - parents[i] = popBack(&pool,&hasWork); - if (!hasWork) break; - } + int poolSize = popBackBulk(&pool, m, M, parents); + if (poolSize > 0) { /* TODO: Optimize 'numBounds' based on the fact that the maximum number of generated children for a parent is 'parent.limit2 - parent.limit1 + 1' or @@ -443,10 +431,7 @@ void pfsp_search(const int inst, const int lb, const int m, const int M, int* be const int nbBlocks = ceil((double)numBounds / BLOCK_SIZE); cudaMemcpy(parents_d, parents, poolSize *sizeof(Node), cudaMemcpyHostToDevice); - - // numBounds is the 'size' of the problem evaluate_gpu(jobs, lb, numBounds, nbBlocks, best, lbound1_d, lbound2_d, parents_d, bounds_d); - cudaMemcpy(bounds, bounds_d, numBounds * sizeof(int), cudaMemcpyDeviceToHost); /* @@ -458,6 +443,7 @@ void pfsp_search(const int inst, const int lb, const int m, const int M, int* be break; } } + clock_gettime(CLOCK_MONOTONIC_RAW, &end); double t2 = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9; @@ -469,8 +455,8 @@ void pfsp_search(const int inst, const int lb, const int m, const int M, int* be /* Step 3: We complete the depth-first search on CPU. */ - clock_gettime(CLOCK_MONOTONIC_RAW, &start); + while (1) { int hasWork = 0; Node parent = popBack(&pool, &hasWork); @@ -479,14 +465,17 @@ void pfsp_search(const int inst, const int lb, const int m, const int M, int* be decompose(jobs, lb, best, lbound1, lbound2, parent, exploredTree, exploredSol, &pool); } - // Freeing memory for structs - deleteSinglePool(&pool); - free_bound_data(lbound1); - free_johnson_bd_data(lbound2); + clock_gettime(CLOCK_MONOTONIC_RAW, &end); + double t3 = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9; + *elapsedTime = t1 + t2 + t3; + + printf("\nSearch on CPU completed\n"); + printf("Size of the explored tree: %llu\n", *exploredTree); + printf("Number of explored solutions: %llu\n", *exploredSol); + printf("Elapsed time: %f [s]\n", t3); + + printf("\nExploration terminated.\n"); - // Freeing memory for device - cudaFree(parents_d); - cudaFree(bounds_d); cudaFree(p_times_d); cudaFree(min_heads_d); cudaFree(min_tails_d); @@ -495,20 +484,15 @@ void pfsp_search(const int inst, const int lb, const int m, const int M, int* be cudaFree(machine_pairs_1_d); cudaFree(machine_pairs_2_d); cudaFree(machine_pair_order_d); + cudaFree(parents_d); + cudaFree(bounds_d); - //Freeing memory for host + free_bound_data(lbound1); + free_johnson_bd_data(lbound2); free(parents); free(bounds); - clock_gettime(CLOCK_MONOTONIC_RAW, &end); - double t3 = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9; - *elapsedTime = t1 + t2 + t3; - printf("\nSearch on CPU completed\n"); - printf("Size of the explored tree: %llu\n", *exploredTree); - printf("Number of explored solutions: %llu\n", *exploredSol); - printf("Elapsed time: %f [s]\n", t3); - - printf("\nExploration terminated.\n"); + deleteSinglePool(&pool); } int main(int argc, char* argv[]) diff --git a/lib/pfsp/Bound_johnson.chpl b/lib/pfsp/Bound_johnson.chpl index 6f60be9c341e1fe589ff435eb09dbfa046afd672..e8a25c89348f436c0a1819a5ca66b23eb504d093 100644 --- a/lib/pfsp/Bound_johnson.chpl +++ b/lib/pfsp/Bound_johnson.chpl @@ -47,18 +47,6 @@ module Bound_johnson } } - /* NOTE: This wrapper allows one to store persistent data on the GPU memory. */ - class WrapperClassLB2 { - forwarding var lb2_bound: lb2_bound_data; - - proc init(const jobs: int(32), const machines: int(32)) - { - this.lb2_bound = new lb2_bound_data(jobs, machines); - } - } - - type WrapperLB2 = owned WrapperClassLB2?; - proc fill_machine_pairs(ref lb2_data: lb2_bound_data/*, enum lb2_variant lb2_type*/): void { var lb2_type = lb2_variant.LB2_LEARN; @@ -201,13 +189,18 @@ module Bound_johnson { const nb_jobs = lb2_data.nb_jobs; + use CTypes only c_ptrToConst; + const lb2_js = c_ptrToConst(lb2_data.johnson_schedules[0]); + const lb1_pt = c_ptrToConst(lb1_p_times[0]); + const lb2_lags = c_ptrToConst(lb2_data.lags[0]); + for j in 0..#nb_jobs { - var job = lb2_data.johnson_schedules[ind*nb_jobs + j]; + var job = lb2_js[ind*nb_jobs + j]; // j-loop is on unscheduled jobs... (==0 if jobCour is unscheduled) if (flag[job] == 0) { - var ptm0 = lb1_p_times[ma0*nb_jobs + job]; - var ptm1 = lb1_p_times[ma1*nb_jobs + job]; - var lag = lb2_data.lags[ind*nb_jobs + job]; + var ptm0 = lb1_pt[ma0*nb_jobs + job]; + var ptm1 = lb1_pt[ma1*nb_jobs + job]; + var lag = lb2_lags[ind*nb_jobs + job]; // add job on ma0 and ma1 tmp0 += ptm0; tmp1 = max(tmp1,tmp0 + lag); diff --git a/lib/pfsp/Bound_simple.chpl b/lib/pfsp/Bound_simple.chpl index a9efd3f5d105a0f1d0d1366ffadd6765fa652e09..0ca9004bae6a69569add62f8aab836b54347e545 100644 --- a/lib/pfsp/Bound_simple.chpl +++ b/lib/pfsp/Bound_simple.chpl @@ -26,18 +26,6 @@ module Bound_simple } } - /* NOTE: This wrapper allows one to store persistent data on the GPU memory. */ - class WrapperClassLB1 { - forwarding var lb1_bound: lb1_bound_data; - - proc init(const jobs: int(32), const machines: int(32)) - { - this.lb1_bound = new lb1_bound_data(jobs, machines); - } - } - - type WrapperLB1 = owned WrapperClassLB1?; - inline proc add_forward(const job: int(32), const p_times: [] int(32), const nb_jobs: int(32), const nb_machines: int(32), ref front): void { front[0] += p_times[job]; diff --git a/pfsp_chpl.chpl b/pfsp_chpl.chpl index 2cbc2ade2f1999435ef04da5ef3ea9910134d8e6..3e7b578e467c3557ddb5f6b1e751cd6a2704c17a 100644 --- a/pfsp_chpl.chpl +++ b/pfsp_chpl.chpl @@ -6,7 +6,6 @@ use Time; use util; use Pool; - use PFSP_node; use Bound_johnson; use Bound_simple; diff --git a/pfsp_gpu_chpl.chpl b/pfsp_gpu_chpl.chpl index 617f9ceff04a31b14aec4b4b520982bafc503c47..b6641c70404e15e8ecfdd0dc744c7c8eb55338e0 100644 --- a/pfsp_gpu_chpl.chpl +++ b/pfsp_gpu_chpl.chpl @@ -5,11 +5,8 @@ use Time; use GpuDiagnostics; -config const BLOCK_SIZE = 512; - use util; use Pool; - use PFSP_node; use Bound_johnson; use Bound_simple; @@ -17,6 +14,8 @@ use Taillard; const allowedLowerBounds = ["lb1", "lb1_d", "lb2"]; +config const BLOCK_SIZE = 512; + /******************************************************************************* Implementation of the single-GPU PFSP search. *******************************************************************************/ @@ -35,15 +34,6 @@ config const ub: int = 1; // initial upper bound const jobs = taillard_get_nb_jobs(inst); const machines = taillard_get_nb_machines(inst); -var lbound1 = new WrapperLB1(jobs, machines); //lb1_bound_data(jobs, machines); -taillard_get_processing_times(lbound1!.lb1_bound.p_times, inst); -fill_min_heads_tails(lbound1!.lb1_bound); - -var lbound2 = new WrapperLB2(jobs, machines); -fill_machine_pairs(lbound2!.lb2_bound/*, LB2_FULL*/); -fill_lags(lbound1!.lb1_bound.p_times, lbound2!.lb2_bound); -fill_johnson_schedules(lbound1!.lb1_bound.p_times, lbound2!.lb2_bound); - const initUB = if (ub == 1) then taillard_get_best_ub(inst) else max(int); proc check_parameters() @@ -95,8 +85,8 @@ proc help_message(): void } // Evaluate and generate children nodes on CPU. -proc decompose_lb1(const parent: Node, ref tree_loc: uint, ref num_sol: uint, - ref best: int, ref pool: SinglePool(Node)) +proc decompose_lb1(const lb1_data, const parent: Node, ref tree_loc: uint, ref num_sol: uint, + ref best: int, ref pool) { for i in parent.limit1+1..(jobs-1) { var child = new Node(); @@ -105,7 +95,7 @@ proc decompose_lb1(const parent: Node, ref tree_loc: uint, ref num_sol: uint, child.prmu = parent.prmu; child.prmu[parent.depth] <=> child.prmu[i]; - var lowerbound = lb1_bound(lbound1!.lb1_bound, child.prmu, child.limit1, jobs); + var lowerbound = lb1_bound(lb1_data, child.prmu, child.limit1, jobs); if (child.depth == jobs) { // if child leaf num_sol += 1; @@ -122,12 +112,12 @@ proc decompose_lb1(const parent: Node, ref tree_loc: uint, ref num_sol: uint, } } -proc decompose_lb1_d(const parent: Node, ref tree_loc: uint, ref num_sol: uint, - ref best: int, ref pool: SinglePool(Node)) +proc decompose_lb1_d(const lb1_data, const parent: Node, ref tree_loc: uint, ref num_sol: uint, + ref best: int, ref pool) { var lb_begin: MAX_JOBS*int(32); - lb1_children_bounds(lbound1!.lb1_bound, parent.prmu, parent.limit1, jobs, lb_begin); + lb1_children_bounds(lb1_data, parent.prmu, parent.limit1, jobs, lb_begin); for i in parent.limit1+1..(jobs-1) { const job = parent.prmu[i]; @@ -154,8 +144,8 @@ proc decompose_lb1_d(const parent: Node, ref tree_loc: uint, ref num_sol: uint, } } -proc decompose_lb2(const parent: Node, ref tree_loc: uint, ref num_sol: uint, - ref best: int, ref pool: SinglePool(Node)) +proc decompose_lb2(const lb1_data, const lb2_data, const parent: Node, ref tree_loc: uint, + ref num_sol: uint, ref best: int, ref pool) { for i in parent.limit1+1..(jobs-1) { var child = new Node(); @@ -164,7 +154,7 @@ proc decompose_lb2(const parent: Node, ref tree_loc: uint, ref num_sol: uint, child.prmu = parent.prmu; child.prmu[parent.depth] <=> child.prmu[i]; - var lowerbound = lb2_bound(lbound1!.lb1_bound, lbound2!.lb2_bound, child.prmu, child.limit1, jobs, best); + var lowerbound = lb2_bound(lb1_data, lb2_data, child.prmu, child.limit1, jobs, best); if (child.depth == jobs) { // if child leaf num_sol += 1; @@ -182,18 +172,18 @@ proc decompose_lb2(const parent: Node, ref tree_loc: uint, ref num_sol: uint, } // Evaluate and generate children nodes on CPU. -proc decompose(const parent: Node, ref tree_loc: uint, ref num_sol: uint, - ref best: int, ref pool: SinglePool(Node)) +proc decompose(const lb1_data, const lb2_data, const parent: Node, ref tree_loc: uint, + ref num_sol: uint, ref best: int, ref pool) { select lb { when "lb1_d" { - decompose_lb1_d(parent, tree_loc, num_sol, best, pool); + decompose_lb1_d(lb1_data, parent, tree_loc, num_sol, best, pool); } when "lb1" { - decompose_lb1(parent, tree_loc, num_sol, best, pool); + decompose_lb1(lb1_data, parent, tree_loc, num_sol, best, pool); } otherwise { // lb2 - decompose_lb2(parent, tree_loc, num_sol, best, pool); + decompose_lb2(lb1_data, lb2_data, parent, tree_loc, num_sol, best, pool); } } } @@ -211,7 +201,7 @@ proc evaluate_gpu_lb1(const parents_d: [] Node, const size, const lbound1_d, ref if (k >= parent.limit1+1) { prmu[depth] <=> prmu[k]; - bounds_d[threadId] = lb1_bound(lbound1_d!.lb1_bound, prmu, parent.limit1+1, jobs); + bounds_d[threadId] = lb1_bound(lbound1_d, prmu, parent.limit1+1, jobs); prmu[depth] <=> prmu[k]; } } @@ -228,12 +218,12 @@ proc evaluate_gpu_lb1_d(const parents_d: [] Node, const size, const best, const @assertOnGpu foreach parentId in 0..#(size/jobs) { var parent = parents_d[parentId]; - const depth = parent.depth; + /* const depth = parent.depth; */ var prmu = parent.prmu; var lb_begin: MAX_JOBS*int(32); - lb1_children_bounds(lbound1_d!.lb1_bound, parent.prmu, parent.limit1, jobs, lb_begin); + lb1_children_bounds(lbound1_d, parent.prmu, parent.limit1, jobs, lb_begin); for k in 0..#jobs { if (k >= parent.limit1+1) { @@ -257,7 +247,7 @@ proc evaluate_gpu_lb2(const parents_d: [] Node, const size, const best, const lb if (k >= parent.limit1+1) { prmu[depth] <=> prmu[k]; - bounds_d[threadId] = lb2_bound(lbound1_d!.lb1_bound, lbound2_d!.lb2_bound, prmu, parent.limit1+1, jobs, best); + bounds_d[threadId] = lb2_bound(lbound1_d, lbound2_d, prmu, parent.limit1+1, jobs, best); prmu[depth] <=> prmu[k]; } } @@ -299,10 +289,10 @@ proc generate_children(const ref parents: [] Node, const size: int, const ref bo } else { // if not leaf if (lowerbound < best) { // if child feasible var child = new Node(); - child.depth = parent.depth + 1; + child.depth = depth + 1; child.limit1 = parent.limit1 + 1; child.prmu = parent.prmu; - child.prmu[parent.depth] <=> child.prmu[j]; + child.prmu[depth] <=> child.prmu[j]; pool.pushBack(child); exploredTree += 1; @@ -312,16 +302,6 @@ proc generate_children(const ref parents: [] Node, const size: int, const ref bo } } -class WrapperClassArrayParents { - forwarding var arr: [0..#M] Node = noinit; -} -type WrapperArrayParents = owned WrapperClassArrayParents?; - -class WrapperClassArrayBounds { - forwarding var arr: [0..#(M*jobs)] int(32) = noinit; -} -type WrapperArrayBounds = owned WrapperClassArrayBounds?; - // Single-GPU PFSP search. proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint, ref elapsedTime: real) { @@ -342,15 +322,26 @@ proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint */ timer.start(); + var lbound1 = new lb1_bound_data(jobs, machines); + taillard_get_processing_times(lbound1.p_times, inst); + fill_min_heads_tails(lbound1); + + var lbound2 = new lb2_bound_data(jobs, machines); + fill_machine_pairs(lbound2/*, LB2_FULL*/); + fill_lags(lbound1.p_times, lbound2); + fill_johnson_schedules(lbound1.p_times, lbound2); + while (pool.size < m) { var hasWork = 0; var parent = pool.popFront(hasWork); if !hasWork then break; - decompose(parent, exploredTree, exploredSol, best, pool); + decompose(lbound1, lbound2, parent, exploredTree, exploredSol, best, pool); } + timer.stop(); const res1 = (timer.elapsed(), exploredTree, exploredSol); + writeln("\nInitial search on CPU completed"); writeln("Size of the explored tree: ", res1[1]); writeln("Number of explored solutions: ", res1[2]); @@ -365,38 +356,24 @@ proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint var parents: [0..#M] Node = noinit; var bounds: [0..#(M*jobs)] int(32) = noinit; - var lbound1_d: lbound1.type; - var lbound2_d: lbound2.type; - var parents_d: WrapperArrayParents; - var bounds_d: WrapperArrayBounds; - - on device { - lbound1_d = new WrapperLB1(jobs, machines); - lbound1_d!.lb1_bound.p_times = lbound1!.lb1_bound.p_times; - lbound1_d!.lb1_bound.min_heads = lbound1!.lb1_bound.min_heads; - lbound1_d!.lb1_bound.min_tails = lbound1!.lb1_bound.min_tails; - - lbound2_d = new WrapperLB2(jobs, machines); - lbound2_d!.lb2_bound.johnson_schedules = lbound2!.lb2_bound.johnson_schedules; - lbound2_d!.lb2_bound.lags = lbound2!.lb2_bound.lags; - lbound2_d!.lb2_bound.machine_pairs = lbound2!.lb2_bound.machine_pairs; - lbound2_d!.lb2_bound.machine_pair_order = lbound2!.lb2_bound.machine_pair_order; - - parents_d = new WrapperArrayParents(); - bounds_d = new WrapperArrayBounds(); - } + on device var parents_d: [0..#M] Node; + on device var bounds_d: [0..#(M*jobs)] int(32); - while true { + on device var lbound1_d = new lb1_bound_data(jobs, machines); + lbound1_d.p_times = lbound1.p_times; + lbound1_d.min_heads = lbound1.min_heads; + lbound1_d.min_tails = lbound1.min_tails; - var poolSize = pool.size; - if (poolSize >= m) { - poolSize = min(poolSize, M); - for i in 0..#poolSize { - var hasWork = 0; - parents[i] = pool.popBack(hasWork); - if !hasWork then break; - } + on device var lbound2_d = new lb2_bound_data(jobs, machines); + lbound2_d.johnson_schedules = lbound2.johnson_schedules; + lbound2_d.lags = lbound2.lags; + lbound2_d.machine_pairs = lbound2.machine_pairs; + lbound2_d.machine_pair_order = lbound2.machine_pair_order; + + while true { + var poolSize = pool.popBackBulk(m, M, parents); + if (poolSize > 0) { /* TODO: Optimize 'numBounds' based on the fact that the maximum number of generated children for a parent is 'parent.limit2 - parent.limit1 + 1' or @@ -404,11 +381,9 @@ proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint */ const numBounds = jobs * poolSize; - on device { - parents_d!.arr = parents; // host-to-device - evaluate_gpu(parents_d!.arr, numBounds, best, lbound1_d, lbound2_d, bounds_d!.arr); - bounds = bounds_d!.arr; // device-to-host - } + parents_d = parents; // host-to-device + on device do evaluate_gpu(parents_d, numBounds, best, lbound1_d, lbound2_d, bounds_d); // GPU kernel + bounds = bounds_d; // device-to-host /* Each task generates and inserts its children nodes to the pool. @@ -419,9 +394,10 @@ proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint break; } } - timer.stop(); + timer.stop(); const res2 = (timer.elapsed(), exploredTree, exploredSol) - res1; + writeln("Search on GPU completed"); writeln("Size of the explored tree: ", res2[1]); writeln("Number of explored solutions: ", res2[2]); @@ -431,16 +407,19 @@ proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint Step 3: We complete the depth-first search on CPU. */ timer.start(); + while true { var hasWork = 0; var parent = pool.popBack(hasWork); if !hasWork then break; - decompose(parent, exploredTree, exploredSol, best, pool); + decompose(lbound1, lbound2, parent, exploredTree, exploredSol, best, pool); } + timer.stop(); elapsedTime = timer.elapsed(); const res3 = (elapsedTime, exploredTree, exploredSol) - res1 - res2; + writeln("Search on CPU completed"); writeln("Size of the explored tree: ", res3[1]); writeln("Number of explored solutions: ", res3[2]); diff --git a/pfsp_multigpu_chpl.chpl b/pfsp_multigpu_chpl.chpl index 2bb3ae22c4ceb07c5ddefe3c6fadeb9b76a773ca..4d2db4e6bd45aaf9e49aedc8f4af01094c43f7c4 100644 --- a/pfsp_multigpu_chpl.chpl +++ b/pfsp_multigpu_chpl.chpl @@ -6,11 +6,8 @@ use Time; use Random; use GpuDiagnostics; -config const BLOCK_SIZE = 512; - use util; use Pool_par; - use PFSP_node; use Bound_johnson; use Bound_simple; @@ -18,6 +15,8 @@ use Taillard; const allowedLowerBounds = ["lb1", "lb1_d", "lb2"]; +config const BLOCK_SIZE = 512; + /******************************************************************************* Implementation of the multi-GPU PFSP search. *******************************************************************************/ @@ -89,7 +88,7 @@ proc help_message(): void // Evaluate and generate children nodes on CPU. proc decompose_lb1(const lb1_data, const parent: Node, ref tree_loc: uint, ref num_sol: uint, - ref best: int, ref pool: SinglePool_par(Node)) + ref best: int, ref pool) { for i in parent.limit1+1..(jobs-1) { var child = new Node(); @@ -108,7 +107,7 @@ proc decompose_lb1(const lb1_data, const parent: Node, ref tree_loc: uint, ref n } } else { // if not leaf if (lowerbound < best) { // if child feasible - pool.pushBack(child); + pool.pushBackFree(child); tree_loc += 1; } } @@ -116,7 +115,7 @@ proc decompose_lb1(const lb1_data, const parent: Node, ref tree_loc: uint, ref n } proc decompose_lb1_d(const lb1_data, const parent: Node, ref tree_loc: uint, ref num_sol: uint, - ref best: int, ref pool: SinglePool_par(Node)) + ref best: int, ref pool) { var lb_begin: MAX_JOBS*int(32); @@ -140,15 +139,15 @@ proc decompose_lb1_d(const lb1_data, const parent: Node, ref tree_loc: uint, ref child.prmu = parent.prmu; child.prmu[parent.depth] <=> child.prmu[i]; - pool.pushBack(child); + pool.pushBackFree(child); tree_loc += 1; } } } } -proc decompose_lb2(const lb1_data, const lb2_data, const parent: Node, ref tree_loc: uint, ref num_sol: uint, - ref best: int, ref pool: SinglePool_par(Node)) +proc decompose_lb2(const lb1_data, const lb2_data, const parent: Node, ref tree_loc: uint, + ref num_sol: uint, ref best: int, ref pool) { for i in parent.limit1+1..(jobs-1) { var child = new Node(); @@ -167,7 +166,7 @@ proc decompose_lb2(const lb1_data, const lb2_data, const parent: Node, ref tree_ } } else { // if not leaf if (lowerbound < best) { // if child feasible - pool.pushBack(child); + pool.pushBackFree(child); tree_loc += 1; } } @@ -175,8 +174,8 @@ proc decompose_lb2(const lb1_data, const lb2_data, const parent: Node, ref tree_ } // Evaluate and generate children nodes on CPU. -proc decompose(const lb1_data, const lb2_data, const parent: Node, ref tree_loc: uint, ref num_sol: uint, - ref best: int, ref pool: SinglePool_par(Node)) +proc decompose(const lb1_data, const lb2_data, const parent: Node, ref tree_loc: uint, + ref num_sol: uint, ref best: int, ref pool) { select lb { when "lb1_d" { @@ -221,7 +220,7 @@ proc evaluate_gpu_lb1_d(const parents_d: [] Node, const size, const best, const @assertOnGpu foreach parentId in 0..#(size/jobs) { var parent = parents_d[parentId]; - const depth = parent.depth; + /* const depth = parent.depth; */ var prmu = parent.prmu; var lb_begin: MAX_JOBS*int(32); @@ -261,21 +260,23 @@ proc evaluate_gpu(const parents_d: [] Node, const size, const best, const lbound { select lb { when "lb1_d" { - evaluate_gpu_lb1_d(parents_d, size, best, lbound1_d!.lb1_bound, bounds_d); + evaluate_gpu_lb1_d(parents_d, size, best, lbound1_d, bounds_d); } when "lb1" { - evaluate_gpu_lb1(parents_d, size, lbound1_d!.lb1_bound, bounds_d); + evaluate_gpu_lb1(parents_d, size, lbound1_d, bounds_d); } otherwise { // lb2 - evaluate_gpu_lb2(parents_d, size, best, lbound1_d!.lb1_bound, lbound2_d!.lb2_bound, bounds_d); + evaluate_gpu_lb2(parents_d, size, best, lbound1_d, lbound2_d, bounds_d); } } } // Generate children nodes (evaluated by GPU) on CPU. proc generate_children(const ref parents: [] Node, const size: int, const ref bounds: [] int(32), - ref exploredTree: uint, ref exploredSol: uint, ref best: int, ref pool: SinglePool_par(Node)) + ref exploredTree: uint, ref exploredSol: uint, ref best: int, ref pool) { + pool.acquireLock(); + for i in 0..#size { const parent = parents[i]; const depth = parent.depth; @@ -292,28 +293,20 @@ proc generate_children(const ref parents: [] Node, const size: int, const ref bo } else { // if not leaf if (lowerbound < best) { // if child feasible var child = new Node(); - child.depth = parent.depth + 1; + child.depth = depth + 1; child.limit1 = parent.limit1 + 1; child.prmu = parent.prmu; - child.prmu[parent.depth] <=> child.prmu[j]; + child.prmu[depth] <=> child.prmu[j]; - pool.pushBack(child); + pool.pushBackFree(child); exploredTree += 1; } } } } -} -class WrapperClassArrayParents { - forwarding var arr: [0..#M] Node = noinit; + pool.releaseLock(); } -type WrapperArrayParents = owned WrapperClassArrayParents?; - -class WrapperClassArrayBounds { - forwarding var arr: [0..#(M*jobs)] int(32) = noinit; -} -type WrapperArrayBounds = owned WrapperClassArrayBounds?; // Multi-GPU PFSP search. proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint, ref elapsedTime: real) @@ -325,9 +318,6 @@ proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint var pool = new SinglePool_par(Node); pool.pushBack(root); - var allTasksIdleFlag: atomic bool = false; - var eachTaskState: [0..#D] atomic bool = BUSY; // one task per GPU - var timer: stopwatch; /* @@ -352,8 +342,10 @@ proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint decompose(lbound1, lbound2, parent, exploredTree, exploredSol, best, pool); } + timer.stop(); const res1 = (timer.elapsed(), exploredTree, exploredSol); + writeln("\nInitial search on CPU completed"); writeln("Size of the explored tree: ", res1[1]); writeln("Number of explored solutions: ", res1[2]); @@ -364,14 +356,16 @@ proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint is not enough work. */ timer.start(); + var eachExploredTree, eachExploredSol: [0..#D] uint = noinit; var eachBest: [0..#D] int = noinit; + var eachTaskState: [0..#D] atomic bool = BUSY; // one task per GPU + var allTasksIdleFlag: atomic bool = false; const poolSize = pool.size; const c = poolSize / D; const l = poolSize - (D-1)*c; const f = pool.front; - var lock_p: atomic bool; pool.front = 0; pool.size = 0; @@ -401,31 +395,21 @@ proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint var parents: [0..#M] Node = noinit; var bounds: [0..#(M*jobs)] int(32) = noinit; - var lbound1_d: WrapperLB1; - var lbound2_d: WrapperLB2; - var parents_d: WrapperArrayParents; - var bounds_d: WrapperArrayBounds; - - on device { - lbound1_d = new WrapperLB1(jobs, machines); - lbound1_d!.lb1_bound.p_times = lbound1.p_times; - lbound1_d!.lb1_bound.min_heads = lbound1.min_heads; - lbound1_d!.lb1_bound.min_tails = lbound1.min_tails; - - lbound2_d = new WrapperLB2(jobs, machines); - lbound2_d!.lb2_bound.johnson_schedules = lbound2.johnson_schedules; - lbound2_d!.lb2_bound.lags = lbound2.lags; - lbound2_d!.lb2_bound.machine_pairs = lbound2.machine_pairs; - lbound2_d!.lb2_bound.machine_pair_order = lbound2.machine_pair_order; - - parents_d = new WrapperArrayParents(); - bounds_d = new WrapperArrayBounds(); - } + on device var parents_d: [0..#M] Node; + on device var bounds_d: [0..#(M*jobs)] int(32); + + on device var lbound1_d = new lb1_bound_data(jobs, machines); + lbound1_d.p_times = lbound1.p_times; + lbound1_d.min_heads = lbound1.min_heads; + lbound1_d.min_tails = lbound1.min_tails; + + on device var lbound2_d = new lb2_bound_data(jobs, machines); + lbound2_d.johnson_schedules = lbound2.johnson_schedules; + lbound2_d.lags = lbound2.lags; + lbound2_d.machine_pairs = lbound2.machine_pairs; + lbound2_d.machine_pair_order = lbound2.machine_pair_order; while true { - /* - Each task gets its parents nodes from the pool. - */ var poolSize = pool_loc.popBackBulk(m, M, parents); if (poolSize > 0) { @@ -434,15 +418,6 @@ proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint eachTaskState[gpuID].write(BUSY); } - /* poolSize = min(poolSize, M); - var hasWork = 0; - var parents: [0..#poolSize] Node = noinit; - pool_loc.popBackBulk(poolSize, parents, hasWork); - if (hasWork == 0) { - writeln("DEADCODE in get parents"); - break; - } */ - /* TODO: Optimize 'numBounds' based on the fact that the maximum number of generated children for a parent is 'parent.limit2 - parent.limit1 + 1' or @@ -450,11 +425,9 @@ proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint */ const numBounds = jobs * poolSize; - on device { - parents_d!.arr = parents; // host-to-device - evaluate_gpu(parents_d!.arr, numBounds, best_l, lbound1_d, lbound2_d, bounds_d!.arr); - bounds = bounds_d!.arr; // device-to-host - } + parents_d = parents; // host-to-device + on device do evaluate_gpu(parents_d, numBounds, best_l, lbound1_d, lbound2_d, bounds_d); // GPU kernel + bounds = bounds_d; // device-to-host /* Each task generates and inserts its children nodes to the pool. @@ -462,7 +435,7 @@ proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint generate_children(parents, poolSize, bounds, tree, sol, best_l, pool_loc); } else { - // work stealing + // work stealing attempts var tries = 0; var steal = false; const victims = permute(0..#D); @@ -486,9 +459,6 @@ proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint halt("DEADCODE in work stealing"); } - /* for i in 0..#(size/2) { - pool_loc.pushBack(p[i]); - } */ pool_loc.pushBackBulk(p); steal = true; @@ -525,23 +495,25 @@ proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint } } - if lock_p.compareAndSwap(false, true) { - const poolLocSize = pool_loc.size; - for p in 0..#poolLocSize { - var hasWork = 0; - pool.pushBack(pool_loc.popBack(hasWork)); - if !hasWork then break; - } - lock_p.write(false); + const poolLocSize = pool_loc.size; + for p in 0..#poolLocSize { + var hasWork = 0; + pool.pushBack(pool_loc.popBack(hasWork)); + if !hasWork then break; } eachExploredTree[gpuID] = tree; eachExploredSol[gpuID] = sol; eachBest[gpuID] = best_l; - - writeln("work stealing tries on tasks ", gpuID, " : ", nSteal, " and successes : ", nSSteal); } + timer.stop(); + const res2 = (timer.elapsed(), exploredTree, exploredSol) - res1; + + writeln("Search on GPU completed"); + writeln("Size of the explored tree: ", res2[1]); + writeln("Number of explored solutions: ", res2[2]); + writeln("Elapsed time: ", res2[0], " [s]\n"); exploredTree += (+ reduce eachExploredTree); exploredSol += (+ reduce eachExploredSol); @@ -549,16 +521,11 @@ proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint writeln("workload per GPU: ", 100.0*eachExploredTree/(exploredTree-res1[1]):real); - const res2 = (timer.elapsed(), exploredTree, exploredSol) - res1; - writeln("Search on GPU completed"); - writeln("Size of the explored tree: ", res2[1]); - writeln("Number of explored solutions: ", res2[2]); - writeln("Elapsed time: ", res2[0], " [s]\n"); - /* Step 3: We complete the depth-first search on CPU. */ timer.start(); + while true { var hasWork = 0; var parent = pool.popBackFree(hasWork); @@ -566,9 +533,11 @@ proc pfsp_search(ref optimum: int, ref exploredTree: uint, ref exploredSol: uint decompose(lbound1, lbound2, parent, exploredTree, exploredSol, best, pool); } + timer.stop(); elapsedTime = timer.elapsed(); const res3 = (elapsedTime, exploredTree, exploredSol) - res1 - res2; + writeln("Search on CPU completed"); writeln("Size of the explored tree: ", res3[1]); writeln("Number of explored solutions: ", res3[2]);