Kernels launched by multiple host threads get serialized by cudaStreamSynchronize(0) when compiled with “--default-stream=per-thread”2019 Community Moderator ElectionCUDA - copy to array within array of ObjectsC structures with dynamic data with CUDA kernels?How do CUDA blocks/warps/threads map onto CUDA cores?Multiple host threads launching individual CUDA kernelsCUDA kernels not launching before CudaDeviceSynchronizeRunning several streams (instead of threads/blocks) in parallelCudaMemcpy of a single bool takes way too longMultiple Host threads launch CUDA kernels togetherWill CUDA API affect CPU's Ram access performance?Concurrency of cuFFT streams

Should we release the security issues we found in our product as CVE or we can just update those on weekly release notes?

Make a transparent 448*448 image

Why does Deadpool say "You're welcome, Canada," after shooting Ryan Reynolds in the end credits?

How could a female member of a species produce eggs unto death?

What exactly is the purpose of connection links straped between the rocket and the launch pad

Is this animal really missing?

Plywood subfloor won't screw down in a trailer home

What has been your most complicated TikZ drawing?

Latest web browser compatible with Windows 98

When were linguistics departments first established

The three point beverage

Why don't MCU characters ever seem to have language issues?

Prove that the total distance is minimised (when travelling across the longest path)

Do I need to leave some extra space available on the disk which my database log files reside, for log backup operations to successfully occur?

US to Europe trip with Canada layover- is 52 minutes enough?

Format picture and text with TikZ and minipage

Am I not good enough for you?

Deleting missing values from a dataset

Can someone explain what is being said here in color publishing in the American Mathematical Monthly?

Time travel short story where dinosaur doesn't taste like chicken

How do anti-virus programs start at Windows boot?

Who is our nearest neighbor

"One can do his homework in the library"

Why doesn't the EU now just force the UK to choose between referendum and no-deal?

Kernels launched by multiple host threads get serialized by cudaStreamSynchronize(0) when compiled with “--default-stream=per-thread”

2019 Community Moderator ElectionCUDA - copy to array within array of ObjectsC structures with dynamic data with CUDA kernels?How do CUDA blocks/warps/threads map onto CUDA cores?Multiple host threads launching individual CUDA kernelsCUDA kernels not launching before CudaDeviceSynchronizeRunning several streams (instead of threads/blocks) in parallelCudaMemcpy of a single bool takes way too longMultiple Host threads launch CUDA kernels togetherWill CUDA API affect CPU's Ram access performance?Concurrency of cuFFT streams

I've created a minimalistic sample program that captures the strange behavior of CUDA runtime API.

When compiled using "--default-stream=per-thread", each host thread should be able to launch its own kernel and wait for result using cudaStreamSynchronize(0), in parallel.

But it turns out to be not true ...

In the example code, when you set PARALLEL=false, you can observe perfect concurrency in GPU using visual profiler, as expected

When you set PARALLEL=true, you can observe complete serialization in GPU; when replacing cudaStreamSynchronize(0) by the commented out cudaMemcpy (and you need to uncomment the cudaMalloc and comment one cudaHostGetDevicePointer), concurrency increased somewhat, but still far from perfect concurrency.

The kernel is producing nothing interesting, so its logic can be ignored :-)

I'm using a single 1080 Ti, CUDA 10.1, Windows 7, Visual Studio Community 2017.

Any thoughts? Thanks for your time!

#include <iostream>
#include <cfloat>
#include <cmath>
#include <thread>
#include <atomic>
#include <vector>
#include <chrono>
#include <cuda_profiler_api.h>

const bool PARALLEL = true;

const int ArrayLength = 128;
const int LoopLength = 10240000;
const int ThreadsCount = 28;

std::atomic<bool> ready(false);

__global__ void kernel_inference(const float* __restrict__ input, float* __restrict__ output)

 for(int x=threadIdx.x;x<ArrayLength;x+=blockDim.x)
 
 float tmp = input[x];
 for(int i=0;i<LoopLength;i++)
 
 tmp = sinf(tmp*tmp+tmp+1.0f);
 
 atomicAdd(output, tmp);
 


struct Portal

 cudaStream_t stream;
 float* inputh;
 float* inputd;
 float* outputh;
 float* outputd;

 Portal()
 
 // malloc
 cudaHostAlloc(&inputh, ArrayLength*sizeof(float), cudaHostAllocMapped);
 cudaHostAlloc(&outputh, sizeof(float), cudaHostAllocMapped);
 //cudaMalloc(&outputd, sizeof(float));
 // get pointer
 cudaHostGetDevicePointer(&inputd, inputh, 0);
 cudaHostGetDevicePointer(&outputd, outputh, 0);
 // stream
 if (!PARALLEL) cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
 

 ~Portal()
 
 cudaFreeHost(inputh);
 cudaFreeHost(outputh);
 //cudaFree(outputd);
 if (!PARALLEL) cudaStreamDestroy(stream);
 

 float query(float v)
 
 for(int i=0;i<ArrayLength;i++)
 inputh[i] = v+i;
 *outputh = 0;
 kernel_inference<<<1,128,0,PARALLEL?0:stream>>>(inputd, outputd);
 //if (PARALLEL) cudaMemcpy(outputh, outputd, sizeof(float), cudaMemcpyDeviceToHost);
 if (PARALLEL) cudaStreamSynchronize(0);
 return PARALLEL ? *outputh : 0;
 
;

void thread_main(int t) 

 Portal portal;

 while (!ready)
 
 std::this_thread::sleep_for(std::chrono::milliseconds(1));
 

 std::cout<<portal.query(t)<<std::endl;


int main(int argc, char** argv)

 std::vector<std::unique_ptr<std::thread>> threads;
 std::vector<std::unique_ptr<Portal>> portals;

 if (PARALLEL)
 
 for(int t=0;t<ThreadsCount;t++)
 threads.push_back(std::make_unique<std::thread>(thread_main, t));
 
 else
 
 for(int t=0;t<ThreadsCount;t++)
 portals.push_back(std::make_unique<Portal>());
 

 cudaDeviceSynchronize();

 ready = true;

 if (PARALLEL)
 
 for(int t=0;t<threads.size();t++)
 threads[t]->join();
 
 else
 
 for(int t=0;t<portals.size();t++)
 portals[t]->query(t);

 for(int t=0;t<portals.size();t++)
 
 cudaStreamSynchronize(portals[t]->stream);
 std::cout<<*(portals[t]->outputh)<<std::endl;

Compilation command

CALL "C:Program Files (x86)Microsoft Visual Studio2017CommunityVCAuxiliaryBuildvcvars64.bat"
nvcc --gpu-architecture=sm_61 --default-stream=per-thread --optimize=3 -Xcompiler "/wd4819" --x=cu test.cpp --use_fast_math --library=cuda,cudart_static --library-path="C:Program FilesNVIDIA GPU Computing ToolkitCUDAv10.1libx64" --output-file test.exe

Visual Profiler Graph Shows Complete Serialization
CUDA Visual Profiler Graph

edited Mar 7 at 17:21

asked Mar 6 at 17:35

Zhifu

112

cudaMemcpy and cudaMalloc are synchronizing operations, just in case.

– Ander Biguri
Mar 7 at 9:35

Try cudaStreamSynchronise(cudaStreamPerThread) instead of cudaStreamSynchronise(0).

– tera
Mar 8 at 9:25

@tera, Yes, I have tried replacing the two references of stream 0 to cudaStreamPerThread, but problem is not solved

– Zhifu
Mar 8 at 10:45

When I run your posted code as-is with const bool PARALLEL = true; and --default-stream=per-thread on linux, I get concurrent kernel launches. No serialization. So this may be a windows WDDM thing. I generally don't recommend trying to achieve complex concurrency scenarios with windows WDDM. Yes, I understand it works differently when you use a user-created stream. However I don't see any difference on linux.

– Robert Crovella
Mar 8 at 18:31

@ Robert Crovella. Thanks! My scenario is a parallel tree search problem so it is not convenient nor efficient to post queries to a central server thread. Hope for a future version of CUDA or WDDM to work around this problem, TCC cards are too expensive for me.

– Zhifu
Mar 8 at 18:38

add a comment |

I've created a minimalistic sample program that captures the strange behavior of CUDA runtime API.

When compiled using "--default-stream=per-thread", each host thread should be able to launch its own kernel and wait for result using cudaStreamSynchronize(0), in parallel.

But it turns out to be not true ...

In the example code, when you set PARALLEL=false, you can observe perfect concurrency in GPU using visual profiler, as expected

The kernel is producing nothing interesting, so its logic can be ignored :-)

I'm using a single 1080 Ti, CUDA 10.1, Windows 7, Visual Studio Community 2017.

Any thoughts? Thanks for your time!

#include <iostream>
#include <cfloat>
#include <cmath>
#include <thread>
#include <atomic>
#include <vector>
#include <chrono>
#include <cuda_profiler_api.h>

const bool PARALLEL = true;

const int ArrayLength = 128;
const int LoopLength = 10240000;
const int ThreadsCount = 28;

std::atomic<bool> ready(false);

__global__ void kernel_inference(const float* __restrict__ input, float* __restrict__ output)

 for(int x=threadIdx.x;x<ArrayLength;x+=blockDim.x)
 
 float tmp = input[x];
 for(int i=0;i<LoopLength;i++)
 
 tmp = sinf(tmp*tmp+tmp+1.0f);
 
 atomicAdd(output, tmp);
 


struct Portal

 cudaStream_t stream;
 float* inputh;
 float* inputd;
 float* outputh;
 float* outputd;

 Portal()
 
 // malloc
 cudaHostAlloc(&inputh, ArrayLength*sizeof(float), cudaHostAllocMapped);
 cudaHostAlloc(&outputh, sizeof(float), cudaHostAllocMapped);
 //cudaMalloc(&outputd, sizeof(float));
 // get pointer
 cudaHostGetDevicePointer(&inputd, inputh, 0);
 cudaHostGetDevicePointer(&outputd, outputh, 0);
 // stream
 if (!PARALLEL) cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
 

 ~Portal()
 
 cudaFreeHost(inputh);
 cudaFreeHost(outputh);
 //cudaFree(outputd);
 if (!PARALLEL) cudaStreamDestroy(stream);
 

 float query(float v)
 
 for(int i=0;i<ArrayLength;i++)
 inputh[i] = v+i;
 *outputh = 0;
 kernel_inference<<<1,128,0,PARALLEL?0:stream>>>(inputd, outputd);
 //if (PARALLEL) cudaMemcpy(outputh, outputd, sizeof(float), cudaMemcpyDeviceToHost);
 if (PARALLEL) cudaStreamSynchronize(0);
 return PARALLEL ? *outputh : 0;
 
;

void thread_main(int t) 

 Portal portal;

 while (!ready)
 
 std::this_thread::sleep_for(std::chrono::milliseconds(1));
 

 std::cout<<portal.query(t)<<std::endl;


int main(int argc, char** argv)

 std::vector<std::unique_ptr<std::thread>> threads;
 std::vector<std::unique_ptr<Portal>> portals;

 if (PARALLEL)
 
 for(int t=0;t<ThreadsCount;t++)
 threads.push_back(std::make_unique<std::thread>(thread_main, t));
 
 else
 
 for(int t=0;t<ThreadsCount;t++)
 portals.push_back(std::make_unique<Portal>());
 

 cudaDeviceSynchronize();

 ready = true;

 if (PARALLEL)
 
 for(int t=0;t<threads.size();t++)
 threads[t]->join();
 
 else
 
 for(int t=0;t<portals.size();t++)
 portals[t]->query(t);

 for(int t=0;t<portals.size();t++)
 
 cudaStreamSynchronize(portals[t]->stream);
 std::cout<<*(portals[t]->outputh)<<std::endl;

Compilation command

CALL "C:Program Files (x86)Microsoft Visual Studio2017CommunityVCAuxiliaryBuildvcvars64.bat"
nvcc --gpu-architecture=sm_61 --default-stream=per-thread --optimize=3 -Xcompiler "/wd4819" --x=cu test.cpp --use_fast_math --library=cuda,cudart_static --library-path="C:Program FilesNVIDIA GPU Computing ToolkitCUDAv10.1libx64" --output-file test.exe

Visual Profiler Graph Shows Complete Serialization
CUDA Visual Profiler Graph

edited Mar 7 at 17:21

asked Mar 6 at 17:35

Zhifu

112

cudaMemcpy and cudaMalloc are synchronizing operations, just in case.

– Ander Biguri
Mar 7 at 9:35

Try cudaStreamSynchronise(cudaStreamPerThread) instead of cudaStreamSynchronise(0).

– tera
Mar 8 at 9:25

@tera, Yes, I have tried replacing the two references of stream 0 to cudaStreamPerThread, but problem is not solved

– Zhifu
Mar 8 at 10:45

When I run your posted code as-is with const bool PARALLEL = true; and --default-stream=per-thread on linux, I get concurrent kernel launches. No serialization. So this may be a windows WDDM thing. I generally don't recommend trying to achieve complex concurrency scenarios with windows WDDM. Yes, I understand it works differently when you use a user-created stream. However I don't see any difference on linux.

– Robert Crovella
Mar 8 at 18:31

@ Robert Crovella. Thanks! My scenario is a parallel tree search problem so it is not convenient nor efficient to post queries to a central server thread. Hope for a future version of CUDA or WDDM to work around this problem, TCC cards are too expensive for me.

– Zhifu
Mar 8 at 18:38

add a comment |

I've created a minimalistic sample program that captures the strange behavior of CUDA runtime API.

When compiled using "--default-stream=per-thread", each host thread should be able to launch its own kernel and wait for result using cudaStreamSynchronize(0), in parallel.

But it turns out to be not true ...

In the example code, when you set PARALLEL=false, you can observe perfect concurrency in GPU using visual profiler, as expected

The kernel is producing nothing interesting, so its logic can be ignored :-)

I'm using a single 1080 Ti, CUDA 10.1, Windows 7, Visual Studio Community 2017.

Any thoughts? Thanks for your time!

#include <iostream>
#include <cfloat>
#include <cmath>
#include <thread>
#include <atomic>
#include <vector>
#include <chrono>
#include <cuda_profiler_api.h>

const bool PARALLEL = true;

const int ArrayLength = 128;
const int LoopLength = 10240000;
const int ThreadsCount = 28;

std::atomic<bool> ready(false);

__global__ void kernel_inference(const float* __restrict__ input, float* __restrict__ output)

 for(int x=threadIdx.x;x<ArrayLength;x+=blockDim.x)
 
 float tmp = input[x];
 for(int i=0;i<LoopLength;i++)
 
 tmp = sinf(tmp*tmp+tmp+1.0f);
 
 atomicAdd(output, tmp);
 


struct Portal

 cudaStream_t stream;
 float* inputh;
 float* inputd;
 float* outputh;
 float* outputd;

 Portal()
 
 // malloc
 cudaHostAlloc(&inputh, ArrayLength*sizeof(float), cudaHostAllocMapped);
 cudaHostAlloc(&outputh, sizeof(float), cudaHostAllocMapped);
 //cudaMalloc(&outputd, sizeof(float));
 // get pointer
 cudaHostGetDevicePointer(&inputd, inputh, 0);
 cudaHostGetDevicePointer(&outputd, outputh, 0);
 // stream
 if (!PARALLEL) cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
 

 ~Portal()
 
 cudaFreeHost(inputh);
 cudaFreeHost(outputh);
 //cudaFree(outputd);
 if (!PARALLEL) cudaStreamDestroy(stream);
 

 float query(float v)
 
 for(int i=0;i<ArrayLength;i++)
 inputh[i] = v+i;
 *outputh = 0;
 kernel_inference<<<1,128,0,PARALLEL?0:stream>>>(inputd, outputd);
 //if (PARALLEL) cudaMemcpy(outputh, outputd, sizeof(float), cudaMemcpyDeviceToHost);
 if (PARALLEL) cudaStreamSynchronize(0);
 return PARALLEL ? *outputh : 0;
 
;

void thread_main(int t) 

 Portal portal;

 while (!ready)
 
 std::this_thread::sleep_for(std::chrono::milliseconds(1));
 

 std::cout<<portal.query(t)<<std::endl;


int main(int argc, char** argv)

 std::vector<std::unique_ptr<std::thread>> threads;
 std::vector<std::unique_ptr<Portal>> portals;

 if (PARALLEL)
 
 for(int t=0;t<ThreadsCount;t++)
 threads.push_back(std::make_unique<std::thread>(thread_main, t));
 
 else
 
 for(int t=0;t<ThreadsCount;t++)
 portals.push_back(std::make_unique<Portal>());
 

 cudaDeviceSynchronize();

 ready = true;

 if (PARALLEL)
 
 for(int t=0;t<threads.size();t++)
 threads[t]->join();
 
 else
 
 for(int t=0;t<portals.size();t++)
 portals[t]->query(t);

 for(int t=0;t<portals.size();t++)
 
 cudaStreamSynchronize(portals[t]->stream);
 std::cout<<*(portals[t]->outputh)<<std::endl;

Compilation command

CALL "C:Program Files (x86)Microsoft Visual Studio2017CommunityVCAuxiliaryBuildvcvars64.bat"
nvcc --gpu-architecture=sm_61 --default-stream=per-thread --optimize=3 -Xcompiler "/wd4819" --x=cu test.cpp --use_fast_math --library=cuda,cudart_static --library-path="C:Program FilesNVIDIA GPU Computing ToolkitCUDAv10.1libx64" --output-file test.exe

Visual Profiler Graph Shows Complete Serialization
CUDA Visual Profiler Graph

edited Mar 7 at 17:21

asked Mar 6 at 17:35

Zhifu

112

I've created a minimalistic sample program that captures the strange behavior of CUDA runtime API.

When compiled using "--default-stream=per-thread", each host thread should be able to launch its own kernel and wait for result using cudaStreamSynchronize(0), in parallel.

But it turns out to be not true ...

In the example code, when you set PARALLEL=false, you can observe perfect concurrency in GPU using visual profiler, as expected

The kernel is producing nothing interesting, so its logic can be ignored :-)

I'm using a single 1080 Ti, CUDA 10.1, Windows 7, Visual Studio Community 2017.

Any thoughts? Thanks for your time!

#include <iostream>
#include <cfloat>
#include <cmath>
#include <thread>
#include <atomic>
#include <vector>
#include <chrono>
#include <cuda_profiler_api.h>

const bool PARALLEL = true;

const int ArrayLength = 128;
const int LoopLength = 10240000;
const int ThreadsCount = 28;

std::atomic<bool> ready(false);

__global__ void kernel_inference(const float* __restrict__ input, float* __restrict__ output)

 for(int x=threadIdx.x;x<ArrayLength;x+=blockDim.x)
 
 float tmp = input[x];
 for(int i=0;i<LoopLength;i++)
 
 tmp = sinf(tmp*tmp+tmp+1.0f);
 
 atomicAdd(output, tmp);
 


struct Portal

 cudaStream_t stream;
 float* inputh;
 float* inputd;
 float* outputh;
 float* outputd;

 Portal()
 
 // malloc
 cudaHostAlloc(&inputh, ArrayLength*sizeof(float), cudaHostAllocMapped);
 cudaHostAlloc(&outputh, sizeof(float), cudaHostAllocMapped);
 //cudaMalloc(&outputd, sizeof(float));
 // get pointer
 cudaHostGetDevicePointer(&inputd, inputh, 0);
 cudaHostGetDevicePointer(&outputd, outputh, 0);
 // stream
 if (!PARALLEL) cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
 

 ~Portal()
 
 cudaFreeHost(inputh);
 cudaFreeHost(outputh);
 //cudaFree(outputd);
 if (!PARALLEL) cudaStreamDestroy(stream);
 

 float query(float v)
 
 for(int i=0;i<ArrayLength;i++)
 inputh[i] = v+i;
 *outputh = 0;
 kernel_inference<<<1,128,0,PARALLEL?0:stream>>>(inputd, outputd);
 //if (PARALLEL) cudaMemcpy(outputh, outputd, sizeof(float), cudaMemcpyDeviceToHost);
 if (PARALLEL) cudaStreamSynchronize(0);
 return PARALLEL ? *outputh : 0;
 
;

void thread_main(int t) 

 Portal portal;

 while (!ready)
 
 std::this_thread::sleep_for(std::chrono::milliseconds(1));
 

 std::cout<<portal.query(t)<<std::endl;


int main(int argc, char** argv)

 std::vector<std::unique_ptr<std::thread>> threads;
 std::vector<std::unique_ptr<Portal>> portals;

 if (PARALLEL)
 
 for(int t=0;t<ThreadsCount;t++)
 threads.push_back(std::make_unique<std::thread>(thread_main, t));
 
 else
 
 for(int t=0;t<ThreadsCount;t++)
 portals.push_back(std::make_unique<Portal>());
 

 cudaDeviceSynchronize();

 ready = true;

 if (PARALLEL)
 
 for(int t=0;t<threads.size();t++)
 threads[t]->join();
 
 else
 
 for(int t=0;t<portals.size();t++)
 portals[t]->query(t);

 for(int t=0;t<portals.size();t++)
 
 cudaStreamSynchronize(portals[t]->stream);
 std::cout<<*(portals[t]->outputh)<<std::endl;

Compilation command

CALL "C:Program Files (x86)Microsoft Visual Studio2017CommunityVCAuxiliaryBuildvcvars64.bat"
nvcc --gpu-architecture=sm_61 --default-stream=per-thread --optimize=3 -Xcompiler "/wd4819" --x=cu test.cpp --use_fast_math --library=cuda,cudart_static --library-path="C:Program FilesNVIDIA GPU Computing ToolkitCUDAv10.1libx64" --output-file test.exe

Visual Profiler Graph Shows Complete Serialization
CUDA Visual Profiler Graph

c++ multithreading cuda

edited Mar 7 at 17:21

asked Mar 6 at 17:35

Zhifu

112

edited Mar 7 at 17:21

asked Mar 6 at 17:35

Zhifu

112

edited Mar 7 at 17:21

asked Mar 6 at 17:35

Zhifu

112

asked Mar 6 at 17:35

Zhifu

112

asked Mar 6 at 17:35

Zhifu

112

cudaMemcpy and cudaMalloc are synchronizing operations, just in case.

– Ander Biguri
Mar 7 at 9:35

Try cudaStreamSynchronise(cudaStreamPerThread) instead of cudaStreamSynchronise(0).

– tera
Mar 8 at 9:25

@tera, Yes, I have tried replacing the two references of stream 0 to cudaStreamPerThread, but problem is not solved

– Zhifu
Mar 8 at 10:45

When I run your posted code as-is with const bool PARALLEL = true; and --default-stream=per-thread on linux, I get concurrent kernel launches. No serialization. So this may be a windows WDDM thing. I generally don't recommend trying to achieve complex concurrency scenarios with windows WDDM. Yes, I understand it works differently when you use a user-created stream. However I don't see any difference on linux.

– Robert Crovella
Mar 8 at 18:31

@ Robert Crovella. Thanks! My scenario is a parallel tree search problem so it is not convenient nor efficient to post queries to a central server thread. Hope for a future version of CUDA or WDDM to work around this problem, TCC cards are too expensive for me.

– Zhifu
Mar 8 at 18:38

add a comment |

cudaMemcpy and cudaMalloc are synchronizing operations, just in case.

– Ander Biguri
Mar 7 at 9:35

Try cudaStreamSynchronise(cudaStreamPerThread) instead of cudaStreamSynchronise(0).

– tera
Mar 8 at 9:25

@tera, Yes, I have tried replacing the two references of stream 0 to cudaStreamPerThread, but problem is not solved

– Zhifu
Mar 8 at 10:45

When I run your posted code as-is with const bool PARALLEL = true; and --default-stream=per-thread on linux, I get concurrent kernel launches. No serialization. So this may be a windows WDDM thing. I generally don't recommend trying to achieve complex concurrency scenarios with windows WDDM. Yes, I understand it works differently when you use a user-created stream. However I don't see any difference on linux.

– Robert Crovella
Mar 8 at 18:31

@ Robert Crovella. Thanks! My scenario is a parallel tree search problem so it is not convenient nor efficient to post queries to a central server thread. Hope for a future version of CUDA or WDDM to work around this problem, TCC cards are too expensive for me.

– Zhifu
Mar 8 at 18:38

cudaMemcpy and cudaMalloc are synchronizing operations, just in case.

– Ander Biguri
Mar 7 at 9:35

Try cudaStreamSynchronise(cudaStreamPerThread) instead of cudaStreamSynchronise(0).

– tera
Mar 8 at 9:25

@tera, Yes, I have tried replacing the two references of stream 0 to cudaStreamPerThread, but problem is not solved

– Zhifu
Mar 8 at 10:45

When I run your posted code as-is with const bool PARALLEL = true; and --default-stream=per-thread on linux, I get concurrent kernel launches. No serialization. So this may be a windows WDDM thing. I generally don't recommend trying to achieve complex concurrency scenarios with windows WDDM. Yes, I understand it works differently when you use a user-created stream. However I don't see any difference on linux.

– Robert Crovella
Mar 8 at 18:31

@ Robert Crovella. Thanks! My scenario is a parallel tree search problem so it is not convenient nor efficient to post queries to a central server thread. Hope for a future version of CUDA or WDDM to work around this problem, TCC cards are too expensive for me.

– Zhifu
Mar 8 at 18:38

add a comment |

0

active

oldest

votes

Your Answer

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "1"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader:
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
,
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);

);

draft saved

draft discarded

StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f55029075%2fkernels-launched-by-multiple-host-threads-get-serialized-by-cudastreamsynchroniz%23new-answer', 'question_page');

);

Post as a guest

Name

Required, but never shown

0

active

oldest

votes

0

active

oldest

votes

draft saved

draft discarded

Thanks for contributing an answer to Stack Overflow!

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

To learn more, see our tips on writing great answers.

draft saved

draft discarded

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

搜尋此網誌

Ufdjrw

0

Your Answer

Post as a guest

0

0

Post as a guest

Popular posts from this blog

Алба-Юлія

Захаров Федір Захарович

0

Your Answer

Sign up or log in

Post as a guest

Post as a guest

0

0

Sign up or log in

Post as a guest

Post as a guest

Sign up or log in

Post as a guest

Sign up or log in

Post as a guest

Sign up or log in

Post as a guest

Popular posts from this blog

Алба-Юлія

Захаров Федір Захарович