Let’s have fun with prime numbers? In this post, I would like to share some results I got from using multi-threading with .NET and CUDA to find prime numbers in a range.
My machine:
- Intel Core i7-7700HQ CPU @ 2.80GHz
- 32 GB RAM
- Windows 10 Pro
- NVIDIA GeForce GTX 1070
It is important to say that I am NOT using the best algorithms here. I know there are better approaches to find prime numbers. Also, I am pretty sure there are a lot of improvements that I could implement in my code. So, take it easy. Right?
The book Pro .NET performance inspired the code in this post.
The starting point
Let’s start with a straightforward sequential implementation.
static void Main()
{
var sw = new Stopwatch();
sw.Start();
var result = PrimesInRange(200, 800000);
sw.Stop();
Console.WriteLine($"{result} prime numbers found in {sw.ElapsedMilliseconds / 1000} seconds ({Environment.ProcessorCount} processors).");
}
public static long PrimesInRange(long start, long end)
{
long result = 0;
for (var number = start; number < end; number++)
{
if (IsPrime(number))
{
result++;
}
}
return result;
}
static bool IsPrime(long number)
{
if (number == 2) return true;
if (number % 2 == 0) return false;
for (long divisor = 3; divisor < (number / 2); divisor += 2)
{
if (number % divisor == 0)
{
return false;
}
}
return true;
}
Time to run: ~76 seconds!
Using Threads
public static long PrimesInRange(long start, long end)
{
long result = 0;
var lockObject = new object();
var range = end - start;
var numberOfThreads = (long) Environment.ProcessorCount;
var threads = new Thread[numberOfThreads];
var chunkSize = range / numberOfThreads;
for (long i = 0; i < numberOfThreads; i++)
{
var chunkStart = start + i * chunkSize;
var chunkEnd = (i == (numberOfThreads - 1)) ? end : chunkStart + chunkSize;
threads[i] = new Thread(() =>
{
for (var number = chunkStart; number < chunkEnd; ++number)
{
if (IsPrime(number))
{
lock (lockObject)
{
result++;
}
}
}
});
threads[i].Start();
}
foreach (var thread in threads)
{
thread.Join();
}
return result;
}
This is a naïve implementation. Do you know why? Share your thoughts in the comments.
Time to run: ~23 seconds.
Using Threads (no locks)
public static long PrimesInRange2_1(long start, long end)
{
//var result = new List();
var range = end - start;
var numberOfThreads = (long)Environment.ProcessorCount;
var threads = new Thread[numberOfThreads];
var results = new long[numberOfThreads];
var chunkSize = range / numberOfThreads;
for (long i = 0; i < numberOfThreads; i++)
{
var chunkStart = start + i * chunkSize;
var chunkEnd = i == (numberOfThreads - 1) ? end : chunkStart + chunkSize;
var current = i;
threads[i] = new Thread(() =>
{
results[current] = 0;
for (var number = chunkStart; number < chunkEnd; ++number)
{
if (IsPrime(number))
{
results[current]++;
}
}
});
threads[i].Start();
}
foreach (var thread in threads)
{
thread.Join();
}
return results.Sum();
}
Time to run: ~23 seconds.
Using Threads (Interlocked)
public static long PrimesInRange(long start, long end)
{
long result = 0;
var range = end - start;
var numberOfThreads = (long)Environment.ProcessorCount;
var threads = new Thread[numberOfThreads];
var chunkSize = range / numberOfThreads;
for (long i = 0; i < numberOfThreads; i++)
{
var chunkStart = start + i * chunkSize;
var chunkEnd = i == (numberOfThreads - 1) ? end : chunkStart + chunkSize;
threads[i] = new Thread(() =>
{
for (var number = chunkStart; number < chunkEnd; ++number)
{
if (IsPrime(number))
{
Interlocked.Increment(ref result);
}
}
});
threads[i].Start();
}
foreach (var thread in threads)
{
thread.Join();
}
return result;
}
Time to Run: ~23 seconds.
ThreadPool
public static long PrimesInRange(long start, long end)
{
long result = 0;
const long chunkSize = 100;
var completed = 0;
var allDone = new ManualResetEvent(initialState: false);
var chunks = (end - start) / chunkSize;
for (long i = 0; i < chunks; i++)
{
var chunkStart = (start) + i * chunkSize;
var chunkEnd = i == (chunks - 1) ? end : chunkStart + chunkSize;
ThreadPool.QueueUserWorkItem(_ =>
{
for (var number = chunkStart; number < chunkEnd; number++)
{
if (IsPrime(number))
{
Interlocked.Increment(ref result);
}
}
if (Interlocked.Increment(ref completed) == chunks)
{
allDone.Set();
}
});
}
allDone.WaitOne();
return result;
}
Time to Run: ~16 seconds.
Parallel.For
public static long PrimesInRange4(long start, long end)
{
long result = 0;
Parallel.For(start, end, number =>
{
if (IsPrime(number))
{
Interlocked.Increment(ref result);
}
});
return result;
}
Time to Run: ~16 seconds.
CUDA
#include "device_launch_parameters.h"
#include "cuda_runtime.h"
#include <ctime>
#include <cstdio>
__global__ void primes_in_range(int *result)
{
const auto number = 200 + (blockIdx.x * blockDim.x) + threadIdx.x;
if (number >= 800000)
{
return;
}
if (number % 2 == 0) return;
for (long divisor = 3; divisor < (number / 2); divisor += 2)
{
if (number % divisor == 0)
{
return;
}
}
atomicAdd(result, 1);
}
int main()
{
auto begin = std::clock();
int *result;
cudaMallocManaged(&result, 4);
*result = 0;
primes_in_range<<<800, 1024>>>(result);
cudaDeviceSynchronize();
auto end = std::clock();
auto duration = double(end - begin) / CLOCKS_PER_SEC * 1000;
printf("%d prime numbers found in %d milliseconds",
*result,
static_cast<int>(duration)
);
getchar();
return 0;
}
Time to Run: Less than 2 seconds.
Time to Action
I strongly recommend you to reproduce this tests on your machine. If you see something that I could do better, please, share your ideas.
I understand that performance is a feature. I will continue to blog about it. Subscribe the contact list, and I will send you an email every week with the new content.