import Pkg
Pkg.activate("colab3")
Pkg.add("BenchmarkTools")

using Base.Threads
using BenchmarkTools

Threads.nthreads()

JULIA_NUM_THREADS=4 julia  # we can also set the `JULIA_NUM_THREADS` environment variable in .bashrc.
julia -t 4
julia --threads 4
julia -t auto

a = zeros(Int, 10)
Threads.@threads for i = 1:10
    a[i] = Threads.threadid()
end
display(a)

function busywait(seconds)
    tstart = time_ns()
    while (time_ns() - tstart) / 1e9 < seconds
    end
end

@time begin
    Threads.@spawn busywait(5)
    Threads.@threads :static for i in 1:Threads.nthreads()
        busywait(1)
    end
end

@time begin
    Threads.@spawn busywait(5)
    Threads.@threads :dynamic for i in 1:Threads.nthreads()
        busywait(1)
    end
end

@time Threads.@spawn busywait(5)

function sqrt_array(A)
    B = similar(A)
    for i in eachindex(A)
        @inbounds B[i] = sqrt(A[i])
    end
    B
end

function threaded_sqrt_array(A)
    B = similar(A)
    @threads for i in eachindex(A)
        @inbounds B[i] = sqrt(A[i])
    end
    B
end

n = 1000
A = rand(n, n)
@btime sqrt_array(A);
@btime threaded_sqrt_array(A);

sqrt_array(A) == threaded_sqrt_array(A)

function sqrt_sum(A)
    s = zero(eltype(A))
    for i in eachindex(A)
        @inbounds s += sqrt(A[i])
    end
    return s
end

function threaded_sqrt_sum(A)
    s = zero(eltype(A))
    @threads for i in eachindex(A)
        @inbounds s += sqrt(A[i])
    end
    return s
end

function threaded_sqrt_sum2(A)
    s = Atomic{Float64}(0.0)
    @threads for i in eachindex(A)
        @inbounds atomic_add!(s, sqrt(A[i]))
    end
    return s
end

n = 1000
A = rand(n, n)
@btime sqrt_sum(A);
@btime threaded_sqrt_sum(A);
@btime threaded_sqrt_sum2(A);

sqrt_sum(A) ≈ threaded_sqrt_sum(A)

# Ref{Int} is an object that safely references data of type Int.
# This type is guaranteed to point to valid, Julia-allocated memory of the correct type.
acc = Ref{Int}(0)
@threads for i in 1:1000
    acc[] += 1
end
acc[]

 acc = Atomic{Int}(0)
 @threads for i in 1:1000
    atomic_add!(acc, 1)
end
acc[]

i = Threads.Atomic{Int}(0)
old_i = zeros(4)
Threads.@threads for id in 1:4
    old_i[id] = atomic_add!(i, id) # Threads.atomic_add! returns the old value of i!
end
display(i[])
old_i

function threaded_sqrt_sum_atomic(A)
    T = eltype(A)
    s = Atomic{T}(zero(T))
    @threads for i in eachindex(A)
        @inbounds atomic_add!(s, sqrt(A[i]))
    end
    return s[]
end

@btime threaded_sqrt_sum_atomic(A);

function threaded_sqrt_sum_optimized(A, partial)
    T = eltype(A)
    @threads for i in eachindex(A)
        @inbounds partial[threadid()] += sqrt(A[i])
    end
    s = zero(T)
    for i in eachindex(partial)
        s += partial[i]
    end
    return s
end

partial = zeros(Float64, nthreads())
A = rand(5000,2000)
@btime sqrt_sum(A);
@btime threaded_sqrt_sum_optimized(A, partial);

function estimate_pi(num_points)
    hits = 0
    for _ in 1:num_points
        x, y = rand(), rand()
        if x^2 + y^2 < 1.0
            hits += 1
        end
    end
    fraction = hits / num_points
    return 4 * fraction
end

num_points = 100_000_000
@btime estimate_pi(num_points)  # 3.14147572...

function threaded_estimate_pi_v1(num_points)
    hits = Atomic{Int}(0)
    @threads for _ in 1:num_points
        x, y = rand(), rand()
        if x^2 + y^2 < 1.0
            atomic_add!(hits, 1)
        end
    end
    fraction = hits[] / num_points
    return 4 * fraction
end

num_points = 100_000_000
@btime threaded_estimate_pi_v1(num_points)

function threaded_estimate_pi_v2(num_points)
    partial_hits = zeros(Int, nthreads())
    @threads for _ in 1:num_points
        x, y = rand(), rand()
        if x^2 + y^2 < 1.0
            partial_hits[threadid()] += 1
        end
    end
    hits = sum(partial_hits)
    fraction = hits / num_points
    return 4 * fraction
end

num_points = 100_000_000
@btime threaded_estimate_pi_v2(num_points)

julia -t 1 threaded_estimate_pi.jl
pi = 3.14176872
time = 950.957122

julia -t 2 threaded_estimate_pi.jl
pi = 3.1412234
time = 732.195929

julia -t 4 threaded_estimate_pi.jl
pi = 3.14180932
time = 663.25783

Parallel computing and GPU programming with Julia¶

Part I: Multi-threading¶

Exercise: Multithread the computation of π¶

Tools for multi-threading¶

Homework 🤓¶

References:¶