U8 *master_bitmap;
I64 prime_range, my_mp_count, pending;

U0 PrimesJob(I64 i)
{
    I64 j, k, l = Sqrt(prime_range), 
        lo = i * prime_range / my_mp_count, 
        hi = (i + 1) * prime_range / my_mp_count, 
        lo2 = lo + sys_cache_line_width, 
        hi2 = hi - sys_cache_line_width;

    if (lo2 > hi2)
        lo2 = hi2;
    for (j = 2; j <= l; j++)
    {
        if (!Bt(master_bitmap, j))
        {
            if (j >= lo)
                k = j + j;
            else
            {
                k = lo - lo % j;
                if (k < lo)
                    k += j;
            }
            while (k < lo2)
            {
                LBts(master_bitmap, k);
                k += j;
            }
            while (k < hi2)
            {
                Bts(master_bitmap, k);
                k += j;
            }
            while (k < hi)
            {
                LBts(master_bitmap, k);
                k += j;
            }
        }
    }
    lock pending--;
}

I64 Primes(I64 range, I64 _my_mp_count)
{
    I64 i, prime_count = 0;
    F64 t0 = tS, tf;

    prime_range = range;
    if (_my_mp_count <= mp_count)
        my_mp_count = _my_mp_count;
    else
        my_mp_count = mp_count;
    master_bitmap = CAlloc((prime_range + 7) / 8 + 1);
    Bts(master_bitmap, 0);
    Bts(master_bitmap, 1);
    pending = my_mp_count;
    for (i = 0; i < my_mp_count; i++)
        JobQueue(&PrimesJob, i, i);
    while (pending)
        Yield;
    tf = tS;

    for (i = 0; i < prime_range; i++)
        if (!Bt(master_bitmap, i))
            prime_count++;

    for (i = MaxI64(prime_range - 100, 0); i < prime_range; i++)
        if (!Bt(master_bitmap, i))
            "%d ", i;

    "\n$RED$CPUs:%d PrimeRange:%,d PrimeCount:%,d Time:%9.7,f$FG$\n", my_mp_count, prime_range, prime_count, tf - t0;

    Free(master_bitmap);
    return prime_count;
}

Primes(100, 1);
Primes(100, mp_count);
Primes(1000000, 1);
Primes(1000000, mp_count);