/* On an 8-core machine, this takes the top 3-bits of random numbers and distributes them to the 8 cores for sorting. Then, it merge sorts them. */ #define NUM 1000000 I64 my_mp_count = 1 << Bsr(mp_count);//Power of 2 I32 *arg1, *arg2; I32 *b[my_mp_count], bn[my_mp_count]; I64 mp_not_done_flags; I64 Compare(I32 *e1, I32 *e2) { return *e1 - *e2; } U0 QuickSortU32(I32 *base, I64 num) {//By customizing, we dramatically improve it! //Cut and paste from QuickSortI64(). I64 i; I32 *less, *greater, pivot; if (num > 1) { do { less = base; greater = base + num; pivot = base[num / 2]; while (less < greater) { if (*less <= pivot) less++; else { greater--; SwapU32(less, greater); } } i = less - base; if (i == num) {//All less or equ to pivot //Point greater to first less do greater--; while (--i && *greater == pivot); if (i) { less = base +num / 2; //Pivot was not moved, point to it if (less < greater) SwapU32(less, greater); num = i; } else //All equ break; } else if (i < num / 2) { QuickSortU32(base, i); num -= i; base = greater; } else { QuickSortU32(greater, num - i); num = i; } } while (num > 1); } } U0 MPSort(I64 dummy=0) { no_warn dummy; QuickSortU32(b[Gs->num], bn[Gs->num]); LBtr(&mp_not_done_flags, Gs->num); } U0 MPRadixSortDemo(I64 dummy=0) { no_warn dummy; I64 i, j, k1, k2; F64 t0; arg1 = MAlloc(NUM * sizeof(I32)); for (i = 0; i < NUM; i++) arg1[i] = RandI32; arg2 = MAlloc(NUM * sizeof(I32)); "$GREEN$QuickSort$FG$\n"; t0 = tS; MemCopy(arg2, arg1, sizeof(I32) * NUM); QuickSort(arg2, NUM, sizeof(I32), &Compare); "Time:%9.6f\n", tS - t0; Dump(arg2 + NUM / 4); "$GREEN$QuickSortU32$FG$\n"; t0 = tS; MemCopy(arg2, arg1, sizeof(I32)*NUM); QuickSortU32(arg2, NUM); "Time:%9.6f\n", tS - t0; Dump(arg2 + NUM / 4); for (i = 0; i < my_mp_count; i++) { //We must do full size, just in case. //There will be uneven split between cores //depending on the distribution of rand numbers. b[i] = MAlloc(NUM * sizeof(I32)); bn[i] = 0; } if (my_mp_count < 2) throw('MultCore'); "$GREEN$MP Radix QuickSortU32$FG$\n"; t0 = tS; k1 = 32 - Bsr(my_mp_count); k2 = my_mp_count / 2; for (i = 0; i < NUM; i++) { j = arg1[i] >> k1 + k2; //This is a preliminary radix sort. b[j][bn[j]++] = arg1[i]; } mp_not_done_flags = 1 << my_mp_count - 1; for (i = 0; i < my_mp_count; i++) Spawn(&MPSort, NULL, NULL, i); while (mp_not_done_flags) Yield; j = 0; for (i = 0; i < my_mp_count; i++) { MemCopy(&arg2[j], b[i], bn[i] * sizeof(I32)); j += bn[i]; } "Time:%9.6f\n", tS - t0; Dump(arg2 + NUM / 4); Free(arg1); Free(arg2); for (i = 0; i < my_mp_count; i++) Free(b[i]); } MPRadixSortDemo; /* Results on 8 Cores 3.397GHz Core i7: QuickSort Time: 0.759998 QuickSortU32 Time: 0.093684 MP Radix QuickSortU32 Time: 0.045450 */