[concurrency-interest] help show speed-up on a trivial but manualarray-map operation?

Dan Grossman djg at cs.washington.edu
Thu Mar 8 21:56:45 EST 2012


I tried 100x larger (20 million array elements) and saw similar
behavior.  At 1000x (200 million array elements), I got an
out-of-memory error with the default heap configuration.

--Dan

On Thu, Mar 8, 2012 at 6:49 PM, David Holmes <davidcholmes at aapt.net.au> wrote:
> Dan,
>
> Have you tried making the vector much much larger? On some problems you need
> to switch to 64-bit to get data structures big enough to warrant parallel
> evaluation.
>
> David Holmes
>
>> -----Original Message-----
>> From: concurrency-interest-bounces at cs.oswego.edu
>> [mailto:concurrency-interest-bounces at cs.oswego.edu]On Behalf Of Dan
>> Grossman
>> Sent: Friday, 9 March 2012 12:41 PM
>> To: concurrency-interest at cs.oswego.edu
>> Cc: Kim Bruce
>> Subject: [concurrency-interest] help show speed-up on a trivial but
>> manualarray-map operation?
>>
>>
>> Hi all,
>>
>> Short version:
>>
>> Any help on why the attached program (also pasted at the bottom) runs
>> the sequential version faster than the parallel version?  Apologies if
>> I've missed something obvious.
>>
>> Longer version:
>>
>> As I've mentioned a couple times on this list, I have materials for
>> introducing parallelism and concurrency in undergraduate data
>> structures courses
>> (http://www.cs.washington.edu/homes/djg/teachingMaterials/spac/).  The
>> parallel portion uses the ForkJoin Framework, including how to
>> manually implement maps and reduces over arrays.  (Why manually?  For
>> the same reason students in data structures courses often implement
>> hashtables and priority queues.)  Several schools are using the
>> materials and the most common question I get from instructors is "why
>> didn't this code run faster in parallel" since they understandably
>> like to show the "easy pay-off for an easy problem".  I've
>> remote-debugged a couple -- bad sequential cutoffs or wonky timing
>> code -- without bothering all of you, but this time I'm stuck.
>>
>> The attached code (and pasted below for convenience) is a simple
>> vector addition.  Kim (cc'ed) is getting slower results than the naive
>> code on a 48-processor Linux box.  The attached results are from his
>> machine.  I see similar behavior on a 4-processor Linux box with
>> openJDK7.  We have more detailed machine specs or Java versions if
>> that would help, but I imagine either:
>>
>> (a) We've done something silly with the code that we've both
>> overlooked (but varying the sequential cutoff or the array size
>> doesn't seem to be it)
>>
>> or
>>
>> (b) This is not a "winning" example for some reason and we should
>> tweak the problem.
>>
>> Any thoughts?
>>
>> Much thanks!
>>
>> --Dan
>>
>> =====
>> import java.util.concurrent.*;
>>
>> public class VecAdd extends RecursiveAction {
>>        private static final long NPS = (1000L * 1000 * 1000);
>>        private static final int VECTOR_SIZE = 200000;
>>        private static final ForkJoinPool fjPool = new ForkJoinPool();
>>        private static int SEQUENTIAL_CUTOFF = 100;
>>        int lo;
>>        int hi;
>>        int[] res;
>>        int[] arr1;
>>        int[] arr2;
>>
>>        public VecAdd(int l, int h, int[] r, int[] a1, int[] a2) {
>>                lo = l;
>>                hi = h;
>>                res = r;
>>                arr1 = a1;
>>                arr2 = a2;
>>        }
>>
>>        protected void compute() {
>>                if ((hi - lo) < SEQUENTIAL_CUTOFF) {
>>                        for (int i = lo; i < hi; i++) {
>>                                res[i] = arr1[i] + arr2[i];
>>                        }
>>                } else {
>>                        int mid = (hi + lo) / 2;
>>                        VecAdd left = new VecAdd(lo, mid, res, arr1, arr2);
>>                        VecAdd right = new VecAdd(mid, hi, res,
>> arr1, arr2);
>>                        left.fork();
>>                        right.compute();
>>                        left.join();
>>                }
>>        }
>>
>>        public static int[] add(int[] arr1, int[] arr2) {
>>                assert (arr1.length == arr2.length);
>>                int[] ans = new int[arr1.length];
>>                fjPool.invoke(new VecAdd(0, arr1.length, ans, arr1, arr2));
>>                return ans;
>>        }
>>
>>        public static int[] staticadd(int[] arr1, int[] arr2) {
>>                assert (arr1.length == arr2.length);
>>                int[] ans = new int[arr1.length];
>>                for (int i = 0; i < arr1.length; i++) {
>>                        ans[i] = arr1[i] + arr2[i];
>>                }
>>                return ans;
>>        }
>>
>>        public static void main(String[] args) {
>>                int[] first = new int[VECTOR_SIZE];
>>                int[] second = new int[VECTOR_SIZE];
>>                for (int count = 0; count < VECTOR_SIZE; count++) {
>>                        first[count] = count;
>>                        second[count] = 2 * count;
>>                }
>>                for (int reps = 0; reps < 20; reps++) {
>>                        long last = System.nanoTime();
>>                        int[] answer = add(first, second);
>>                        double elapsed = (double) (System.nanoTime() -
>> last) / NPS;
>>                        System.out.println("Parallel add in time "
>> + elapsed);
>>                }
>>                for (int reps = 0; reps < 20; reps++){
>>                        SEQUENTIAL_CUTOFF = 2 * SEQUENTIAL_CUTOFF;
>>                        long last = System.nanoTime();
>>                        int[] answer = add(first,second);
>>                        double elapsed =
>> (double)(System.nanoTime() - last)/NPS;
>>                        System.out.println("Parallel add in time "+elapsed+
>>                                        " with cutoff "+SEQUENTIAL_CUTOFF);
>>                }
>>                System.out.println(fjPool);
>>                /*
>>                 * for(int val:answer) { System.out.print(val+", "); }
>>                 */
>>                for (int reps = 0; reps < 20; reps++) {
>>                        long last = System.nanoTime();
>>                        int[] answer = staticadd(first, second);
>>                        double elapsed = (double) (System.nanoTime() -
>> last) / NPS;
>>                        System.out.println("Sequential add in time
>> " + elapsed);
>>                }
>>        }
>> }
>>
>



More information about the Concurrency-interest mailing list