<p dir="ltr">Interesting, thanks.  I would've expected it to be treated like the return value from System.currentTimeMillis, where it splits it between eax:edx registers.  Moving into xmm register before storing the value seems wasteful, but I guess it really is avoiding "short" writes.</p>

<p dir="ltr">Sent from my phone</p>
<div class="gmail_quote">On Apr 30, 2013 1:31 PM, "Stanimir Simeonoff" <<a href="mailto:stanimir@riflexo.com">stanimir@riflexo.com</a>> wrote:<br type="attribution"><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
<span style="font-family:courier new,monospace">Here is some proof.<br><br>Stanimir<br>-----------<br>package t1;<br><br>public class TearLong {<br>    private volatile long x;<br>    long test(){<br>        for (int i=0;i<20000;i++){<br>

            long n=x;<br>            n+=System.currentTimeMillis()&0xff;<br>            x=n;<br>        }<br>        return x;<br>    }<br>    public static void main(String[] args) {<br>        System.out.println(new TearLong().test());<br>

        System.out.println(new TearLong().test());<br>    }<br>}<br><br>Decoding compiled method 0x00938a08:<br>Code:<br>[Disassembling for mach='i386']<br>[Entry Point]<br>[Verified Entry Point]<br>[Constants]<br>

  # {method} 'test' '()J' in 't1/TearLong'<br>  0x00938b00: int3   <br>  0x00938b01: xchg   %ax,%ax<br>  0x00938b04: mov    %eax,0xffffd000(%esp)<br>  0x00938b0b: push   %ebp<br>  0x00938b0c: sub    $0x18,%esp<br>

  0x00938b12: mov    0x8(%ecx),%ebx<br>  0x00938b15: mov    0xc(%ecx),%esi<br>  0x00938b18: mov    %ecx,(%esp)<br>  0x00938b1b: call   0x6dbeed90         ;   {runtime_call}<br>  0x00938b20: mov    0x4(%esi),%ebp     ; implicit exception: dispatches to 0x00938bdd<br>

  0x00938b23: cmp    $0x3b6bd38,%ebp    ;   {oop('t1/TearLong')}<br>  0x00938b29: jne    0x00938bcb         ;*aload_0<br>                                        ; - t1.TearLong::test@5 (line 7)<br>  0x00938b2f: inc    %ebx               ;*iinc<br>

                                        ; - t1.TearLong::test@25 (line 6)<br>  0x00938b30: movsd  0x8(%esi),%xmm0<br>  0x00938b35: movd   %xmm0,%ebp<br>  0x00938b39: psrlq  $0x20,%xmm0<br>  0x00938b3e: movd   %xmm0,%edi         ;*getfield x<br>

                                        ; - t1.TearLong::test@6 (line 7)<br>  0x00938b42: call   0x6dce22f0         ;   {runtime_call}<br>  0x00938b47: and    $0xff,%eax<br>  0x00938b4d: and    $0x0,%edx<br>  0x00938b50: add    %ebp,%eax<br>

  0x00938b52: adc    %edi,%edx<br>  0x00938b54: cmp    0x8(%esi),%eax<br>  0x00938b57: movd   %eax,%xmm1<br>  0x00938b5b: movd   %edx,%xmm0<br>  0x00938b5f: punpckldq %xmm0,%xmm1<br>  0x00938b63: movsd  %xmm1,0x8(%esi)<br>

  0x00938b68: lock addl $0x0,(%esp)     ;*putfield x<br>                                        ; - t1.TearLong::test@22 (line 9)<br>  0x00938b6d: jmp    0x00938b9c<br>  0x00938b6f: nop                       ;*getfield x<br>

                                        ; - t1.TearLong::test@6 (line 7)<br>  0x00938b70: call   0x6dce22f0         ;*putfield x<br>                                        ; - t1.TearLong::test@22 (line 9)<br>                                        ;   {runtime_call}<br>

  0x00938b75: inc    %ebx               ;*iinc<br>                                        ; - t1.TearLong::test@25 (line 6)<br>  0x00938b76: and    $0xff,%eax<br>  0x00938b7c: and    $0x0,%edx<br>  0x00938b7f: add    %ebp,%eax<br>

  0x00938b81: adc    %edi,%edx<br>  0x00938b83: cmp    0x8(%esi),%eax<br>  0x00938b86: movd   %eax,%xmm1<br>  0x00938b8a: movd   %edx,%xmm0<br>  0x00938b8e: punpckldq %xmm0,%xmm1<br>  0x00938b92: movsd  %xmm1,0x8(%esi)<br>

  0x00938b97: lock addl $0x0,(%esp)     ; OopMap{esi=Oop off=156}<br>                                        ;*if_icmplt<br>                                        ; - t1.TearLong::test@32 (line 6)<br>  0x00938b9c: test   %edi,0x8c0000      ;*if_icmplt<br>

                                        ; - t1.TearLong::test@32 (line 6)<br>                                        ;   {poll}<br>  0x00938ba2: movsd  0x8(%esi),%xmm0<br>  0x00938ba7: movd   %xmm0,%ebp<br>  0x00938bab: psrlq  $0x20,%xmm0<br>

  0x00938bb0: movd   %xmm0,%edi<br>  0x00938bb4: cmp    $0x4e20,%ebx<br>  0x00938bba: jl     0x00938b70         ;*getfield x<br>                                        ; - t1.TearLong::test@36 (line 11)<br>  0x00938bbc: mov    %ebp,%eax<br>

  0x00938bbe: mov    %edi,%edx<br>  0x00938bc0: add    $0x18,%esp<br>  0x00938bc3: pop    %ebp<br>  0x00938bc4: test   %eax,0x8c0000      ;   {poll_return}<br>  0x00938bca: ret    <br>  0x00938bcb: mov    $0xffffffad,%ecx<br>

  0x00938bd0: mov    %esi,%ebp<br>  0x00938bd2: mov    %ebx,0x4(%esp)<br>  0x00938bd6: nop    <br>  0x00938bd7: call   0x0091c700         ; OopMap{ebp=Oop off=220}<br>                                        ;*aload_0<br>
                                        ; - t1.TearLong::test@5 (line 7)<br>
                                        ;   {runtime_call}<br>  0x00938bdc: int3                      ;*getfield x<br>                                        ; - t1.TearLong::test@6 (line 7)<br>  0x00938bdd: mov    $0xfffffff6,%ecx<br>

  0x00938be2: nop    <br>  0x00938be3: call   0x0091c700         ; OopMap{off=232}<br>                                        ;*getfield x<br>                                        ; - t1.TearLong::test@6 (line 7)<br>                                        ;   {runtime_call}<br>

  0x00938be8: int3                      ;*getfield x<br>                                        ; - t1.TearLong::test@6 (line 7)<br></span>....<br><br><br><br><div class="gmail_quote">On Tue, Apr 30, 2013 at 8:15 PM, Stanimir Simeonoff <span dir="ltr"><<a href="mailto:stanimir@riflexo.com" target="_blank">stanimir@riflexo.com</a>></span> wrote:<br>

<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">.<div class="gmail_quote"><div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">


<p dir="ltr">As for SSE, yeah it's possible, but is that true? JIT skips integer registers for scalar long operations? I find that hard to believe as it would miss out on large register file/renaming opportunities.</p>




<p dir="ltr"></p></blockquote></div><div>I know that by looking at the assembly. I can still check w/ the current version.<span><font color="#888888"><br><br>Stanimir<br><br> <br></font></span></div><div><div>
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
<p dir="ltr">Sent from my phone</p><div><div>
<div class="gmail_quote">On Apr 30, 2013 12:58 PM, "Nathan Reynolds" <<a href="mailto:nathan.reynolds@oracle.com" target="_blank">nathan.reynolds@oracle.com</a>> wrote:<br type="attribution"><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">




  

    
  
  <div text="#000000" bgcolor="#FFFFFF">
    The processor can do whatever it wants in registers without other
    threads being able to see intermediate values.  Registers are
    private to the hardware thread.  So, we can use multiple
    instructions to load the ecx:ebx registers and then execute the
    cmpxchg8b to do a single write to globally visible cache.<br>
    <div><br>
      <div><a href="http://psr.us.oracle.com/wiki/index.php/User:Nathan_Reynolds" target="_blank">Nathan
          Reynolds</a> | Architect | <a href="tel:602.333.9091" value="+16023339091" target="_blank">602.333.9091</a><br>
        <font color="red">Oracle</font> <a href="http://psr.us.oracle.com/" target="_blank">PSR Engineering</a> | Server
        Technology<br>
      </div>
      On 4/30/2013 9:53 AM, Vitaly Davidovich wrote:<br>
    </div>
    <blockquote type="cite">
      <p dir="ltr">But this requires the src value to be in ecx:ebx so
        how would you load it there without two loads (and possibly
        observe tearing) in the first place?</p>
      <p dir="ltr">Sent from my phone</p>
      <div class="gmail_quote">On Apr 30, 2013 12:45 PM, "Nathan
        Reynolds" <<a href="mailto:nathan.reynolds@oracle.com" target="_blank">nathan.reynolds@oracle.com</a>>
        wrote:<br type="attribution">
        <blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
          <div text="#000000" bgcolor="#FFFFFF"> On 32-bit x86, the
            cmpxchg8b can be used to write a long in 1 instruction. 
            This instruction has been "present on most post-80486
            processors" (Wikipedia).  There might be cheaper ways to
            write a long but there is at least 1 way.<br>
            <div><br>
              <div><a href="http://psr.us.oracle.com/wiki/index.php/User:Nathan_Reynolds" target="_blank">Nathan Reynolds</a> | Architect | <a href="tel:602.333.9091" value="+16023339091" target="_blank">602.333.9091</a><br>




                <font color="red">Oracle</font> <a href="http://psr.us.oracle.com/" target="_blank">PSR
                  Engineering</a> | Server Technology<br>
              </div>
              On 4/30/2013 9:37 AM, Vitaly Davidovich wrote:<br>
            </div>
            <blockquote type="cite">
              <p dir="ltr">Curious how x86 would move a long in 1
                instruction? There's no memory to memory mov so has to
                go through register, and thus needs 2 registers (and
                hence split).  Am I missing something?</p>
              <p dir="ltr">Sent from my phone</p>
              <div class="gmail_quote">On Apr 30, 2013 12:23 PM, "Nathan
                Reynolds" <<a href="mailto:nathan.reynolds@oracle.com" target="_blank">nathan.reynolds@oracle.com</a>>
                wrote:<br type="attribution">
                <blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
                  <div text="#000000" bgcolor="#FFFFFF">
                    <div>You might want to print the assembly using
                      HotSpot (and OpenJDK?).  If the assembly, uses 1
                      instruction to do the write, then no splitting can
                      ever happen (because alignment takes care of cache
                      line splits).  If the assembly, uses 2
                      instructions to do the write, then it is only a
                      matter of timing.<br>
                      <br>
                      With a single processor system, you are waiting
                      for the thread's quantum to end right after the
                      first instruction but before the second
                      instruction.  This will allow the other thread to
                      see the split write.<br>
                      <br>
                      With a dual processor system, the reader thread
                      simply has to get a copy of the cache line after
                      the first write and before the second write.  This
                      is much easier to do.<br>
                      <br>
                      HotSpot will do a lot of optimizations on single
                      processor systems.  For example, it gets rid of
                      the "lock" prefix in front of atomic instructions
                      since the instruction's execution can't be split. 
                      It also doesn't output memory fences.  Both of
                      these give good performance boosts.  I wonder if
                      with one processor, OpenJDK is using 2
                      instructions to do the write whereas with multiple
                      processors it plays it safe and uses 1
                      instruction.<br>
                      <br>
                      Note: If you disable all of the processors but 1
                      and then start HotSpot, HotSpot will start in
                      single processor mode.  If you then enable those
                      processors while HotSpot is running, a lot of
                      things break and the JVM will crash.  Because
                      single processor systems are rare, the default
                      might be changed to assume multiple processors
                      unless the command line specifies 1 processor.<br>
                      <br>
                      <div><a href="http://psr.us.oracle.com/wiki/index.php/User:Nathan_Reynolds" target="_blank">Nathan Reynolds</a> |
                        Architect | <a href="tel:602.333.9091" value="+16023339091" target="_blank">602.333.9091</a><br>
                        <font color="red">Oracle</font> <a href="http://psr.us.oracle.com/" target="_blank">PSR Engineering</a> | Server
                        Technology<br>
                      </div>
                      On 4/30/2013 8:48 AM, Tim Halloran wrote:<br>
                    </div>
                    <blockquote type="cite">
                      <div dir="ltr">
                        <div>Aleksey, correct -- more trials show what
                          you predicted. Thanks for the nudge.</div>
                        <div><br>
                        </div>
                        Mark,
                        <div><br>
                        </div>
                        <div>Very helpful, in fact, we are seeing quick
                          failures except for the dual-processor case --
                          on a dual processor hardware or VM (Virtual
                          Box) we have yet to get a failure.  The two
                          programs attached are what I'm running.  I
                          stripped out my benchmark framework (so they
                          are easy to run on OpenJDK but not on
                          Android).  The difference is that one uses two
                          threads (one writer one reader) the other
                          three (two writers one reader) -- both seem to
                          produce similar results.</div>
                        <div><br>
                        </div>
                        <div>With one processor, OpenJDK 1.6.0_27 I see
                          the split write almost immediatly. Dual we
                          can't get a failure, yet, we get more failures
                          as the processor count goes up -- but after a
                          few failures, we don't get any more (they
                          program tries to get 10 to happen)...we can't
                          get to 10.</div>
                        <div><br>
                        </div>
                        <div>It seems that while this can happen on
                          OpenJDK it is rarer than on Android where ten
                          failures takes less than a second to happen.</div>
                        <div><br>
                        </div>
                        <div>Best, Tim</div>
                        <div> <br>
                        </div>
                      </div>
                      <div class="gmail_extra"><br>
                        <br>
                        <div class="gmail_quote">On Tue, Apr 30, 2013 at
                          11:26 AM, Mark Thornton <span dir="ltr"><<a href="mailto:mthornton@optrak.com" target="_blank">mthornton@optrak.com</a>></span>
                          wrote:<br>
                          <blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
                            <div bgcolor="#FFFFFF" text="#000000">
                              <div>
                                <div>
                                  <div>On 30/04/13 15:36, Tim Halloran
                                    wrote:<br>
                                  </div>
                                  <blockquote type="cite">
                                    <div dir="ltr">On Mon, Apr 29, 2013
                                      at 4:59 PM, Aleksey Shipilev <span dir="ltr"><<a href="mailto:aleksey.shipilev@oracle.com" target="_blank">aleksey.shipilev@oracle.com</a>></span>
                                      wrote:<br>
                                      <div class="gmail_extra">
                                        <div class="gmail_quote">
                                          <blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex">Yes,



                                            that's exactly what I had in
                                            mind:<br>
                                             a. Declare "long a"<br>
                                             b. Ramp up two threads.<br>
                                             c. Make thread 1 write 0L
                                            and -1L over and over to
                                            field $a<br>
                                             d. Make thread 2 observe
                                            the field a, and count the
                                            observed values<br>
                                             e. ...<br>
                                             f. PROFIT!<br>
                                            <br>
                                            P.S. It is important to do
                                            some action on value read in
                                            thread 2, so<br>
                                            that it does not hoisted
                                            from the loop, since $a is
                                            not supposed to be<br>
                                            volatile.<br>
                                            <span><font color="#888888"><br>
                                                -Aleksey.<br>
                                              </font></span>
                                            <div><br>
                                            </div>
                                          </blockquote>
                                          <div><br>
                                          </div>
                                          <div>This discussion is
                                            getting a bit far afield, I
                                            guess, but to get back onto
                                            the topic. I followed
                                            Aleksey's advice. And wrote
                                            an implementation that tests
                                            this.  I used two separate
                                            threads to write 0L and -1L
                                            into the long field "a" but
                                            that is the only real change
                                            I made. (I already had some
                                            scaffolding code to run
                                            things on Android or desktop
                                            Java).</div>
                                          <div><br>
                                          </div>
                                          <div><b>Android: splits writes
                                              to longs into two parts.</b></div>
                                          <div><br>
                                          </div>
                                          <div>On a Samsung Galaxy II
                                            with Android 4.0.4  a Nexus
                                            4 phone with Android 4.2.2 I
                                            saw non-atomic treatment of
                                            long. The value -4294967296
                                            (xFFFFFFFF00000000) showed
                                            up as well as 4294967295
                                            (x00000000FFFFFFFF).</div>
                                          <div><br>
                                          </div>
                                          <div>So looks like Android
                                            does not follow the (albeit
                                            optional) advice in the Java
                                            language specification about
                                            this.</div>
                                          <div><br>
                                          </div>
                                          <div><b>JDK: DOES NOT split
                                              writes to longs into two
                                              parts (even 32-bit
                                              implementations)</b></div>
                                          <div><br>
                                          </div>
                                          <div>Of course we couldn't get
                                            this to happen on any 64-bit
                                            JVM, but we tried it out
                                            under Linux on 32-bit
                                            OpenJDK 1.7.0_21 it does NOT
                                            happen. The 32-bit JVM
                                            implementations follow the
                                            recommendation of the Java
                                            language specification.</div>
                                          <div><br>
                                          </div>
                                          <div>An interesting curio. I
                                            wonder how many crashes in
                                            "working" Java code moved
                                            from desktop Java onto
                                            Android programmers are
                                            going to lose sleep tracking
                                            down this one.</div>
                                          <div> <br>
                                          </div>
                                          <br>
                                        </div>
                                      </div>
                                    </div>
                                  </blockquote>
                                  <br>
                                </div>
                              </div>
                              Last time I tried this sort of test, a
                              split write would be observed in under a
                              second on a true dual processor. However,
                              with only one processor available, it
                              would typically take around 20 minutes. So
                              you might have to run a very long test to
                              have any real confidence in the lack of
                              splitting.<span><font color="#888888"><br>
                                  <br>
                                  Mark Thornton<br>
                                  <br>
                                </font></span></div>
                            <br>
_______________________________________________<br>
                            Concurrency-interest mailing list<br>
                            <a href="mailto:Concurrency-interest@cs.oswego.edu" target="_blank">Concurrency-interest@cs.oswego.edu</a><br>
                            <a href="http://cs.oswego.edu/mailman/listinfo/concurrency-interest" target="_blank">http://cs.oswego.edu/mailman/listinfo/concurrency-interest</a><br>
                            <br>
                          </blockquote>
                        </div>
                        <br>
                      </div>
                      <br>
                      <fieldset></fieldset>
                      <br>
                      <pre>_______________________________________________
Concurrency-interest mailing list
<a href="mailto:Concurrency-interest@cs.oswego.edu" target="_blank">Concurrency-interest@cs.oswego.edu</a>
<a href="http://cs.oswego.edu/mailman/listinfo/concurrency-interest" target="_blank">http://cs.oswego.edu/mailman/listinfo/concurrency-interest</a>
</pre>
                    </blockquote>
                    <br>
                  </div>
                  <br>
                  _______________________________________________<br>
                  Concurrency-interest mailing list<br>
                  <a href="mailto:Concurrency-interest@cs.oswego.edu" target="_blank">Concurrency-interest@cs.oswego.edu</a><br>
                  <a href="http://cs.oswego.edu/mailman/listinfo/concurrency-interest" target="_blank">http://cs.oswego.edu/mailman/listinfo/concurrency-interest</a><br>
                  <br>
                </blockquote>
              </div>
            </blockquote>
            <br>
          </div>
        </blockquote>
      </div>
    </blockquote>
    <br>
  </div>

</blockquote></div>
</div></div><br>_______________________________________________<br>
Concurrency-interest mailing list<br>
<a href="mailto:Concurrency-interest@cs.oswego.edu" target="_blank">Concurrency-interest@cs.oswego.edu</a><br>
<a href="http://cs.oswego.edu/mailman/listinfo/concurrency-interest" target="_blank">http://cs.oswego.edu/mailman/listinfo/concurrency-interest</a><br>
<br></blockquote></div></div></div><br>
</blockquote></div><br>
</blockquote></div>