1 /*
    2  * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
    3  * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
    4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    5  *
    6  * This code is free software; you can redistribute it and/or modify it
    7  * under the terms of the GNU General Public License version 2 only, as
    8  * published by the Free Software Foundation.
    9  *
   10  * This code is distributed in the hope that it will be useful, but WITHOUT
   11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   13  * version 2 for more details (a copy is included in the LICENSE file that
   14  * accompanied this code).
   15  *
   16  * You should have received a copy of the GNU General Public License version
   17  * 2 along with this work; if not, write to the Free Software Foundation,
   18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   19  *
   20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   21  * or visit www.oracle.com if you need additional information or have any
   22  * questions.
   23  *
   24  */
   25 
   26 #include "asm/macroAssembler.hpp"
   27 #include "asm/macroAssembler.inline.hpp"
   28 #include "asm/register.hpp"
   29 #include "atomic_aarch64.hpp"
   30 #include "compiler/oopMap.hpp"
   31 #include "gc/shared/barrierSet.hpp"
   32 #include "gc/shared/barrierSetAssembler.hpp"
   33 #include "gc/shared/gc_globals.hpp"
   34 #include "gc/shared/tlab_globals.hpp"
   35 #include "interpreter/interpreter.hpp"
   36 #include "memory/universe.hpp"
   37 #include "nativeInst_aarch64.hpp"
   38 #include "oops/instanceOop.hpp"
   39 #include "oops/method.hpp"
   40 #include "oops/objArrayKlass.hpp"
   41 #include "oops/oop.inline.hpp"
   42 #include "prims/methodHandles.hpp"
   43 #include "prims/upcallLinker.hpp"
   44 #include "runtime/arguments.hpp"
   45 #include "runtime/atomic.hpp"
   46 #include "runtime/continuation.hpp"
   47 #include "runtime/continuationEntry.inline.hpp"
   48 #include "runtime/frame.inline.hpp"
   49 #include "runtime/handles.inline.hpp"
   50 #include "runtime/javaThread.hpp"
   51 #include "runtime/sharedRuntime.hpp"
   52 #include "runtime/stubCodeGenerator.hpp"
   53 #include "runtime/stubRoutines.hpp"
   54 #include "utilities/align.hpp"
   55 #include "utilities/checkedCast.hpp"
   56 #include "utilities/debug.hpp"
   57 #include "utilities/globalDefinitions.hpp"
   58 #include "utilities/intpow.hpp"
   59 #include "utilities/powerOfTwo.hpp"
   60 #ifdef COMPILER2
   61 #include "opto/runtime.hpp"
   62 #endif
   63 #if INCLUDE_ZGC
   64 #include "gc/z/zThreadLocalData.hpp"
   65 #endif
   66 
   67 // Declaration and definition of StubGenerator (no .hpp file).
   68 // For a more detailed description of the stub routine structure
   69 // see the comment in stubRoutines.hpp
   70 
   71 #undef __
   72 #define __ _masm->
   73 
   74 #ifdef PRODUCT
   75 #define BLOCK_COMMENT(str) /* nothing */
   76 #else
   77 #define BLOCK_COMMENT(str) __ block_comment(str)
   78 #endif
   79 
   80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
   81 
   82 // Stub Code definitions
   83 
   84 class StubGenerator: public StubCodeGenerator {
   85  private:
   86 
   87 #ifdef PRODUCT
   88 #define inc_counter_np(counter) ((void)0)
   89 #else
   90   void inc_counter_np_(uint& counter) {
   91     __ incrementw(ExternalAddress((address)&counter));
   92   }
   93 #define inc_counter_np(counter) \
   94   BLOCK_COMMENT("inc_counter " #counter); \
   95   inc_counter_np_(counter);
   96 #endif
   97 
   98   // Call stubs are used to call Java from C
   99   //
  100   // Arguments:
  101   //    c_rarg0:   call wrapper address                   address
  102   //    c_rarg1:   result                                 address
  103   //    c_rarg2:   result type                            BasicType
  104   //    c_rarg3:   method                                 Method*
  105   //    c_rarg4:   (interpreter) entry point              address
  106   //    c_rarg5:   parameters                             intptr_t*
  107   //    c_rarg6:   parameter size (in words)              int
  108   //    c_rarg7:   thread                                 Thread*
  109   //
  110   // There is no return from the stub itself as any Java result
  111   // is written to result
  112   //
  113   // we save r30 (lr) as the return PC at the base of the frame and
  114   // link r29 (fp) below it as the frame pointer installing sp (r31)
  115   // into fp.
  116   //
  117   // we save r0-r7, which accounts for all the c arguments.
  118   //
  119   // TODO: strictly do we need to save them all? they are treated as
  120   // volatile by C so could we omit saving the ones we are going to
  121   // place in global registers (thread? method?) or those we only use
  122   // during setup of the Java call?
  123   //
  124   // we don't need to save r8 which C uses as an indirect result location
  125   // return register.
  126   //
  127   // we don't need to save r9-r15 which both C and Java treat as
  128   // volatile
  129   //
  130   // we don't need to save r16-18 because Java does not use them
  131   //
  132   // we save r19-r28 which Java uses as scratch registers and C
  133   // expects to be callee-save
  134   //
  135   // we save the bottom 64 bits of each value stored in v8-v15; it is
  136   // the responsibility of the caller to preserve larger values.
  137   //
  138   // so the stub frame looks like this when we enter Java code
  139   //
  140   //     [ return_from_Java     ] <--- sp
  141   //     [ argument word n      ]
  142   //      ...
  143   // -29 [ argument word 1      ]
  144   // -28 [ saved Floating-point Control Register ]
  145   // -26 [ saved v15            ] <--- sp_after_call
  146   // -25 [ saved v14            ]
  147   // -24 [ saved v13            ]
  148   // -23 [ saved v12            ]
  149   // -22 [ saved v11            ]
  150   // -21 [ saved v10            ]
  151   // -20 [ saved v9             ]
  152   // -19 [ saved v8             ]
  153   // -18 [ saved r28            ]
  154   // -17 [ saved r27            ]
  155   // -16 [ saved r26            ]
  156   // -15 [ saved r25            ]
  157   // -14 [ saved r24            ]
  158   // -13 [ saved r23            ]
  159   // -12 [ saved r22            ]
  160   // -11 [ saved r21            ]
  161   // -10 [ saved r20            ]
  162   //  -9 [ saved r19            ]
  163   //  -8 [ call wrapper    (r0) ]
  164   //  -7 [ result          (r1) ]
  165   //  -6 [ result type     (r2) ]
  166   //  -5 [ method          (r3) ]
  167   //  -4 [ entry point     (r4) ]
  168   //  -3 [ parameters      (r5) ]
  169   //  -2 [ parameter size  (r6) ]
  170   //  -1 [ thread (r7)          ]
  171   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
  172   //   1 [ saved lr       (r30) ]
  173 
  174   // Call stub stack layout word offsets from fp
  175   enum call_stub_layout {
  176     sp_after_call_off  = -28,
  177 
  178     fpcr_off           = sp_after_call_off,
  179     d15_off            = -26,
  180     d13_off            = -24,
  181     d11_off            = -22,
  182     d9_off             = -20,
  183 
  184     r28_off            = -18,
  185     r26_off            = -16,
  186     r24_off            = -14,
  187     r22_off            = -12,
  188     r20_off            = -10,
  189     call_wrapper_off   =  -8,
  190     result_off         =  -7,
  191     result_type_off    =  -6,
  192     method_off         =  -5,
  193     entry_point_off    =  -4,
  194     parameter_size_off =  -2,
  195     thread_off         =  -1,
  196     fp_f               =   0,
  197     retaddr_off        =   1,
  198   };
  199 
  200   address generate_call_stub(address& return_address) {
  201     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
  202            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
  203            "adjust this code");
  204 
  205     StubGenStubId stub_id = StubGenStubId::call_stub_id;
  206     StubCodeMark mark(this, stub_id);
  207     address start = __ pc();
  208 
  209     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
  210 
  211     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
  212     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
  213     const Address result        (rfp, result_off         * wordSize);
  214     const Address result_type   (rfp, result_type_off    * wordSize);
  215     const Address method        (rfp, method_off         * wordSize);
  216     const Address entry_point   (rfp, entry_point_off    * wordSize);
  217     const Address parameter_size(rfp, parameter_size_off * wordSize);
  218 
  219     const Address thread        (rfp, thread_off         * wordSize);
  220 
  221     const Address d15_save      (rfp, d15_off * wordSize);
  222     const Address d13_save      (rfp, d13_off * wordSize);
  223     const Address d11_save      (rfp, d11_off * wordSize);
  224     const Address d9_save       (rfp, d9_off * wordSize);
  225 
  226     const Address r28_save      (rfp, r28_off * wordSize);
  227     const Address r26_save      (rfp, r26_off * wordSize);
  228     const Address r24_save      (rfp, r24_off * wordSize);
  229     const Address r22_save      (rfp, r22_off * wordSize);
  230     const Address r20_save      (rfp, r20_off * wordSize);
  231 
  232     // stub code
  233 
  234     address aarch64_entry = __ pc();
  235 
  236     // set up frame and move sp to end of save area
  237     __ enter();
  238     __ sub(sp, rfp, -sp_after_call_off * wordSize);
  239 
  240     // save register parameters and Java scratch/global registers
  241     // n.b. we save thread even though it gets installed in
  242     // rthread because we want to sanity check rthread later
  243     __ str(c_rarg7,  thread);
  244     __ strw(c_rarg6, parameter_size);
  245     __ stp(c_rarg4, c_rarg5,  entry_point);
  246     __ stp(c_rarg2, c_rarg3,  result_type);
  247     __ stp(c_rarg0, c_rarg1,  call_wrapper);
  248 
  249     __ stp(r20, r19,   r20_save);
  250     __ stp(r22, r21,   r22_save);
  251     __ stp(r24, r23,   r24_save);
  252     __ stp(r26, r25,   r26_save);
  253     __ stp(r28, r27,   r28_save);
  254 
  255     __ stpd(v9,  v8,   d9_save);
  256     __ stpd(v11, v10,  d11_save);
  257     __ stpd(v13, v12,  d13_save);
  258     __ stpd(v15, v14,  d15_save);
  259 
  260     __ get_fpcr(rscratch1);
  261     __ str(rscratch1, fpcr_save);
  262     // Set FPCR to the state we need. We do want Round to Nearest. We
  263     // don't want non-IEEE rounding modes or floating-point traps.
  264     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
  265     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
  266     __ set_fpcr(rscratch1);
  267 
  268     // install Java thread in global register now we have saved
  269     // whatever value it held
  270     __ mov(rthread, c_rarg7);
  271     // And method
  272     __ mov(rmethod, c_rarg3);
  273 
  274     // set up the heapbase register
  275     __ reinit_heapbase();
  276 
  277 #ifdef ASSERT
  278     // make sure we have no pending exceptions
  279     {
  280       Label L;
  281       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
  282       __ cmp(rscratch1, (u1)NULL_WORD);
  283       __ br(Assembler::EQ, L);
  284       __ stop("StubRoutines::call_stub: entered with pending exception");
  285       __ BIND(L);
  286     }
  287 #endif
  288     // pass parameters if any
  289     __ mov(esp, sp);
  290     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
  291     __ andr(sp, rscratch1, -2 * wordSize);
  292 
  293     BLOCK_COMMENT("pass parameters if any");
  294     Label parameters_done;
  295     // parameter count is still in c_rarg6
  296     // and parameter pointer identifying param 1 is in c_rarg5
  297     __ cbzw(c_rarg6, parameters_done);
  298 
  299     address loop = __ pc();
  300     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
  301     __ subsw(c_rarg6, c_rarg6, 1);
  302     __ push(rscratch1);
  303     __ br(Assembler::GT, loop);
  304 
  305     __ BIND(parameters_done);
  306 
  307     // call Java entry -- passing methdoOop, and current sp
  308     //      rmethod: Method*
  309     //      r19_sender_sp: sender sp
  310     BLOCK_COMMENT("call Java function");
  311     __ mov(r19_sender_sp, sp);
  312     __ blr(c_rarg4);
  313 
  314     // we do this here because the notify will already have been done
  315     // if we get to the next instruction via an exception
  316     //
  317     // n.b. adding this instruction here affects the calculation of
  318     // whether or not a routine returns to the call stub (used when
  319     // doing stack walks) since the normal test is to check the return
  320     // pc against the address saved below. so we may need to allow for
  321     // this extra instruction in the check.
  322 
  323     // save current address for use by exception handling code
  324 
  325     return_address = __ pc();
  326 
  327     // store result depending on type (everything that is not
  328     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
  329     // n.b. this assumes Java returns an integral result in r0
  330     // and a floating result in j_farg0
  331     // All of j_rargN may be used to return inline type fields so be careful
  332     // not to clobber those.
  333     // SharedRuntime::generate_buffered_inline_type_adapter() knows the register
  334     // assignment of Rresult below.
  335     Register Rresult = r14, Rresult_type = r15;
  336     __ ldr(Rresult, result);
  337     Label is_long, is_float, is_double, check_prim, exit;
  338     __ ldr(Rresult_type, result_type);
  339     __ cmp(Rresult_type, (u1)T_OBJECT);
  340     __ br(Assembler::EQ, check_prim);
  341     __ cmp(Rresult_type, (u1)T_LONG);
  342     __ br(Assembler::EQ, is_long);
  343     __ cmp(Rresult_type, (u1)T_FLOAT);
  344     __ br(Assembler::EQ, is_float);
  345     __ cmp(Rresult_type, (u1)T_DOUBLE);
  346     __ br(Assembler::EQ, is_double);
  347 
  348     // handle T_INT case
  349     __ strw(r0, Address(Rresult));
  350 
  351     __ BIND(exit);
  352 
  353     // pop parameters
  354     __ sub(esp, rfp, -sp_after_call_off * wordSize);
  355 
  356 #ifdef ASSERT
  357     // verify that threads correspond
  358     {
  359       Label L, S;
  360       __ ldr(rscratch1, thread);
  361       __ cmp(rthread, rscratch1);
  362       __ br(Assembler::NE, S);
  363       __ get_thread(rscratch1);
  364       __ cmp(rthread, rscratch1);
  365       __ br(Assembler::EQ, L);
  366       __ BIND(S);
  367       __ stop("StubRoutines::call_stub: threads must correspond");
  368       __ BIND(L);
  369     }
  370 #endif
  371 
  372     __ pop_cont_fastpath(rthread);
  373 
  374     // restore callee-save registers
  375     __ ldpd(v15, v14,  d15_save);
  376     __ ldpd(v13, v12,  d13_save);
  377     __ ldpd(v11, v10,  d11_save);
  378     __ ldpd(v9,  v8,   d9_save);
  379 
  380     __ ldp(r28, r27,   r28_save);
  381     __ ldp(r26, r25,   r26_save);
  382     __ ldp(r24, r23,   r24_save);
  383     __ ldp(r22, r21,   r22_save);
  384     __ ldp(r20, r19,   r20_save);
  385 
  386     // restore fpcr
  387     __ ldr(rscratch1,  fpcr_save);
  388     __ set_fpcr(rscratch1);
  389 
  390     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
  391     __ ldrw(c_rarg2, result_type);
  392     __ ldr(c_rarg3,  method);
  393     __ ldp(c_rarg4, c_rarg5,  entry_point);
  394     __ ldp(c_rarg6, c_rarg7,  parameter_size);
  395 
  396     // leave frame and return to caller
  397     __ leave();
  398     __ ret(lr);
  399 
  400     // handle return types different from T_INT
  401     __ BIND(check_prim);
  402     if (InlineTypeReturnedAsFields) {
  403       // Check for scalarized return value
  404       __ tbz(r0, 0, is_long);
  405       // Load pack handler address
  406       __ andr(rscratch1, r0, -2);
  407       __ ldr(rscratch1, Address(rscratch1, InstanceKlass::adr_inlineklass_fixed_block_offset()));
  408       __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset()));
  409       __ blr(rscratch1);
  410       __ b(exit);
  411     }
  412 
  413     __ BIND(is_long);
  414     __ str(r0, Address(Rresult, 0));
  415     __ br(Assembler::AL, exit);
  416 
  417     __ BIND(is_float);
  418     __ strs(j_farg0, Address(Rresult, 0));
  419     __ br(Assembler::AL, exit);
  420 
  421     __ BIND(is_double);
  422     __ strd(j_farg0, Address(Rresult, 0));
  423     __ br(Assembler::AL, exit);
  424 
  425     return start;
  426   }
  427 
  428   // Return point for a Java call if there's an exception thrown in
  429   // Java code.  The exception is caught and transformed into a
  430   // pending exception stored in JavaThread that can be tested from
  431   // within the VM.
  432   //
  433   // Note: Usually the parameters are removed by the callee. In case
  434   // of an exception crossing an activation frame boundary, that is
  435   // not the case if the callee is compiled code => need to setup the
  436   // rsp.
  437   //
  438   // r0: exception oop
  439 
  440   address generate_catch_exception() {
  441     StubGenStubId stub_id = StubGenStubId::catch_exception_id;
  442     StubCodeMark mark(this, stub_id);
  443     address start = __ pc();
  444 
  445     // same as in generate_call_stub():
  446     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
  447     const Address thread        (rfp, thread_off         * wordSize);
  448 
  449 #ifdef ASSERT
  450     // verify that threads correspond
  451     {
  452       Label L, S;
  453       __ ldr(rscratch1, thread);
  454       __ cmp(rthread, rscratch1);
  455       __ br(Assembler::NE, S);
  456       __ get_thread(rscratch1);
  457       __ cmp(rthread, rscratch1);
  458       __ br(Assembler::EQ, L);
  459       __ bind(S);
  460       __ stop("StubRoutines::catch_exception: threads must correspond");
  461       __ bind(L);
  462     }
  463 #endif
  464 
  465     // set pending exception
  466     __ verify_oop(r0);
  467 
  468     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
  469     __ mov(rscratch1, (address)__FILE__);
  470     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
  471     __ movw(rscratch1, (int)__LINE__);
  472     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
  473 
  474     // complete return to VM
  475     assert(StubRoutines::_call_stub_return_address != nullptr,
  476            "_call_stub_return_address must have been generated before");
  477     __ b(StubRoutines::_call_stub_return_address);
  478 
  479     return start;
  480   }
  481 
  482   // Continuation point for runtime calls returning with a pending
  483   // exception.  The pending exception check happened in the runtime
  484   // or native call stub.  The pending exception in Thread is
  485   // converted into a Java-level exception.
  486   //
  487   // Contract with Java-level exception handlers:
  488   // r0: exception
  489   // r3: throwing pc
  490   //
  491   // NOTE: At entry of this stub, exception-pc must be in LR !!
  492 
  493   // NOTE: this is always used as a jump target within generated code
  494   // so it just needs to be generated code with no x86 prolog
  495 
  496   address generate_forward_exception() {
  497     StubGenStubId stub_id = StubGenStubId::forward_exception_id;
  498     StubCodeMark mark(this, stub_id);
  499     address start = __ pc();
  500 
  501     // Upon entry, LR points to the return address returning into
  502     // Java (interpreted or compiled) code; i.e., the return address
  503     // becomes the throwing pc.
  504     //
  505     // Arguments pushed before the runtime call are still on the stack
  506     // but the exception handler will reset the stack pointer ->
  507     // ignore them.  A potential result in registers can be ignored as
  508     // well.
  509 
  510 #ifdef ASSERT
  511     // make sure this code is only executed if there is a pending exception
  512     {
  513       Label L;
  514       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
  515       __ cbnz(rscratch1, L);
  516       __ stop("StubRoutines::forward exception: no pending exception (1)");
  517       __ bind(L);
  518     }
  519 #endif
  520 
  521     // compute exception handler into r19
  522 
  523     // call the VM to find the handler address associated with the
  524     // caller address. pass thread in r0 and caller pc (ret address)
  525     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
  526     // the stack.
  527     __ mov(c_rarg1, lr);
  528     // lr will be trashed by the VM call so we move it to R19
  529     // (callee-saved) because we also need to pass it to the handler
  530     // returned by this call.
  531     __ mov(r19, lr);
  532     BLOCK_COMMENT("call exception_handler_for_return_address");
  533     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
  534                          SharedRuntime::exception_handler_for_return_address),
  535                     rthread, c_rarg1);
  536     // Reinitialize the ptrue predicate register, in case the external runtime
  537     // call clobbers ptrue reg, as we may return to SVE compiled code.
  538     __ reinitialize_ptrue();
  539 
  540     // we should not really care that lr is no longer the callee
  541     // address. we saved the value the handler needs in r19 so we can
  542     // just copy it to r3. however, the C2 handler will push its own
  543     // frame and then calls into the VM and the VM code asserts that
  544     // the PC for the frame above the handler belongs to a compiled
  545     // Java method. So, we restore lr here to satisfy that assert.
  546     __ mov(lr, r19);
  547     // setup r0 & r3 & clear pending exception
  548     __ mov(r3, r19);
  549     __ mov(r19, r0);
  550     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
  551     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
  552 
  553 #ifdef ASSERT
  554     // make sure exception is set
  555     {
  556       Label L;
  557       __ cbnz(r0, L);
  558       __ stop("StubRoutines::forward exception: no pending exception (2)");
  559       __ bind(L);
  560     }
  561 #endif
  562 
  563     // continue at exception handler
  564     // r0: exception
  565     // r3: throwing pc
  566     // r19: exception handler
  567     __ verify_oop(r0);
  568     __ br(r19);
  569 
  570     return start;
  571   }
  572 
  573   // Non-destructive plausibility checks for oops
  574   //
  575   // Arguments:
  576   //    r0: oop to verify
  577   //    rscratch1: error message
  578   //
  579   // Stack after saving c_rarg3:
  580   //    [tos + 0]: saved c_rarg3
  581   //    [tos + 1]: saved c_rarg2
  582   //    [tos + 2]: saved lr
  583   //    [tos + 3]: saved rscratch2
  584   //    [tos + 4]: saved r0
  585   //    [tos + 5]: saved rscratch1
  586   address generate_verify_oop() {
  587     StubGenStubId stub_id = StubGenStubId::verify_oop_id;
  588     StubCodeMark mark(this, stub_id);
  589     address start = __ pc();
  590 
  591     Label exit, error;
  592 
  593     // save c_rarg2 and c_rarg3
  594     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
  595 
  596     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  597     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
  598     __ ldr(c_rarg3, Address(c_rarg2));
  599     __ add(c_rarg3, c_rarg3, 1);
  600     __ str(c_rarg3, Address(c_rarg2));
  601 
  602     // object is in r0
  603     // make sure object is 'reasonable'
  604     __ cbz(r0, exit); // if obj is null it is OK
  605 
  606     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
  607     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
  608 
  609     // return if everything seems ok
  610     __ bind(exit);
  611 
  612     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  613     __ ret(lr);
  614 
  615     // handle errors
  616     __ bind(error);
  617     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
  618 
  619     __ push(RegSet::range(r0, r29), sp);
  620     // debug(char* msg, int64_t pc, int64_t regs[])
  621     __ mov(c_rarg0, rscratch1);      // pass address of error message
  622     __ mov(c_rarg1, lr);             // pass return address
  623     __ mov(c_rarg2, sp);             // pass address of regs on stack
  624 #ifndef PRODUCT
  625     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
  626 #endif
  627     BLOCK_COMMENT("call MacroAssembler::debug");
  628     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
  629     __ blr(rscratch1);
  630     __ hlt(0);
  631 
  632     return start;
  633   }
  634 
  635   // Generate indices for iota vector.
  636   address generate_iota_indices(StubGenStubId stub_id) {
  637     __ align(CodeEntryAlignment);
  638     StubCodeMark mark(this, stub_id);
  639     address start = __ pc();
  640     // B
  641     __ emit_data64(0x0706050403020100, relocInfo::none);
  642     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
  643     // H
  644     __ emit_data64(0x0003000200010000, relocInfo::none);
  645     __ emit_data64(0x0007000600050004, relocInfo::none);
  646     // S
  647     __ emit_data64(0x0000000100000000, relocInfo::none);
  648     __ emit_data64(0x0000000300000002, relocInfo::none);
  649     // D
  650     __ emit_data64(0x0000000000000000, relocInfo::none);
  651     __ emit_data64(0x0000000000000001, relocInfo::none);
  652     // S - FP
  653     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
  654     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
  655     // D - FP
  656     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
  657     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
  658     return start;
  659   }
  660 
  661   // The inner part of zero_words().  This is the bulk operation,
  662   // zeroing words in blocks, possibly using DC ZVA to do it.  The
  663   // caller is responsible for zeroing the last few words.
  664   //
  665   // Inputs:
  666   // r10: the HeapWord-aligned base address of an array to zero.
  667   // r11: the count in HeapWords, r11 > 0.
  668   //
  669   // Returns r10 and r11, adjusted for the caller to clear.
  670   // r10: the base address of the tail of words left to clear.
  671   // r11: the number of words in the tail.
  672   //      r11 < MacroAssembler::zero_words_block_size.
  673 
  674   address generate_zero_blocks() {
  675     Label done;
  676     Label base_aligned;
  677 
  678     Register base = r10, cnt = r11;
  679 
  680     __ align(CodeEntryAlignment);
  681     StubGenStubId stub_id = StubGenStubId::zero_blocks_id;
  682     StubCodeMark mark(this, stub_id);
  683     address start = __ pc();
  684 
  685     if (UseBlockZeroing) {
  686       int zva_length = VM_Version::zva_length();
  687 
  688       // Ensure ZVA length can be divided by 16. This is required by
  689       // the subsequent operations.
  690       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
  691 
  692       __ tbz(base, 3, base_aligned);
  693       __ str(zr, Address(__ post(base, 8)));
  694       __ sub(cnt, cnt, 1);
  695       __ bind(base_aligned);
  696 
  697       // Ensure count >= zva_length * 2 so that it still deserves a zva after
  698       // alignment.
  699       Label small;
  700       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
  701       __ subs(rscratch1, cnt, low_limit >> 3);
  702       __ br(Assembler::LT, small);
  703       __ zero_dcache_blocks(base, cnt);
  704       __ bind(small);
  705     }
  706 
  707     {
  708       // Number of stp instructions we'll unroll
  709       const int unroll =
  710         MacroAssembler::zero_words_block_size / 2;
  711       // Clear the remaining blocks.
  712       Label loop;
  713       __ subs(cnt, cnt, unroll * 2);
  714       __ br(Assembler::LT, done);
  715       __ bind(loop);
  716       for (int i = 0; i < unroll; i++)
  717         __ stp(zr, zr, __ post(base, 16));
  718       __ subs(cnt, cnt, unroll * 2);
  719       __ br(Assembler::GE, loop);
  720       __ bind(done);
  721       __ add(cnt, cnt, unroll * 2);
  722     }
  723 
  724     __ ret(lr);
  725 
  726     return start;
  727   }
  728 
  729 
  730   typedef enum {
  731     copy_forwards = 1,
  732     copy_backwards = -1
  733   } copy_direction;
  734 
  735   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
  736   // for arraycopy stubs.
  737   class ArrayCopyBarrierSetHelper : StackObj {
  738     BarrierSetAssembler* _bs_asm;
  739     MacroAssembler* _masm;
  740     DecoratorSet _decorators;
  741     BasicType _type;
  742     Register _gct1;
  743     Register _gct2;
  744     Register _gct3;
  745     FloatRegister _gcvt1;
  746     FloatRegister _gcvt2;
  747     FloatRegister _gcvt3;
  748 
  749   public:
  750     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
  751                               DecoratorSet decorators,
  752                               BasicType type,
  753                               Register gct1,
  754                               Register gct2,
  755                               Register gct3,
  756                               FloatRegister gcvt1,
  757                               FloatRegister gcvt2,
  758                               FloatRegister gcvt3)
  759       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
  760         _masm(masm),
  761         _decorators(decorators),
  762         _type(type),
  763         _gct1(gct1),
  764         _gct2(gct2),
  765         _gct3(gct3),
  766         _gcvt1(gcvt1),
  767         _gcvt2(gcvt2),
  768         _gcvt3(gcvt3) {
  769     }
  770 
  771     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
  772       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
  773                             dst1, dst2, src,
  774                             _gct1, _gct2, _gcvt1);
  775     }
  776 
  777     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
  778       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
  779                              dst, src1, src2,
  780                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
  781     }
  782 
  783     void copy_load_at_16(Register dst1, Register dst2, Address src) {
  784       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
  785                             dst1, dst2, src,
  786                             _gct1);
  787     }
  788 
  789     void copy_store_at_16(Address dst, Register src1, Register src2) {
  790       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
  791                              dst, src1, src2,
  792                              _gct1, _gct2, _gct3);
  793     }
  794 
  795     void copy_load_at_8(Register dst, Address src) {
  796       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
  797                             dst, noreg, src,
  798                             _gct1);
  799     }
  800 
  801     void copy_store_at_8(Address dst, Register src) {
  802       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
  803                              dst, src, noreg,
  804                              _gct1, _gct2, _gct3);
  805     }
  806   };
  807 
  808   // Bulk copy of blocks of 8 words.
  809   //
  810   // count is a count of words.
  811   //
  812   // Precondition: count >= 8
  813   //
  814   // Postconditions:
  815   //
  816   // The least significant bit of count contains the remaining count
  817   // of words to copy.  The rest of count is trash.
  818   //
  819   // s and d are adjusted to point to the remaining words to copy
  820   //
  821   void generate_copy_longs(StubGenStubId stub_id, DecoratorSet decorators, Label &start, Register s, Register d, Register count) {
  822     BasicType type;
  823     copy_direction direction;
  824 
  825     switch (stub_id) {
  826     case copy_byte_f_id:
  827       direction = copy_forwards;
  828       type = T_BYTE;
  829       break;
  830     case copy_byte_b_id:
  831       direction = copy_backwards;
  832       type = T_BYTE;
  833       break;
  834     case copy_oop_f_id:
  835       direction = copy_forwards;
  836       type = T_OBJECT;
  837       break;
  838     case copy_oop_b_id:
  839       direction = copy_backwards;
  840       type = T_OBJECT;
  841       break;
  842     case copy_oop_uninit_f_id:
  843       direction = copy_forwards;
  844       type = T_OBJECT;
  845       break;
  846     case copy_oop_uninit_b_id:
  847       direction = copy_backwards;
  848       type = T_OBJECT;
  849       break;
  850     default:
  851       ShouldNotReachHere();
  852     }
  853 
  854     int unit = wordSize * direction;
  855     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
  856 
  857     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
  858       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
  859     const Register stride = r14;
  860     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
  861     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
  862     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
  863 
  864     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
  865     assert_different_registers(s, d, count, rscratch1, rscratch2);
  866 
  867     Label again, drain;
  868 
  869     __ align(CodeEntryAlignment);
  870 
  871     StubCodeMark mark(this, stub_id);
  872 
  873     __ bind(start);
  874 
  875     Label unaligned_copy_long;
  876     if (AvoidUnalignedAccesses) {
  877       __ tbnz(d, 3, unaligned_copy_long);
  878     }
  879 
  880     if (direction == copy_forwards) {
  881       __ sub(s, s, bias);
  882       __ sub(d, d, bias);
  883     }
  884 
  885 #ifdef ASSERT
  886     // Make sure we are never given < 8 words
  887     {
  888       Label L;
  889       __ cmp(count, (u1)8);
  890       __ br(Assembler::GE, L);
  891       __ stop("genrate_copy_longs called with < 8 words");
  892       __ bind(L);
  893     }
  894 #endif
  895 
  896     // Fill 8 registers
  897     if (UseSIMDForMemoryOps) {
  898       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  899       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  900     } else {
  901       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  902       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  903       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  904       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  905     }
  906 
  907     __ subs(count, count, 16);
  908     __ br(Assembler::LO, drain);
  909 
  910     int prefetch = PrefetchCopyIntervalInBytes;
  911     bool use_stride = false;
  912     if (direction == copy_backwards) {
  913        use_stride = prefetch > 256;
  914        prefetch = -prefetch;
  915        if (use_stride) __ mov(stride, prefetch);
  916     }
  917 
  918     __ bind(again);
  919 
  920     if (PrefetchCopyIntervalInBytes > 0)
  921       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
  922 
  923     if (UseSIMDForMemoryOps) {
  924       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  925       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
  926       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  927       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
  928     } else {
  929       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  930       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  931       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  932       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
  933       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  934       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
  935       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  936       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
  937     }
  938 
  939     __ subs(count, count, 8);
  940     __ br(Assembler::HS, again);
  941 
  942     // Drain
  943     __ bind(drain);
  944     if (UseSIMDForMemoryOps) {
  945       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
  946       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
  947     } else {
  948       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  949       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
  950       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
  951       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
  952     }
  953 
  954     {
  955       Label L1, L2;
  956       __ tbz(count, exact_log2(4), L1);
  957       if (UseSIMDForMemoryOps) {
  958         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
  959         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
  960       } else {
  961         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
  962         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
  963         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
  964         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
  965       }
  966       __ bind(L1);
  967 
  968       if (direction == copy_forwards) {
  969         __ add(s, s, bias);
  970         __ add(d, d, bias);
  971       }
  972 
  973       __ tbz(count, 1, L2);
  974       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
  975       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
  976       __ bind(L2);
  977     }
  978 
  979     __ ret(lr);
  980 
  981     if (AvoidUnalignedAccesses) {
  982       Label drain, again;
  983       // Register order for storing. Order is different for backward copy.
  984 
  985       __ bind(unaligned_copy_long);
  986 
  987       // source address is even aligned, target odd aligned
  988       //
  989       // when forward copying word pairs we read long pairs at offsets
  990       // {0, 2, 4, 6} (in long words). when backwards copying we read
  991       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
  992       // address by -2 in the forwards case so we can compute the
  993       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
  994       // or -1.
  995       //
  996       // when forward copying we need to store 1 word, 3 pairs and
  997       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
  998       // zero offset We adjust the destination by -1 which means we
  999       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 1000       //
 1001       // When backwards copyng we need to store 1 word, 3 pairs and
 1002       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 1003       // offsets {1, 3, 5, 7, 8} * unit.
 1004 
 1005       if (direction == copy_forwards) {
 1006         __ sub(s, s, 16);
 1007         __ sub(d, d, 8);
 1008       }
 1009 
 1010       // Fill 8 registers
 1011       //
 1012       // for forwards copy s was offset by -16 from the original input
 1013       // value of s so the register contents are at these offsets
 1014       // relative to the 64 bit block addressed by that original input
 1015       // and so on for each successive 64 byte block when s is updated
 1016       //
 1017       // t0 at offset 0,  t1 at offset 8
 1018       // t2 at offset 16, t3 at offset 24
 1019       // t4 at offset 32, t5 at offset 40
 1020       // t6 at offset 48, t7 at offset 56
 1021 
 1022       // for backwards copy s was not offset so the register contents
 1023       // are at these offsets into the preceding 64 byte block
 1024       // relative to that original input and so on for each successive
 1025       // preceding 64 byte block when s is updated. this explains the
 1026       // slightly counter-intuitive looking pattern of register usage
 1027       // in the stp instructions for backwards copy.
 1028       //
 1029       // t0 at offset -16, t1 at offset -8
 1030       // t2 at offset -32, t3 at offset -24
 1031       // t4 at offset -48, t5 at offset -40
 1032       // t6 at offset -64, t7 at offset -56
 1033 
 1034       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1035       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1036       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1037       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1038 
 1039       __ subs(count, count, 16);
 1040       __ br(Assembler::LO, drain);
 1041 
 1042       int prefetch = PrefetchCopyIntervalInBytes;
 1043       bool use_stride = false;
 1044       if (direction == copy_backwards) {
 1045          use_stride = prefetch > 256;
 1046          prefetch = -prefetch;
 1047          if (use_stride) __ mov(stride, prefetch);
 1048       }
 1049 
 1050       __ bind(again);
 1051 
 1052       if (PrefetchCopyIntervalInBytes > 0)
 1053         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 1054 
 1055       if (direction == copy_forwards) {
 1056        // allowing for the offset of -8 the store instructions place
 1057        // registers into the target 64 bit block at the following
 1058        // offsets
 1059        //
 1060        // t0 at offset 0
 1061        // t1 at offset 8,  t2 at offset 16
 1062        // t3 at offset 24, t4 at offset 32
 1063        // t5 at offset 40, t6 at offset 48
 1064        // t7 at offset 56
 1065 
 1066         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1067         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1068         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1069         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1070         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1071         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1072         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1073         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1074         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1075       } else {
 1076        // d was not offset when we started so the registers are
 1077        // written into the 64 bit block preceding d with the following
 1078        // offsets
 1079        //
 1080        // t1 at offset -8
 1081        // t3 at offset -24, t0 at offset -16
 1082        // t5 at offset -48, t2 at offset -32
 1083        // t7 at offset -56, t4 at offset -48
 1084        //                   t6 at offset -64
 1085        //
 1086        // note that this matches the offsets previously noted for the
 1087        // loads
 1088 
 1089         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1090         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1091         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1092         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1093         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 1094         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1095         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 1096         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1097         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 1098       }
 1099 
 1100       __ subs(count, count, 8);
 1101       __ br(Assembler::HS, again);
 1102 
 1103       // Drain
 1104       //
 1105       // this uses the same pattern of offsets and register arguments
 1106       // as above
 1107       __ bind(drain);
 1108       if (direction == copy_forwards) {
 1109         bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1110         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1111         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
 1112         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
 1113         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
 1114       } else {
 1115         bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1116         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1117         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
 1118         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
 1119         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
 1120       }
 1121       // now we need to copy any remaining part block which may
 1122       // include a 4 word block subblock and/or a 2 word subblock.
 1123       // bits 2 and 1 in the count are the tell-tale for whether we
 1124       // have each such subblock
 1125       {
 1126         Label L1, L2;
 1127         __ tbz(count, exact_log2(4), L1);
 1128        // this is the same as above but copying only 4 longs hence
 1129        // with only one intervening stp between the str instructions
 1130        // but note that the offsets and registers still follow the
 1131        // same pattern
 1132         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 1133         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 1134         if (direction == copy_forwards) {
 1135           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1136           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
 1137           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
 1138         } else {
 1139           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1140           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
 1141           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
 1142         }
 1143         __ bind(L1);
 1144 
 1145         __ tbz(count, 1, L2);
 1146        // this is the same as above but copying only 2 longs hence
 1147        // there is no intervening stp between the str instructions
 1148        // but note that the offset and register patterns are still
 1149        // the same
 1150         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
 1151         if (direction == copy_forwards) {
 1152           bs.copy_store_at_8(Address(d, 1 * unit), t0);
 1153           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
 1154         } else {
 1155           bs.copy_store_at_8(Address(d, 1 * unit), t1);
 1156           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
 1157         }
 1158         __ bind(L2);
 1159 
 1160        // for forwards copy we need to re-adjust the offsets we
 1161        // applied so that s and d are follow the last words written
 1162 
 1163        if (direction == copy_forwards) {
 1164          __ add(s, s, 16);
 1165          __ add(d, d, 8);
 1166        }
 1167 
 1168       }
 1169 
 1170       __ ret(lr);
 1171       }
 1172   }
 1173 
 1174   // Small copy: less than 16 bytes.
 1175   //
 1176   // NB: Ignores all of the bits of count which represent more than 15
 1177   // bytes, so a caller doesn't have to mask them.
 1178 
 1179   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
 1180     bool is_backwards = step < 0;
 1181     size_t granularity = uabs(step);
 1182     int direction = is_backwards ? -1 : 1;
 1183 
 1184     Label Lword, Lint, Lshort, Lbyte;
 1185 
 1186     assert(granularity
 1187            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 1188 
 1189     const Register t0 = r3;
 1190     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1191     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
 1192 
 1193     // ??? I don't know if this bit-test-and-branch is the right thing
 1194     // to do.  It does a lot of jumping, resulting in several
 1195     // mispredicted branches.  It might make more sense to do this
 1196     // with something like Duff's device with a single computed branch.
 1197 
 1198     __ tbz(count, 3 - exact_log2(granularity), Lword);
 1199     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1200     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1201     __ bind(Lword);
 1202 
 1203     if (granularity <= sizeof (jint)) {
 1204       __ tbz(count, 2 - exact_log2(granularity), Lint);
 1205       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 1206       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 1207       __ bind(Lint);
 1208     }
 1209 
 1210     if (granularity <= sizeof (jshort)) {
 1211       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 1212       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 1213       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 1214       __ bind(Lshort);
 1215     }
 1216 
 1217     if (granularity <= sizeof (jbyte)) {
 1218       __ tbz(count, 0, Lbyte);
 1219       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 1220       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 1221       __ bind(Lbyte);
 1222     }
 1223   }
 1224 
 1225   Label copy_f, copy_b;
 1226   Label copy_obj_f, copy_obj_b;
 1227   Label copy_obj_uninit_f, copy_obj_uninit_b;
 1228 
 1229   // All-singing all-dancing memory copy.
 1230   //
 1231   // Copy count units of memory from s to d.  The size of a unit is
 1232   // step, which can be positive or negative depending on the direction
 1233   // of copy.  If is_aligned is false, we align the source address.
 1234   //
 1235 
 1236   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
 1237                    Register s, Register d, Register count, int step) {
 1238     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 1239     bool is_backwards = step < 0;
 1240     unsigned int granularity = uabs(step);
 1241     const Register t0 = r3, t1 = r4;
 1242 
 1243     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
 1244     // load all the data before writing anything
 1245     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
 1246     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
 1247     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
 1248     const Register send = r17, dend = r16;
 1249     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 1250     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 1251     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 1252 
 1253     if (PrefetchCopyIntervalInBytes > 0)
 1254       __ prfm(Address(s, 0), PLDL1KEEP);
 1255     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
 1256     __ br(Assembler::HI, copy_big);
 1257 
 1258     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
 1259     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
 1260 
 1261     __ cmp(count, u1(16/granularity));
 1262     __ br(Assembler::LS, copy16);
 1263 
 1264     __ cmp(count, u1(64/granularity));
 1265     __ br(Assembler::HI, copy80);
 1266 
 1267     __ cmp(count, u1(32/granularity));
 1268     __ br(Assembler::LS, copy32);
 1269 
 1270     // 33..64 bytes
 1271     if (UseSIMDForMemoryOps) {
 1272       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1273       bs.copy_load_at_32(v2, v3, Address(send, -32));
 1274       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1275       bs.copy_store_at_32(Address(dend, -32), v2, v3);
 1276     } else {
 1277       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1278       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1279       bs.copy_load_at_16(t4, t5, Address(send, -32));
 1280       bs.copy_load_at_16(t6, t7, Address(send, -16));
 1281 
 1282       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1283       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1284       bs.copy_store_at_16(Address(dend, -32), t4, t5);
 1285       bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1286     }
 1287     __ b(finish);
 1288 
 1289     // 17..32 bytes
 1290     __ bind(copy32);
 1291     bs.copy_load_at_16(t0, t1, Address(s, 0));
 1292     bs.copy_load_at_16(t6, t7, Address(send, -16));
 1293 
 1294     bs.copy_store_at_16(Address(d, 0), t0, t1);
 1295     bs.copy_store_at_16(Address(dend, -16), t6, t7);
 1296     __ b(finish);
 1297 
 1298     // 65..80/96 bytes
 1299     // (96 bytes if SIMD because we do 32 byes per instruction)
 1300     __ bind(copy80);
 1301     if (UseSIMDForMemoryOps) {
 1302       bs.copy_load_at_32(v0, v1, Address(s, 0));
 1303       bs.copy_load_at_32(v2, v3, Address(s, 32));
 1304       // Unaligned pointers can be an issue for copying.
 1305       // The issue has more chances to happen when granularity of data is
 1306       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
 1307       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
 1308       // The most performance drop has been seen for the range 65-80 bytes.
 1309       // For such cases using the pair of ldp/stp instead of the third pair of
 1310       // ldpq/stpq fixes the performance issue.
 1311       if (granularity < sizeof (jint)) {
 1312         Label copy96;
 1313         __ cmp(count, u1(80/granularity));
 1314         __ br(Assembler::HI, copy96);
 1315         bs.copy_load_at_16(t0, t1, Address(send, -16));
 1316 
 1317         bs.copy_store_at_32(Address(d, 0), v0, v1);
 1318         bs.copy_store_at_32(Address(d, 32), v2, v3);
 1319 
 1320         bs.copy_store_at_16(Address(dend, -16), t0, t1);
 1321         __ b(finish);
 1322 
 1323         __ bind(copy96);
 1324       }
 1325       bs.copy_load_at_32(v4, v5, Address(send, -32));
 1326 
 1327       bs.copy_store_at_32(Address(d, 0), v0, v1);
 1328       bs.copy_store_at_32(Address(d, 32), v2, v3);
 1329 
 1330       bs.copy_store_at_32(Address(dend, -32), v4, v5);
 1331     } else {
 1332       bs.copy_load_at_16(t0, t1, Address(s, 0));
 1333       bs.copy_load_at_16(t2, t3, Address(s, 16));
 1334       bs.copy_load_at_16(t4, t5, Address(s, 32));
 1335       bs.copy_load_at_16(t6, t7, Address(s, 48));
 1336       bs.copy_load_at_16(t8, t9, Address(send, -16));
 1337 
 1338       bs.copy_store_at_16(Address(d, 0), t0, t1);
 1339       bs.copy_store_at_16(Address(d, 16), t2, t3);
 1340       bs.copy_store_at_16(Address(d, 32), t4, t5);
 1341       bs.copy_store_at_16(Address(d, 48), t6, t7);
 1342       bs.copy_store_at_16(Address(dend, -16), t8, t9);
 1343     }
 1344     __ b(finish);
 1345 
 1346     // 0..16 bytes
 1347     __ bind(copy16);
 1348     __ cmp(count, u1(8/granularity));
 1349     __ br(Assembler::LO, copy8);
 1350 
 1351     // 8..16 bytes
 1352     bs.copy_load_at_8(t0, Address(s, 0));
 1353     bs.copy_load_at_8(t1, Address(send, -8));
 1354     bs.copy_store_at_8(Address(d, 0), t0);
 1355     bs.copy_store_at_8(Address(dend, -8), t1);
 1356     __ b(finish);
 1357 
 1358     if (granularity < 8) {
 1359       // 4..7 bytes
 1360       __ bind(copy8);
 1361       __ tbz(count, 2 - exact_log2(granularity), copy4);
 1362       __ ldrw(t0, Address(s, 0));
 1363       __ ldrw(t1, Address(send, -4));
 1364       __ strw(t0, Address(d, 0));
 1365       __ strw(t1, Address(dend, -4));
 1366       __ b(finish);
 1367       if (granularity < 4) {
 1368         // 0..3 bytes
 1369         __ bind(copy4);
 1370         __ cbz(count, finish); // get rid of 0 case
 1371         if (granularity == 2) {
 1372           __ ldrh(t0, Address(s, 0));
 1373           __ strh(t0, Address(d, 0));
 1374         } else { // granularity == 1
 1375           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
 1376           // the first and last byte.
 1377           // Handle the 3 byte case by loading and storing base + count/2
 1378           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
 1379           // This does means in the 1 byte case we load/store the same
 1380           // byte 3 times.
 1381           __ lsr(count, count, 1);
 1382           __ ldrb(t0, Address(s, 0));
 1383           __ ldrb(t1, Address(send, -1));
 1384           __ ldrb(t2, Address(s, count));
 1385           __ strb(t0, Address(d, 0));
 1386           __ strb(t1, Address(dend, -1));
 1387           __ strb(t2, Address(d, count));
 1388         }
 1389         __ b(finish);
 1390       }
 1391     }
 1392 
 1393     __ bind(copy_big);
 1394     if (is_backwards) {
 1395       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
 1396       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
 1397     }
 1398 
 1399     // Now we've got the small case out of the way we can align the
 1400     // source address on a 2-word boundary.
 1401 
 1402     // Here we will materialize a count in r15, which is used by copy_memory_small
 1403     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
 1404     // Up until here, we have used t9, which aliases r15, but from here on, that register
 1405     // can not be used as a temp register, as it contains the count.
 1406 
 1407     Label aligned;
 1408 
 1409     if (is_aligned) {
 1410       // We may have to adjust by 1 word to get s 2-word-aligned.
 1411       __ tbz(s, exact_log2(wordSize), aligned);
 1412       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
 1413       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
 1414       __ sub(count, count, wordSize/granularity);
 1415     } else {
 1416       if (is_backwards) {
 1417         __ andr(r15, s, 2 * wordSize - 1);
 1418       } else {
 1419         __ neg(r15, s);
 1420         __ andr(r15, r15, 2 * wordSize - 1);
 1421       }
 1422       // r15 is the byte adjustment needed to align s.
 1423       __ cbz(r15, aligned);
 1424       int shift = exact_log2(granularity);
 1425       if (shift > 0) {
 1426         __ lsr(r15, r15, shift);
 1427       }
 1428       __ sub(count, count, r15);
 1429 
 1430 #if 0
 1431       // ?? This code is only correct for a disjoint copy.  It may or
 1432       // may not make sense to use it in that case.
 1433 
 1434       // Copy the first pair; s and d may not be aligned.
 1435       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
 1436       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
 1437 
 1438       // Align s and d, adjust count
 1439       if (is_backwards) {
 1440         __ sub(s, s, r15);
 1441         __ sub(d, d, r15);
 1442       } else {
 1443         __ add(s, s, r15);
 1444         __ add(d, d, r15);
 1445       }
 1446 #else
 1447       copy_memory_small(decorators, type, s, d, r15, step);
 1448 #endif
 1449     }
 1450 
 1451     __ bind(aligned);
 1452 
 1453     // s is now 2-word-aligned.
 1454 
 1455     // We have a count of units and some trailing bytes. Adjust the
 1456     // count and do a bulk copy of words. If the shift is zero
 1457     // perform a move instead to benefit from zero latency moves.
 1458     int shift = exact_log2(wordSize/granularity);
 1459     if (shift > 0) {
 1460       __ lsr(r15, count, shift);
 1461     } else {
 1462       __ mov(r15, count);
 1463     }
 1464     if (direction == copy_forwards) {
 1465       if (type != T_OBJECT) {
 1466         __ bl(copy_f);
 1467       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1468         __ bl(copy_obj_uninit_f);
 1469       } else {
 1470         __ bl(copy_obj_f);
 1471       }
 1472     } else {
 1473       if (type != T_OBJECT) {
 1474         __ bl(copy_b);
 1475       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
 1476         __ bl(copy_obj_uninit_b);
 1477       } else {
 1478         __ bl(copy_obj_b);
 1479       }
 1480     }
 1481 
 1482     // And the tail.
 1483     copy_memory_small(decorators, type, s, d, count, step);
 1484 
 1485     if (granularity >= 8) __ bind(copy8);
 1486     if (granularity >= 4) __ bind(copy4);
 1487     __ bind(finish);
 1488   }
 1489 
 1490 
 1491   void clobber_registers() {
 1492 #ifdef ASSERT
 1493     RegSet clobbered
 1494       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
 1495     __ mov(rscratch1, (uint64_t)0xdeadbeef);
 1496     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
 1497     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
 1498       __ mov(*it, rscratch1);
 1499     }
 1500 #endif
 1501 
 1502   }
 1503 
 1504   // Scan over array at a for count oops, verifying each one.
 1505   // Preserves a and count, clobbers rscratch1 and rscratch2.
 1506   void verify_oop_array (int size, Register a, Register count, Register temp) {
 1507     Label loop, end;
 1508     __ mov(rscratch1, a);
 1509     __ mov(rscratch2, zr);
 1510     __ bind(loop);
 1511     __ cmp(rscratch2, count);
 1512     __ br(Assembler::HS, end);
 1513     if (size == wordSize) {
 1514       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1515       __ verify_oop(temp);
 1516     } else {
 1517       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
 1518       __ decode_heap_oop(temp); // calls verify_oop
 1519     }
 1520     __ add(rscratch2, rscratch2, 1);
 1521     __ b(loop);
 1522     __ bind(end);
 1523   }
 1524 
 1525   // Arguments:
 1526   //   stub_id - is used to name the stub and identify all details of
 1527   //             how to perform the copy.
 1528   //
 1529   //   entry - is assigned to the stub's post push entry point unless
 1530   //           it is null
 1531   //
 1532   // Inputs:
 1533   //   c_rarg0   - source array address
 1534   //   c_rarg1   - destination array address
 1535   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1536   //
 1537   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1538   // the hardware handle it.  The two dwords within qwords that span
 1539   // cache line boundaries will still be loaded and stored atomically.
 1540   //
 1541   // Side Effects: entry is set to the (post push) entry point so it
 1542   //               can be used by the corresponding conjoint copy
 1543   //               method
 1544   //
 1545   address generate_disjoint_copy(StubGenStubId stub_id, address *entry) {
 1546     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1547     RegSet saved_reg = RegSet::of(s, d, count);
 1548     int size;
 1549     bool aligned;
 1550     bool is_oop;
 1551     bool dest_uninitialized;
 1552     switch (stub_id) {
 1553     case jbyte_disjoint_arraycopy_id:
 1554       size = sizeof(jbyte);
 1555       aligned = false;
 1556       is_oop = false;
 1557       dest_uninitialized = false;
 1558       break;
 1559     case arrayof_jbyte_disjoint_arraycopy_id:
 1560       size = sizeof(jbyte);
 1561       aligned = true;
 1562       is_oop = false;
 1563       dest_uninitialized = false;
 1564       break;
 1565     case jshort_disjoint_arraycopy_id:
 1566       size = sizeof(jshort);
 1567       aligned = false;
 1568       is_oop = false;
 1569       dest_uninitialized = false;
 1570       break;
 1571     case arrayof_jshort_disjoint_arraycopy_id:
 1572       size = sizeof(jshort);
 1573       aligned = true;
 1574       is_oop = false;
 1575       dest_uninitialized = false;
 1576       break;
 1577     case jint_disjoint_arraycopy_id:
 1578       size = sizeof(jint);
 1579       aligned = false;
 1580       is_oop = false;
 1581       dest_uninitialized = false;
 1582       break;
 1583     case arrayof_jint_disjoint_arraycopy_id:
 1584       size = sizeof(jint);
 1585       aligned = true;
 1586       is_oop = false;
 1587       dest_uninitialized = false;
 1588       break;
 1589     case jlong_disjoint_arraycopy_id:
 1590       // since this is always aligned we can (should!) use the same
 1591       // stub as for case arrayof_jlong_disjoint_arraycopy
 1592       ShouldNotReachHere();
 1593       break;
 1594     case arrayof_jlong_disjoint_arraycopy_id:
 1595       size = sizeof(jlong);
 1596       aligned = true;
 1597       is_oop = false;
 1598       dest_uninitialized = false;
 1599       break;
 1600     case oop_disjoint_arraycopy_id:
 1601       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1602       aligned = !UseCompressedOops;
 1603       is_oop = true;
 1604       dest_uninitialized = false;
 1605       break;
 1606     case arrayof_oop_disjoint_arraycopy_id:
 1607       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1608       aligned = !UseCompressedOops;
 1609       is_oop = true;
 1610       dest_uninitialized = false;
 1611       break;
 1612     case oop_disjoint_arraycopy_uninit_id:
 1613       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1614       aligned = !UseCompressedOops;
 1615       is_oop = true;
 1616       dest_uninitialized = true;
 1617       break;
 1618     case arrayof_oop_disjoint_arraycopy_uninit_id:
 1619       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1620       aligned = !UseCompressedOops;
 1621       is_oop = true;
 1622       dest_uninitialized = true;
 1623       break;
 1624     default:
 1625       ShouldNotReachHere();
 1626       break;
 1627     }
 1628 
 1629     __ align(CodeEntryAlignment);
 1630     StubCodeMark mark(this, stub_id);
 1631     address start = __ pc();
 1632     __ enter();
 1633 
 1634     if (entry != nullptr) {
 1635       *entry = __ pc();
 1636       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1637       BLOCK_COMMENT("Entry:");
 1638     }
 1639 
 1640     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 1641     if (dest_uninitialized) {
 1642       decorators |= IS_DEST_UNINITIALIZED;
 1643     }
 1644     if (aligned) {
 1645       decorators |= ARRAYCOPY_ALIGNED;
 1646     }
 1647 
 1648     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1649     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
 1650 
 1651     if (is_oop) {
 1652       // save regs before copy_memory
 1653       __ push(RegSet::of(d, count), sp);
 1654     }
 1655     {
 1656       // UnsafeMemoryAccess page error: continue after unsafe access
 1657       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1658       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1659       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
 1660     }
 1661 
 1662     if (is_oop) {
 1663       __ pop(RegSet::of(d, count), sp);
 1664       if (VerifyOops)
 1665         verify_oop_array(size, d, count, r16);
 1666     }
 1667 
 1668     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
 1669 
 1670     __ leave();
 1671     __ mov(r0, zr); // return 0
 1672     __ ret(lr);
 1673     return start;
 1674   }
 1675 
 1676   // Arguments:
 1677   //   stub_id - is used to name the stub and identify all details of
 1678   //             how to perform the copy.
 1679   //
 1680   //   nooverlap_target - identifes the (post push) entry for the
 1681   //             corresponding disjoint copy routine which can be
 1682   //             jumped to if the ranges do not actually overlap
 1683   //
 1684   //   entry - is assigned to the stub's post push entry point unless
 1685   //           it is null
 1686   //
 1687   //
 1688   // Inputs:
 1689   //   c_rarg0   - source array address
 1690   //   c_rarg1   - destination array address
 1691   //   c_rarg2   - element count, treated as ssize_t, can be zero
 1692   //
 1693   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 1694   // the hardware handle it.  The two dwords within qwords that span
 1695   // cache line boundaries will still be loaded and stored atomically.
 1696   //
 1697   // Side Effects:
 1698   //   entry is set to the no-overlap entry point so it can be used by
 1699   //   some other conjoint copy method
 1700   //
 1701   address generate_conjoint_copy(StubGenStubId stub_id, address nooverlap_target, address *entry) {
 1702     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 1703     RegSet saved_regs = RegSet::of(s, d, count);
 1704     int size;
 1705     bool aligned;
 1706     bool is_oop;
 1707     bool dest_uninitialized;
 1708     switch (stub_id) {
 1709     case jbyte_arraycopy_id:
 1710       size = sizeof(jbyte);
 1711       aligned = false;
 1712       is_oop = false;
 1713       dest_uninitialized = false;
 1714       break;
 1715     case arrayof_jbyte_arraycopy_id:
 1716       size = sizeof(jbyte);
 1717       aligned = true;
 1718       is_oop = false;
 1719       dest_uninitialized = false;
 1720       break;
 1721     case jshort_arraycopy_id:
 1722       size = sizeof(jshort);
 1723       aligned = false;
 1724       is_oop = false;
 1725       dest_uninitialized = false;
 1726       break;
 1727     case arrayof_jshort_arraycopy_id:
 1728       size = sizeof(jshort);
 1729       aligned = true;
 1730       is_oop = false;
 1731       dest_uninitialized = false;
 1732       break;
 1733     case jint_arraycopy_id:
 1734       size = sizeof(jint);
 1735       aligned = false;
 1736       is_oop = false;
 1737       dest_uninitialized = false;
 1738       break;
 1739     case arrayof_jint_arraycopy_id:
 1740       size = sizeof(jint);
 1741       aligned = true;
 1742       is_oop = false;
 1743       dest_uninitialized = false;
 1744       break;
 1745     case jlong_arraycopy_id:
 1746       // since this is always aligned we can (should!) use the same
 1747       // stub as for case arrayof_jlong_disjoint_arraycopy
 1748       ShouldNotReachHere();
 1749       break;
 1750     case arrayof_jlong_arraycopy_id:
 1751       size = sizeof(jlong);
 1752       aligned = true;
 1753       is_oop = false;
 1754       dest_uninitialized = false;
 1755       break;
 1756     case oop_arraycopy_id:
 1757       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1758       aligned = !UseCompressedOops;
 1759       is_oop = true;
 1760       dest_uninitialized = false;
 1761       break;
 1762     case arrayof_oop_arraycopy_id:
 1763       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1764       aligned = !UseCompressedOops;
 1765       is_oop = true;
 1766       dest_uninitialized = false;
 1767       break;
 1768     case oop_arraycopy_uninit_id:
 1769       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1770       aligned = !UseCompressedOops;
 1771       is_oop = true;
 1772       dest_uninitialized = true;
 1773       break;
 1774     case arrayof_oop_arraycopy_uninit_id:
 1775       size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
 1776       aligned = !UseCompressedOops;
 1777       is_oop = true;
 1778       dest_uninitialized = true;
 1779       break;
 1780     default:
 1781       ShouldNotReachHere();
 1782     }
 1783 
 1784     StubCodeMark mark(this, stub_id);
 1785     address start = __ pc();
 1786     __ enter();
 1787 
 1788     if (entry != nullptr) {
 1789       *entry = __ pc();
 1790       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 1791       BLOCK_COMMENT("Entry:");
 1792     }
 1793 
 1794     // use fwd copy when (d-s) above_equal (count*size)
 1795     __ sub(rscratch1, d, s);
 1796     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
 1797     __ br(Assembler::HS, nooverlap_target);
 1798 
 1799     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 1800     if (dest_uninitialized) {
 1801       decorators |= IS_DEST_UNINITIALIZED;
 1802     }
 1803     if (aligned) {
 1804       decorators |= ARRAYCOPY_ALIGNED;
 1805     }
 1806 
 1807     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1808     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
 1809 
 1810     if (is_oop) {
 1811       // save regs before copy_memory
 1812       __ push(RegSet::of(d, count), sp);
 1813     }
 1814     {
 1815       // UnsafeMemoryAccess page error: continue after unsafe access
 1816       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
 1817       UnsafeMemoryAccessMark umam(this, add_entry, true);
 1818       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
 1819     }
 1820     if (is_oop) {
 1821       __ pop(RegSet::of(d, count), sp);
 1822       if (VerifyOops)
 1823         verify_oop_array(size, d, count, r16);
 1824     }
 1825     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
 1826     __ leave();
 1827     __ mov(r0, zr); // return 0
 1828     __ ret(lr);
 1829     return start;
 1830   }
 1831 
 1832   // Helper for generating a dynamic type check.
 1833   // Smashes rscratch1, rscratch2.
 1834   void generate_type_check(Register sub_klass,
 1835                            Register super_check_offset,
 1836                            Register super_klass,
 1837                            Register temp1,
 1838                            Register temp2,
 1839                            Register result,
 1840                            Label& L_success) {
 1841     assert_different_registers(sub_klass, super_check_offset, super_klass);
 1842 
 1843     BLOCK_COMMENT("type_check:");
 1844 
 1845     Label L_miss;
 1846 
 1847     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
 1848                                      super_check_offset);
 1849     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
 1850 
 1851     // Fall through on failure!
 1852     __ BIND(L_miss);
 1853   }
 1854 
 1855   //
 1856   //  Generate checkcasting array copy stub
 1857   //
 1858   //  Input:
 1859   //    c_rarg0   - source array address
 1860   //    c_rarg1   - destination array address
 1861   //    c_rarg2   - element count, treated as ssize_t, can be zero
 1862   //    c_rarg3   - size_t ckoff (super_check_offset)
 1863   //    c_rarg4   - oop ckval (super_klass)
 1864   //
 1865   //  Output:
 1866   //    r0 ==  0  -  success
 1867   //    r0 == -1^K - failure, where K is partial transfer count
 1868   //
 1869   address generate_checkcast_copy(StubGenStubId stub_id, address *entry) {
 1870     bool dest_uninitialized;
 1871     switch (stub_id) {
 1872     case checkcast_arraycopy_id:
 1873       dest_uninitialized = false;
 1874       break;
 1875     case checkcast_arraycopy_uninit_id:
 1876       dest_uninitialized = true;
 1877       break;
 1878     default:
 1879       ShouldNotReachHere();
 1880     }
 1881 
 1882     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
 1883 
 1884     // Input registers (after setup_arg_regs)
 1885     const Register from        = c_rarg0;   // source array address
 1886     const Register to          = c_rarg1;   // destination array address
 1887     const Register count       = c_rarg2;   // elementscount
 1888     const Register ckoff       = c_rarg3;   // super_check_offset
 1889     const Register ckval       = c_rarg4;   // super_klass
 1890 
 1891     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
 1892     RegSet wb_post_saved_regs = RegSet::of(count);
 1893 
 1894     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
 1895     const Register copied_oop  = r22;       // actual oop copied
 1896     const Register count_save  = r21;       // orig elementscount
 1897     const Register start_to    = r20;       // destination array start address
 1898     const Register r19_klass   = r19;       // oop._klass
 1899 
 1900     // Registers used as gc temps (r5, r6, r7 are save-on-call)
 1901     const Register gct1 = r5, gct2 = r6, gct3 = r7;
 1902 
 1903     //---------------------------------------------------------------
 1904     // Assembler stub will be used for this call to arraycopy
 1905     // if the two arrays are subtypes of Object[] but the
 1906     // destination array type is not equal to or a supertype
 1907     // of the source type.  Each element must be separately
 1908     // checked.
 1909 
 1910     assert_different_registers(from, to, count, ckoff, ckval, start_to,
 1911                                copied_oop, r19_klass, count_save);
 1912 
 1913     __ align(CodeEntryAlignment);
 1914     StubCodeMark mark(this, stub_id);
 1915     address start = __ pc();
 1916 
 1917     __ enter(); // required for proper stackwalking of RuntimeStub frame
 1918 
 1919 #ifdef ASSERT
 1920     // caller guarantees that the arrays really are different
 1921     // otherwise, we would have to make conjoint checks
 1922     { Label L;
 1923       __ b(L);                  // conjoint check not yet implemented
 1924       __ stop("checkcast_copy within a single array");
 1925       __ bind(L);
 1926     }
 1927 #endif //ASSERT
 1928 
 1929     // Caller of this entry point must set up the argument registers.
 1930     if (entry != nullptr) {
 1931       *entry = __ pc();
 1932       BLOCK_COMMENT("Entry:");
 1933     }
 1934 
 1935      // Empty array:  Nothing to do.
 1936     __ cbz(count, L_done);
 1937     __ push(RegSet::of(r19, r20, r21, r22), sp);
 1938 
 1939 #ifdef ASSERT
 1940     BLOCK_COMMENT("assert consistent ckoff/ckval");
 1941     // The ckoff and ckval must be mutually consistent,
 1942     // even though caller generates both.
 1943     { Label L;
 1944       int sco_offset = in_bytes(Klass::super_check_offset_offset());
 1945       __ ldrw(start_to, Address(ckval, sco_offset));
 1946       __ cmpw(ckoff, start_to);
 1947       __ br(Assembler::EQ, L);
 1948       __ stop("super_check_offset inconsistent");
 1949       __ bind(L);
 1950     }
 1951 #endif //ASSERT
 1952 
 1953     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
 1954     bool is_oop = true;
 1955     int element_size = UseCompressedOops ? 4 : 8;
 1956     if (dest_uninitialized) {
 1957       decorators |= IS_DEST_UNINITIALIZED;
 1958     }
 1959 
 1960     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 1961     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
 1962 
 1963     // save the original count
 1964     __ mov(count_save, count);
 1965 
 1966     // Copy from low to high addresses
 1967     __ mov(start_to, to);              // Save destination array start address
 1968     __ b(L_load_element);
 1969 
 1970     // ======== begin loop ========
 1971     // (Loop is rotated; its entry is L_load_element.)
 1972     // Loop control:
 1973     //   for (; count != 0; count--) {
 1974     //     copied_oop = load_heap_oop(from++);
 1975     //     ... generate_type_check ...;
 1976     //     store_heap_oop(to++, copied_oop);
 1977     //   }
 1978     __ align(OptoLoopAlignment);
 1979 
 1980     __ BIND(L_store_element);
 1981     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
 1982                       __ post(to, element_size), copied_oop, noreg,
 1983                       gct1, gct2, gct3);
 1984     __ sub(count, count, 1);
 1985     __ cbz(count, L_do_card_marks);
 1986 
 1987     // ======== loop entry is here ========
 1988     __ BIND(L_load_element);
 1989     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
 1990                      copied_oop, noreg, __ post(from, element_size),
 1991                      gct1);
 1992     __ cbz(copied_oop, L_store_element);
 1993 
 1994     __ load_klass(r19_klass, copied_oop);// query the object klass
 1995 
 1996     BLOCK_COMMENT("type_check:");
 1997     generate_type_check(/*sub_klass*/r19_klass,
 1998                         /*super_check_offset*/ckoff,
 1999                         /*super_klass*/ckval,
 2000                         /*r_array_base*/gct1,
 2001                         /*temp2*/gct2,
 2002                         /*result*/r10, L_store_element);
 2003 
 2004     // Fall through on failure!
 2005 
 2006     // ======== end loop ========
 2007 
 2008     // It was a real error; we must depend on the caller to finish the job.
 2009     // Register count = remaining oops, count_orig = total oops.
 2010     // Emit GC store barriers for the oops we have copied and report
 2011     // their number to the caller.
 2012 
 2013     __ subs(count, count_save, count);     // K = partially copied oop count
 2014     __ eon(count, count, zr);              // report (-1^K) to caller
 2015     __ br(Assembler::EQ, L_done_pop);
 2016 
 2017     __ BIND(L_do_card_marks);
 2018     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
 2019 
 2020     __ bind(L_done_pop);
 2021     __ pop(RegSet::of(r19, r20, r21, r22), sp);
 2022     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
 2023 
 2024     __ bind(L_done);
 2025     __ mov(r0, count);
 2026     __ leave();
 2027     __ ret(lr);
 2028 
 2029     return start;
 2030   }
 2031 
 2032   // Perform range checks on the proposed arraycopy.
 2033   // Kills temp, but nothing else.
 2034   // Also, clean the sign bits of src_pos and dst_pos.
 2035   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
 2036                               Register src_pos, // source position (c_rarg1)
 2037                               Register dst,     // destination array oo (c_rarg2)
 2038                               Register dst_pos, // destination position (c_rarg3)
 2039                               Register length,
 2040                               Register temp,
 2041                               Label& L_failed) {
 2042     BLOCK_COMMENT("arraycopy_range_checks:");
 2043 
 2044     assert_different_registers(rscratch1, temp);
 2045 
 2046     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
 2047     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
 2048     __ addw(temp, length, src_pos);
 2049     __ cmpw(temp, rscratch1);
 2050     __ br(Assembler::HI, L_failed);
 2051 
 2052     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
 2053     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
 2054     __ addw(temp, length, dst_pos);
 2055     __ cmpw(temp, rscratch1);
 2056     __ br(Assembler::HI, L_failed);
 2057 
 2058     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
 2059     __ movw(src_pos, src_pos);
 2060     __ movw(dst_pos, dst_pos);
 2061 
 2062     BLOCK_COMMENT("arraycopy_range_checks done");
 2063   }
 2064 
 2065   // These stubs get called from some dumb test routine.
 2066   // I'll write them properly when they're called from
 2067   // something that's actually doing something.
 2068   static void fake_arraycopy_stub(address src, address dst, int count) {
 2069     assert(count == 0, "huh?");
 2070   }
 2071 
 2072 
 2073   //
 2074   //  Generate 'unsafe' array copy stub
 2075   //  Though just as safe as the other stubs, it takes an unscaled
 2076   //  size_t argument instead of an element count.
 2077   //
 2078   //  Input:
 2079   //    c_rarg0   - source array address
 2080   //    c_rarg1   - destination array address
 2081   //    c_rarg2   - byte count, treated as ssize_t, can be zero
 2082   //
 2083   // Examines the alignment of the operands and dispatches
 2084   // to a long, int, short, or byte copy loop.
 2085   //
 2086   address generate_unsafe_copy(address byte_copy_entry,
 2087                                address short_copy_entry,
 2088                                address int_copy_entry,
 2089                                address long_copy_entry) {
 2090     StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id;
 2091 
 2092     Label L_long_aligned, L_int_aligned, L_short_aligned;
 2093     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 2094 
 2095     __ align(CodeEntryAlignment);
 2096     StubCodeMark mark(this, stub_id);
 2097     address start = __ pc();
 2098     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2099 
 2100     // bump this on entry, not on exit:
 2101     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
 2102 
 2103     __ orr(rscratch1, s, d);
 2104     __ orr(rscratch1, rscratch1, count);
 2105 
 2106     __ andr(rscratch1, rscratch1, BytesPerLong-1);
 2107     __ cbz(rscratch1, L_long_aligned);
 2108     __ andr(rscratch1, rscratch1, BytesPerInt-1);
 2109     __ cbz(rscratch1, L_int_aligned);
 2110     __ tbz(rscratch1, 0, L_short_aligned);
 2111     __ b(RuntimeAddress(byte_copy_entry));
 2112 
 2113     __ BIND(L_short_aligned);
 2114     __ lsr(count, count, LogBytesPerShort);  // size => short_count
 2115     __ b(RuntimeAddress(short_copy_entry));
 2116     __ BIND(L_int_aligned);
 2117     __ lsr(count, count, LogBytesPerInt);    // size => int_count
 2118     __ b(RuntimeAddress(int_copy_entry));
 2119     __ BIND(L_long_aligned);
 2120     __ lsr(count, count, LogBytesPerLong);   // size => long_count
 2121     __ b(RuntimeAddress(long_copy_entry));
 2122 
 2123     return start;
 2124   }
 2125 
 2126   //
 2127   //  Generate generic array copy stubs
 2128   //
 2129   //  Input:
 2130   //    c_rarg0    -  src oop
 2131   //    c_rarg1    -  src_pos (32-bits)
 2132   //    c_rarg2    -  dst oop
 2133   //    c_rarg3    -  dst_pos (32-bits)
 2134   //    c_rarg4    -  element count (32-bits)
 2135   //
 2136   //  Output:
 2137   //    r0 ==  0  -  success
 2138   //    r0 == -1^K - failure, where K is partial transfer count
 2139   //
 2140   address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
 2141                                 address int_copy_entry, address oop_copy_entry,
 2142                                 address long_copy_entry, address checkcast_copy_entry) {
 2143     StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id;
 2144 
 2145     Label L_failed, L_objArray;
 2146     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
 2147 
 2148     // Input registers
 2149     const Register src        = c_rarg0;  // source array oop
 2150     const Register src_pos    = c_rarg1;  // source position
 2151     const Register dst        = c_rarg2;  // destination array oop
 2152     const Register dst_pos    = c_rarg3;  // destination position
 2153     const Register length     = c_rarg4;
 2154 
 2155 
 2156     // Registers used as temps
 2157     const Register dst_klass  = c_rarg5;
 2158 
 2159     __ align(CodeEntryAlignment);
 2160 
 2161     StubCodeMark mark(this, stub_id);
 2162 
 2163     address start = __ pc();
 2164 
 2165     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2166 
 2167     // bump this on entry, not on exit:
 2168     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
 2169 
 2170     //-----------------------------------------------------------------------
 2171     // Assembler stub will be used for this call to arraycopy
 2172     // if the following conditions are met:
 2173     //
 2174     // (1) src and dst must not be null.
 2175     // (2) src_pos must not be negative.
 2176     // (3) dst_pos must not be negative.
 2177     // (4) length  must not be negative.
 2178     // (5) src klass and dst klass should be the same and not null.
 2179     // (6) src and dst should be arrays.
 2180     // (7) src_pos + length must not exceed length of src.
 2181     // (8) dst_pos + length must not exceed length of dst.
 2182     //
 2183 
 2184     //  if (src == nullptr) return -1;
 2185     __ cbz(src, L_failed);
 2186 
 2187     //  if (src_pos < 0) return -1;
 2188     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
 2189 
 2190     //  if (dst == nullptr) return -1;
 2191     __ cbz(dst, L_failed);
 2192 
 2193     //  if (dst_pos < 0) return -1;
 2194     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
 2195 
 2196     // registers used as temp
 2197     const Register scratch_length    = r16; // elements count to copy
 2198     const Register scratch_src_klass = r17; // array klass
 2199     const Register lh                = r15; // layout helper
 2200 
 2201     //  if (length < 0) return -1;
 2202     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
 2203     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
 2204 
 2205     __ load_klass(scratch_src_klass, src);
 2206 #ifdef ASSERT
 2207     //  assert(src->klass() != nullptr);
 2208     {
 2209       BLOCK_COMMENT("assert klasses not null {");
 2210       Label L1, L2;
 2211       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
 2212       __ bind(L1);
 2213       __ stop("broken null klass");
 2214       __ bind(L2);
 2215       __ load_klass(rscratch1, dst);
 2216       __ cbz(rscratch1, L1);     // this would be broken also
 2217       BLOCK_COMMENT("} assert klasses not null done");
 2218     }
 2219 #endif
 2220 
 2221     // Load layout helper (32-bits)
 2222     //
 2223     //  |array_tag|     | header_size | element_type |     |log2_element_size|
 2224     // 32        30    24            16              8     2                 0
 2225     //
 2226     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
 2227     //
 2228 
 2229     const int lh_offset = in_bytes(Klass::layout_helper_offset());
 2230 
 2231     // Handle objArrays completely differently...
 2232     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
 2233     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
 2234     __ movw(rscratch1, objArray_lh);
 2235     __ eorw(rscratch2, lh, rscratch1);
 2236     __ cbzw(rscratch2, L_objArray);
 2237 
 2238     //  if (src->klass() != dst->klass()) return -1;
 2239     __ load_klass(rscratch2, dst);
 2240     __ eor(rscratch2, rscratch2, scratch_src_klass);
 2241     __ cbnz(rscratch2, L_failed);
 2242 
 2243     // Check for flat inline type array -> return -1
 2244     __ test_flat_array_oop(src, rscratch2, L_failed);
 2245 
 2246     // Check for null-free (non-flat) inline type array -> handle as object array
 2247     __ test_null_free_array_oop(src, rscratch2, L_objArray);
 2248 
 2249     //  if (!src->is_Array()) return -1;
 2250     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
 2251 
 2252     // At this point, it is known to be a typeArray (array_tag 0x3).
 2253 #ifdef ASSERT
 2254     {
 2255       BLOCK_COMMENT("assert primitive array {");
 2256       Label L;
 2257       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
 2258       __ cmpw(lh, rscratch2);
 2259       __ br(Assembler::GE, L);
 2260       __ stop("must be a primitive array");
 2261       __ bind(L);
 2262       BLOCK_COMMENT("} assert primitive array done");
 2263     }
 2264 #endif
 2265 
 2266     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2267                            rscratch2, L_failed);
 2268 
 2269     // TypeArrayKlass
 2270     //
 2271     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
 2272     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
 2273     //
 2274 
 2275     const Register rscratch1_offset = rscratch1;    // array offset
 2276     const Register r15_elsize = lh; // element size
 2277 
 2278     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
 2279            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
 2280     __ add(src, src, rscratch1_offset);           // src array offset
 2281     __ add(dst, dst, rscratch1_offset);           // dst array offset
 2282     BLOCK_COMMENT("choose copy loop based on element size");
 2283 
 2284     // next registers should be set before the jump to corresponding stub
 2285     const Register from     = c_rarg0;  // source array address
 2286     const Register to       = c_rarg1;  // destination array address
 2287     const Register count    = c_rarg2;  // elements count
 2288 
 2289     // 'from', 'to', 'count' registers should be set in such order
 2290     // since they are the same as 'src', 'src_pos', 'dst'.
 2291 
 2292     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
 2293 
 2294     // The possible values of elsize are 0-3, i.e. exact_log2(element
 2295     // size in bytes).  We do a simple bitwise binary search.
 2296   __ BIND(L_copy_bytes);
 2297     __ tbnz(r15_elsize, 1, L_copy_ints);
 2298     __ tbnz(r15_elsize, 0, L_copy_shorts);
 2299     __ lea(from, Address(src, src_pos));// src_addr
 2300     __ lea(to,   Address(dst, dst_pos));// dst_addr
 2301     __ movw(count, scratch_length); // length
 2302     __ b(RuntimeAddress(byte_copy_entry));
 2303 
 2304   __ BIND(L_copy_shorts);
 2305     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
 2306     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
 2307     __ movw(count, scratch_length); // length
 2308     __ b(RuntimeAddress(short_copy_entry));
 2309 
 2310   __ BIND(L_copy_ints);
 2311     __ tbnz(r15_elsize, 0, L_copy_longs);
 2312     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
 2313     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
 2314     __ movw(count, scratch_length); // length
 2315     __ b(RuntimeAddress(int_copy_entry));
 2316 
 2317   __ BIND(L_copy_longs);
 2318 #ifdef ASSERT
 2319     {
 2320       BLOCK_COMMENT("assert long copy {");
 2321       Label L;
 2322       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
 2323       __ cmpw(r15_elsize, LogBytesPerLong);
 2324       __ br(Assembler::EQ, L);
 2325       __ stop("must be long copy, but elsize is wrong");
 2326       __ bind(L);
 2327       BLOCK_COMMENT("} assert long copy done");
 2328     }
 2329 #endif
 2330     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
 2331     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
 2332     __ movw(count, scratch_length); // length
 2333     __ b(RuntimeAddress(long_copy_entry));
 2334 
 2335     // ObjArrayKlass
 2336   __ BIND(L_objArray);
 2337     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
 2338 
 2339     Label L_plain_copy, L_checkcast_copy;
 2340     //  test array classes for subtyping
 2341     __ load_klass(r15, dst);
 2342     __ cmp(scratch_src_klass, r15); // usual case is exact equality
 2343     __ br(Assembler::NE, L_checkcast_copy);
 2344 
 2345     // Identically typed arrays can be copied without element-wise checks.
 2346     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2347                            rscratch2, L_failed);
 2348 
 2349     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2350     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2351     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2352     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2353     __ movw(count, scratch_length); // length
 2354   __ BIND(L_plain_copy);
 2355     __ b(RuntimeAddress(oop_copy_entry));
 2356 
 2357   __ BIND(L_checkcast_copy);
 2358     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
 2359     {
 2360       // Before looking at dst.length, make sure dst is also an objArray.
 2361       __ ldrw(rscratch1, Address(r15, lh_offset));
 2362       __ movw(rscratch2, objArray_lh);
 2363       __ eorw(rscratch1, rscratch1, rscratch2);
 2364       __ cbnzw(rscratch1, L_failed);
 2365 
 2366       // It is safe to examine both src.length and dst.length.
 2367       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
 2368                              r15, L_failed);
 2369 
 2370       __ load_klass(dst_klass, dst); // reload
 2371 
 2372       // Marshal the base address arguments now, freeing registers.
 2373       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
 2374       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2375       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
 2376       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
 2377       __ movw(count, length);           // length (reloaded)
 2378       Register sco_temp = c_rarg3;      // this register is free now
 2379       assert_different_registers(from, to, count, sco_temp,
 2380                                  dst_klass, scratch_src_klass);
 2381       // assert_clean_int(count, sco_temp);
 2382 
 2383       // Generate the type check.
 2384       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
 2385       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2386 
 2387       // Smashes rscratch1, rscratch2
 2388       generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
 2389                           L_plain_copy);
 2390 
 2391       // Fetch destination element klass from the ObjArrayKlass header.
 2392       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
 2393       __ ldr(dst_klass, Address(dst_klass, ek_offset));
 2394       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
 2395 
 2396       // the checkcast_copy loop needs two extra arguments:
 2397       assert(c_rarg3 == sco_temp, "#3 already in place");
 2398       // Set up arguments for checkcast_copy_entry.
 2399       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
 2400       __ b(RuntimeAddress(checkcast_copy_entry));
 2401     }
 2402 
 2403   __ BIND(L_failed);
 2404     __ mov(r0, -1);
 2405     __ leave();   // required for proper stackwalking of RuntimeStub frame
 2406     __ ret(lr);
 2407 
 2408     return start;
 2409   }
 2410 
 2411   //
 2412   // Generate stub for array fill. If "aligned" is true, the
 2413   // "to" address is assumed to be heapword aligned.
 2414   //
 2415   // Arguments for generated stub:
 2416   //   to:    c_rarg0
 2417   //   value: c_rarg1
 2418   //   count: c_rarg2 treated as signed
 2419   //
 2420   address generate_fill(StubGenStubId stub_id) {
 2421     BasicType t;
 2422     bool aligned;
 2423 
 2424     switch (stub_id) {
 2425     case jbyte_fill_id:
 2426       t = T_BYTE;
 2427       aligned = false;
 2428       break;
 2429     case jshort_fill_id:
 2430       t = T_SHORT;
 2431       aligned = false;
 2432       break;
 2433     case jint_fill_id:
 2434       t = T_INT;
 2435       aligned = false;
 2436       break;
 2437     case arrayof_jbyte_fill_id:
 2438       t = T_BYTE;
 2439       aligned = true;
 2440       break;
 2441     case arrayof_jshort_fill_id:
 2442       t = T_SHORT;
 2443       aligned = true;
 2444       break;
 2445     case arrayof_jint_fill_id:
 2446       t = T_INT;
 2447       aligned = true;
 2448       break;
 2449     default:
 2450       ShouldNotReachHere();
 2451     };
 2452 
 2453     __ align(CodeEntryAlignment);
 2454     StubCodeMark mark(this, stub_id);
 2455     address start = __ pc();
 2456 
 2457     BLOCK_COMMENT("Entry:");
 2458 
 2459     const Register to        = c_rarg0;  // source array address
 2460     const Register value     = c_rarg1;  // value
 2461     const Register count     = c_rarg2;  // elements count
 2462 
 2463     const Register bz_base = r10;        // base for block_zero routine
 2464     const Register cnt_words = r11;      // temp register
 2465 
 2466     __ enter();
 2467 
 2468     Label L_fill_elements, L_exit1;
 2469 
 2470     int shift = -1;
 2471     switch (t) {
 2472       case T_BYTE:
 2473         shift = 0;
 2474         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2475         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
 2476         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2477         __ br(Assembler::LO, L_fill_elements);
 2478         break;
 2479       case T_SHORT:
 2480         shift = 1;
 2481         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2482         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
 2483         __ br(Assembler::LO, L_fill_elements);
 2484         break;
 2485       case T_INT:
 2486         shift = 2;
 2487         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
 2488         __ br(Assembler::LO, L_fill_elements);
 2489         break;
 2490       default: ShouldNotReachHere();
 2491     }
 2492 
 2493     // Align source address at 8 bytes address boundary.
 2494     Label L_skip_align1, L_skip_align2, L_skip_align4;
 2495     if (!aligned) {
 2496       switch (t) {
 2497         case T_BYTE:
 2498           // One byte misalignment happens only for byte arrays.
 2499           __ tbz(to, 0, L_skip_align1);
 2500           __ strb(value, Address(__ post(to, 1)));
 2501           __ subw(count, count, 1);
 2502           __ bind(L_skip_align1);
 2503           // Fallthrough
 2504         case T_SHORT:
 2505           // Two bytes misalignment happens only for byte and short (char) arrays.
 2506           __ tbz(to, 1, L_skip_align2);
 2507           __ strh(value, Address(__ post(to, 2)));
 2508           __ subw(count, count, 2 >> shift);
 2509           __ bind(L_skip_align2);
 2510           // Fallthrough
 2511         case T_INT:
 2512           // Align to 8 bytes, we know we are 4 byte aligned to start.
 2513           __ tbz(to, 2, L_skip_align4);
 2514           __ strw(value, Address(__ post(to, 4)));
 2515           __ subw(count, count, 4 >> shift);
 2516           __ bind(L_skip_align4);
 2517           break;
 2518         default: ShouldNotReachHere();
 2519       }
 2520     }
 2521 
 2522     //
 2523     //  Fill large chunks
 2524     //
 2525     __ lsrw(cnt_words, count, 3 - shift); // number of words
 2526     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
 2527     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
 2528     if (UseBlockZeroing) {
 2529       Label non_block_zeroing, rest;
 2530       // If the fill value is zero we can use the fast zero_words().
 2531       __ cbnz(value, non_block_zeroing);
 2532       __ mov(bz_base, to);
 2533       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
 2534       address tpc = __ zero_words(bz_base, cnt_words);
 2535       if (tpc == nullptr) {
 2536         fatal("CodeCache is full at generate_fill");
 2537       }
 2538       __ b(rest);
 2539       __ bind(non_block_zeroing);
 2540       __ fill_words(to, cnt_words, value);
 2541       __ bind(rest);
 2542     } else {
 2543       __ fill_words(to, cnt_words, value);
 2544     }
 2545 
 2546     // Remaining count is less than 8 bytes. Fill it by a single store.
 2547     // Note that the total length is no less than 8 bytes.
 2548     if (t == T_BYTE || t == T_SHORT) {
 2549       Label L_exit1;
 2550       __ cbzw(count, L_exit1);
 2551       __ add(to, to, count, Assembler::LSL, shift); // points to the end
 2552       __ str(value, Address(to, -8));    // overwrite some elements
 2553       __ bind(L_exit1);
 2554       __ leave();
 2555       __ ret(lr);
 2556     }
 2557 
 2558     // Handle copies less than 8 bytes.
 2559     Label L_fill_2, L_fill_4, L_exit2;
 2560     __ bind(L_fill_elements);
 2561     switch (t) {
 2562       case T_BYTE:
 2563         __ tbz(count, 0, L_fill_2);
 2564         __ strb(value, Address(__ post(to, 1)));
 2565         __ bind(L_fill_2);
 2566         __ tbz(count, 1, L_fill_4);
 2567         __ strh(value, Address(__ post(to, 2)));
 2568         __ bind(L_fill_4);
 2569         __ tbz(count, 2, L_exit2);
 2570         __ strw(value, Address(to));
 2571         break;
 2572       case T_SHORT:
 2573         __ tbz(count, 0, L_fill_4);
 2574         __ strh(value, Address(__ post(to, 2)));
 2575         __ bind(L_fill_4);
 2576         __ tbz(count, 1, L_exit2);
 2577         __ strw(value, Address(to));
 2578         break;
 2579       case T_INT:
 2580         __ cbzw(count, L_exit2);
 2581         __ strw(value, Address(to));
 2582         break;
 2583       default: ShouldNotReachHere();
 2584     }
 2585     __ bind(L_exit2);
 2586     __ leave();
 2587     __ ret(lr);
 2588     return start;
 2589   }
 2590 
 2591   address generate_data_cache_writeback() {
 2592     const Register line        = c_rarg0;  // address of line to write back
 2593 
 2594     __ align(CodeEntryAlignment);
 2595 
 2596     StubGenStubId stub_id = StubGenStubId::data_cache_writeback_id;
 2597     StubCodeMark mark(this, stub_id);
 2598 
 2599     address start = __ pc();
 2600     __ enter();
 2601     __ cache_wb(Address(line, 0));
 2602     __ leave();
 2603     __ ret(lr);
 2604 
 2605     return start;
 2606   }
 2607 
 2608   address generate_data_cache_writeback_sync() {
 2609     const Register is_pre     = c_rarg0;  // pre or post sync
 2610 
 2611     __ align(CodeEntryAlignment);
 2612 
 2613     StubGenStubId stub_id = StubGenStubId::data_cache_writeback_sync_id;
 2614     StubCodeMark mark(this, stub_id);
 2615 
 2616     // pre wbsync is a no-op
 2617     // post wbsync translates to an sfence
 2618 
 2619     Label skip;
 2620     address start = __ pc();
 2621     __ enter();
 2622     __ cbnz(is_pre, skip);
 2623     __ cache_wbsync(false);
 2624     __ bind(skip);
 2625     __ leave();
 2626     __ ret(lr);
 2627 
 2628     return start;
 2629   }
 2630 
 2631   void generate_arraycopy_stubs() {
 2632     address entry;
 2633     address entry_jbyte_arraycopy;
 2634     address entry_jshort_arraycopy;
 2635     address entry_jint_arraycopy;
 2636     address entry_oop_arraycopy;
 2637     address entry_jlong_arraycopy;
 2638     address entry_checkcast_arraycopy;
 2639 
 2640     generate_copy_longs(StubGenStubId::copy_byte_f_id, IN_HEAP | IS_ARRAY, copy_f, r0, r1, r15);
 2641     generate_copy_longs(StubGenStubId::copy_byte_b_id, IN_HEAP | IS_ARRAY, copy_b, r0, r1, r15);
 2642 
 2643     generate_copy_longs(StubGenStubId::copy_oop_f_id, IN_HEAP | IS_ARRAY, copy_obj_f, r0, r1, r15);
 2644     generate_copy_longs(StubGenStubId::copy_oop_b_id, IN_HEAP | IS_ARRAY, copy_obj_b, r0, r1, r15);
 2645 
 2646     generate_copy_longs(StubGenStubId::copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_f, r0, r1, r15);
 2647     generate_copy_longs(StubGenStubId::copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_b, r0, r1, r15);
 2648 
 2649     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
 2650 
 2651     //*** jbyte
 2652     // Always need aligned and unaligned versions
 2653     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jbyte_disjoint_arraycopy_id, &entry);
 2654     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_copy(StubGenStubId::jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy);
 2655     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id, &entry);
 2656     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jbyte_arraycopy_id, entry, nullptr);
 2657 
 2658     //*** jshort
 2659     // Always need aligned and unaligned versions
 2660     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jshort_disjoint_arraycopy_id, &entry);
 2661     StubRoutines::_jshort_arraycopy                  = generate_conjoint_copy(StubGenStubId::jshort_arraycopy_id, entry, &entry_jshort_arraycopy);
 2662     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id, &entry);
 2663     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jshort_arraycopy_id, entry, nullptr);
 2664 
 2665     //*** jint
 2666     // Aligned versions
 2667     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jint_disjoint_arraycopy_id, &entry);
 2668     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy);
 2669     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
 2670     // entry_jint_arraycopy always points to the unaligned version
 2671     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_copy(StubGenStubId::jint_disjoint_arraycopy_id, &entry);
 2672     StubRoutines::_jint_arraycopy                  = generate_conjoint_copy(StubGenStubId::jint_arraycopy_id, entry, &entry_jint_arraycopy);
 2673 
 2674     //*** jlong
 2675     // It is always aligned
 2676     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jlong_disjoint_arraycopy_id, &entry);
 2677     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_copy(StubGenStubId::arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy);
 2678     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
 2679     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
 2680 
 2681     //*** oops
 2682     {
 2683       // With compressed oops we need unaligned versions; notice that
 2684       // we overwrite entry_oop_arraycopy.
 2685       bool aligned = !UseCompressedOops;
 2686 
 2687       StubRoutines::_arrayof_oop_disjoint_arraycopy
 2688         = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_id, &entry);
 2689       StubRoutines::_arrayof_oop_arraycopy
 2690         = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy);
 2691       // Aligned versions without pre-barriers
 2692       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
 2693         = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_uninit_id, &entry);
 2694       StubRoutines::_arrayof_oop_arraycopy_uninit
 2695         = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_uninit_id, entry, nullptr);
 2696     }
 2697 
 2698     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
 2699     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
 2700     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
 2701     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
 2702 
 2703     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_id, &entry_checkcast_arraycopy);
 2704     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_uninit_id, nullptr);
 2705 
 2706     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy(entry_jbyte_arraycopy,
 2707                                                               entry_jshort_arraycopy,
 2708                                                               entry_jint_arraycopy,
 2709                                                               entry_jlong_arraycopy);
 2710 
 2711     StubRoutines::_generic_arraycopy   = generate_generic_copy(entry_jbyte_arraycopy,
 2712                                                                entry_jshort_arraycopy,
 2713                                                                entry_jint_arraycopy,
 2714                                                                entry_oop_arraycopy,
 2715                                                                entry_jlong_arraycopy,
 2716                                                                entry_checkcast_arraycopy);
 2717 
 2718     StubRoutines::_jbyte_fill = generate_fill(StubGenStubId::jbyte_fill_id);
 2719     StubRoutines::_jshort_fill = generate_fill(StubGenStubId::jshort_fill_id);
 2720     StubRoutines::_jint_fill = generate_fill(StubGenStubId::jint_fill_id);
 2721     StubRoutines::_arrayof_jbyte_fill = generate_fill(StubGenStubId::arrayof_jbyte_fill_id);
 2722     StubRoutines::_arrayof_jshort_fill = generate_fill(StubGenStubId::arrayof_jshort_fill_id);
 2723     StubRoutines::_arrayof_jint_fill = generate_fill(StubGenStubId::arrayof_jint_fill_id);
 2724   }
 2725 
 2726   void generate_math_stubs() { Unimplemented(); }
 2727 
 2728   // Arguments:
 2729   //
 2730   // Inputs:
 2731   //   c_rarg0   - source byte array address
 2732   //   c_rarg1   - destination byte array address
 2733   //   c_rarg2   - K (key) in little endian int array
 2734   //
 2735   address generate_aescrypt_encryptBlock() {
 2736     __ align(CodeEntryAlignment);
 2737     StubGenStubId stub_id = StubGenStubId::aescrypt_encryptBlock_id;
 2738     StubCodeMark mark(this, stub_id);
 2739 
 2740     const Register from        = c_rarg0;  // source array address
 2741     const Register to          = c_rarg1;  // destination array address
 2742     const Register key         = c_rarg2;  // key array address
 2743     const Register keylen      = rscratch1;
 2744 
 2745     address start = __ pc();
 2746     __ enter();
 2747 
 2748     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2749 
 2750     __ aesenc_loadkeys(key, keylen);
 2751     __ aesecb_encrypt(from, to, keylen);
 2752 
 2753     __ mov(r0, 0);
 2754 
 2755     __ leave();
 2756     __ ret(lr);
 2757 
 2758     return start;
 2759   }
 2760 
 2761   // Arguments:
 2762   //
 2763   // Inputs:
 2764   //   c_rarg0   - source byte array address
 2765   //   c_rarg1   - destination byte array address
 2766   //   c_rarg2   - K (key) in little endian int array
 2767   //
 2768   address generate_aescrypt_decryptBlock() {
 2769     assert(UseAES, "need AES cryptographic extension support");
 2770     __ align(CodeEntryAlignment);
 2771     StubGenStubId stub_id = StubGenStubId::aescrypt_decryptBlock_id;
 2772     StubCodeMark mark(this, stub_id);
 2773     Label L_doLast;
 2774 
 2775     const Register from        = c_rarg0;  // source array address
 2776     const Register to          = c_rarg1;  // destination array address
 2777     const Register key         = c_rarg2;  // key array address
 2778     const Register keylen      = rscratch1;
 2779 
 2780     address start = __ pc();
 2781     __ enter(); // required for proper stackwalking of RuntimeStub frame
 2782 
 2783     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2784 
 2785     __ aesecb_decrypt(from, to, key, keylen);
 2786 
 2787     __ mov(r0, 0);
 2788 
 2789     __ leave();
 2790     __ ret(lr);
 2791 
 2792     return start;
 2793   }
 2794 
 2795   // Arguments:
 2796   //
 2797   // Inputs:
 2798   //   c_rarg0   - source byte array address
 2799   //   c_rarg1   - destination byte array address
 2800   //   c_rarg2   - K (key) in little endian int array
 2801   //   c_rarg3   - r vector byte array address
 2802   //   c_rarg4   - input length
 2803   //
 2804   // Output:
 2805   //   x0        - input length
 2806   //
 2807   address generate_cipherBlockChaining_encryptAESCrypt() {
 2808     assert(UseAES, "need AES cryptographic extension support");
 2809     __ align(CodeEntryAlignment);
 2810     StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_encryptAESCrypt_id;
 2811     StubCodeMark mark(this, stub_id);
 2812 
 2813     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 2814 
 2815     const Register from        = c_rarg0;  // source array address
 2816     const Register to          = c_rarg1;  // destination array address
 2817     const Register key         = c_rarg2;  // key array address
 2818     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 2819                                            // and left with the results of the last encryption block
 2820     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 2821     const Register keylen      = rscratch1;
 2822 
 2823     address start = __ pc();
 2824 
 2825       __ enter();
 2826 
 2827       __ movw(rscratch2, len_reg);
 2828 
 2829       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2830 
 2831       __ ld1(v0, __ T16B, rvec);
 2832 
 2833       __ cmpw(keylen, 52);
 2834       __ br(Assembler::CC, L_loadkeys_44);
 2835       __ br(Assembler::EQ, L_loadkeys_52);
 2836 
 2837       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 2838       __ rev32(v17, __ T16B, v17);
 2839       __ rev32(v18, __ T16B, v18);
 2840     __ BIND(L_loadkeys_52);
 2841       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 2842       __ rev32(v19, __ T16B, v19);
 2843       __ rev32(v20, __ T16B, v20);
 2844     __ BIND(L_loadkeys_44);
 2845       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 2846       __ rev32(v21, __ T16B, v21);
 2847       __ rev32(v22, __ T16B, v22);
 2848       __ rev32(v23, __ T16B, v23);
 2849       __ rev32(v24, __ T16B, v24);
 2850       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 2851       __ rev32(v25, __ T16B, v25);
 2852       __ rev32(v26, __ T16B, v26);
 2853       __ rev32(v27, __ T16B, v27);
 2854       __ rev32(v28, __ T16B, v28);
 2855       __ ld1(v29, v30, v31, __ T16B, key);
 2856       __ rev32(v29, __ T16B, v29);
 2857       __ rev32(v30, __ T16B, v30);
 2858       __ rev32(v31, __ T16B, v31);
 2859 
 2860     __ BIND(L_aes_loop);
 2861       __ ld1(v1, __ T16B, __ post(from, 16));
 2862       __ eor(v0, __ T16B, v0, v1);
 2863 
 2864       __ br(Assembler::CC, L_rounds_44);
 2865       __ br(Assembler::EQ, L_rounds_52);
 2866 
 2867       __ aese(v0, v17); __ aesmc(v0, v0);
 2868       __ aese(v0, v18); __ aesmc(v0, v0);
 2869     __ BIND(L_rounds_52);
 2870       __ aese(v0, v19); __ aesmc(v0, v0);
 2871       __ aese(v0, v20); __ aesmc(v0, v0);
 2872     __ BIND(L_rounds_44);
 2873       __ aese(v0, v21); __ aesmc(v0, v0);
 2874       __ aese(v0, v22); __ aesmc(v0, v0);
 2875       __ aese(v0, v23); __ aesmc(v0, v0);
 2876       __ aese(v0, v24); __ aesmc(v0, v0);
 2877       __ aese(v0, v25); __ aesmc(v0, v0);
 2878       __ aese(v0, v26); __ aesmc(v0, v0);
 2879       __ aese(v0, v27); __ aesmc(v0, v0);
 2880       __ aese(v0, v28); __ aesmc(v0, v0);
 2881       __ aese(v0, v29); __ aesmc(v0, v0);
 2882       __ aese(v0, v30);
 2883       __ eor(v0, __ T16B, v0, v31);
 2884 
 2885       __ st1(v0, __ T16B, __ post(to, 16));
 2886 
 2887       __ subw(len_reg, len_reg, 16);
 2888       __ cbnzw(len_reg, L_aes_loop);
 2889 
 2890       __ st1(v0, __ T16B, rvec);
 2891 
 2892       __ mov(r0, rscratch2);
 2893 
 2894       __ leave();
 2895       __ ret(lr);
 2896 
 2897       return start;
 2898   }
 2899 
 2900   // Arguments:
 2901   //
 2902   // Inputs:
 2903   //   c_rarg0   - source byte array address
 2904   //   c_rarg1   - destination byte array address
 2905   //   c_rarg2   - K (key) in little endian int array
 2906   //   c_rarg3   - r vector byte array address
 2907   //   c_rarg4   - input length
 2908   //
 2909   // Output:
 2910   //   r0        - input length
 2911   //
 2912   address generate_cipherBlockChaining_decryptAESCrypt() {
 2913     assert(UseAES, "need AES cryptographic extension support");
 2914     __ align(CodeEntryAlignment);
 2915     StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_decryptAESCrypt_id;
 2916     StubCodeMark mark(this, stub_id);
 2917 
 2918     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
 2919 
 2920     const Register from        = c_rarg0;  // source array address
 2921     const Register to          = c_rarg1;  // destination array address
 2922     const Register key         = c_rarg2;  // key array address
 2923     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
 2924                                            // and left with the results of the last encryption block
 2925     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
 2926     const Register keylen      = rscratch1;
 2927 
 2928     address start = __ pc();
 2929 
 2930       __ enter();
 2931 
 2932       __ movw(rscratch2, len_reg);
 2933 
 2934       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 2935 
 2936       __ ld1(v2, __ T16B, rvec);
 2937 
 2938       __ ld1(v31, __ T16B, __ post(key, 16));
 2939       __ rev32(v31, __ T16B, v31);
 2940 
 2941       __ cmpw(keylen, 52);
 2942       __ br(Assembler::CC, L_loadkeys_44);
 2943       __ br(Assembler::EQ, L_loadkeys_52);
 2944 
 2945       __ ld1(v17, v18, __ T16B, __ post(key, 32));
 2946       __ rev32(v17, __ T16B, v17);
 2947       __ rev32(v18, __ T16B, v18);
 2948     __ BIND(L_loadkeys_52);
 2949       __ ld1(v19, v20, __ T16B, __ post(key, 32));
 2950       __ rev32(v19, __ T16B, v19);
 2951       __ rev32(v20, __ T16B, v20);
 2952     __ BIND(L_loadkeys_44);
 2953       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
 2954       __ rev32(v21, __ T16B, v21);
 2955       __ rev32(v22, __ T16B, v22);
 2956       __ rev32(v23, __ T16B, v23);
 2957       __ rev32(v24, __ T16B, v24);
 2958       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
 2959       __ rev32(v25, __ T16B, v25);
 2960       __ rev32(v26, __ T16B, v26);
 2961       __ rev32(v27, __ T16B, v27);
 2962       __ rev32(v28, __ T16B, v28);
 2963       __ ld1(v29, v30, __ T16B, key);
 2964       __ rev32(v29, __ T16B, v29);
 2965       __ rev32(v30, __ T16B, v30);
 2966 
 2967     __ BIND(L_aes_loop);
 2968       __ ld1(v0, __ T16B, __ post(from, 16));
 2969       __ orr(v1, __ T16B, v0, v0);
 2970 
 2971       __ br(Assembler::CC, L_rounds_44);
 2972       __ br(Assembler::EQ, L_rounds_52);
 2973 
 2974       __ aesd(v0, v17); __ aesimc(v0, v0);
 2975       __ aesd(v0, v18); __ aesimc(v0, v0);
 2976     __ BIND(L_rounds_52);
 2977       __ aesd(v0, v19); __ aesimc(v0, v0);
 2978       __ aesd(v0, v20); __ aesimc(v0, v0);
 2979     __ BIND(L_rounds_44);
 2980       __ aesd(v0, v21); __ aesimc(v0, v0);
 2981       __ aesd(v0, v22); __ aesimc(v0, v0);
 2982       __ aesd(v0, v23); __ aesimc(v0, v0);
 2983       __ aesd(v0, v24); __ aesimc(v0, v0);
 2984       __ aesd(v0, v25); __ aesimc(v0, v0);
 2985       __ aesd(v0, v26); __ aesimc(v0, v0);
 2986       __ aesd(v0, v27); __ aesimc(v0, v0);
 2987       __ aesd(v0, v28); __ aesimc(v0, v0);
 2988       __ aesd(v0, v29); __ aesimc(v0, v0);
 2989       __ aesd(v0, v30);
 2990       __ eor(v0, __ T16B, v0, v31);
 2991       __ eor(v0, __ T16B, v0, v2);
 2992 
 2993       __ st1(v0, __ T16B, __ post(to, 16));
 2994       __ orr(v2, __ T16B, v1, v1);
 2995 
 2996       __ subw(len_reg, len_reg, 16);
 2997       __ cbnzw(len_reg, L_aes_loop);
 2998 
 2999       __ st1(v2, __ T16B, rvec);
 3000 
 3001       __ mov(r0, rscratch2);
 3002 
 3003       __ leave();
 3004       __ ret(lr);
 3005 
 3006     return start;
 3007   }
 3008 
 3009   // Big-endian 128-bit + 64-bit -> 128-bit addition.
 3010   // Inputs: 128-bits. in is preserved.
 3011   // The least-significant 64-bit word is in the upper dword of each vector.
 3012   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
 3013   // Output: result
 3014   void be_add_128_64(FloatRegister result, FloatRegister in,
 3015                      FloatRegister inc, FloatRegister tmp) {
 3016     assert_different_registers(result, tmp, inc);
 3017 
 3018     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
 3019                                            // input
 3020     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
 3021     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
 3022                                            // MSD == 0 (must be!) to LSD
 3023     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
 3024   }
 3025 
 3026   // CTR AES crypt.
 3027   // Arguments:
 3028   //
 3029   // Inputs:
 3030   //   c_rarg0   - source byte array address
 3031   //   c_rarg1   - destination byte array address
 3032   //   c_rarg2   - K (key) in little endian int array
 3033   //   c_rarg3   - counter vector byte array address
 3034   //   c_rarg4   - input length
 3035   //   c_rarg5   - saved encryptedCounter start
 3036   //   c_rarg6   - saved used length
 3037   //
 3038   // Output:
 3039   //   r0       - input length
 3040   //
 3041   address generate_counterMode_AESCrypt() {
 3042     const Register in = c_rarg0;
 3043     const Register out = c_rarg1;
 3044     const Register key = c_rarg2;
 3045     const Register counter = c_rarg3;
 3046     const Register saved_len = c_rarg4, len = r10;
 3047     const Register saved_encrypted_ctr = c_rarg5;
 3048     const Register used_ptr = c_rarg6, used = r12;
 3049 
 3050     const Register offset = r7;
 3051     const Register keylen = r11;
 3052 
 3053     const unsigned char block_size = 16;
 3054     const int bulk_width = 4;
 3055     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
 3056     // performance with larger data sizes, but it also means that the
 3057     // fast path isn't used until you have at least 8 blocks, and up
 3058     // to 127 bytes of data will be executed on the slow path. For
 3059     // that reason, and also so as not to blow away too much icache, 4
 3060     // blocks seems like a sensible compromise.
 3061 
 3062     // Algorithm:
 3063     //
 3064     //    if (len == 0) {
 3065     //        goto DONE;
 3066     //    }
 3067     //    int result = len;
 3068     //    do {
 3069     //        if (used >= blockSize) {
 3070     //            if (len >= bulk_width * blockSize) {
 3071     //                CTR_large_block();
 3072     //                if (len == 0)
 3073     //                    goto DONE;
 3074     //            }
 3075     //            for (;;) {
 3076     //                16ByteVector v0 = counter;
 3077     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
 3078     //                used = 0;
 3079     //                if (len < blockSize)
 3080     //                    break;    /* goto NEXT */
 3081     //                16ByteVector v1 = load16Bytes(in, offset);
 3082     //                v1 = v1 ^ encryptedCounter;
 3083     //                store16Bytes(out, offset);
 3084     //                used = blockSize;
 3085     //                offset += blockSize;
 3086     //                len -= blockSize;
 3087     //                if (len == 0)
 3088     //                    goto DONE;
 3089     //            }
 3090     //        }
 3091     //      NEXT:
 3092     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
 3093     //        len--;
 3094     //    } while (len != 0);
 3095     //  DONE:
 3096     //    return result;
 3097     //
 3098     // CTR_large_block()
 3099     //    Wide bulk encryption of whole blocks.
 3100 
 3101     __ align(CodeEntryAlignment);
 3102     StubGenStubId stub_id = StubGenStubId::counterMode_AESCrypt_id;
 3103     StubCodeMark mark(this, stub_id);
 3104     const address start = __ pc();
 3105     __ enter();
 3106 
 3107     Label DONE, CTR_large_block, large_block_return;
 3108     __ ldrw(used, Address(used_ptr));
 3109     __ cbzw(saved_len, DONE);
 3110 
 3111     __ mov(len, saved_len);
 3112     __ mov(offset, 0);
 3113 
 3114     // Compute #rounds for AES based on the length of the key array
 3115     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3116 
 3117     __ aesenc_loadkeys(key, keylen);
 3118 
 3119     {
 3120       Label L_CTR_loop, NEXT;
 3121 
 3122       __ bind(L_CTR_loop);
 3123 
 3124       __ cmp(used, block_size);
 3125       __ br(__ LO, NEXT);
 3126 
 3127       // Maybe we have a lot of data
 3128       __ subsw(rscratch1, len, bulk_width * block_size);
 3129       __ br(__ HS, CTR_large_block);
 3130       __ BIND(large_block_return);
 3131       __ cbzw(len, DONE);
 3132 
 3133       // Setup the counter
 3134       __ movi(v4, __ T4S, 0);
 3135       __ movi(v5, __ T4S, 1);
 3136       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
 3137 
 3138       // 128-bit big-endian increment
 3139       __ ld1(v0, __ T16B, counter);
 3140       __ rev64(v16, __ T16B, v0);
 3141       be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3142       __ rev64(v16, __ T16B, v16);
 3143       __ st1(v16, __ T16B, counter);
 3144       // Previous counter value is in v0
 3145       // v4 contains { 0, 1 }
 3146 
 3147       {
 3148         // We have fewer than bulk_width blocks of data left. Encrypt
 3149         // them one by one until there is less than a full block
 3150         // remaining, being careful to save both the encrypted counter
 3151         // and the counter.
 3152 
 3153         Label inner_loop;
 3154         __ bind(inner_loop);
 3155         // Counter to encrypt is in v0
 3156         __ aesecb_encrypt(noreg, noreg, keylen);
 3157         __ st1(v0, __ T16B, saved_encrypted_ctr);
 3158 
 3159         // Do we have a remaining full block?
 3160 
 3161         __ mov(used, 0);
 3162         __ cmp(len, block_size);
 3163         __ br(__ LO, NEXT);
 3164 
 3165         // Yes, we have a full block
 3166         __ ldrq(v1, Address(in, offset));
 3167         __ eor(v1, __ T16B, v1, v0);
 3168         __ strq(v1, Address(out, offset));
 3169         __ mov(used, block_size);
 3170         __ add(offset, offset, block_size);
 3171 
 3172         __ subw(len, len, block_size);
 3173         __ cbzw(len, DONE);
 3174 
 3175         // Increment the counter, store it back
 3176         __ orr(v0, __ T16B, v16, v16);
 3177         __ rev64(v16, __ T16B, v16);
 3178         be_add_128_64(v16, v16, v4, /*tmp*/v5);
 3179         __ rev64(v16, __ T16B, v16);
 3180         __ st1(v16, __ T16B, counter); // Save the incremented counter back
 3181 
 3182         __ b(inner_loop);
 3183       }
 3184 
 3185       __ BIND(NEXT);
 3186 
 3187       // Encrypt a single byte, and loop.
 3188       // We expect this to be a rare event.
 3189       __ ldrb(rscratch1, Address(in, offset));
 3190       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
 3191       __ eor(rscratch1, rscratch1, rscratch2);
 3192       __ strb(rscratch1, Address(out, offset));
 3193       __ add(offset, offset, 1);
 3194       __ add(used, used, 1);
 3195       __ subw(len, len,1);
 3196       __ cbnzw(len, L_CTR_loop);
 3197     }
 3198 
 3199     __ bind(DONE);
 3200     __ strw(used, Address(used_ptr));
 3201     __ mov(r0, saved_len);
 3202 
 3203     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3204     __ ret(lr);
 3205 
 3206     // Bulk encryption
 3207 
 3208     __ BIND (CTR_large_block);
 3209     assert(bulk_width == 4 || bulk_width == 8, "must be");
 3210 
 3211     if (bulk_width == 8) {
 3212       __ sub(sp, sp, 4 * 16);
 3213       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3214     }
 3215     __ sub(sp, sp, 4 * 16);
 3216     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3217     RegSet saved_regs = (RegSet::of(in, out, offset)
 3218                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
 3219     __ push(saved_regs, sp);
 3220     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
 3221     __ add(in, in, offset);
 3222     __ add(out, out, offset);
 3223 
 3224     // Keys should already be loaded into the correct registers
 3225 
 3226     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3227     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3228 
 3229     // AES/CTR loop
 3230     {
 3231       Label L_CTR_loop;
 3232       __ BIND(L_CTR_loop);
 3233 
 3234       // Setup the counters
 3235       __ movi(v8, __ T4S, 0);
 3236       __ movi(v9, __ T4S, 1);
 3237       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
 3238 
 3239       for (int i = 0; i < bulk_width; i++) {
 3240         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3241         __ rev64(v0_ofs, __ T16B, v16);
 3242         be_add_128_64(v16, v16, v8, /*tmp*/v9);
 3243       }
 3244 
 3245       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3246 
 3247       // Encrypt the counters
 3248       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
 3249 
 3250       if (bulk_width == 8) {
 3251         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3252       }
 3253 
 3254       // XOR the encrypted counters with the inputs
 3255       for (int i = 0; i < bulk_width; i++) {
 3256         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3257         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3258         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3259       }
 3260 
 3261       // Write the encrypted data
 3262       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3263       if (bulk_width == 8) {
 3264         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3265       }
 3266 
 3267       __ subw(len, len, 16 * bulk_width);
 3268       __ cbnzw(len, L_CTR_loop);
 3269     }
 3270 
 3271     // Save the counter back where it goes
 3272     __ rev64(v16, __ T16B, v16);
 3273     __ st1(v16, __ T16B, counter);
 3274 
 3275     __ pop(saved_regs, sp);
 3276 
 3277     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3278     if (bulk_width == 8) {
 3279       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3280     }
 3281 
 3282     __ andr(rscratch1, len, -16 * bulk_width);
 3283     __ sub(len, len, rscratch1);
 3284     __ add(offset, offset, rscratch1);
 3285     __ mov(used, 16);
 3286     __ strw(used, Address(used_ptr));
 3287     __ b(large_block_return);
 3288 
 3289     return start;
 3290   }
 3291 
 3292   // Vector AES Galois Counter Mode implementation. Parameters:
 3293   //
 3294   // in = c_rarg0
 3295   // len = c_rarg1
 3296   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
 3297   // out = c_rarg3
 3298   // key = c_rarg4
 3299   // state = c_rarg5 - GHASH.state
 3300   // subkeyHtbl = c_rarg6 - powers of H
 3301   // counter = c_rarg7 - 16 bytes of CTR
 3302   // return - number of processed bytes
 3303   address generate_galoisCounterMode_AESCrypt() {
 3304     address ghash_polynomial = __ pc();
 3305     __ emit_int64(0x87);  // The low-order bits of the field
 3306                           // polynomial (i.e. p = z^7+z^2+z+1)
 3307                           // repeated in the low and high parts of a
 3308                           // 128-bit vector
 3309     __ emit_int64(0x87);
 3310 
 3311     __ align(CodeEntryAlignment);
 3312     StubGenStubId stub_id = StubGenStubId::galoisCounterMode_AESCrypt_id;
 3313     StubCodeMark mark(this, stub_id);
 3314     address start = __ pc();
 3315     __ enter();
 3316 
 3317     const Register in = c_rarg0;
 3318     const Register len = c_rarg1;
 3319     const Register ct = c_rarg2;
 3320     const Register out = c_rarg3;
 3321     // and updated with the incremented counter in the end
 3322 
 3323     const Register key = c_rarg4;
 3324     const Register state = c_rarg5;
 3325 
 3326     const Register subkeyHtbl = c_rarg6;
 3327 
 3328     const Register counter = c_rarg7;
 3329 
 3330     const Register keylen = r10;
 3331     // Save state before entering routine
 3332     __ sub(sp, sp, 4 * 16);
 3333     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 3334     __ sub(sp, sp, 4 * 16);
 3335     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 3336 
 3337     // __ andr(len, len, -512);
 3338     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
 3339     __ str(len, __ pre(sp, -2 * wordSize));
 3340 
 3341     Label DONE;
 3342     __ cbz(len, DONE);
 3343 
 3344     // Compute #rounds for AES based on the length of the key array
 3345     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 3346 
 3347     __ aesenc_loadkeys(key, keylen);
 3348     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
 3349     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
 3350 
 3351     // AES/CTR loop
 3352     {
 3353       Label L_CTR_loop;
 3354       __ BIND(L_CTR_loop);
 3355 
 3356       // Setup the counters
 3357       __ movi(v8, __ T4S, 0);
 3358       __ movi(v9, __ T4S, 1);
 3359       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
 3360 
 3361       assert(v0->encoding() < v8->encoding(), "");
 3362       for (int i = v0->encoding(); i < v8->encoding(); i++) {
 3363         FloatRegister f = as_FloatRegister(i);
 3364         __ rev32(f, __ T16B, v16);
 3365         __ addv(v16, __ T4S, v16, v8);
 3366       }
 3367 
 3368       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
 3369 
 3370       // Encrypt the counters
 3371       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
 3372 
 3373       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
 3374 
 3375       // XOR the encrypted counters with the inputs
 3376       for (int i = 0; i < 8; i++) {
 3377         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
 3378         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
 3379         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
 3380       }
 3381       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
 3382       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
 3383 
 3384       __ subw(len, len, 16 * 8);
 3385       __ cbnzw(len, L_CTR_loop);
 3386     }
 3387 
 3388     __ rev32(v16, __ T16B, v16);
 3389     __ st1(v16, __ T16B, counter);
 3390 
 3391     __ ldr(len, Address(sp));
 3392     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
 3393 
 3394     // GHASH/CTR loop
 3395     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
 3396                                 len, /*unrolls*/4);
 3397 
 3398 #ifdef ASSERT
 3399     { Label L;
 3400       __ cmp(len, (unsigned char)0);
 3401       __ br(Assembler::EQ, L);
 3402       __ stop("stubGenerator: abort");
 3403       __ bind(L);
 3404   }
 3405 #endif
 3406 
 3407   __ bind(DONE);
 3408     // Return the number of bytes processed
 3409     __ ldr(r0, __ post(sp, 2 * wordSize));
 3410 
 3411     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 3412     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 3413 
 3414     __ leave(); // required for proper stackwalking of RuntimeStub frame
 3415     __ ret(lr);
 3416      return start;
 3417   }
 3418 
 3419   class Cached64Bytes {
 3420   private:
 3421     MacroAssembler *_masm;
 3422     Register _regs[8];
 3423 
 3424   public:
 3425     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
 3426       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
 3427       auto it = rs.begin();
 3428       for (auto &r: _regs) {
 3429         r = *it;
 3430         ++it;
 3431       }
 3432     }
 3433 
 3434     void gen_loads(Register base) {
 3435       for (int i = 0; i < 8; i += 2) {
 3436         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
 3437       }
 3438     }
 3439 
 3440     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
 3441     void extract_u32(Register dest, int i) {
 3442       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
 3443     }
 3444   };
 3445 
 3446   // Utility routines for md5.
 3447   // Clobbers r10 and r11.
 3448   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3449               int k, int s, int t) {
 3450     Register rscratch3 = r10;
 3451     Register rscratch4 = r11;
 3452 
 3453     __ eorw(rscratch3, r3, r4);
 3454     __ movw(rscratch2, t);
 3455     __ andw(rscratch3, rscratch3, r2);
 3456     __ addw(rscratch4, r1, rscratch2);
 3457     reg_cache.extract_u32(rscratch1, k);
 3458     __ eorw(rscratch3, rscratch3, r4);
 3459     __ addw(rscratch4, rscratch4, rscratch1);
 3460     __ addw(rscratch3, rscratch3, rscratch4);
 3461     __ rorw(rscratch2, rscratch3, 32 - s);
 3462     __ addw(r1, rscratch2, r2);
 3463   }
 3464 
 3465   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3466               int k, int s, int t) {
 3467     Register rscratch3 = r10;
 3468     Register rscratch4 = r11;
 3469 
 3470     reg_cache.extract_u32(rscratch1, k);
 3471     __ movw(rscratch2, t);
 3472     __ addw(rscratch4, r1, rscratch2);
 3473     __ addw(rscratch4, rscratch4, rscratch1);
 3474     __ bicw(rscratch2, r3, r4);
 3475     __ andw(rscratch3, r2, r4);
 3476     __ addw(rscratch2, rscratch2, rscratch4);
 3477     __ addw(rscratch2, rscratch2, rscratch3);
 3478     __ rorw(rscratch2, rscratch2, 32 - s);
 3479     __ addw(r1, rscratch2, r2);
 3480   }
 3481 
 3482   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3483               int k, int s, int t) {
 3484     Register rscratch3 = r10;
 3485     Register rscratch4 = r11;
 3486 
 3487     __ eorw(rscratch3, r3, r4);
 3488     __ movw(rscratch2, t);
 3489     __ addw(rscratch4, r1, rscratch2);
 3490     reg_cache.extract_u32(rscratch1, k);
 3491     __ eorw(rscratch3, rscratch3, r2);
 3492     __ addw(rscratch4, rscratch4, rscratch1);
 3493     __ addw(rscratch3, rscratch3, rscratch4);
 3494     __ rorw(rscratch2, rscratch3, 32 - s);
 3495     __ addw(r1, rscratch2, r2);
 3496   }
 3497 
 3498   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
 3499               int k, int s, int t) {
 3500     Register rscratch3 = r10;
 3501     Register rscratch4 = r11;
 3502 
 3503     __ movw(rscratch3, t);
 3504     __ ornw(rscratch2, r2, r4);
 3505     __ addw(rscratch4, r1, rscratch3);
 3506     reg_cache.extract_u32(rscratch1, k);
 3507     __ eorw(rscratch3, rscratch2, r3);
 3508     __ addw(rscratch4, rscratch4, rscratch1);
 3509     __ addw(rscratch3, rscratch3, rscratch4);
 3510     __ rorw(rscratch2, rscratch3, 32 - s);
 3511     __ addw(r1, rscratch2, r2);
 3512   }
 3513 
 3514   // Arguments:
 3515   //
 3516   // Inputs:
 3517   //   c_rarg0   - byte[]  source+offset
 3518   //   c_rarg1   - int[]   SHA.state
 3519   //   c_rarg2   - int     offset
 3520   //   c_rarg3   - int     limit
 3521   //
 3522   address generate_md5_implCompress(StubGenStubId stub_id) {
 3523     bool multi_block;
 3524     switch (stub_id) {
 3525     case md5_implCompress_id:
 3526       multi_block = false;
 3527       break;
 3528     case md5_implCompressMB_id:
 3529       multi_block = true;
 3530       break;
 3531     default:
 3532       ShouldNotReachHere();
 3533     }
 3534     __ align(CodeEntryAlignment);
 3535 
 3536     StubCodeMark mark(this, stub_id);
 3537     address start = __ pc();
 3538 
 3539     Register buf       = c_rarg0;
 3540     Register state     = c_rarg1;
 3541     Register ofs       = c_rarg2;
 3542     Register limit     = c_rarg3;
 3543     Register a         = r4;
 3544     Register b         = r5;
 3545     Register c         = r6;
 3546     Register d         = r7;
 3547     Register rscratch3 = r10;
 3548     Register rscratch4 = r11;
 3549 
 3550     Register state_regs[2] = { r12, r13 };
 3551     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
 3552     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
 3553 
 3554     __ push(saved_regs, sp);
 3555 
 3556     __ ldp(state_regs[0], state_regs[1], Address(state));
 3557     __ ubfx(a, state_regs[0],  0, 32);
 3558     __ ubfx(b, state_regs[0], 32, 32);
 3559     __ ubfx(c, state_regs[1],  0, 32);
 3560     __ ubfx(d, state_regs[1], 32, 32);
 3561 
 3562     Label md5_loop;
 3563     __ BIND(md5_loop);
 3564 
 3565     reg_cache.gen_loads(buf);
 3566 
 3567     // Round 1
 3568     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
 3569     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
 3570     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
 3571     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
 3572     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
 3573     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
 3574     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
 3575     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
 3576     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
 3577     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
 3578     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
 3579     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
 3580     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
 3581     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
 3582     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
 3583     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
 3584 
 3585     // Round 2
 3586     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
 3587     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
 3588     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
 3589     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
 3590     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
 3591     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
 3592     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
 3593     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
 3594     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
 3595     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
 3596     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
 3597     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
 3598     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
 3599     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
 3600     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
 3601     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
 3602 
 3603     // Round 3
 3604     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
 3605     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
 3606     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
 3607     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
 3608     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
 3609     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
 3610     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
 3611     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
 3612     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
 3613     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
 3614     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
 3615     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
 3616     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
 3617     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
 3618     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
 3619     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
 3620 
 3621     // Round 4
 3622     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
 3623     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
 3624     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
 3625     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
 3626     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
 3627     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
 3628     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
 3629     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
 3630     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
 3631     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
 3632     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
 3633     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
 3634     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
 3635     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
 3636     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
 3637     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
 3638 
 3639     __ addw(a, state_regs[0], a);
 3640     __ ubfx(rscratch2, state_regs[0], 32, 32);
 3641     __ addw(b, rscratch2, b);
 3642     __ addw(c, state_regs[1], c);
 3643     __ ubfx(rscratch4, state_regs[1], 32, 32);
 3644     __ addw(d, rscratch4, d);
 3645 
 3646     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
 3647     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
 3648 
 3649     if (multi_block) {
 3650       __ add(buf, buf, 64);
 3651       __ add(ofs, ofs, 64);
 3652       __ cmp(ofs, limit);
 3653       __ br(Assembler::LE, md5_loop);
 3654       __ mov(c_rarg0, ofs); // return ofs
 3655     }
 3656 
 3657     // write hash values back in the correct order
 3658     __ stp(state_regs[0], state_regs[1], Address(state));
 3659 
 3660     __ pop(saved_regs, sp);
 3661 
 3662     __ ret(lr);
 3663 
 3664     return start;
 3665   }
 3666 
 3667   // Arguments:
 3668   //
 3669   // Inputs:
 3670   //   c_rarg0   - byte[]  source+offset
 3671   //   c_rarg1   - int[]   SHA.state
 3672   //   c_rarg2   - int     offset
 3673   //   c_rarg3   - int     limit
 3674   //
 3675   address generate_sha1_implCompress(StubGenStubId stub_id) {
 3676     bool multi_block;
 3677     switch (stub_id) {
 3678     case sha1_implCompress_id:
 3679       multi_block = false;
 3680       break;
 3681     case sha1_implCompressMB_id:
 3682       multi_block = true;
 3683       break;
 3684     default:
 3685       ShouldNotReachHere();
 3686     }
 3687 
 3688     __ align(CodeEntryAlignment);
 3689 
 3690     StubCodeMark mark(this, stub_id);
 3691     address start = __ pc();
 3692 
 3693     Register buf   = c_rarg0;
 3694     Register state = c_rarg1;
 3695     Register ofs   = c_rarg2;
 3696     Register limit = c_rarg3;
 3697 
 3698     Label keys;
 3699     Label sha1_loop;
 3700 
 3701     // load the keys into v0..v3
 3702     __ adr(rscratch1, keys);
 3703     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
 3704     // load 5 words state into v6, v7
 3705     __ ldrq(v6, Address(state, 0));
 3706     __ ldrs(v7, Address(state, 16));
 3707 
 3708 
 3709     __ BIND(sha1_loop);
 3710     // load 64 bytes of data into v16..v19
 3711     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3712     __ rev32(v16, __ T16B, v16);
 3713     __ rev32(v17, __ T16B, v17);
 3714     __ rev32(v18, __ T16B, v18);
 3715     __ rev32(v19, __ T16B, v19);
 3716 
 3717     // do the sha1
 3718     __ addv(v4, __ T4S, v16, v0);
 3719     __ orr(v20, __ T16B, v6, v6);
 3720 
 3721     FloatRegister d0 = v16;
 3722     FloatRegister d1 = v17;
 3723     FloatRegister d2 = v18;
 3724     FloatRegister d3 = v19;
 3725 
 3726     for (int round = 0; round < 20; round++) {
 3727       FloatRegister tmp1 = (round & 1) ? v4 : v5;
 3728       FloatRegister tmp2 = (round & 1) ? v21 : v22;
 3729       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
 3730       FloatRegister tmp4 = (round & 1) ? v5 : v4;
 3731       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
 3732 
 3733       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
 3734       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
 3735       __ sha1h(tmp2, __ T4S, v20);
 3736       if (round < 5)
 3737         __ sha1c(v20, __ T4S, tmp3, tmp4);
 3738       else if (round < 10 || round >= 15)
 3739         __ sha1p(v20, __ T4S, tmp3, tmp4);
 3740       else
 3741         __ sha1m(v20, __ T4S, tmp3, tmp4);
 3742       if (round < 16) __ sha1su1(d0, __ T4S, d3);
 3743 
 3744       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3745     }
 3746 
 3747     __ addv(v7, __ T2S, v7, v21);
 3748     __ addv(v6, __ T4S, v6, v20);
 3749 
 3750     if (multi_block) {
 3751       __ add(ofs, ofs, 64);
 3752       __ cmp(ofs, limit);
 3753       __ br(Assembler::LE, sha1_loop);
 3754       __ mov(c_rarg0, ofs); // return ofs
 3755     }
 3756 
 3757     __ strq(v6, Address(state, 0));
 3758     __ strs(v7, Address(state, 16));
 3759 
 3760     __ ret(lr);
 3761 
 3762     __ bind(keys);
 3763     __ emit_int32(0x5a827999);
 3764     __ emit_int32(0x6ed9eba1);
 3765     __ emit_int32(0x8f1bbcdc);
 3766     __ emit_int32(0xca62c1d6);
 3767 
 3768     return start;
 3769   }
 3770 
 3771 
 3772   // Arguments:
 3773   //
 3774   // Inputs:
 3775   //   c_rarg0   - byte[]  source+offset
 3776   //   c_rarg1   - int[]   SHA.state
 3777   //   c_rarg2   - int     offset
 3778   //   c_rarg3   - int     limit
 3779   //
 3780   address generate_sha256_implCompress(StubGenStubId stub_id) {
 3781     bool multi_block;
 3782     switch (stub_id) {
 3783     case sha256_implCompress_id:
 3784       multi_block = false;
 3785       break;
 3786     case sha256_implCompressMB_id:
 3787       multi_block = true;
 3788       break;
 3789     default:
 3790       ShouldNotReachHere();
 3791     }
 3792 
 3793     static const uint32_t round_consts[64] = {
 3794       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
 3795       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
 3796       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 3797       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
 3798       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
 3799       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 3800       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
 3801       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
 3802       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 3803       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
 3804       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
 3805       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 3806       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
 3807       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
 3808       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 3809       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 3810     };
 3811 
 3812     __ align(CodeEntryAlignment);
 3813 
 3814     StubCodeMark mark(this, stub_id);
 3815     address start = __ pc();
 3816 
 3817     Register buf   = c_rarg0;
 3818     Register state = c_rarg1;
 3819     Register ofs   = c_rarg2;
 3820     Register limit = c_rarg3;
 3821 
 3822     Label sha1_loop;
 3823 
 3824     __ stpd(v8, v9, __ pre(sp, -32));
 3825     __ stpd(v10, v11, Address(sp, 16));
 3826 
 3827 // dga == v0
 3828 // dgb == v1
 3829 // dg0 == v2
 3830 // dg1 == v3
 3831 // dg2 == v4
 3832 // t0 == v6
 3833 // t1 == v7
 3834 
 3835     // load 16 keys to v16..v31
 3836     __ lea(rscratch1, ExternalAddress((address)round_consts));
 3837     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
 3838     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
 3839     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
 3840     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
 3841 
 3842     // load 8 words (256 bits) state
 3843     __ ldpq(v0, v1, state);
 3844 
 3845     __ BIND(sha1_loop);
 3846     // load 64 bytes of data into v8..v11
 3847     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
 3848     __ rev32(v8, __ T16B, v8);
 3849     __ rev32(v9, __ T16B, v9);
 3850     __ rev32(v10, __ T16B, v10);
 3851     __ rev32(v11, __ T16B, v11);
 3852 
 3853     __ addv(v6, __ T4S, v8, v16);
 3854     __ orr(v2, __ T16B, v0, v0);
 3855     __ orr(v3, __ T16B, v1, v1);
 3856 
 3857     FloatRegister d0 = v8;
 3858     FloatRegister d1 = v9;
 3859     FloatRegister d2 = v10;
 3860     FloatRegister d3 = v11;
 3861 
 3862 
 3863     for (int round = 0; round < 16; round++) {
 3864       FloatRegister tmp1 = (round & 1) ? v6 : v7;
 3865       FloatRegister tmp2 = (round & 1) ? v7 : v6;
 3866       FloatRegister tmp3 = (round & 1) ? v2 : v4;
 3867       FloatRegister tmp4 = (round & 1) ? v4 : v2;
 3868 
 3869       if (round < 12) __ sha256su0(d0, __ T4S, d1);
 3870        __ orr(v4, __ T16B, v2, v2);
 3871       if (round < 15)
 3872         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
 3873       __ sha256h(v2, __ T4S, v3, tmp2);
 3874       __ sha256h2(v3, __ T4S, v4, tmp2);
 3875       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
 3876 
 3877       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
 3878     }
 3879 
 3880     __ addv(v0, __ T4S, v0, v2);
 3881     __ addv(v1, __ T4S, v1, v3);
 3882 
 3883     if (multi_block) {
 3884       __ add(ofs, ofs, 64);
 3885       __ cmp(ofs, limit);
 3886       __ br(Assembler::LE, sha1_loop);
 3887       __ mov(c_rarg0, ofs); // return ofs
 3888     }
 3889 
 3890     __ ldpd(v10, v11, Address(sp, 16));
 3891     __ ldpd(v8, v9, __ post(sp, 32));
 3892 
 3893     __ stpq(v0, v1, state);
 3894 
 3895     __ ret(lr);
 3896 
 3897     return start;
 3898   }
 3899 
 3900   // Double rounds for sha512.
 3901   void sha512_dround(int dr,
 3902                      FloatRegister vi0, FloatRegister vi1,
 3903                      FloatRegister vi2, FloatRegister vi3,
 3904                      FloatRegister vi4, FloatRegister vrc0,
 3905                      FloatRegister vrc1, FloatRegister vin0,
 3906                      FloatRegister vin1, FloatRegister vin2,
 3907                      FloatRegister vin3, FloatRegister vin4) {
 3908       if (dr < 36) {
 3909         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
 3910       }
 3911       __ addv(v5, __ T2D, vrc0, vin0);
 3912       __ ext(v6, __ T16B, vi2, vi3, 8);
 3913       __ ext(v5, __ T16B, v5, v5, 8);
 3914       __ ext(v7, __ T16B, vi1, vi2, 8);
 3915       __ addv(vi3, __ T2D, vi3, v5);
 3916       if (dr < 32) {
 3917         __ ext(v5, __ T16B, vin3, vin4, 8);
 3918         __ sha512su0(vin0, __ T2D, vin1);
 3919       }
 3920       __ sha512h(vi3, __ T2D, v6, v7);
 3921       if (dr < 32) {
 3922         __ sha512su1(vin0, __ T2D, vin2, v5);
 3923       }
 3924       __ addv(vi4, __ T2D, vi1, vi3);
 3925       __ sha512h2(vi3, __ T2D, vi1, vi0);
 3926   }
 3927 
 3928   // Arguments:
 3929   //
 3930   // Inputs:
 3931   //   c_rarg0   - byte[]  source+offset
 3932   //   c_rarg1   - int[]   SHA.state
 3933   //   c_rarg2   - int     offset
 3934   //   c_rarg3   - int     limit
 3935   //
 3936   address generate_sha512_implCompress(StubGenStubId stub_id) {
 3937     bool multi_block;
 3938     switch (stub_id) {
 3939     case sha512_implCompress_id:
 3940       multi_block = false;
 3941       break;
 3942     case sha512_implCompressMB_id:
 3943       multi_block = true;
 3944       break;
 3945     default:
 3946       ShouldNotReachHere();
 3947     }
 3948 
 3949     static const uint64_t round_consts[80] = {
 3950       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
 3951       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
 3952       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
 3953       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
 3954       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
 3955       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
 3956       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
 3957       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
 3958       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
 3959       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
 3960       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
 3961       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
 3962       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
 3963       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
 3964       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
 3965       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
 3966       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
 3967       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
 3968       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
 3969       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
 3970       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
 3971       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
 3972       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
 3973       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
 3974       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
 3975       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
 3976       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
 3977     };
 3978 
 3979     __ align(CodeEntryAlignment);
 3980 
 3981     StubCodeMark mark(this, stub_id);
 3982     address start = __ pc();
 3983 
 3984     Register buf   = c_rarg0;
 3985     Register state = c_rarg1;
 3986     Register ofs   = c_rarg2;
 3987     Register limit = c_rarg3;
 3988 
 3989     __ stpd(v8, v9, __ pre(sp, -64));
 3990     __ stpd(v10, v11, Address(sp, 16));
 3991     __ stpd(v12, v13, Address(sp, 32));
 3992     __ stpd(v14, v15, Address(sp, 48));
 3993 
 3994     Label sha512_loop;
 3995 
 3996     // load state
 3997     __ ld1(v8, v9, v10, v11, __ T2D, state);
 3998 
 3999     // load first 4 round constants
 4000     __ lea(rscratch1, ExternalAddress((address)round_consts));
 4001     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
 4002 
 4003     __ BIND(sha512_loop);
 4004     // load 128B of data into v12..v19
 4005     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
 4006     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
 4007     __ rev64(v12, __ T16B, v12);
 4008     __ rev64(v13, __ T16B, v13);
 4009     __ rev64(v14, __ T16B, v14);
 4010     __ rev64(v15, __ T16B, v15);
 4011     __ rev64(v16, __ T16B, v16);
 4012     __ rev64(v17, __ T16B, v17);
 4013     __ rev64(v18, __ T16B, v18);
 4014     __ rev64(v19, __ T16B, v19);
 4015 
 4016     __ mov(rscratch2, rscratch1);
 4017 
 4018     __ mov(v0, __ T16B, v8);
 4019     __ mov(v1, __ T16B, v9);
 4020     __ mov(v2, __ T16B, v10);
 4021     __ mov(v3, __ T16B, v11);
 4022 
 4023     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
 4024     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
 4025     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
 4026     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
 4027     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
 4028     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
 4029     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
 4030     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
 4031     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
 4032     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
 4033     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
 4034     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
 4035     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
 4036     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
 4037     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
 4038     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
 4039     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
 4040     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
 4041     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
 4042     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
 4043     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
 4044     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
 4045     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
 4046     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
 4047     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
 4048     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
 4049     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
 4050     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
 4051     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
 4052     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
 4053     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
 4054     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
 4055     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
 4056     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
 4057     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
 4058     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
 4059     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
 4060     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
 4061     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
 4062     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
 4063 
 4064     __ addv(v8, __ T2D, v8, v0);
 4065     __ addv(v9, __ T2D, v9, v1);
 4066     __ addv(v10, __ T2D, v10, v2);
 4067     __ addv(v11, __ T2D, v11, v3);
 4068 
 4069     if (multi_block) {
 4070       __ add(ofs, ofs, 128);
 4071       __ cmp(ofs, limit);
 4072       __ br(Assembler::LE, sha512_loop);
 4073       __ mov(c_rarg0, ofs); // return ofs
 4074     }
 4075 
 4076     __ st1(v8, v9, v10, v11, __ T2D, state);
 4077 
 4078     __ ldpd(v14, v15, Address(sp, 48));
 4079     __ ldpd(v12, v13, Address(sp, 32));
 4080     __ ldpd(v10, v11, Address(sp, 16));
 4081     __ ldpd(v8, v9, __ post(sp, 64));
 4082 
 4083     __ ret(lr);
 4084 
 4085     return start;
 4086   }
 4087 
 4088   // Execute one round of keccak of two computations in parallel.
 4089   // One of the states should be loaded into the lower halves of
 4090   // the vector registers v0-v24, the other should be loaded into
 4091   // the upper halves of those registers. The ld1r instruction loads
 4092   // the round constant into both halves of register v31.
 4093   // Intermediate results c0...c5 and d0...d5 are computed
 4094   // in registers v25...v30.
 4095   // All vector instructions that are used operate on both register
 4096   // halves in parallel.
 4097   // If only a single computation is needed, one can only load the lower halves.
 4098   void keccak_round(Register rscratch1) {
 4099   __ eor3(v29, __ T16B, v4, v9, v14);       // c4 = a4 ^ a9 ^ a14
 4100   __ eor3(v26, __ T16B, v1, v6, v11);       // c1 = a1 ^ a16 ^ a11
 4101   __ eor3(v28, __ T16B, v3, v8, v13);       // c3 = a3 ^ a8 ^a13
 4102   __ eor3(v25, __ T16B, v0, v5, v10);       // c0 = a0 ^ a5 ^ a10
 4103   __ eor3(v27, __ T16B, v2, v7, v12);       // c2 = a2 ^ a7 ^ a12
 4104   __ eor3(v29, __ T16B, v29, v19, v24);     // c4 ^= a19 ^ a24
 4105   __ eor3(v26, __ T16B, v26, v16, v21);     // c1 ^= a16 ^ a21
 4106   __ eor3(v28, __ T16B, v28, v18, v23);     // c3 ^= a18 ^ a23
 4107   __ eor3(v25, __ T16B, v25, v15, v20);     // c0 ^= a15 ^ a20
 4108   __ eor3(v27, __ T16B, v27, v17, v22);     // c2 ^= a17 ^ a22
 4109 
 4110   __ rax1(v30, __ T2D, v29, v26);           // d0 = c4 ^ rol(c1, 1)
 4111   __ rax1(v26, __ T2D, v26, v28);           // d2 = c1 ^ rol(c3, 1)
 4112   __ rax1(v28, __ T2D, v28, v25);           // d4 = c3 ^ rol(c0, 1)
 4113   __ rax1(v25, __ T2D, v25, v27);           // d1 = c0 ^ rol(c2, 1)
 4114   __ rax1(v27, __ T2D, v27, v29);           // d3 = c2 ^ rol(c4, 1)
 4115 
 4116   __ eor(v0, __ T16B, v0, v30);             // a0 = a0 ^ d0
 4117   __ xar(v29, __ T2D, v1,  v25, (64 - 1));  // a10' = rol((a1^d1), 1)
 4118   __ xar(v1,  __ T2D, v6,  v25, (64 - 44)); // a1 = rol(a6^d1), 44)
 4119   __ xar(v6,  __ T2D, v9,  v28, (64 - 20)); // a6 = rol((a9^d4), 20)
 4120   __ xar(v9,  __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
 4121   __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
 4122   __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
 4123   __ xar(v31, __ T2D, v2,  v26, (64 - 62)); // a20' = rol((a2^d2), 62)
 4124   __ xar(v2,  __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
 4125   __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
 4126   __ xar(v13, __ T2D, v19, v28, (64 - 8));  // a13 = rol((a19^d4), 8)
 4127   __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
 4128   __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
 4129   __ xar(v15, __ T2D, v4,  v28, (64 - 27)); // a15 = rol((a4^d4), 27)
 4130   __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
 4131   __ xar(v24, __ T2D, v21, v25, (64 - 2));  // a24 = rol((a21^d1), 2)
 4132   __ xar(v8,  __ T2D, v8,  v27, (64 - 55)); // a21' = rol((a8^d3), 55)
 4133   __ xar(v4,  __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
 4134   __ xar(v16, __ T2D, v5,  v30, (64 - 36)); // a16 = rol((a5^d0), 36)
 4135   __ xar(v5,  __ T2D, v3,  v27, (64 - 28)); // a5 = rol((a3^d3), 28)
 4136   __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
 4137   __ xar(v3,  __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
 4138   __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
 4139   __ xar(v26, __ T2D, v7,  v26, (64 - 6));  // a11' = rol((a7^d2), 6)
 4140   __ xar(v30, __ T2D, v10, v30, (64 - 3));  // a7' = rol((a10^d0), 3)
 4141 
 4142   __ bcax(v20, __ T16B, v31, v22, v8);      // a20 = a20' ^ (~a21 & a22')
 4143   __ bcax(v21, __ T16B, v8,  v23, v22);     // a21 = a21' ^ (~a22 & a23)
 4144   __ bcax(v22, __ T16B, v22, v24, v23);     // a22 = a22 ^ (~a23 & a24)
 4145   __ bcax(v23, __ T16B, v23, v31, v24);     // a23 = a23 ^ (~a24 & a20')
 4146   __ bcax(v24, __ T16B, v24, v8,  v31);     // a24 = a24 ^ (~a20' & a21')
 4147 
 4148   __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
 4149 
 4150   __ bcax(v17, __ T16B, v25, v19, v3);      // a17 = a17' ^ (~a18' & a19)
 4151   __ bcax(v18, __ T16B, v3,  v15, v19);     // a18 = a18' ^ (~a19 & a15')
 4152   __ bcax(v19, __ T16B, v19, v16, v15);     // a19 = a19 ^ (~a15 & a16)
 4153   __ bcax(v15, __ T16B, v15, v25, v16);     // a15 = a15 ^ (~a16 & a17')
 4154   __ bcax(v16, __ T16B, v16, v3,  v25);     // a16 = a16 ^ (~a17' & a18')
 4155 
 4156   __ bcax(v10, __ T16B, v29, v12, v26);     // a10 = a10' ^ (~a11' & a12)
 4157   __ bcax(v11, __ T16B, v26, v13, v12);     // a11 = a11' ^ (~a12 & a13)
 4158   __ bcax(v12, __ T16B, v12, v14, v13);     // a12 = a12 ^ (~a13 & a14)
 4159   __ bcax(v13, __ T16B, v13, v29, v14);     // a13 = a13 ^ (~a14 & a10')
 4160   __ bcax(v14, __ T16B, v14, v26, v29);     // a14 = a14 ^ (~a10' & a11')
 4161 
 4162   __ bcax(v7, __ T16B, v30, v9,  v4);       // a7 = a7' ^ (~a8' & a9)
 4163   __ bcax(v8, __ T16B, v4,  v5,  v9);       // a8 = a8' ^ (~a9 & a5)
 4164   __ bcax(v9, __ T16B, v9,  v6,  v5);       // a9 = a9 ^ (~a5 & a6)
 4165   __ bcax(v5, __ T16B, v5,  v30, v6);       // a5 = a5 ^ (~a6 & a7)
 4166   __ bcax(v6, __ T16B, v6,  v4,  v30);      // a6 = a6 ^ (~a7 & a8')
 4167 
 4168   __ bcax(v3, __ T16B, v27, v0,  v28);      // a3 = a3' ^ (~a4' & a0)
 4169   __ bcax(v4, __ T16B, v28, v1,  v0);       // a4 = a4' ^ (~a0 & a1)
 4170   __ bcax(v0, __ T16B, v0,  v2,  v1);       // a0 = a0 ^ (~a1 & a2)
 4171   __ bcax(v1, __ T16B, v1,  v27, v2);       // a1 = a1 ^ (~a2 & a3)
 4172   __ bcax(v2, __ T16B, v2,  v28, v27);      // a2 = a2 ^ (~a3 & a4')
 4173 
 4174   __ eor(v0, __ T16B, v0, v31);             // a0 = a0 ^ rc
 4175   }
 4176 
 4177   // Arguments:
 4178   //
 4179   // Inputs:
 4180   //   c_rarg0   - byte[]  source+offset
 4181   //   c_rarg1   - byte[]  SHA.state
 4182   //   c_rarg2   - int     block_size
 4183   //   c_rarg3   - int     offset
 4184   //   c_rarg4   - int     limit
 4185   //
 4186   address generate_sha3_implCompress(StubGenStubId stub_id) {
 4187     bool multi_block;
 4188     switch (stub_id) {
 4189     case sha3_implCompress_id:
 4190       multi_block = false;
 4191       break;
 4192     case sha3_implCompressMB_id:
 4193       multi_block = true;
 4194       break;
 4195     default:
 4196       ShouldNotReachHere();
 4197     }
 4198 
 4199     static const uint64_t round_consts[24] = {
 4200       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4201       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4202       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4203       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4204       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4205       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4206       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4207       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4208     };
 4209 
 4210     __ align(CodeEntryAlignment);
 4211 
 4212     StubCodeMark mark(this, stub_id);
 4213     address start = __ pc();
 4214 
 4215     Register buf           = c_rarg0;
 4216     Register state         = c_rarg1;
 4217     Register block_size    = c_rarg2;
 4218     Register ofs           = c_rarg3;
 4219     Register limit         = c_rarg4;
 4220 
 4221     Label sha3_loop, rounds24_loop;
 4222     Label sha3_512_or_sha3_384, shake128;
 4223 
 4224     __ stpd(v8, v9, __ pre(sp, -64));
 4225     __ stpd(v10, v11, Address(sp, 16));
 4226     __ stpd(v12, v13, Address(sp, 32));
 4227     __ stpd(v14, v15, Address(sp, 48));
 4228 
 4229     // load state
 4230     __ add(rscratch1, state, 32);
 4231     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
 4232     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
 4233     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
 4234     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
 4235     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
 4236     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
 4237     __ ld1(v24, __ T1D, rscratch1);
 4238 
 4239     __ BIND(sha3_loop);
 4240 
 4241     // 24 keccak rounds
 4242     __ movw(rscratch2, 24);
 4243 
 4244     // load round_constants base
 4245     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4246 
 4247     // load input
 4248     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4249     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4250     __ eor(v0, __ T8B, v0, v25);
 4251     __ eor(v1, __ T8B, v1, v26);
 4252     __ eor(v2, __ T8B, v2, v27);
 4253     __ eor(v3, __ T8B, v3, v28);
 4254     __ eor(v4, __ T8B, v4, v29);
 4255     __ eor(v5, __ T8B, v5, v30);
 4256     __ eor(v6, __ T8B, v6, v31);
 4257 
 4258     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
 4259     __ tbz(block_size, 7, sha3_512_or_sha3_384);
 4260 
 4261     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
 4262     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
 4263     __ eor(v7, __ T8B, v7, v25);
 4264     __ eor(v8, __ T8B, v8, v26);
 4265     __ eor(v9, __ T8B, v9, v27);
 4266     __ eor(v10, __ T8B, v10, v28);
 4267     __ eor(v11, __ T8B, v11, v29);
 4268     __ eor(v12, __ T8B, v12, v30);
 4269     __ eor(v13, __ T8B, v13, v31);
 4270 
 4271     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
 4272     __ eor(v14, __ T8B, v14, v25);
 4273     __ eor(v15, __ T8B, v15, v26);
 4274     __ eor(v16, __ T8B, v16, v27);
 4275 
 4276     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
 4277     __ andw(c_rarg5, block_size, 48);
 4278     __ cbzw(c_rarg5, rounds24_loop);
 4279 
 4280     __ tbnz(block_size, 5, shake128);
 4281     // block_size == 144, bit5 == 0, SHA3-224
 4282     __ ldrd(v28, __ post(buf, 8));
 4283     __ eor(v17, __ T8B, v17, v28);
 4284     __ b(rounds24_loop);
 4285 
 4286     __ BIND(shake128);
 4287     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
 4288     __ eor(v17, __ T8B, v17, v28);
 4289     __ eor(v18, __ T8B, v18, v29);
 4290     __ eor(v19, __ T8B, v19, v30);
 4291     __ eor(v20, __ T8B, v20, v31);
 4292     __ b(rounds24_loop); // block_size == 168, SHAKE128
 4293 
 4294     __ BIND(sha3_512_or_sha3_384);
 4295     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
 4296     __ eor(v7, __ T8B, v7, v25);
 4297     __ eor(v8, __ T8B, v8, v26);
 4298     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
 4299 
 4300     // SHA3-384
 4301     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
 4302     __ eor(v9,  __ T8B, v9,  v27);
 4303     __ eor(v10, __ T8B, v10, v28);
 4304     __ eor(v11, __ T8B, v11, v29);
 4305     __ eor(v12, __ T8B, v12, v30);
 4306 
 4307     __ BIND(rounds24_loop);
 4308     __ subw(rscratch2, rscratch2, 1);
 4309 
 4310     keccak_round(rscratch1);
 4311 
 4312     __ cbnzw(rscratch2, rounds24_loop);
 4313 
 4314     if (multi_block) {
 4315       __ add(ofs, ofs, block_size);
 4316       __ cmp(ofs, limit);
 4317       __ br(Assembler::LE, sha3_loop);
 4318       __ mov(c_rarg0, ofs); // return ofs
 4319     }
 4320 
 4321     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
 4322     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
 4323     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
 4324     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
 4325     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
 4326     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
 4327     __ st1(v24, __ T1D, state);
 4328 
 4329     // restore callee-saved registers
 4330     __ ldpd(v14, v15, Address(sp, 48));
 4331     __ ldpd(v12, v13, Address(sp, 32));
 4332     __ ldpd(v10, v11, Address(sp, 16));
 4333     __ ldpd(v8, v9, __ post(sp, 64));
 4334 
 4335     __ ret(lr);
 4336 
 4337     return start;
 4338   }
 4339 
 4340   // Inputs:
 4341   //   c_rarg0   - long[]  state0
 4342   //   c_rarg1   - long[]  state1
 4343   address generate_double_keccak() {
 4344     static const uint64_t round_consts[24] = {
 4345       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
 4346       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
 4347       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
 4348       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
 4349       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
 4350       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
 4351       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
 4352       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
 4353     };
 4354 
 4355     // Implements the double_keccak() method of the
 4356     // sun.secyrity.provider.SHA3Parallel class
 4357     __ align(CodeEntryAlignment);
 4358     StubCodeMark mark(this, "StubRoutines", "double_keccak");
 4359     address start = __ pc();
 4360     __ enter();
 4361 
 4362     Register state0        = c_rarg0;
 4363     Register state1        = c_rarg1;
 4364 
 4365     Label rounds24_loop;
 4366 
 4367     // save callee-saved registers
 4368     __ stpd(v8, v9, __ pre(sp, -64));
 4369     __ stpd(v10, v11, Address(sp, 16));
 4370     __ stpd(v12, v13, Address(sp, 32));
 4371     __ stpd(v14, v15, Address(sp, 48));
 4372 
 4373     // load states
 4374     __ add(rscratch1, state0, 32);
 4375     __ ld4(v0, v1, v2,  v3, __ D, 0,  state0);
 4376     __ ld4(v4, v5, v6,  v7, __ D, 0, __ post(rscratch1, 32));
 4377     __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
 4378     __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
 4379     __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
 4380     __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
 4381     __ ld1(v24, __ D, 0, rscratch1);
 4382     __ add(rscratch1, state1, 32);
 4383     __ ld4(v0, v1, v2,  v3,  __ D, 1, state1);
 4384     __ ld4(v4, v5, v6,  v7, __ D, 1, __ post(rscratch1, 32));
 4385     __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
 4386     __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
 4387     __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
 4388     __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
 4389     __ ld1(v24, __ D, 1, rscratch1);
 4390 
 4391     // 24 keccak rounds
 4392     __ movw(rscratch2, 24);
 4393 
 4394     // load round_constants base
 4395     __ lea(rscratch1, ExternalAddress((address) round_consts));
 4396 
 4397     __ BIND(rounds24_loop);
 4398     __ subw(rscratch2, rscratch2, 1);
 4399     keccak_round(rscratch1);
 4400     __ cbnzw(rscratch2, rounds24_loop);
 4401 
 4402     __ st4(v0, v1, v2,  v3,  __ D, 0, __ post(state0, 32));
 4403     __ st4(v4, v5, v6,  v7,  __ D, 0, __ post(state0, 32));
 4404     __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
 4405     __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
 4406     __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
 4407     __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
 4408     __ st1(v24, __ D, 0, state0);
 4409     __ st4(v0, v1, v2,  v3,  __ D, 1, __ post(state1, 32));
 4410     __ st4(v4, v5, v6,  v7, __ D, 1, __ post(state1, 32));
 4411     __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
 4412     __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
 4413     __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
 4414     __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
 4415     __ st1(v24, __ D, 1, state1);
 4416 
 4417     // restore callee-saved vector registers
 4418     __ ldpd(v14, v15, Address(sp, 48));
 4419     __ ldpd(v12, v13, Address(sp, 32));
 4420     __ ldpd(v10, v11, Address(sp, 16));
 4421     __ ldpd(v8, v9, __ post(sp, 64));
 4422 
 4423     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4424     __ mov(r0, zr); // return 0
 4425     __ ret(lr);
 4426 
 4427     return start;
 4428   }
 4429 
 4430   /**
 4431    *  Arguments:
 4432    *
 4433    * Inputs:
 4434    *   c_rarg0   - int crc
 4435    *   c_rarg1   - byte* buf
 4436    *   c_rarg2   - int length
 4437    *
 4438    * Output:
 4439    *       rax   - int crc result
 4440    */
 4441   address generate_updateBytesCRC32() {
 4442     assert(UseCRC32Intrinsics, "what are we doing here?");
 4443 
 4444     __ align(CodeEntryAlignment);
 4445     StubGenStubId stub_id = StubGenStubId::updateBytesCRC32_id;
 4446     StubCodeMark mark(this, stub_id);
 4447 
 4448     address start = __ pc();
 4449 
 4450     const Register crc   = c_rarg0;  // crc
 4451     const Register buf   = c_rarg1;  // source java byte array address
 4452     const Register len   = c_rarg2;  // length
 4453     const Register table0 = c_rarg3; // crc_table address
 4454     const Register table1 = c_rarg4;
 4455     const Register table2 = c_rarg5;
 4456     const Register table3 = c_rarg6;
 4457     const Register tmp3 = c_rarg7;
 4458 
 4459     BLOCK_COMMENT("Entry:");
 4460     __ enter(); // required for proper stackwalking of RuntimeStub frame
 4461 
 4462     __ kernel_crc32(crc, buf, len,
 4463               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 4464 
 4465     __ leave(); // required for proper stackwalking of RuntimeStub frame
 4466     __ ret(lr);
 4467 
 4468     return start;
 4469   }
 4470 
 4471   // ChaCha20 block function.  This version parallelizes 4 quarter
 4472   // round operations at a time.  It uses 16 SIMD registers to
 4473   // produce 4 blocks of key stream.
 4474   //
 4475   // state (int[16]) = c_rarg0
 4476   // keystream (byte[256]) = c_rarg1
 4477   // return - number of bytes of keystream (always 256)
 4478   //
 4479   // In this approach, we load the 512-bit start state sequentially into
 4480   // 4 128-bit vectors.  We then make 4 4-vector copies of that starting
 4481   // state, with each successive set of 4 vectors having a +1 added into
 4482   // the first 32-bit lane of the 4th vector in that group (the counter).
 4483   // By doing this, we can perform the block function on 4 512-bit blocks
 4484   // within one run of this intrinsic.
 4485   // The alignment of the data across the 4-vector group is such that at
 4486   // the start it is already aligned for the first round of each two-round
 4487   // loop iteration.  In other words, the corresponding lanes of each vector
 4488   // will contain the values needed for that quarter round operation (e.g.
 4489   // elements 0/4/8/12, 1/5/9/13, 2/6/10/14, etc.).
 4490   // In between each full round, a lane shift must occur.  Within a loop
 4491   // iteration, between the first and second rounds, the 2nd, 3rd, and 4th
 4492   // vectors are rotated left 32, 64 and 96 bits, respectively.  The result
 4493   // is effectively a diagonal orientation in columnar form.  After the
 4494   // second full round, those registers are left-rotated again, this time
 4495   // 96, 64, and 32 bits - returning the vectors to their columnar organization.
 4496   // After all 10 iterations, the original state is added to each 4-vector
 4497   // working state along with the add mask, and the 4 vector groups are
 4498   // sequentially written to the memory dedicated for the output key stream.
 4499   //
 4500   // For a more detailed explanation, see Goll and Gueron, "Vectorization of
 4501   // ChaCha Stream Cipher", 2014 11th Int. Conf. on Information Technology:
 4502   // New Generations, Las Vegas, NV, USA, April 2014, DOI: 10.1109/ITNG.2014.33
 4503   address generate_chacha20Block_qrpar() {
 4504     Label L_Q_twoRounds, L_Q_cc20_const;
 4505     // The constant data is broken into two 128-bit segments to be loaded
 4506     // onto SIMD registers.  The first 128 bits are a counter add overlay
 4507     // that adds +1/+0/+0/+0 to the vectors holding replicated state[12].
 4508     // The second 128-bits is a table constant used for 8-bit left rotations.
 4509     // on 32-bit lanes within a SIMD register.
 4510     __ BIND(L_Q_cc20_const);
 4511     __ emit_int64(0x0000000000000001UL);
 4512     __ emit_int64(0x0000000000000000UL);
 4513     __ emit_int64(0x0605040702010003UL);
 4514     __ emit_int64(0x0E0D0C0F0A09080BUL);
 4515 
 4516     __ align(CodeEntryAlignment);
 4517     StubGenStubId stub_id = StubGenStubId::chacha20Block_id;
 4518     StubCodeMark mark(this, stub_id);
 4519     address start = __ pc();
 4520     __ enter();
 4521 
 4522     const Register state = c_rarg0;
 4523     const Register keystream = c_rarg1;
 4524     const Register loopCtr = r10;
 4525     const Register tmpAddr = r11;
 4526 
 4527     const FloatRegister aState = v0;
 4528     const FloatRegister bState = v1;
 4529     const FloatRegister cState = v2;
 4530     const FloatRegister dState = v3;
 4531     const FloatRegister a1Vec = v4;
 4532     const FloatRegister b1Vec = v5;
 4533     const FloatRegister c1Vec = v6;
 4534     const FloatRegister d1Vec = v7;
 4535     // Skip the callee-saved registers v8 - v15
 4536     const FloatRegister a2Vec = v16;
 4537     const FloatRegister b2Vec = v17;
 4538     const FloatRegister c2Vec = v18;
 4539     const FloatRegister d2Vec = v19;
 4540     const FloatRegister a3Vec = v20;
 4541     const FloatRegister b3Vec = v21;
 4542     const FloatRegister c3Vec = v22;
 4543     const FloatRegister d3Vec = v23;
 4544     const FloatRegister a4Vec = v24;
 4545     const FloatRegister b4Vec = v25;
 4546     const FloatRegister c4Vec = v26;
 4547     const FloatRegister d4Vec = v27;
 4548     const FloatRegister scratch = v28;
 4549     const FloatRegister addMask = v29;
 4550     const FloatRegister lrot8Tbl = v30;
 4551 
 4552     // Load the initial state in the first 4 quadword registers,
 4553     // then copy the initial state into the next 4 quadword registers
 4554     // that will be used for the working state.
 4555     __ ld1(aState, bState, cState, dState, __ T16B, Address(state));
 4556 
 4557     // Load the index register for 2 constant 128-bit data fields.
 4558     // The first represents the +1/+0/+0/+0 add mask.  The second is
 4559     // the 8-bit left rotation.
 4560     __ adr(tmpAddr, L_Q_cc20_const);
 4561     __ ldpq(addMask, lrot8Tbl, Address(tmpAddr));
 4562 
 4563     __ mov(a1Vec, __ T16B, aState);
 4564     __ mov(b1Vec, __ T16B, bState);
 4565     __ mov(c1Vec, __ T16B, cState);
 4566     __ mov(d1Vec, __ T16B, dState);
 4567 
 4568     __ mov(a2Vec, __ T16B, aState);
 4569     __ mov(b2Vec, __ T16B, bState);
 4570     __ mov(c2Vec, __ T16B, cState);
 4571     __ addv(d2Vec, __ T4S, d1Vec, addMask);
 4572 
 4573     __ mov(a3Vec, __ T16B, aState);
 4574     __ mov(b3Vec, __ T16B, bState);
 4575     __ mov(c3Vec, __ T16B, cState);
 4576     __ addv(d3Vec, __ T4S, d2Vec, addMask);
 4577 
 4578     __ mov(a4Vec, __ T16B, aState);
 4579     __ mov(b4Vec, __ T16B, bState);
 4580     __ mov(c4Vec, __ T16B, cState);
 4581     __ addv(d4Vec, __ T4S, d3Vec, addMask);
 4582 
 4583     // Set up the 10 iteration loop
 4584     __ mov(loopCtr, 10);
 4585     __ BIND(L_Q_twoRounds);
 4586 
 4587     // The first set of operations on the vectors covers the first 4 quarter
 4588     // round operations:
 4589     //  Qround(state, 0, 4, 8,12)
 4590     //  Qround(state, 1, 5, 9,13)
 4591     //  Qround(state, 2, 6,10,14)
 4592     //  Qround(state, 3, 7,11,15)
 4593     __ cc20_quarter_round(a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl);
 4594     __ cc20_quarter_round(a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl);
 4595     __ cc20_quarter_round(a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl);
 4596     __ cc20_quarter_round(a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl);
 4597 
 4598     // Shuffle the b1Vec/c1Vec/d1Vec to reorganize the state vectors to
 4599     // diagonals. The a1Vec does not need to change orientation.
 4600     __ cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, true);
 4601     __ cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, true);
 4602     __ cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, true);
 4603     __ cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, true);
 4604 
 4605     // The second set of operations on the vectors covers the second 4 quarter
 4606     // round operations, now acting on the diagonals:
 4607     //  Qround(state, 0, 5,10,15)
 4608     //  Qround(state, 1, 6,11,12)
 4609     //  Qround(state, 2, 7, 8,13)
 4610     //  Qround(state, 3, 4, 9,14)
 4611     __ cc20_quarter_round(a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl);
 4612     __ cc20_quarter_round(a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl);
 4613     __ cc20_quarter_round(a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl);
 4614     __ cc20_quarter_round(a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl);
 4615 
 4616     // Before we start the next iteration, we need to perform shuffles
 4617     // on the b/c/d vectors to move them back to columnar organizations
 4618     // from their current diagonal orientation.
 4619     __ cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, false);
 4620     __ cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, false);
 4621     __ cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, false);
 4622     __ cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, false);
 4623 
 4624     // Decrement and iterate
 4625     __ sub(loopCtr, loopCtr, 1);
 4626     __ cbnz(loopCtr, L_Q_twoRounds);
 4627 
 4628     // Once the counter reaches zero, we fall out of the loop
 4629     // and need to add the initial state back into the working state
 4630     // represented by the a/b/c/d1Vec registers.  This is destructive
 4631     // on the dState register but we no longer will need it.
 4632     __ addv(a1Vec, __ T4S, a1Vec, aState);
 4633     __ addv(b1Vec, __ T4S, b1Vec, bState);
 4634     __ addv(c1Vec, __ T4S, c1Vec, cState);
 4635     __ addv(d1Vec, __ T4S, d1Vec, dState);
 4636 
 4637     __ addv(a2Vec, __ T4S, a2Vec, aState);
 4638     __ addv(b2Vec, __ T4S, b2Vec, bState);
 4639     __ addv(c2Vec, __ T4S, c2Vec, cState);
 4640     __ addv(dState, __ T4S, dState, addMask);
 4641     __ addv(d2Vec, __ T4S, d2Vec, dState);
 4642 
 4643     __ addv(a3Vec, __ T4S, a3Vec, aState);
 4644     __ addv(b3Vec, __ T4S, b3Vec, bState);
 4645     __ addv(c3Vec, __ T4S, c3Vec, cState);
 4646     __ addv(dState, __ T4S, dState, addMask);
 4647     __ addv(d3Vec, __ T4S, d3Vec, dState);
 4648 
 4649     __ addv(a4Vec, __ T4S, a4Vec, aState);
 4650     __ addv(b4Vec, __ T4S, b4Vec, bState);
 4651     __ addv(c4Vec, __ T4S, c4Vec, cState);
 4652     __ addv(dState, __ T4S, dState, addMask);
 4653     __ addv(d4Vec, __ T4S, d4Vec, dState);
 4654 
 4655     // Write the final state back to the result buffer
 4656     __ st1(a1Vec, b1Vec, c1Vec, d1Vec, __ T16B, __ post(keystream, 64));
 4657     __ st1(a2Vec, b2Vec, c2Vec, d2Vec, __ T16B, __ post(keystream, 64));
 4658     __ st1(a3Vec, b3Vec, c3Vec, d3Vec, __ T16B, __ post(keystream, 64));
 4659     __ st1(a4Vec, b4Vec, c4Vec, d4Vec, __ T16B, __ post(keystream, 64));
 4660 
 4661     __ mov(r0, 256);             // Return length of output keystream
 4662     __ leave();
 4663     __ ret(lr);
 4664 
 4665     return start;
 4666   }
 4667 
 4668   // Helpers to schedule parallel operation bundles across vector
 4669   // register sequences of size 2, 4 or 8.
 4670 
 4671   // Implement various primitive computations across vector sequences
 4672 
 4673   template<int N>
 4674   void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4675                const VSeq<N>& v1, const VSeq<N>& v2) {
 4676     for (int i = 0; i < N; i++) {
 4677       __ addv(v[i], T, v1[i], v2[i]);
 4678     }
 4679   }
 4680 
 4681   template<int N>
 4682   void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4683                const VSeq<N>& v1, const VSeq<N>& v2) {
 4684     for (int i = 0; i < N; i++) {
 4685       __ subv(v[i], T, v1[i], v2[i]);
 4686     }
 4687   }
 4688 
 4689   template<int N>
 4690   void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4691                const VSeq<N>& v1, const VSeq<N>& v2) {
 4692     for (int i = 0; i < N; i++) {
 4693       __ mulv(v[i], T, v1[i], v2[i]);
 4694     }
 4695   }
 4696 
 4697   template<int N>
 4698   void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
 4699     for (int i = 0; i < N; i++) {
 4700       __ negr(v[i], T, v1[i]);
 4701     }
 4702   }
 4703 
 4704   template<int N>
 4705   void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
 4706                const VSeq<N>& v1, int shift) {
 4707     for (int i = 0; i < N; i++) {
 4708       __ sshr(v[i], T, v1[i], shift);
 4709     }
 4710   }
 4711 
 4712   template<int N>
 4713   void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4714     for (int i = 0; i < N; i++) {
 4715       __ andr(v[i], __ T16B, v1[i], v2[i]);
 4716     }
 4717   }
 4718 
 4719   template<int N>
 4720   void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
 4721     for (int i = 0; i < N; i++) {
 4722       __ orr(v[i], __ T16B, v1[i], v2[i]);
 4723     }
 4724   }
 4725 
 4726   template<int N>
 4727     void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
 4728     for (int i = 0; i < N; i++) {
 4729       __ notr(v[i], __ T16B, v1[i]);
 4730     }
 4731   }
 4732 
 4733   // load N/2 successive pairs of quadword values from memory in order
 4734   // into N successive vector registers of the sequence via the
 4735   // address supplied in base.
 4736   template<int N>
 4737   void vs_ldpq(const VSeq<N>& v, Register base) {
 4738     for (int i = 0; i < N; i += 2) {
 4739       __ ldpq(v[i], v[i+1], Address(base, 32 * i));
 4740     }
 4741   }
 4742 
 4743   // load N/2 successive pairs of quadword values from memory in order
 4744   // into N vector registers of the sequence via the address supplied
 4745   // in base using post-increment addressing
 4746   template<int N>
 4747   void vs_ldpq_post(const VSeq<N>& v, Register base) {
 4748     for (int i = 0; i < N; i += 2) {
 4749       __ ldpq(v[i], v[i+1], __ post(base, 32));
 4750     }
 4751   }
 4752 
 4753   // store N successive vector registers of the sequence into N/2
 4754   // successive pairs of quadword memory locations via the address
 4755   // supplied in base using post-increment addressing
 4756   template<int N>
 4757   void vs_stpq_post(const VSeq<N>& v, Register base) {
 4758     for (int i = 0; i < N; i += 2) {
 4759       __ stpq(v[i], v[i+1], __ post(base, 32));
 4760     }
 4761   }
 4762 
 4763   // load N/2 pairs of quadword values from memory into N vector
 4764   // registers via the address supplied in base with each pair indexed
 4765   // using the the start offset plus the corresponding entry in the
 4766   // offsets array
 4767   template<int N>
 4768   void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
 4769     for (int i = 0; i < N/2; i++) {
 4770       __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 4771     }
 4772   }
 4773 
 4774   // store N vector registers into N/2 pairs of quadword memory
 4775   // locations via the address supplied in base with each pair indexed
 4776   // using the the start offset plus the corresponding entry in the
 4777   // offsets array
 4778   template<int N>
 4779   void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
 4780     for (int i = 0; i < N/2; i++) {
 4781       __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
 4782     }
 4783   }
 4784 
 4785   // load N single quadword values from memory into N vector registers
 4786   // via the address supplied in base with each value indexed using
 4787   // the the start offset plus the corresponding entry in the offsets
 4788   // array
 4789   template<int N>
 4790   void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 4791                       int start, int (&offsets)[N]) {
 4792     for (int i = 0; i < N; i++) {
 4793       __ ldr(v[i], T, Address(base, start + offsets[i]));
 4794     }
 4795   }
 4796 
 4797   // store N vector registers into N single quadword memory locations
 4798   // via the address supplied in base with each value indexed using
 4799   // the the start offset plus the corresponding entry in the offsets
 4800   // array
 4801   template<int N>
 4802   void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
 4803                       int start, int (&offsets)[N]) {
 4804     for (int i = 0; i < N; i++) {
 4805       __ str(v[i], T, Address(base, start + offsets[i]));
 4806     }
 4807   }
 4808 
 4809   // load N/2 pairs of quadword values from memory de-interleaved into
 4810   // N vector registers 2 at a time via the address supplied in base
 4811   // with each pair indexed using the the start offset plus the
 4812   // corresponding entry in the offsets array
 4813   template<int N>
 4814   void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 4815                       Register tmp, int start, int (&offsets)[N/2]) {
 4816     for (int i = 0; i < N/2; i++) {
 4817       __ add(tmp, base, start + offsets[i]);
 4818       __ ld2(v[2*i], v[2*i+1], T, tmp);
 4819     }
 4820   }
 4821 
 4822   // store N vector registers 2 at a time interleaved into N/2 pairs
 4823   // of quadword memory locations via the address supplied in base
 4824   // with each pair indexed using the the start offset plus the
 4825   // corresponding entry in the offsets array
 4826   template<int N>
 4827   void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
 4828                       Register tmp, int start, int (&offsets)[N/2]) {
 4829     for (int i = 0; i < N/2; i++) {
 4830       __ add(tmp, base, start + offsets[i]);
 4831       __ st2(v[2*i], v[2*i+1], T, tmp);
 4832     }
 4833   }
 4834 
 4835   // Helper routines for various flavours of dilithium montgomery
 4836   // multiply
 4837 
 4838   // Perform 16 32-bit Montgomery multiplications in parallel
 4839   // See the montMul() method of the sun.security.provider.ML_DSA class.
 4840   //
 4841   // Computes 4x4S results
 4842   //    a = b * c * 2^-32 mod MONT_Q
 4843   // Inputs:  vb, vc - 4x4S vector register sequences
 4844   //          vq - 2x4S constants <MONT_Q, MONT_Q_INV_MOD_R>
 4845   // Temps:   vtmp - 4x4S vector sequence trashed after call
 4846   // Outputs: va - 4x4S vector register sequences
 4847   // vb, vc, vtmp and vq must all be disjoint
 4848   // va must be disjoint from all other inputs/temps or must equal vc
 4849   // n.b. MONT_R_BITS is 32, so the right shift by it is implicit.
 4850   void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
 4851                     const VSeq<4>& vtmp, const VSeq<2>& vq) {
 4852     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 4853     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 4854     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 4855 
 4856     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 4857     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 4858 
 4859     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 4860 
 4861     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 4862     assert(vs_disjoint(va, vb), "va and vb overlap");
 4863     assert(vs_disjoint(va, vq), "va and vq overlap");
 4864     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 4865 
 4866     // schedule 4 streams of instructions across the vector sequences
 4867     for (int i = 0; i < 4; i++) {
 4868       __ sqdmulh(vtmp[i], __ T4S, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
 4869       __ mulv(va[i], __ T4S, vb[i], vc[i]);    // aLow = lo32(b * c)
 4870     }
 4871 
 4872     for (int i = 0; i < 4; i++) {
 4873       __ mulv(va[i], __ T4S, va[i], vq[0]);     // m = aLow * qinv
 4874     }
 4875 
 4876     for (int i = 0; i < 4; i++) {
 4877       __ sqdmulh(va[i], __ T4S, va[i], vq[1]);  // n = hi32(2 * m * q)
 4878     }
 4879 
 4880     for (int i = 0; i < 4; i++) {
 4881       __ shsubv(va[i], __ T4S, vtmp[i], va[i]);   // a = (aHigh - n) / 2
 4882     }
 4883   }
 4884 
 4885   // Perform 2x16 32-bit Montgomery multiplications in parallel
 4886   // See the montMul() method of the sun.security.provider.ML_DSA class.
 4887   //
 4888   // Computes 8x4S results
 4889   //    a = b * c * 2^-32 mod MONT_Q
 4890   // Inputs:  vb, vc - 8x4S vector register sequences
 4891   //          vq - 2x4S constants <MONT_Q, MONT_Q_INV_MOD_R>
 4892   // Temps:   vtmp - 4x4S vector sequence trashed after call
 4893   // Outputs: va - 8x4S vector register sequences
 4894   // vb, vc, vtmp and vq must all be disjoint
 4895   // va must be disjoint from all other inputs/temps or must equal vc
 4896   // n.b. MONT_R_BITS is 32, so the right shift by it is implicit.
 4897   void vs_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
 4898                     const VSeq<4>& vtmp, const VSeq<2>& vq) {
 4899     // vb, vc, vtmp and vq must be disjoint. va must either be
 4900     // disjoint from all other registers or equal vc
 4901 
 4902     assert(vs_disjoint(vb, vc), "vb and vc overlap");
 4903     assert(vs_disjoint(vb, vq), "vb and vq overlap");
 4904     assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
 4905 
 4906     assert(vs_disjoint(vc, vq), "vc and vq overlap");
 4907     assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
 4908 
 4909     assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
 4910 
 4911     assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
 4912     assert(vs_disjoint(va, vb), "va and vb overlap");
 4913     assert(vs_disjoint(va, vq), "va and vq overlap");
 4914     assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
 4915 
 4916     // we need to multiply the front and back halves of each sequence
 4917     // 4x4S at a time because
 4918     //
 4919     // 1) we are currently only able to get 4-way instruction
 4920     // parallelism at best
 4921     //
 4922     // 2) we need registers for the constants in vq and temporary
 4923     // scratch registers to hold intermediate results so vtmp can only
 4924     // be a VSeq<4> which means we only have 4 scratch slots
 4925 
 4926     dilithium_montmul16(vs_front(va), vs_front(vb), vs_front(vc), vtmp, vq);
 4927     dilithium_montmul16(vs_back(va), vs_back(vb), vs_back(vc), vtmp, vq);
 4928   }
 4929 
 4930   // perform combined montmul then add/sub on 4x4S vectors
 4931 
 4932   void dilithium_montmul16_sub_add(const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
 4933                                    const VSeq<4>& vtmp, const VSeq<2>& vq) {
 4934     // compute a = montmul(a1, c)
 4935     dilithium_montmul16(vc, va1, vc, vtmp, vq);
 4936     // ouptut a1 = a0 - a
 4937     vs_subv(va1, __ T4S, va0, vc);
 4938     //    and a0 = a0 + a
 4939     vs_addv(va0, __ T4S, va0, vc);
 4940   }
 4941 
 4942   // perform combined add/sub then montul on 4x4S vectors
 4943 
 4944   void dilithium_sub_add_montmul16(const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
 4945                                    const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
 4946     // compute c = a0 - a1
 4947     vs_subv(vtmp1, __ T4S, va0, va1);
 4948     // output a0 = a0 + a1
 4949     vs_addv(va0, __ T4S, va0, va1);
 4950     // output a1 = b montmul c
 4951     dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
 4952   }
 4953 
 4954   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 4955   // in the Java implementation come in sequences of at least 8, so we
 4956   // can use ldpq to collect the corresponding data into pairs of vector
 4957   // registers.
 4958   // We collect the coefficients corresponding to the 'j+l' indexes into
 4959   // the vector registers v0-v7, the zetas into the vector registers v16-v23
 4960   // then we do the (Montgomery) multiplications by the zetas in parallel
 4961   // into v16-v23, load the coeffs corresponding to the 'j' indexes into
 4962   // v0-v7, then do the additions into v24-v31 and the subtractions into
 4963   // v0-v7 and finally save the results back to the coeffs array.
 4964   void dilithiumNttLevel0_4(const Register dilithiumConsts,
 4965     const Register coeffs, const Register zetas) {
 4966     int c1 = 0;
 4967     int c2 = 512;
 4968     int startIncr;
 4969     // don't use callee save registers v8 - v15
 4970     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 4971     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 4972     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 4973     int offsets[4] = { 0, 32, 64, 96 };
 4974 
 4975     for (int level = 0; level < 5; level++) {
 4976       int c1Start = c1;
 4977       int c2Start = c2;
 4978       if (level == 3) {
 4979         offsets[1] = 32;
 4980         offsets[2] = 128;
 4981         offsets[3] = 160;
 4982       } else if (level == 4) {
 4983         offsets[1] = 64;
 4984         offsets[2] = 128;
 4985         offsets[3] = 192;
 4986       }
 4987 
 4988       // for levels 1 - 4 we simply load 2 x 4 adjacent values at a
 4989       // time at 4 different offsets and multiply them in order by the
 4990       // next set of input values. So we employ indexed load and store
 4991       // pair instructions with arrangement 4S
 4992       for (int i = 0; i < 4; i++) {
 4993         // reload q and qinv
 4994         vs_ldpq(vq, dilithiumConsts); // qInv, q
 4995         // load 8x4S coefficients via second start pos == c2
 4996         vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
 4997         // load next 8x4S inputs == b
 4998         vs_ldpq_post(vs2, zetas);
 4999         // compute a == c2 * b mod MONT_Q
 5000         vs_montmul32(vs2, vs1, vs2, vtmp, vq);
 5001         // load 8x4s coefficients via first start pos == c1
 5002         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 5003         // compute a1 =  c1 + a
 5004         vs_addv(vs3, __ T4S, vs1, vs2);
 5005         // compute a2 =  c1 - a
 5006         vs_subv(vs1, __ T4S, vs1, vs2);
 5007         // output a1 and a2
 5008         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 5009         vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
 5010 
 5011         int k = 4 * level + i;
 5012 
 5013         if (k > 7) {
 5014           startIncr = 256;
 5015         } else if (k == 5) {
 5016           startIncr = 384;
 5017         } else {
 5018           startIncr = 128;
 5019         }
 5020 
 5021         c1Start += startIncr;
 5022         c2Start += startIncr;
 5023       }
 5024 
 5025       c2 /= 2;
 5026     }
 5027   }
 5028 
 5029   // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
 5030   // Implements the method
 5031   // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
 5032   // of the Java class sun.security.provider
 5033   //
 5034   // coeffs (int[256]) = c_rarg0
 5035   // zetas (int[256]) = c_rarg1
 5036   address generate_dilithiumAlmostNtt() {
 5037 
 5038     __ align(CodeEntryAlignment);
 5039     StubGenStubId stub_id = StubGenStubId::dilithiumAlmostNtt_id;
 5040     StubCodeMark mark(this, stub_id);
 5041     address start = __ pc();
 5042     __ enter();
 5043 
 5044     const Register coeffs = c_rarg0;
 5045     const Register zetas = c_rarg1;
 5046 
 5047     const Register tmpAddr = r9;
 5048     const Register dilithiumConsts = r10;
 5049     const Register result = r11;
 5050     // don't use callee save registers v8 - v15
 5051     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 5052     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 5053     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5054     int offsets[4] = {0, 32, 64, 96};
 5055     int offsets1[8] = {16, 48, 80, 112, 144, 176, 208, 240 };
 5056     int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5057     __ add(result, coeffs, 0);
 5058     __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 5059 
 5060     // Each level represents one iteration of the outer for loop of the Java version
 5061 
 5062     // level 0-4
 5063     dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
 5064 
 5065     // level 5
 5066 
 5067     // at level 5 the coefficients we need to combine with the zetas
 5068     // are grouped in memory in blocks of size 4. So, for both sets of
 5069     // coefficients we load 4 adjacent values at 8 different offsets
 5070     // using an indexed ldr with register variant Q and multiply them
 5071     // in sequence order by the next set of inputs. Likewise we store
 5072     // the resuls using an indexed str with register variant Q.
 5073     for (int i = 0; i < 1024; i += 256) {
 5074       // reload constants q, qinv each iteration as they get clobbered later
 5075       vs_ldpq(vq, dilithiumConsts); // qInv, q
 5076       // load 32 (8x4S) coefficients via first offsets = c1
 5077       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 5078       // load next 32 (8x4S) inputs = b
 5079       vs_ldpq_post(vs2, zetas);
 5080       // a = b montul c1
 5081       vs_montmul32(vs2, vs1, vs2, vtmp, vq);
 5082       // load 32 (8x4S) coefficients via second offsets = c2
 5083       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
 5084       // add/sub with result of multiply
 5085       vs_addv(vs3, __ T4S, vs1, vs2);     // a1 = a - c2
 5086       vs_subv(vs1, __ T4S, vs1, vs2);     // a0 = a + c1
 5087       // write back new coefficients using same offsets
 5088       vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
 5089       vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
 5090     }
 5091 
 5092     // level 6
 5093     // at level 6 the coefficients we need to combine with the zetas
 5094     // are grouped in memory in pairs, the first two being montmul
 5095     // inputs and the second add/sub inputs. We can still implement
 5096     // the montmul+sub+add using 4-way parallelism but only if we
 5097     // combine the coefficients with the zetas 16 at a time. We load 8
 5098     // adjacent values at 4 different offsets using an ld2 load with
 5099     // arrangement 2D. That interleaves the lower and upper halves of
 5100     // each pair of quadwords into successive vector registers. We
 5101     // then need to montmul the 4 even elements of the coefficients
 5102     // register sequence by the zetas in order and then add/sub the 4
 5103     // odd elements of the coefficients register sequence. We use an
 5104     // equivalent st2 operation to store the results back into memory
 5105     // de-interleaved.
 5106     for (int i = 0; i < 1024; i += 128) {
 5107       // reload constants q, qinv each iteration as they get clobbered later
 5108       vs_ldpq(vq, dilithiumConsts); // qInv, q
 5109       // load interleaved 16 (4x2D) coefficients via offsets
 5110       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 5111       // load next 16 (4x4S) inputs
 5112       vs_ldpq_post(vs_front(vs2), zetas);
 5113       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 5114       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 5115                                   vs_front(vs2), vtmp, vq);
 5116       // store interleaved 16 (4x2D) coefficients via offsets
 5117       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 5118     }
 5119 
 5120     // level 7
 5121     // at level 7 the coefficients we need to combine with the zetas
 5122     // occur singly with montmul inputs alterating with add/sub
 5123     // inputs. Once again we can use 4-way parallelism to combine 16
 5124     // zetas at a time. However, we have to load 8 adjacent values at
 5125     // 4 different offsets using an ld2 load with arrangement 4S. That
 5126     // interleaves the the odd words of each pair into one
 5127     // coefficients vector register and the even words of the pair
 5128     // into the next register. We then need to montmul the 4 even
 5129     // elements of the coefficients register sequence by the zetas in
 5130     // order and then add/sub the 4 odd elements of the coefficients
 5131     // register sequence. We use an equivalent st2 operation to store
 5132     // the results back into memory de-interleaved.
 5133 
 5134     for (int i = 0; i < 1024; i += 128) {
 5135       // reload constants q, qinv each iteration as they get clobbered later
 5136       vs_ldpq(vq, dilithiumConsts); // qInv, q
 5137       // load interleaved 16 (4x4S) coefficients via offsets
 5138       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 5139       // load next 16 (4x4S) inputs
 5140       vs_ldpq_post(vs_front(vs2), zetas);
 5141       // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
 5142       dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
 5143                                   vs_front(vs2), vtmp, vq);
 5144       // store interleaved 16 (4x4S) coefficients via offsets
 5145       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 5146     }
 5147     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5148     __ mov(r0, zr); // return 0
 5149     __ ret(lr);
 5150 
 5151     return start;
 5152   }
 5153 
 5154   // At these levels, the indices that correspond to the 'j's (and 'j+l's)
 5155   // in the Java implementation come in sequences of at least 8, so we
 5156   // can use ldpq to collect the corresponding data into pairs of vector
 5157   // registers
 5158   // We collect the coefficients that correspond to the 'j's into vs1
 5159   // the coefficiets that correspond to the 'j+l's into vs2 then
 5160   // do the additions into vs3 and the subtractions into vs1 then
 5161   // save the result of the additions, load the zetas into vs2
 5162   // do the (Montgomery) multiplications by zeta in parallel into vs2
 5163   // finally save the results back to the coeffs array
 5164   void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
 5165     const Register coeffs, const Register zetas) {
 5166     int c1 = 0;
 5167     int c2 = 32;
 5168     int startIncr;
 5169     int offsets[4];
 5170     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 5171     VSeq<4> vtmp = vs_front(vs3);      // n.b. tmp registers overlap vs3
 5172     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5173 
 5174     offsets[0] = 0;
 5175 
 5176     for (int level = 3; level < 8; level++) {
 5177       int c1Start = c1;
 5178       int c2Start = c2;
 5179       if (level == 3) {
 5180         offsets[1] = 64;
 5181         offsets[2] = 128;
 5182         offsets[3] = 192;
 5183       } else if (level == 4) {
 5184         offsets[1] = 32;
 5185         offsets[2] = 128;
 5186         offsets[3] = 160;
 5187       } else {
 5188         offsets[1] = 32;
 5189         offsets[2] = 64;
 5190         offsets[3] = 96;
 5191       }
 5192 
 5193       // for levels 3 - 7 we simply load 2 x 4 adjacent values at a
 5194       // time at 4 different offsets and multiply them in order by the
 5195       // next set of input values. So we employ indexed load and store
 5196       // pair instructions with arrangement 4S
 5197       for (int i = 0; i < 4; i++) {
 5198         // load v1 32 (8x4S) coefficients relative to first start index
 5199         vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
 5200         // load v2 32 (8x4S) coefficients relative to second start index
 5201         vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
 5202         // a0 = v1 + v2 -- n.b. clobbers vqs
 5203         vs_addv(vs3, __ T4S, vs1, vs2);
 5204         // a1 = v1 - v2
 5205         vs_subv(vs1, __ T4S, vs1, vs2);
 5206         // save a1 relative to first start index
 5207         vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
 5208         // load constants q, qinv each iteration as they get clobbered above
 5209         vs_ldpq(vq, dilithiumConsts); // qInv, q
 5210         // load b next 32 (8x4S) inputs
 5211         vs_ldpq_post(vs2, zetas);
 5212         // a = a1 montmul b
 5213         vs_montmul32(vs2, vs1, vs2, vtmp, vq);
 5214         // save a relative to second start index
 5215         vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
 5216 
 5217         int k = 4 * level + i;
 5218 
 5219         if (k < 24) {
 5220           startIncr = 256;
 5221         } else if (k == 25) {
 5222           startIncr = 384;
 5223         } else {
 5224           startIncr = 128;
 5225         }
 5226 
 5227         c1Start += startIncr;
 5228         c2Start += startIncr;
 5229       }
 5230 
 5231       c2 *= 2;
 5232     }
 5233   }
 5234 
 5235   // Dilithium Inverse NTT function except the final mod Q division by 2^256.
 5236   // Implements the method
 5237   // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
 5238   // the sun.security.provider.ML_DSA class.
 5239   //
 5240   // coeffs (int[256]) = c_rarg0
 5241   // zetas (int[256]) = c_rarg1
 5242   address generate_dilithiumAlmostInverseNtt() {
 5243 
 5244     __ align(CodeEntryAlignment);
 5245     StubGenStubId stub_id = StubGenStubId::dilithiumAlmostInverseNtt_id;
 5246     StubCodeMark mark(this, stub_id);
 5247     address start = __ pc();
 5248     __ enter();
 5249 
 5250     const Register coeffs = c_rarg0;
 5251     const Register zetas = c_rarg1;
 5252 
 5253     const Register tmpAddr = r9;
 5254     const Register dilithiumConsts = r10;
 5255     const Register result = r11;
 5256     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 5257     VSeq<4> vtmp = vs_front(vs3);     // n.b. tmp registers overlap vs3
 5258     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5259     int offsets[4] = { 0, 32, 64, 96 };
 5260     int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
 5261     int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
 5262 
 5263     __ add(result, coeffs, 0);
 5264     __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 5265 
 5266     // Each level represents one iteration of the outer for loop of the Java version
 5267     // level0
 5268 
 5269     // level 0
 5270     // At level 0 we need to interleave adjacent quartets of
 5271     // coefficients before we multiply and add/sub by the next 16
 5272     // zetas just as we did for level 7 in the multiply code. So we
 5273     // load and store the values using an ld2/st2 with arrangement 4S
 5274     for (int i = 0; i < 1024; i += 128) {
 5275       // load constants q, qinv
 5276       // n.b. this can be moved out of the loop as they do not get
 5277       // clobbered by first two loops
 5278       vs_ldpq(vq, dilithiumConsts); // qInv, q
 5279       // a0/a1 load interleaved 32 (8x4S) coefficients
 5280       vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 5281       // b load next 32 (8x4S) inputs
 5282       vs_ldpq_post(vs_front(vs2), zetas);
 5283       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 5284       // n.b. second half of vs2 provides temporary register storage
 5285       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 5286                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 5287       // a0/a1 store interleaved 32 (8x4S) coefficients
 5288       vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
 5289     }
 5290 
 5291     // level 1
 5292     // At level 1 we need to interleave pairs of adjacent pairs of
 5293     // coefficients before we multiply by the next 16 zetas just as we
 5294     // did for level 6 in the multiply code. So we load and store the
 5295     // values an ld2/st2 with arrangement 2D
 5296     for (int i = 0; i < 1024; i += 128) {
 5297       // a0/a1 load interleaved 32 (8x2D) coefficients
 5298       vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 5299       // b load next 16 (4x4S) inputs
 5300       vs_ldpq_post(vs_front(vs2), zetas);
 5301       // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
 5302       // n.b. second half of vs2 provides temporary register storage
 5303       dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
 5304                                   vs_front(vs2), vs_back(vs2), vtmp, vq);
 5305       // a0/a1 store interleaved 32 (8x2D) coefficients
 5306       vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
 5307     }
 5308 
 5309     // level 2
 5310     // At level 2 coefficients come in blocks of 4. So, we load 4
 5311     // adjacent coefficients at 8 distinct offsets for both the first
 5312     // and second coefficient sequences, using an ldr with register
 5313     // variant Q then combine them with next set of 32 zetas. Likewise
 5314     // we store the results using an str with register variant Q.
 5315     for (int i = 0; i < 1024; i += 256) {
 5316       // c0 load 32 (8x4S) coefficients via first offsets
 5317       vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
 5318       // c1 load 32 (8x4S) coefficients via second offsets
 5319       vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
 5320       // a0 = c0 + c1  n.b. clobbers vq which overlaps vs3
 5321       vs_addv(vs3, __ T4S, vs1, vs2);
 5322       // c = c0 - c1
 5323       vs_subv(vs1, __ T4S, vs1, vs2);
 5324       // store a0 32 (8x4S) coefficients via first offsets
 5325       vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
 5326       // b load 32 (8x4S) next inputs
 5327       vs_ldpq_post(vs2, zetas);
 5328       // reload constants q, qinv -- they were clobbered earlier
 5329       vs_ldpq(vq, dilithiumConsts); // qInv, q
 5330       // compute a1 = b montmul c
 5331       vs_montmul32(vs2, vs1, vs2, vtmp, vq);
 5332       // store a1 32 (8x4S) coefficients via second offsets
 5333       vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
 5334     }
 5335 
 5336     // level 3-7
 5337     dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
 5338 
 5339     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5340     __ mov(r0, zr); // return 0
 5341     __ ret(lr);
 5342 
 5343     return start;
 5344 
 5345   }
 5346 
 5347   // Dilithium multiply polynomials in the NTT domain.
 5348   // Straightforward implementation of the method
 5349   // static int implDilithiumNttMult(
 5350   //              int[] result, int[] ntta, int[] nttb {} of
 5351   // the sun.security.provider.ML_DSA class.
 5352   //
 5353   // result (int[256]) = c_rarg0
 5354   // poly1 (int[256]) = c_rarg1
 5355   // poly2 (int[256]) = c_rarg2
 5356   address generate_dilithiumNttMult() {
 5357 
 5358         __ align(CodeEntryAlignment);
 5359     StubGenStubId stub_id = StubGenStubId::dilithiumNttMult_id;
 5360     StubCodeMark mark(this, stub_id);
 5361     address start = __ pc();
 5362     __ enter();
 5363 
 5364     Label L_loop;
 5365 
 5366     const Register result = c_rarg0;
 5367     const Register poly1 = c_rarg1;
 5368     const Register poly2 = c_rarg2;
 5369 
 5370     const Register dilithiumConsts = r10;
 5371     const Register len = r11;
 5372 
 5373     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 5374     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 5375     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5376     VSeq<8> vrsquare(29, 0);           // for montmul by constant RSQUARE
 5377 
 5378     __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 5379 
 5380     // load constants q, qinv
 5381     vs_ldpq(vq, dilithiumConsts); // qInv, q
 5382     // load constant rSquare into v29
 5383     __ ldr(v29, __ Q, Address(dilithiumConsts, 48));  // rSquare
 5384 
 5385     __ mov(len, zr);
 5386     __ add(len, len, 1024);
 5387 
 5388     __ BIND(L_loop);
 5389 
 5390     // b load 32 (8x4S) next inputs from poly1
 5391     vs_ldpq_post(vs1, poly1);
 5392     // c load 32 (8x4S) next inputs from poly2
 5393     vs_ldpq_post(vs2, poly2);
 5394     // compute a = b montmul c
 5395     vs_montmul32(vs2, vs1, vs2, vtmp, vq);
 5396     // compute a = rsquare montmul a
 5397     vs_montmul32(vs2, vrsquare, vs2, vtmp, vq);
 5398     // save a 32 (8x4S) results
 5399     vs_stpq_post(vs2, result);
 5400 
 5401     __ sub(len, len, 128);
 5402     __ cmp(len, (u1)128);
 5403     __ br(Assembler::GE, L_loop);
 5404 
 5405     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5406     __ mov(r0, zr); // return 0
 5407     __ ret(lr);
 5408 
 5409     return start;
 5410 
 5411   }
 5412 
 5413   // Dilithium Motgomery multiply an array by a constant.
 5414   // A straightforward implementation of the method
 5415   // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
 5416   // of the sun.security.provider.MLDSA class
 5417   //
 5418   // coeffs (int[256]) = c_rarg0
 5419   // constant (int) = c_rarg1
 5420   address generate_dilithiumMontMulByConstant() {
 5421 
 5422     __ align(CodeEntryAlignment);
 5423     StubGenStubId stub_id = StubGenStubId::dilithiumMontMulByConstant_id;
 5424     StubCodeMark mark(this, stub_id);
 5425     address start = __ pc();
 5426     __ enter();
 5427 
 5428     Label L_loop;
 5429 
 5430     const Register coeffs = c_rarg0;
 5431     const Register constant = c_rarg1;
 5432 
 5433     const Register dilithiumConsts = r10;
 5434     const Register result = r11;
 5435     const Register len = r12;
 5436 
 5437     VSeq<8> vs1(0), vs2(16), vs3(24);  // 3 sets of 8x4s inputs/outputs
 5438     VSeq<4> vtmp = vs_front(vs3);         // n.b. tmp registers overlap vs3
 5439     VSeq<2> vq(30);                    // n.b. constants overlap vs3
 5440     VSeq<8> vconst(29, 0);             // for montmul by constant
 5441 
 5442     // results track inputs
 5443     __ add(result, coeffs, 0);
 5444     __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 5445 
 5446     // load constants q, qinv -- they do not get clobbered by first two loops
 5447     vs_ldpq(vq, dilithiumConsts); // qInv, q
 5448     // copy caller supplied constant across vconst
 5449     __ dup(vconst[0], __ T4S, constant);
 5450     __ mov(len, zr);
 5451     __ add(len, len, 1024);
 5452 
 5453     __ BIND(L_loop);
 5454 
 5455     // load next 32 inputs
 5456     vs_ldpq_post(vs2, coeffs);
 5457     // mont mul by constant
 5458     vs_montmul32(vs2, vconst, vs2, vtmp, vq);
 5459     // write next 32 results
 5460     vs_stpq_post(vs2, result);
 5461 
 5462     __ sub(len, len, 128);
 5463     __ cmp(len, (u1)128);
 5464     __ br(Assembler::GE, L_loop);
 5465 
 5466     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5467     __ mov(r0, zr); // return 0
 5468     __ ret(lr);
 5469 
 5470     return start;
 5471 
 5472   }
 5473 
 5474   // Dilithium decompose poly.
 5475   // Implements the method
 5476   // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
 5477   // of the sun.security.provider.ML_DSA class
 5478   //
 5479   // input (int[256]) = c_rarg0
 5480   // lowPart (int[256]) = c_rarg1
 5481   // highPart (int[256]) = c_rarg2
 5482   // twoGamma2  (int) = c_rarg3
 5483   // multiplier (int) = c_rarg4
 5484   address generate_dilithiumDecomposePoly() {
 5485 
 5486     __ align(CodeEntryAlignment);
 5487     StubGenStubId stub_id = StubGenStubId::dilithiumDecomposePoly_id;
 5488     StubCodeMark mark(this, stub_id);
 5489     address start = __ pc();
 5490     Label L_loop;
 5491 
 5492     const Register input = c_rarg0;
 5493     const Register lowPart = c_rarg1;
 5494     const Register highPart = c_rarg2;
 5495     const Register twoGamma2 = c_rarg3;
 5496     const Register multiplier = c_rarg4;
 5497 
 5498     const Register len = r9;
 5499     const Register dilithiumConsts = r10;
 5500     const Register tmp = r11;
 5501 
 5502     VSeq<4> vs1(0), vs2(4), vs3(8); // 6 independent sets of 4x4s values
 5503     VSeq<4> vs4(12), vs5(16), vtmp(20);
 5504     VSeq<4> one(25, 0);            // 7 constants for cross-multiplying
 5505     VSeq<4> qminus1(26, 0);
 5506     VSeq<4> g2(27, 0);
 5507     VSeq<4> twog2(28, 0);
 5508     VSeq<4> mult(29, 0);
 5509     VSeq<4> q(30, 0);
 5510     VSeq<4> qadd(31, 0);
 5511 
 5512     __ enter();
 5513 
 5514     __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
 5515 
 5516     // save callee-saved registers
 5517     __ stpd(v8, v9, __ pre(sp, -64));
 5518     __ stpd(v10, v11, Address(sp, 16));
 5519     __ stpd(v12, v13, Address(sp, 32));
 5520     __ stpd(v14, v15, Address(sp, 48));
 5521 
 5522     // populate constant registers
 5523     __ mov(tmp, zr);
 5524     __ add(tmp, tmp, 1);
 5525     __ dup(one[0], __ T4S, tmp); // 1
 5526     __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
 5527     __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
 5528     __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
 5529     __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
 5530     __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
 5531     __ sshr(g2[0], __ T4S, v28, 1); // gamma2
 5532 
 5533     __ mov(len, zr);
 5534     __ add(len, len, 1024);
 5535 
 5536     __ BIND(L_loop);
 5537 
 5538     // load next 4x4S inputs interleaved: rplus --> vs1
 5539     __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
 5540 
 5541     //  rplus = rplus - ((rplus + qadd) >> 23) * q
 5542     vs_addv(vtmp, __ T4S, vs1, qadd);
 5543     vs_sshr(vtmp, __ T4S, vtmp, 23);
 5544     vs_mulv(vtmp, __ T4S, vtmp, q);
 5545     vs_subv(vs1, __ T4S, vs1, vtmp);
 5546 
 5547     // rplus = rplus + ((rplus >> 31) & dilithium_q);
 5548     vs_sshr(vtmp, __ T4S, vs1, 31);
 5549     vs_andr(vtmp, vtmp, q);
 5550     vs_addv(vs1, __ T4S, vs1, vtmp);
 5551 
 5552     // quotient --> vs2
 5553     // int quotient = (rplus * multiplier) >> 22;
 5554     vs_mulv(vtmp, __ T4S, vs1, mult);
 5555     vs_sshr(vs2, __ T4S, vtmp, 22);
 5556 
 5557     // r0 --> vs3
 5558     // int r0 = rplus - quotient * twoGamma2;
 5559     vs_mulv(vtmp, __ T4S, vs2, twog2);
 5560     vs_subv(vs3, __ T4S, vs1, vtmp);
 5561 
 5562     // mask --> vs4
 5563     // int mask = (twoGamma2 - r0) >> 22;
 5564     vs_subv(vtmp, __ T4S, twog2, vs3);
 5565     vs_sshr(vs4, __ T4S, vtmp, 22);
 5566 
 5567     // r0 -= (mask & twoGamma2);
 5568     vs_andr(vtmp, vs4, twog2);
 5569     vs_subv(vs3, __ T4S, vs3, vtmp);
 5570 
 5571     //  quotient += (mask & 1);
 5572     vs_andr(vtmp, vs4, one);
 5573     vs_addv(vs2, __ T4S, vs2, vtmp);
 5574 
 5575     // mask = (twoGamma2 / 2 - r0) >> 31;
 5576     vs_subv(vtmp, __ T4S, g2, vs3);
 5577     vs_sshr(vs4, __ T4S, vtmp, 31);
 5578 
 5579     // r0 -= (mask & twoGamma2);
 5580     vs_andr(vtmp, vs4, twog2);
 5581     vs_subv(vs3, __ T4S, vs3, vtmp);
 5582 
 5583     // quotient += (mask & 1);
 5584     vs_andr(vtmp, vs4, one);
 5585     vs_addv(vs2, __ T4S, vs2, vtmp);
 5586 
 5587     // r1 --> vs5
 5588     // int r1 = rplus - r0 - (dilithium_q - 1);
 5589     vs_subv(vtmp, __ T4S, vs1, vs3);
 5590     vs_subv(vs5, __ T4S, vtmp, qminus1);
 5591 
 5592     // r1 --> vs1 (overwriting rplus)
 5593     // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
 5594     vs_negr(vtmp, __ T4S, vs5);
 5595     vs_orr(vtmp, vs5, vtmp);
 5596     vs_sshr(vs1, __ T4S, vtmp, 31);
 5597 
 5598     // r0 += ~r1;
 5599     vs_notr(vtmp, vs1);
 5600     vs_addv(vs3, __ T4S, vs3, vtmp);
 5601 
 5602     // r1 = r1 & quotient;
 5603     vs_andr(vs1, vs2, vs1);
 5604 
 5605     // store results inteleaved
 5606     // lowPart[m] = r0;
 5607     // highPart[m] = r1;
 5608     __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
 5609     __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
 5610 
 5611 
 5612     __ sub(len, len, 64);
 5613     __ cmp(len, (u1)64);
 5614     __ br(Assembler::GE, L_loop);
 5615 
 5616     // restore callee-saved vector registers
 5617     __ ldpd(v14, v15, Address(sp, 48));
 5618     __ ldpd(v12, v13, Address(sp, 32));
 5619     __ ldpd(v10, v11, Address(sp, 16));
 5620     __ ldpd(v8, v9, __ post(sp, 64));
 5621 
 5622     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5623     __ mov(r0, zr); // return 0
 5624     __ ret(lr);
 5625 
 5626     return start;
 5627 
 5628   }
 5629 
 5630   /**
 5631    *  Arguments:
 5632    *
 5633    * Inputs:
 5634    *   c_rarg0   - int crc
 5635    *   c_rarg1   - byte* buf
 5636    *   c_rarg2   - int length
 5637    *   c_rarg3   - int* table
 5638    *
 5639    * Output:
 5640    *       r0   - int crc result
 5641    */
 5642   address generate_updateBytesCRC32C() {
 5643     assert(UseCRC32CIntrinsics, "what are we doing here?");
 5644 
 5645     __ align(CodeEntryAlignment);
 5646     StubGenStubId stub_id = StubGenStubId::updateBytesCRC32C_id;
 5647     StubCodeMark mark(this, stub_id);
 5648 
 5649     address start = __ pc();
 5650 
 5651     const Register crc   = c_rarg0;  // crc
 5652     const Register buf   = c_rarg1;  // source java byte array address
 5653     const Register len   = c_rarg2;  // length
 5654     const Register table0 = c_rarg3; // crc_table address
 5655     const Register table1 = c_rarg4;
 5656     const Register table2 = c_rarg5;
 5657     const Register table3 = c_rarg6;
 5658     const Register tmp3 = c_rarg7;
 5659 
 5660     BLOCK_COMMENT("Entry:");
 5661     __ enter(); // required for proper stackwalking of RuntimeStub frame
 5662 
 5663     __ kernel_crc32c(crc, buf, len,
 5664               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
 5665 
 5666     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5667     __ ret(lr);
 5668 
 5669     return start;
 5670   }
 5671 
 5672   /***
 5673    *  Arguments:
 5674    *
 5675    *  Inputs:
 5676    *   c_rarg0   - int   adler
 5677    *   c_rarg1   - byte* buff
 5678    *   c_rarg2   - int   len
 5679    *
 5680    * Output:
 5681    *   c_rarg0   - int adler result
 5682    */
 5683   address generate_updateBytesAdler32() {
 5684     __ align(CodeEntryAlignment);
 5685     StubGenStubId stub_id = StubGenStubId::updateBytesAdler32_id;
 5686     StubCodeMark mark(this, stub_id);
 5687     address start = __ pc();
 5688 
 5689     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
 5690 
 5691     // Aliases
 5692     Register adler  = c_rarg0;
 5693     Register s1     = c_rarg0;
 5694     Register s2     = c_rarg3;
 5695     Register buff   = c_rarg1;
 5696     Register len    = c_rarg2;
 5697     Register nmax  = r4;
 5698     Register base  = r5;
 5699     Register count = r6;
 5700     Register temp0 = rscratch1;
 5701     Register temp1 = rscratch2;
 5702     FloatRegister vbytes = v0;
 5703     FloatRegister vs1acc = v1;
 5704     FloatRegister vs2acc = v2;
 5705     FloatRegister vtable = v3;
 5706 
 5707     // Max number of bytes we can process before having to take the mod
 5708     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
 5709     uint64_t BASE = 0xfff1;
 5710     uint64_t NMAX = 0x15B0;
 5711 
 5712     __ mov(base, BASE);
 5713     __ mov(nmax, NMAX);
 5714 
 5715     // Load accumulation coefficients for the upper 16 bits
 5716     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
 5717     __ ld1(vtable, __ T16B, Address(temp0));
 5718 
 5719     // s1 is initialized to the lower 16 bits of adler
 5720     // s2 is initialized to the upper 16 bits of adler
 5721     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
 5722     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
 5723 
 5724     // The pipelined loop needs at least 16 elements for 1 iteration
 5725     // It does check this, but it is more effective to skip to the cleanup loop
 5726     __ cmp(len, (u1)16);
 5727     __ br(Assembler::HS, L_nmax);
 5728     __ cbz(len, L_combine);
 5729 
 5730     __ bind(L_simple_by1_loop);
 5731     __ ldrb(temp0, Address(__ post(buff, 1)));
 5732     __ add(s1, s1, temp0);
 5733     __ add(s2, s2, s1);
 5734     __ subs(len, len, 1);
 5735     __ br(Assembler::HI, L_simple_by1_loop);
 5736 
 5737     // s1 = s1 % BASE
 5738     __ subs(temp0, s1, base);
 5739     __ csel(s1, temp0, s1, Assembler::HS);
 5740 
 5741     // s2 = s2 % BASE
 5742     __ lsr(temp0, s2, 16);
 5743     __ lsl(temp1, temp0, 4);
 5744     __ sub(temp1, temp1, temp0);
 5745     __ add(s2, temp1, s2, ext::uxth);
 5746 
 5747     __ subs(temp0, s2, base);
 5748     __ csel(s2, temp0, s2, Assembler::HS);
 5749 
 5750     __ b(L_combine);
 5751 
 5752     __ bind(L_nmax);
 5753     __ subs(len, len, nmax);
 5754     __ sub(count, nmax, 16);
 5755     __ br(Assembler::LO, L_by16);
 5756 
 5757     __ bind(L_nmax_loop);
 5758 
 5759     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 5760                                       vbytes, vs1acc, vs2acc, vtable);
 5761 
 5762     __ subs(count, count, 16);
 5763     __ br(Assembler::HS, L_nmax_loop);
 5764 
 5765     // s1 = s1 % BASE
 5766     __ lsr(temp0, s1, 16);
 5767     __ lsl(temp1, temp0, 4);
 5768     __ sub(temp1, temp1, temp0);
 5769     __ add(temp1, temp1, s1, ext::uxth);
 5770 
 5771     __ lsr(temp0, temp1, 16);
 5772     __ lsl(s1, temp0, 4);
 5773     __ sub(s1, s1, temp0);
 5774     __ add(s1, s1, temp1, ext:: uxth);
 5775 
 5776     __ subs(temp0, s1, base);
 5777     __ csel(s1, temp0, s1, Assembler::HS);
 5778 
 5779     // s2 = s2 % BASE
 5780     __ lsr(temp0, s2, 16);
 5781     __ lsl(temp1, temp0, 4);
 5782     __ sub(temp1, temp1, temp0);
 5783     __ add(temp1, temp1, s2, ext::uxth);
 5784 
 5785     __ lsr(temp0, temp1, 16);
 5786     __ lsl(s2, temp0, 4);
 5787     __ sub(s2, s2, temp0);
 5788     __ add(s2, s2, temp1, ext:: uxth);
 5789 
 5790     __ subs(temp0, s2, base);
 5791     __ csel(s2, temp0, s2, Assembler::HS);
 5792 
 5793     __ subs(len, len, nmax);
 5794     __ sub(count, nmax, 16);
 5795     __ br(Assembler::HS, L_nmax_loop);
 5796 
 5797     __ bind(L_by16);
 5798     __ adds(len, len, count);
 5799     __ br(Assembler::LO, L_by1);
 5800 
 5801     __ bind(L_by16_loop);
 5802 
 5803     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
 5804                                       vbytes, vs1acc, vs2acc, vtable);
 5805 
 5806     __ subs(len, len, 16);
 5807     __ br(Assembler::HS, L_by16_loop);
 5808 
 5809     __ bind(L_by1);
 5810     __ adds(len, len, 15);
 5811     __ br(Assembler::LO, L_do_mod);
 5812 
 5813     __ bind(L_by1_loop);
 5814     __ ldrb(temp0, Address(__ post(buff, 1)));
 5815     __ add(s1, temp0, s1);
 5816     __ add(s2, s2, s1);
 5817     __ subs(len, len, 1);
 5818     __ br(Assembler::HS, L_by1_loop);
 5819 
 5820     __ bind(L_do_mod);
 5821     // s1 = s1 % BASE
 5822     __ lsr(temp0, s1, 16);
 5823     __ lsl(temp1, temp0, 4);
 5824     __ sub(temp1, temp1, temp0);
 5825     __ add(temp1, temp1, s1, ext::uxth);
 5826 
 5827     __ lsr(temp0, temp1, 16);
 5828     __ lsl(s1, temp0, 4);
 5829     __ sub(s1, s1, temp0);
 5830     __ add(s1, s1, temp1, ext:: uxth);
 5831 
 5832     __ subs(temp0, s1, base);
 5833     __ csel(s1, temp0, s1, Assembler::HS);
 5834 
 5835     // s2 = s2 % BASE
 5836     __ lsr(temp0, s2, 16);
 5837     __ lsl(temp1, temp0, 4);
 5838     __ sub(temp1, temp1, temp0);
 5839     __ add(temp1, temp1, s2, ext::uxth);
 5840 
 5841     __ lsr(temp0, temp1, 16);
 5842     __ lsl(s2, temp0, 4);
 5843     __ sub(s2, s2, temp0);
 5844     __ add(s2, s2, temp1, ext:: uxth);
 5845 
 5846     __ subs(temp0, s2, base);
 5847     __ csel(s2, temp0, s2, Assembler::HS);
 5848 
 5849     // Combine lower bits and higher bits
 5850     __ bind(L_combine);
 5851     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
 5852 
 5853     __ ret(lr);
 5854 
 5855     return start;
 5856   }
 5857 
 5858   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
 5859           Register temp0, Register temp1, FloatRegister vbytes,
 5860           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
 5861     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
 5862     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
 5863     // In non-vectorized code, we update s1 and s2 as:
 5864     //   s1 <- s1 + b1
 5865     //   s2 <- s2 + s1
 5866     //   s1 <- s1 + b2
 5867     //   s2 <- s2 + b1
 5868     //   ...
 5869     //   s1 <- s1 + b16
 5870     //   s2 <- s2 + s1
 5871     // Putting above assignments together, we have:
 5872     //   s1_new = s1 + b1 + b2 + ... + b16
 5873     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
 5874     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
 5875     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
 5876     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
 5877 
 5878     // s2 = s2 + s1 * 16
 5879     __ add(s2, s2, s1, Assembler::LSL, 4);
 5880 
 5881     // vs1acc = b1 + b2 + b3 + ... + b16
 5882     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
 5883     __ umullv(vs2acc, __ T8B, vtable, vbytes);
 5884     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
 5885     __ uaddlv(vs1acc, __ T16B, vbytes);
 5886     __ uaddlv(vs2acc, __ T8H, vs2acc);
 5887 
 5888     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
 5889     __ fmovd(temp0, vs1acc);
 5890     __ fmovd(temp1, vs2acc);
 5891     __ add(s1, s1, temp0);
 5892     __ add(s2, s2, temp1);
 5893   }
 5894 
 5895   /**
 5896    *  Arguments:
 5897    *
 5898    *  Input:
 5899    *    c_rarg0   - x address
 5900    *    c_rarg1   - x length
 5901    *    c_rarg2   - y address
 5902    *    c_rarg3   - y length
 5903    *    c_rarg4   - z address
 5904    */
 5905   address generate_multiplyToLen() {
 5906     __ align(CodeEntryAlignment);
 5907     StubGenStubId stub_id = StubGenStubId::multiplyToLen_id;
 5908     StubCodeMark mark(this, stub_id);
 5909 
 5910     address start = __ pc();
 5911     const Register x     = r0;
 5912     const Register xlen  = r1;
 5913     const Register y     = r2;
 5914     const Register ylen  = r3;
 5915     const Register z     = r4;
 5916 
 5917     const Register tmp0  = r5;
 5918     const Register tmp1  = r10;
 5919     const Register tmp2  = r11;
 5920     const Register tmp3  = r12;
 5921     const Register tmp4  = r13;
 5922     const Register tmp5  = r14;
 5923     const Register tmp6  = r15;
 5924     const Register tmp7  = r16;
 5925 
 5926     BLOCK_COMMENT("Entry:");
 5927     __ enter(); // required for proper stackwalking of RuntimeStub frame
 5928     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 5929     __ leave(); // required for proper stackwalking of RuntimeStub frame
 5930     __ ret(lr);
 5931 
 5932     return start;
 5933   }
 5934 
 5935   address generate_squareToLen() {
 5936     // squareToLen algorithm for sizes 1..127 described in java code works
 5937     // faster than multiply_to_len on some CPUs and slower on others, but
 5938     // multiply_to_len shows a bit better overall results
 5939     __ align(CodeEntryAlignment);
 5940     StubGenStubId stub_id = StubGenStubId::squareToLen_id;
 5941     StubCodeMark mark(this, stub_id);
 5942     address start = __ pc();
 5943 
 5944     const Register x     = r0;
 5945     const Register xlen  = r1;
 5946     const Register z     = r2;
 5947     const Register y     = r4; // == x
 5948     const Register ylen  = r5; // == xlen
 5949 
 5950     const Register tmp0  = r3;
 5951     const Register tmp1  = r10;
 5952     const Register tmp2  = r11;
 5953     const Register tmp3  = r12;
 5954     const Register tmp4  = r13;
 5955     const Register tmp5  = r14;
 5956     const Register tmp6  = r15;
 5957     const Register tmp7  = r16;
 5958 
 5959     RegSet spilled_regs = RegSet::of(y, ylen);
 5960     BLOCK_COMMENT("Entry:");
 5961     __ enter();
 5962     __ push(spilled_regs, sp);
 5963     __ mov(y, x);
 5964     __ mov(ylen, xlen);
 5965     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
 5966     __ pop(spilled_regs, sp);
 5967     __ leave();
 5968     __ ret(lr);
 5969     return start;
 5970   }
 5971 
 5972   address generate_mulAdd() {
 5973     __ align(CodeEntryAlignment);
 5974     StubGenStubId stub_id = StubGenStubId::mulAdd_id;
 5975     StubCodeMark mark(this, stub_id);
 5976 
 5977     address start = __ pc();
 5978 
 5979     const Register out     = r0;
 5980     const Register in      = r1;
 5981     const Register offset  = r2;
 5982     const Register len     = r3;
 5983     const Register k       = r4;
 5984 
 5985     BLOCK_COMMENT("Entry:");
 5986     __ enter();
 5987     __ mul_add(out, in, offset, len, k);
 5988     __ leave();
 5989     __ ret(lr);
 5990 
 5991     return start;
 5992   }
 5993 
 5994   // Arguments:
 5995   //
 5996   // Input:
 5997   //   c_rarg0   - newArr address
 5998   //   c_rarg1   - oldArr address
 5999   //   c_rarg2   - newIdx
 6000   //   c_rarg3   - shiftCount
 6001   //   c_rarg4   - numIter
 6002   //
 6003   address generate_bigIntegerRightShift() {
 6004     __ align(CodeEntryAlignment);
 6005     StubGenStubId stub_id = StubGenStubId::bigIntegerRightShiftWorker_id;
 6006     StubCodeMark mark(this, stub_id);
 6007     address start = __ pc();
 6008 
 6009     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 6010 
 6011     Register newArr        = c_rarg0;
 6012     Register oldArr        = c_rarg1;
 6013     Register newIdx        = c_rarg2;
 6014     Register shiftCount    = c_rarg3;
 6015     Register numIter       = c_rarg4;
 6016     Register idx           = numIter;
 6017 
 6018     Register newArrCur     = rscratch1;
 6019     Register shiftRevCount = rscratch2;
 6020     Register oldArrCur     = r13;
 6021     Register oldArrNext    = r14;
 6022 
 6023     FloatRegister oldElem0        = v0;
 6024     FloatRegister oldElem1        = v1;
 6025     FloatRegister newElem         = v2;
 6026     FloatRegister shiftVCount     = v3;
 6027     FloatRegister shiftVRevCount  = v4;
 6028 
 6029     __ cbz(idx, Exit);
 6030 
 6031     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 6032 
 6033     // left shift count
 6034     __ movw(shiftRevCount, 32);
 6035     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 6036 
 6037     // numIter too small to allow a 4-words SIMD loop, rolling back
 6038     __ cmp(numIter, (u1)4);
 6039     __ br(Assembler::LT, ShiftThree);
 6040 
 6041     __ dup(shiftVCount,    __ T4S, shiftCount);
 6042     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
 6043     __ negr(shiftVCount,   __ T4S, shiftVCount);
 6044 
 6045     __ BIND(ShiftSIMDLoop);
 6046 
 6047     // Calculate the load addresses
 6048     __ sub(idx, idx, 4);
 6049     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 6050     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 6051     __ add(oldArrCur,  oldArrNext, 4);
 6052 
 6053     // Load 4 words and process
 6054     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
 6055     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
 6056     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 6057     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 6058     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 6059     __ st1(newElem,   __ T4S,  Address(newArrCur));
 6060 
 6061     __ cmp(idx, (u1)4);
 6062     __ br(Assembler::LT, ShiftTwoLoop);
 6063     __ b(ShiftSIMDLoop);
 6064 
 6065     __ BIND(ShiftTwoLoop);
 6066     __ cbz(idx, Exit);
 6067     __ cmp(idx, (u1)1);
 6068     __ br(Assembler::EQ, ShiftOne);
 6069 
 6070     // Calculate the load addresses
 6071     __ sub(idx, idx, 2);
 6072     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
 6073     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
 6074     __ add(oldArrCur,  oldArrNext, 4);
 6075 
 6076     // Load 2 words and process
 6077     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
 6078     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
 6079     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
 6080     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
 6081     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
 6082     __ st1(newElem,   __ T2S, Address(newArrCur));
 6083     __ b(ShiftTwoLoop);
 6084 
 6085     __ BIND(ShiftThree);
 6086     __ tbz(idx, 1, ShiftOne);
 6087     __ tbz(idx, 0, ShiftTwo);
 6088     __ ldrw(r10,  Address(oldArr, 12));
 6089     __ ldrw(r11,  Address(oldArr, 8));
 6090     __ lsrvw(r10, r10, shiftCount);
 6091     __ lslvw(r11, r11, shiftRevCount);
 6092     __ orrw(r12,  r10, r11);
 6093     __ strw(r12,  Address(newArr, 8));
 6094 
 6095     __ BIND(ShiftTwo);
 6096     __ ldrw(r10,  Address(oldArr, 8));
 6097     __ ldrw(r11,  Address(oldArr, 4));
 6098     __ lsrvw(r10, r10, shiftCount);
 6099     __ lslvw(r11, r11, shiftRevCount);
 6100     __ orrw(r12,  r10, r11);
 6101     __ strw(r12,  Address(newArr, 4));
 6102 
 6103     __ BIND(ShiftOne);
 6104     __ ldrw(r10,  Address(oldArr, 4));
 6105     __ ldrw(r11,  Address(oldArr));
 6106     __ lsrvw(r10, r10, shiftCount);
 6107     __ lslvw(r11, r11, shiftRevCount);
 6108     __ orrw(r12,  r10, r11);
 6109     __ strw(r12,  Address(newArr));
 6110 
 6111     __ BIND(Exit);
 6112     __ ret(lr);
 6113 
 6114     return start;
 6115   }
 6116 
 6117   // Arguments:
 6118   //
 6119   // Input:
 6120   //   c_rarg0   - newArr address
 6121   //   c_rarg1   - oldArr address
 6122   //   c_rarg2   - newIdx
 6123   //   c_rarg3   - shiftCount
 6124   //   c_rarg4   - numIter
 6125   //
 6126   address generate_bigIntegerLeftShift() {
 6127     __ align(CodeEntryAlignment);
 6128     StubGenStubId stub_id = StubGenStubId::bigIntegerLeftShiftWorker_id;
 6129     StubCodeMark mark(this, stub_id);
 6130     address start = __ pc();
 6131 
 6132     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
 6133 
 6134     Register newArr        = c_rarg0;
 6135     Register oldArr        = c_rarg1;
 6136     Register newIdx        = c_rarg2;
 6137     Register shiftCount    = c_rarg3;
 6138     Register numIter       = c_rarg4;
 6139 
 6140     Register shiftRevCount = rscratch1;
 6141     Register oldArrNext    = rscratch2;
 6142 
 6143     FloatRegister oldElem0        = v0;
 6144     FloatRegister oldElem1        = v1;
 6145     FloatRegister newElem         = v2;
 6146     FloatRegister shiftVCount     = v3;
 6147     FloatRegister shiftVRevCount  = v4;
 6148 
 6149     __ cbz(numIter, Exit);
 6150 
 6151     __ add(oldArrNext, oldArr, 4);
 6152     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
 6153 
 6154     // right shift count
 6155     __ movw(shiftRevCount, 32);
 6156     __ subw(shiftRevCount, shiftRevCount, shiftCount);
 6157 
 6158     // numIter too small to allow a 4-words SIMD loop, rolling back
 6159     __ cmp(numIter, (u1)4);
 6160     __ br(Assembler::LT, ShiftThree);
 6161 
 6162     __ dup(shiftVCount,     __ T4S, shiftCount);
 6163     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
 6164     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
 6165 
 6166     __ BIND(ShiftSIMDLoop);
 6167 
 6168     // load 4 words and process
 6169     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
 6170     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
 6171     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
 6172     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
 6173     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
 6174     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
 6175     __ sub(numIter,   numIter, 4);
 6176 
 6177     __ cmp(numIter, (u1)4);
 6178     __ br(Assembler::LT, ShiftTwoLoop);
 6179     __ b(ShiftSIMDLoop);
 6180 
 6181     __ BIND(ShiftTwoLoop);
 6182     __ cbz(numIter, Exit);
 6183     __ cmp(numIter, (u1)1);
 6184     __ br(Assembler::EQ, ShiftOne);
 6185 
 6186     // load 2 words and process
 6187     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
 6188     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
 6189     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
 6190     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
 6191     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
 6192     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
 6193     __ sub(numIter,   numIter, 2);
 6194     __ b(ShiftTwoLoop);
 6195 
 6196     __ BIND(ShiftThree);
 6197     __ ldrw(r10,  __ post(oldArr, 4));
 6198     __ ldrw(r11,  __ post(oldArrNext, 4));
 6199     __ lslvw(r10, r10, shiftCount);
 6200     __ lsrvw(r11, r11, shiftRevCount);
 6201     __ orrw(r12,  r10, r11);
 6202     __ strw(r12,  __ post(newArr, 4));
 6203     __ tbz(numIter, 1, Exit);
 6204     __ tbz(numIter, 0, ShiftOne);
 6205 
 6206     __ BIND(ShiftTwo);
 6207     __ ldrw(r10,  __ post(oldArr, 4));
 6208     __ ldrw(r11,  __ post(oldArrNext, 4));
 6209     __ lslvw(r10, r10, shiftCount);
 6210     __ lsrvw(r11, r11, shiftRevCount);
 6211     __ orrw(r12,  r10, r11);
 6212     __ strw(r12,  __ post(newArr, 4));
 6213 
 6214     __ BIND(ShiftOne);
 6215     __ ldrw(r10,  Address(oldArr));
 6216     __ ldrw(r11,  Address(oldArrNext));
 6217     __ lslvw(r10, r10, shiftCount);
 6218     __ lsrvw(r11, r11, shiftRevCount);
 6219     __ orrw(r12,  r10, r11);
 6220     __ strw(r12,  Address(newArr));
 6221 
 6222     __ BIND(Exit);
 6223     __ ret(lr);
 6224 
 6225     return start;
 6226   }
 6227 
 6228   address generate_count_positives(address &count_positives_long) {
 6229     const u1 large_loop_size = 64;
 6230     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
 6231     int dcache_line = VM_Version::dcache_line_size();
 6232 
 6233     Register ary1 = r1, len = r2, result = r0;
 6234 
 6235     __ align(CodeEntryAlignment);
 6236 
 6237     StubGenStubId stub_id = StubGenStubId::count_positives_id;
 6238     StubCodeMark mark(this, stub_id);
 6239 
 6240     address entry = __ pc();
 6241 
 6242     __ enter();
 6243     // precondition: a copy of len is already in result
 6244     // __ mov(result, len);
 6245 
 6246   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
 6247         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
 6248 
 6249   __ cmp(len, (u1)15);
 6250   __ br(Assembler::GT, LEN_OVER_15);
 6251   // The only case when execution falls into this code is when pointer is near
 6252   // the end of memory page and we have to avoid reading next page
 6253   __ add(ary1, ary1, len);
 6254   __ subs(len, len, 8);
 6255   __ br(Assembler::GT, LEN_OVER_8);
 6256   __ ldr(rscratch2, Address(ary1, -8));
 6257   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
 6258   __ lsrv(rscratch2, rscratch2, rscratch1);
 6259   __ tst(rscratch2, UPPER_BIT_MASK);
 6260   __ csel(result, zr, result, Assembler::NE);
 6261   __ leave();
 6262   __ ret(lr);
 6263   __ bind(LEN_OVER_8);
 6264   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
 6265   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
 6266   __ tst(rscratch2, UPPER_BIT_MASK);
 6267   __ br(Assembler::NE, RET_NO_POP);
 6268   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
 6269   __ lsrv(rscratch1, rscratch1, rscratch2);
 6270   __ tst(rscratch1, UPPER_BIT_MASK);
 6271   __ bind(RET_NO_POP);
 6272   __ csel(result, zr, result, Assembler::NE);
 6273   __ leave();
 6274   __ ret(lr);
 6275 
 6276   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
 6277   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
 6278 
 6279   count_positives_long = __ pc(); // 2nd entry point
 6280 
 6281   __ enter();
 6282 
 6283   __ bind(LEN_OVER_15);
 6284     __ push(spilled_regs, sp);
 6285     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
 6286     __ cbz(rscratch2, ALIGNED);
 6287     __ ldp(tmp6, tmp1, Address(ary1));
 6288     __ mov(tmp5, 16);
 6289     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
 6290     __ add(ary1, ary1, rscratch1);
 6291     __ orr(tmp6, tmp6, tmp1);
 6292     __ tst(tmp6, UPPER_BIT_MASK);
 6293     __ br(Assembler::NE, RET_ADJUST);
 6294     __ sub(len, len, rscratch1);
 6295 
 6296   __ bind(ALIGNED);
 6297     __ cmp(len, large_loop_size);
 6298     __ br(Assembler::LT, CHECK_16);
 6299     // Perform 16-byte load as early return in pre-loop to handle situation
 6300     // when initially aligned large array has negative values at starting bytes,
 6301     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
 6302     // slower. Cases with negative bytes further ahead won't be affected that
 6303     // much. In fact, it'll be faster due to early loads, less instructions and
 6304     // less branches in LARGE_LOOP.
 6305     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
 6306     __ sub(len, len, 16);
 6307     __ orr(tmp6, tmp6, tmp1);
 6308     __ tst(tmp6, UPPER_BIT_MASK);
 6309     __ br(Assembler::NE, RET_ADJUST_16);
 6310     __ cmp(len, large_loop_size);
 6311     __ br(Assembler::LT, CHECK_16);
 6312 
 6313     if (SoftwarePrefetchHintDistance >= 0
 6314         && SoftwarePrefetchHintDistance >= dcache_line) {
 6315       // initial prefetch
 6316       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
 6317     }
 6318   __ bind(LARGE_LOOP);
 6319     if (SoftwarePrefetchHintDistance >= 0) {
 6320       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
 6321     }
 6322     // Issue load instructions first, since it can save few CPU/MEM cycles, also
 6323     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
 6324     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
 6325     // instructions per cycle and have less branches, but this approach disables
 6326     // early return, thus, all 64 bytes are loaded and checked every time.
 6327     __ ldp(tmp2, tmp3, Address(ary1));
 6328     __ ldp(tmp4, tmp5, Address(ary1, 16));
 6329     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
 6330     __ ldp(tmp6, tmp1, Address(ary1, 48));
 6331     __ add(ary1, ary1, large_loop_size);
 6332     __ sub(len, len, large_loop_size);
 6333     __ orr(tmp2, tmp2, tmp3);
 6334     __ orr(tmp4, tmp4, tmp5);
 6335     __ orr(rscratch1, rscratch1, rscratch2);
 6336     __ orr(tmp6, tmp6, tmp1);
 6337     __ orr(tmp2, tmp2, tmp4);
 6338     __ orr(rscratch1, rscratch1, tmp6);
 6339     __ orr(tmp2, tmp2, rscratch1);
 6340     __ tst(tmp2, UPPER_BIT_MASK);
 6341     __ br(Assembler::NE, RET_ADJUST_LONG);
 6342     __ cmp(len, large_loop_size);
 6343     __ br(Assembler::GE, LARGE_LOOP);
 6344 
 6345   __ bind(CHECK_16); // small 16-byte load pre-loop
 6346     __ cmp(len, (u1)16);
 6347     __ br(Assembler::LT, POST_LOOP16);
 6348 
 6349   __ bind(LOOP16); // small 16-byte load loop
 6350     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
 6351     __ sub(len, len, 16);
 6352     __ orr(tmp2, tmp2, tmp3);
 6353     __ tst(tmp2, UPPER_BIT_MASK);
 6354     __ br(Assembler::NE, RET_ADJUST_16);
 6355     __ cmp(len, (u1)16);
 6356     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
 6357 
 6358   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
 6359     __ cmp(len, (u1)8);
 6360     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
 6361     __ ldr(tmp3, Address(__ post(ary1, 8)));
 6362     __ tst(tmp3, UPPER_BIT_MASK);
 6363     __ br(Assembler::NE, RET_ADJUST);
 6364     __ sub(len, len, 8);
 6365 
 6366   __ bind(POST_LOOP16_LOAD_TAIL);
 6367     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
 6368     __ ldr(tmp1, Address(ary1));
 6369     __ mov(tmp2, 64);
 6370     __ sub(tmp4, tmp2, len, __ LSL, 3);
 6371     __ lslv(tmp1, tmp1, tmp4);
 6372     __ tst(tmp1, UPPER_BIT_MASK);
 6373     __ br(Assembler::NE, RET_ADJUST);
 6374     // Fallthrough
 6375 
 6376   __ bind(RET_LEN);
 6377     __ pop(spilled_regs, sp);
 6378     __ leave();
 6379     __ ret(lr);
 6380 
 6381     // difference result - len is the count of guaranteed to be
 6382     // positive bytes
 6383 
 6384   __ bind(RET_ADJUST_LONG);
 6385     __ add(len, len, (u1)(large_loop_size - 16));
 6386   __ bind(RET_ADJUST_16);
 6387     __ add(len, len, 16);
 6388   __ bind(RET_ADJUST);
 6389     __ pop(spilled_regs, sp);
 6390     __ leave();
 6391     __ sub(result, result, len);
 6392     __ ret(lr);
 6393 
 6394     return entry;
 6395   }
 6396 
 6397   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
 6398         bool usePrefetch, Label &NOT_EQUAL) {
 6399     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 6400         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 6401         tmp7 = r12, tmp8 = r13;
 6402     Label LOOP;
 6403 
 6404     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 6405     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 6406     __ bind(LOOP);
 6407     if (usePrefetch) {
 6408       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 6409       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 6410     }
 6411     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 6412     __ eor(tmp1, tmp1, tmp2);
 6413     __ eor(tmp3, tmp3, tmp4);
 6414     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 6415     __ orr(tmp1, tmp1, tmp3);
 6416     __ cbnz(tmp1, NOT_EQUAL);
 6417     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 6418     __ eor(tmp5, tmp5, tmp6);
 6419     __ eor(tmp7, tmp7, tmp8);
 6420     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 6421     __ orr(tmp5, tmp5, tmp7);
 6422     __ cbnz(tmp5, NOT_EQUAL);
 6423     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
 6424     __ eor(tmp1, tmp1, tmp2);
 6425     __ eor(tmp3, tmp3, tmp4);
 6426     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
 6427     __ orr(tmp1, tmp1, tmp3);
 6428     __ cbnz(tmp1, NOT_EQUAL);
 6429     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
 6430     __ eor(tmp5, tmp5, tmp6);
 6431     __ sub(cnt1, cnt1, 8 * wordSize);
 6432     __ eor(tmp7, tmp7, tmp8);
 6433     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
 6434     // tmp6 is not used. MacroAssembler::subs is used here (rather than
 6435     // cmp) because subs allows an unlimited range of immediate operand.
 6436     __ subs(tmp6, cnt1, loopThreshold);
 6437     __ orr(tmp5, tmp5, tmp7);
 6438     __ cbnz(tmp5, NOT_EQUAL);
 6439     __ br(__ GE, LOOP);
 6440     // post-loop
 6441     __ eor(tmp1, tmp1, tmp2);
 6442     __ eor(tmp3, tmp3, tmp4);
 6443     __ orr(tmp1, tmp1, tmp3);
 6444     __ sub(cnt1, cnt1, 2 * wordSize);
 6445     __ cbnz(tmp1, NOT_EQUAL);
 6446   }
 6447 
 6448   void generate_large_array_equals_loop_simd(int loopThreshold,
 6449         bool usePrefetch, Label &NOT_EQUAL) {
 6450     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 6451         tmp2 = rscratch2;
 6452     Label LOOP;
 6453 
 6454     __ bind(LOOP);
 6455     if (usePrefetch) {
 6456       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
 6457       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
 6458     }
 6459     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
 6460     __ sub(cnt1, cnt1, 8 * wordSize);
 6461     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
 6462     __ subs(tmp1, cnt1, loopThreshold);
 6463     __ eor(v0, __ T16B, v0, v4);
 6464     __ eor(v1, __ T16B, v1, v5);
 6465     __ eor(v2, __ T16B, v2, v6);
 6466     __ eor(v3, __ T16B, v3, v7);
 6467     __ orr(v0, __ T16B, v0, v1);
 6468     __ orr(v1, __ T16B, v2, v3);
 6469     __ orr(v0, __ T16B, v0, v1);
 6470     __ umov(tmp1, v0, __ D, 0);
 6471     __ umov(tmp2, v0, __ D, 1);
 6472     __ orr(tmp1, tmp1, tmp2);
 6473     __ cbnz(tmp1, NOT_EQUAL);
 6474     __ br(__ GE, LOOP);
 6475   }
 6476 
 6477   // a1 = r1 - array1 address
 6478   // a2 = r2 - array2 address
 6479   // result = r0 - return value. Already contains "false"
 6480   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
 6481   // r3-r5 are reserved temporary registers
 6482   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
 6483   address generate_large_array_equals() {
 6484     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
 6485         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
 6486         tmp7 = r12, tmp8 = r13;
 6487     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
 6488         SMALL_LOOP, POST_LOOP;
 6489     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
 6490     // calculate if at least 32 prefetched bytes are used
 6491     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
 6492     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
 6493     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
 6494     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
 6495         tmp5, tmp6, tmp7, tmp8);
 6496 
 6497     __ align(CodeEntryAlignment);
 6498 
 6499     StubGenStubId stub_id = StubGenStubId::large_array_equals_id;
 6500     StubCodeMark mark(this, stub_id);
 6501 
 6502     address entry = __ pc();
 6503     __ enter();
 6504     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
 6505     // also advance pointers to use post-increment instead of pre-increment
 6506     __ add(a1, a1, wordSize);
 6507     __ add(a2, a2, wordSize);
 6508     if (AvoidUnalignedAccesses) {
 6509       // both implementations (SIMD/nonSIMD) are using relatively large load
 6510       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
 6511       // on some CPUs in case of address is not at least 16-byte aligned.
 6512       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
 6513       // load if needed at least for 1st address and make if 16-byte aligned.
 6514       Label ALIGNED16;
 6515       __ tbz(a1, 3, ALIGNED16);
 6516       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 6517       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 6518       __ sub(cnt1, cnt1, wordSize);
 6519       __ eor(tmp1, tmp1, tmp2);
 6520       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
 6521       __ bind(ALIGNED16);
 6522     }
 6523     if (UseSIMDForArrayEquals) {
 6524       if (SoftwarePrefetchHintDistance >= 0) {
 6525         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 6526         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 6527         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
 6528             /* prfm = */ true, NOT_EQUAL);
 6529         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 6530         __ br(__ LT, TAIL);
 6531       }
 6532       __ bind(NO_PREFETCH_LARGE_LOOP);
 6533       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
 6534           /* prfm = */ false, NOT_EQUAL);
 6535     } else {
 6536       __ push(spilled_regs, sp);
 6537       if (SoftwarePrefetchHintDistance >= 0) {
 6538         __ subs(tmp1, cnt1, prefetchLoopThreshold);
 6539         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
 6540         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
 6541             /* prfm = */ true, NOT_EQUAL);
 6542         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
 6543         __ br(__ LT, TAIL);
 6544       }
 6545       __ bind(NO_PREFETCH_LARGE_LOOP);
 6546       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
 6547           /* prfm = */ false, NOT_EQUAL);
 6548     }
 6549     __ bind(TAIL);
 6550       __ cbz(cnt1, EQUAL);
 6551       __ subs(cnt1, cnt1, wordSize);
 6552       __ br(__ LE, POST_LOOP);
 6553     __ bind(SMALL_LOOP);
 6554       __ ldr(tmp1, Address(__ post(a1, wordSize)));
 6555       __ ldr(tmp2, Address(__ post(a2, wordSize)));
 6556       __ subs(cnt1, cnt1, wordSize);
 6557       __ eor(tmp1, tmp1, tmp2);
 6558       __ cbnz(tmp1, NOT_EQUAL);
 6559       __ br(__ GT, SMALL_LOOP);
 6560     __ bind(POST_LOOP);
 6561       __ ldr(tmp1, Address(a1, cnt1));
 6562       __ ldr(tmp2, Address(a2, cnt1));
 6563       __ eor(tmp1, tmp1, tmp2);
 6564       __ cbnz(tmp1, NOT_EQUAL);
 6565     __ bind(EQUAL);
 6566       __ mov(result, true);
 6567     __ bind(NOT_EQUAL);
 6568       if (!UseSIMDForArrayEquals) {
 6569         __ pop(spilled_regs, sp);
 6570       }
 6571     __ bind(NOT_EQUAL_NO_POP);
 6572     __ leave();
 6573     __ ret(lr);
 6574     return entry;
 6575   }
 6576 
 6577   // result = r0 - return value. Contains initial hashcode value on entry.
 6578   // ary = r1 - array address
 6579   // cnt = r2 - elements count
 6580   // Clobbers: v0-v13, rscratch1, rscratch2
 6581   address generate_large_arrays_hashcode(BasicType eltype) {
 6582     const Register result = r0, ary = r1, cnt = r2;
 6583     const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
 6584     const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
 6585     const FloatRegister vpow = v12;  // powers of 31: <31^3, ..., 31^0>
 6586     const FloatRegister vpowm = v13;
 6587 
 6588     ARRAYS_HASHCODE_REGISTERS;
 6589 
 6590     Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
 6591 
 6592     unsigned int vf; // vectorization factor
 6593     bool multiply_by_halves;
 6594     Assembler::SIMD_Arrangement load_arrangement;
 6595     switch (eltype) {
 6596     case T_BOOLEAN:
 6597     case T_BYTE:
 6598       load_arrangement = Assembler::T8B;
 6599       multiply_by_halves = true;
 6600       vf = 8;
 6601       break;
 6602     case T_CHAR:
 6603     case T_SHORT:
 6604       load_arrangement = Assembler::T8H;
 6605       multiply_by_halves = true;
 6606       vf = 8;
 6607       break;
 6608     case T_INT:
 6609       load_arrangement = Assembler::T4S;
 6610       multiply_by_halves = false;
 6611       vf = 4;
 6612       break;
 6613     default:
 6614       ShouldNotReachHere();
 6615     }
 6616 
 6617     // Unroll factor
 6618     const unsigned uf = 4;
 6619 
 6620     // Effective vectorization factor
 6621     const unsigned evf = vf * uf;
 6622 
 6623     __ align(CodeEntryAlignment);
 6624 
 6625     StubGenStubId stub_id;
 6626     switch (eltype) {
 6627     case T_BOOLEAN:
 6628       stub_id = StubGenStubId::large_arrays_hashcode_boolean_id;
 6629       break;
 6630     case T_BYTE:
 6631       stub_id = StubGenStubId::large_arrays_hashcode_byte_id;
 6632       break;
 6633     case T_CHAR:
 6634       stub_id = StubGenStubId::large_arrays_hashcode_char_id;
 6635       break;
 6636     case T_SHORT:
 6637       stub_id = StubGenStubId::large_arrays_hashcode_short_id;
 6638       break;
 6639     case T_INT:
 6640       stub_id = StubGenStubId::large_arrays_hashcode_int_id;
 6641       break;
 6642     default:
 6643       stub_id = StubGenStubId::NO_STUBID;
 6644       ShouldNotReachHere();
 6645     };
 6646 
 6647     StubCodeMark mark(this, stub_id);
 6648 
 6649     address entry = __ pc();
 6650     __ enter();
 6651 
 6652     // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
 6653     // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
 6654     // value shouldn't change throughout both loops.
 6655     __ movw(rscratch1, intpow(31U, 3));
 6656     __ mov(vpow, Assembler::S, 0, rscratch1);
 6657     __ movw(rscratch1, intpow(31U, 2));
 6658     __ mov(vpow, Assembler::S, 1, rscratch1);
 6659     __ movw(rscratch1, intpow(31U, 1));
 6660     __ mov(vpow, Assembler::S, 2, rscratch1);
 6661     __ movw(rscratch1, intpow(31U, 0));
 6662     __ mov(vpow, Assembler::S, 3, rscratch1);
 6663 
 6664     __ mov(vmul0, Assembler::T16B, 0);
 6665     __ mov(vmul0, Assembler::S, 3, result);
 6666 
 6667     __ andr(rscratch2, cnt, (uf - 1) * vf);
 6668     __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
 6669 
 6670     __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
 6671     __ mov(vpowm, Assembler::S, 0, rscratch1);
 6672 
 6673     // SMALL LOOP
 6674     __ bind(SMALL_LOOP);
 6675 
 6676     __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
 6677     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 6678     __ subsw(rscratch2, rscratch2, vf);
 6679 
 6680     if (load_arrangement == Assembler::T8B) {
 6681       // Extend 8B to 8H to be able to use vector multiply
 6682       // instructions
 6683       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 6684       if (is_signed_subword_type(eltype)) {
 6685         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 6686       } else {
 6687         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 6688       }
 6689     }
 6690 
 6691     switch (load_arrangement) {
 6692     case Assembler::T4S:
 6693       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 6694       break;
 6695     case Assembler::T8B:
 6696     case Assembler::T8H:
 6697       assert(is_subword_type(eltype), "subword type expected");
 6698       if (is_signed_subword_type(eltype)) {
 6699         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 6700       } else {
 6701         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 6702       }
 6703       break;
 6704     default:
 6705       __ should_not_reach_here();
 6706     }
 6707 
 6708     // Process the upper half of a vector
 6709     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 6710       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 6711       if (is_signed_subword_type(eltype)) {
 6712         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 6713       } else {
 6714         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 6715       }
 6716     }
 6717 
 6718     __ br(Assembler::HI, SMALL_LOOP);
 6719 
 6720     // SMALL LOOP'S EPILOQUE
 6721     __ lsr(rscratch2, cnt, exact_log2(evf));
 6722     __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
 6723 
 6724     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 6725     __ addv(vmul0, Assembler::T4S, vmul0);
 6726     __ umov(result, vmul0, Assembler::S, 0);
 6727 
 6728     // TAIL
 6729     __ bind(TAIL);
 6730 
 6731     // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
 6732     // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
 6733     assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
 6734     __ andr(rscratch2, cnt, vf - 1);
 6735     __ bind(TAIL_SHORTCUT);
 6736     __ adr(rscratch1, BR_BASE);
 6737     __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, 3);
 6738     __ movw(rscratch2, 0x1f);
 6739     __ br(rscratch1);
 6740 
 6741     for (size_t i = 0; i < vf - 1; ++i) {
 6742       __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
 6743                                    eltype);
 6744       __ maddw(result, result, rscratch2, rscratch1);
 6745     }
 6746     __ bind(BR_BASE);
 6747 
 6748     __ leave();
 6749     __ ret(lr);
 6750 
 6751     // LARGE LOOP
 6752     __ bind(LARGE_LOOP_PREHEADER);
 6753 
 6754     __ lsr(rscratch2, cnt, exact_log2(evf));
 6755 
 6756     if (multiply_by_halves) {
 6757       // 31^4 - multiplier between lower and upper parts of a register
 6758       __ movw(rscratch1, intpow(31U, vf / 2));
 6759       __ mov(vpowm, Assembler::S, 1, rscratch1);
 6760       // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
 6761       __ movw(rscratch1, intpow(31U, evf - vf / 2));
 6762       __ mov(vpowm, Assembler::S, 0, rscratch1);
 6763     } else {
 6764       // 31^16
 6765       __ movw(rscratch1, intpow(31U, evf));
 6766       __ mov(vpowm, Assembler::S, 0, rscratch1);
 6767     }
 6768 
 6769     __ mov(vmul3, Assembler::T16B, 0);
 6770     __ mov(vmul2, Assembler::T16B, 0);
 6771     __ mov(vmul1, Assembler::T16B, 0);
 6772 
 6773     __ bind(LARGE_LOOP);
 6774 
 6775     __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
 6776     __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
 6777     __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
 6778     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
 6779 
 6780     __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
 6781            Address(__ post(ary, evf * type2aelembytes(eltype))));
 6782 
 6783     if (load_arrangement == Assembler::T8B) {
 6784       // Extend 8B to 8H to be able to use vector multiply
 6785       // instructions
 6786       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
 6787       if (is_signed_subword_type(eltype)) {
 6788         __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 6789         __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 6790         __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 6791         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 6792       } else {
 6793         __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
 6794         __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
 6795         __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
 6796         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
 6797       }
 6798     }
 6799 
 6800     switch (load_arrangement) {
 6801     case Assembler::T4S:
 6802       __ addv(vmul3, load_arrangement, vmul3, vdata3);
 6803       __ addv(vmul2, load_arrangement, vmul2, vdata2);
 6804       __ addv(vmul1, load_arrangement, vmul1, vdata1);
 6805       __ addv(vmul0, load_arrangement, vmul0, vdata0);
 6806       break;
 6807     case Assembler::T8B:
 6808     case Assembler::T8H:
 6809       assert(is_subword_type(eltype), "subword type expected");
 6810       if (is_signed_subword_type(eltype)) {
 6811         __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 6812         __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 6813         __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 6814         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 6815       } else {
 6816         __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
 6817         __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
 6818         __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
 6819         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
 6820       }
 6821       break;
 6822     default:
 6823       __ should_not_reach_here();
 6824     }
 6825 
 6826     // Process the upper half of a vector
 6827     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
 6828       __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
 6829       __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
 6830       __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
 6831       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
 6832       if (is_signed_subword_type(eltype)) {
 6833         __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 6834         __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 6835         __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 6836         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 6837       } else {
 6838         __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
 6839         __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
 6840         __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
 6841         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
 6842       }
 6843     }
 6844 
 6845     __ subsw(rscratch2, rscratch2, 1);
 6846     __ br(Assembler::HI, LARGE_LOOP);
 6847 
 6848     __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
 6849     __ addv(vmul3, Assembler::T4S, vmul3);
 6850     __ umov(result, vmul3, Assembler::S, 0);
 6851 
 6852     __ mov(rscratch2, intpow(31U, vf));
 6853 
 6854     __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
 6855     __ addv(vmul2, Assembler::T4S, vmul2);
 6856     __ umov(rscratch1, vmul2, Assembler::S, 0);
 6857     __ maddw(result, result, rscratch2, rscratch1);
 6858 
 6859     __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
 6860     __ addv(vmul1, Assembler::T4S, vmul1);
 6861     __ umov(rscratch1, vmul1, Assembler::S, 0);
 6862     __ maddw(result, result, rscratch2, rscratch1);
 6863 
 6864     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
 6865     __ addv(vmul0, Assembler::T4S, vmul0);
 6866     __ umov(rscratch1, vmul0, Assembler::S, 0);
 6867     __ maddw(result, result, rscratch2, rscratch1);
 6868 
 6869     __ andr(rscratch2, cnt, vf - 1);
 6870     __ cbnz(rscratch2, TAIL_SHORTCUT);
 6871 
 6872     __ leave();
 6873     __ ret(lr);
 6874 
 6875     return entry;
 6876   }
 6877 
 6878   address generate_dsin_dcos(bool isCos) {
 6879     __ align(CodeEntryAlignment);
 6880     StubGenStubId stub_id = (isCos ? StubGenStubId::dcos_id : StubGenStubId::dsin_id);
 6881     StubCodeMark mark(this, stub_id);
 6882     address start = __ pc();
 6883     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
 6884         (address)StubRoutines::aarch64::_two_over_pi,
 6885         (address)StubRoutines::aarch64::_pio2,
 6886         (address)StubRoutines::aarch64::_dsin_coef,
 6887         (address)StubRoutines::aarch64::_dcos_coef);
 6888     return start;
 6889   }
 6890 
 6891   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
 6892   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
 6893       Label &DIFF2) {
 6894     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
 6895     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
 6896 
 6897     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
 6898     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 6899     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
 6900     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
 6901 
 6902     __ fmovd(tmpL, vtmp3);
 6903     __ eor(rscratch2, tmp3, tmpL);
 6904     __ cbnz(rscratch2, DIFF2);
 6905 
 6906     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 6907     __ umov(tmpL, vtmp3, __ D, 1);
 6908     __ eor(rscratch2, tmpU, tmpL);
 6909     __ cbnz(rscratch2, DIFF1);
 6910 
 6911     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
 6912     __ ldr(tmpU, Address(__ post(cnt1, 8)));
 6913     __ fmovd(tmpL, vtmp);
 6914     __ eor(rscratch2, tmp3, tmpL);
 6915     __ cbnz(rscratch2, DIFF2);
 6916 
 6917     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 6918     __ umov(tmpL, vtmp, __ D, 1);
 6919     __ eor(rscratch2, tmpU, tmpL);
 6920     __ cbnz(rscratch2, DIFF1);
 6921   }
 6922 
 6923   // r0  = result
 6924   // r1  = str1
 6925   // r2  = cnt1
 6926   // r3  = str2
 6927   // r4  = cnt2
 6928   // r10 = tmp1
 6929   // r11 = tmp2
 6930   address generate_compare_long_string_different_encoding(bool isLU) {
 6931     __ align(CodeEntryAlignment);
 6932     StubGenStubId stub_id = (isLU ? StubGenStubId::compare_long_string_LU_id : StubGenStubId::compare_long_string_UL_id);
 6933     StubCodeMark mark(this, stub_id);
 6934     address entry = __ pc();
 6935     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
 6936         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
 6937         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
 6938     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 6939         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
 6940     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
 6941     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
 6942 
 6943     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
 6944 
 6945     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
 6946     // cnt2 == amount of characters left to compare
 6947     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
 6948     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 6949     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
 6950     __ add(str2, str2, isLU ? wordSize : wordSize/2);
 6951     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
 6952     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
 6953     __ eor(rscratch2, tmp1, tmp2);
 6954     __ mov(rscratch1, tmp2);
 6955     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
 6956     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
 6957              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
 6958     __ push(spilled_regs, sp);
 6959     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
 6960     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
 6961 
 6962     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 6963 
 6964     if (SoftwarePrefetchHintDistance >= 0) {
 6965       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 6966       __ br(__ LT, NO_PREFETCH);
 6967       __ bind(LARGE_LOOP_PREFETCH);
 6968         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
 6969         __ mov(tmp4, 2);
 6970         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 6971         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
 6972           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 6973           __ subs(tmp4, tmp4, 1);
 6974           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
 6975           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
 6976           __ mov(tmp4, 2);
 6977         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
 6978           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 6979           __ subs(tmp4, tmp4, 1);
 6980           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
 6981           __ sub(cnt2, cnt2, 64);
 6982           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
 6983           __ br(__ GE, LARGE_LOOP_PREFETCH);
 6984     }
 6985     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
 6986     __ bind(NO_PREFETCH);
 6987     __ subs(cnt2, cnt2, 16);
 6988     __ br(__ LT, TAIL);
 6989     __ align(OptoLoopAlignment);
 6990     __ bind(SMALL_LOOP); // smaller loop
 6991       __ subs(cnt2, cnt2, 16);
 6992       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
 6993       __ br(__ GE, SMALL_LOOP);
 6994       __ cmn(cnt2, (u1)16);
 6995       __ br(__ EQ, LOAD_LAST);
 6996     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
 6997       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
 6998       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
 6999       __ ldr(tmp3, Address(cnt1, -8));
 7000       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
 7001       __ b(LOAD_LAST);
 7002     __ bind(DIFF2);
 7003       __ mov(tmpU, tmp3);
 7004     __ bind(DIFF1);
 7005       __ pop(spilled_regs, sp);
 7006       __ b(CALCULATE_DIFFERENCE);
 7007     __ bind(LOAD_LAST);
 7008       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
 7009       // No need to load it again
 7010       __ mov(tmpU, tmp3);
 7011       __ pop(spilled_regs, sp);
 7012 
 7013       // tmp2 points to the address of the last 4 Latin1 characters right now
 7014       __ ldrs(vtmp, Address(tmp2));
 7015       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
 7016       __ fmovd(tmpL, vtmp);
 7017 
 7018       __ eor(rscratch2, tmpU, tmpL);
 7019       __ cbz(rscratch2, DONE);
 7020 
 7021     // Find the first different characters in the longwords and
 7022     // compute their difference.
 7023     __ bind(CALCULATE_DIFFERENCE);
 7024       __ rev(rscratch2, rscratch2);
 7025       __ clz(rscratch2, rscratch2);
 7026       __ andr(rscratch2, rscratch2, -16);
 7027       __ lsrv(tmp1, tmp1, rscratch2);
 7028       __ uxthw(tmp1, tmp1);
 7029       __ lsrv(rscratch1, rscratch1, rscratch2);
 7030       __ uxthw(rscratch1, rscratch1);
 7031       __ subw(result, tmp1, rscratch1);
 7032     __ bind(DONE);
 7033       __ ret(lr);
 7034     return entry;
 7035   }
 7036 
 7037   // r0 = input (float16)
 7038   // v0 = result (float)
 7039   // v1 = temporary float register
 7040   address generate_float16ToFloat() {
 7041     __ align(CodeEntryAlignment);
 7042     StubGenStubId stub_id = StubGenStubId::hf2f_id;
 7043     StubCodeMark mark(this, stub_id);
 7044     address entry = __ pc();
 7045     BLOCK_COMMENT("Entry:");
 7046     __ flt16_to_flt(v0, r0, v1);
 7047     __ ret(lr);
 7048     return entry;
 7049   }
 7050 
 7051   // v0 = input (float)
 7052   // r0 = result (float16)
 7053   // v1 = temporary float register
 7054   address generate_floatToFloat16() {
 7055     __ align(CodeEntryAlignment);
 7056     StubGenStubId stub_id = StubGenStubId::f2hf_id;
 7057     StubCodeMark mark(this, stub_id);
 7058     address entry = __ pc();
 7059     BLOCK_COMMENT("Entry:");
 7060     __ flt_to_flt16(r0, v0, v1);
 7061     __ ret(lr);
 7062     return entry;
 7063   }
 7064 
 7065   address generate_method_entry_barrier() {
 7066     __ align(CodeEntryAlignment);
 7067     StubGenStubId stub_id = StubGenStubId::method_entry_barrier_id;
 7068     StubCodeMark mark(this, stub_id);
 7069 
 7070     Label deoptimize_label;
 7071 
 7072     address start = __ pc();
 7073 
 7074     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 7075 
 7076     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
 7077       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
 7078       // We can get here despite the nmethod being good, if we have not
 7079       // yet applied our cross modification fence (or data fence).
 7080       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
 7081       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
 7082       __ ldrw(rscratch2, rscratch2);
 7083       __ strw(rscratch2, thread_epoch_addr);
 7084       __ isb();
 7085       __ membar(__ LoadLoad);
 7086     }
 7087 
 7088     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
 7089 
 7090     __ enter();
 7091     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
 7092 
 7093     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
 7094 
 7095     __ push_call_clobbered_registers();
 7096 
 7097     __ mov(c_rarg0, rscratch2);
 7098     __ call_VM_leaf
 7099          (CAST_FROM_FN_PTR
 7100           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
 7101 
 7102     __ reset_last_Java_frame(true);
 7103 
 7104     __ mov(rscratch1, r0);
 7105 
 7106     __ pop_call_clobbered_registers();
 7107 
 7108     __ cbnz(rscratch1, deoptimize_label);
 7109 
 7110     __ leave();
 7111     __ ret(lr);
 7112 
 7113     __ BIND(deoptimize_label);
 7114 
 7115     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
 7116     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
 7117 
 7118     __ mov(sp, rscratch1);
 7119     __ br(rscratch2);
 7120 
 7121     return start;
 7122   }
 7123 
 7124   // r0  = result
 7125   // r1  = str1
 7126   // r2  = cnt1
 7127   // r3  = str2
 7128   // r4  = cnt2
 7129   // r10 = tmp1
 7130   // r11 = tmp2
 7131   address generate_compare_long_string_same_encoding(bool isLL) {
 7132     __ align(CodeEntryAlignment);
 7133     StubGenStubId stub_id = (isLL ? StubGenStubId::compare_long_string_LL_id : StubGenStubId::compare_long_string_UU_id);
 7134     StubCodeMark mark(this, stub_id);
 7135     address entry = __ pc();
 7136     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 7137         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
 7138 
 7139     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
 7140 
 7141     // exit from large loop when less than 64 bytes left to read or we're about
 7142     // to prefetch memory behind array border
 7143     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
 7144 
 7145     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
 7146     __ eor(rscratch2, tmp1, tmp2);
 7147     __ cbnz(rscratch2, CAL_DIFFERENCE);
 7148 
 7149     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
 7150     // update pointers, because of previous read
 7151     __ add(str1, str1, wordSize);
 7152     __ add(str2, str2, wordSize);
 7153     if (SoftwarePrefetchHintDistance >= 0) {
 7154       __ align(OptoLoopAlignment);
 7155       __ bind(LARGE_LOOP_PREFETCH);
 7156         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
 7157         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
 7158 
 7159         for (int i = 0; i < 4; i++) {
 7160           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
 7161           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
 7162           __ cmp(tmp1, tmp2);
 7163           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 7164           __ br(Assembler::NE, DIFF);
 7165         }
 7166         __ sub(cnt2, cnt2, isLL ? 64 : 32);
 7167         __ add(str1, str1, 64);
 7168         __ add(str2, str2, 64);
 7169         __ subs(rscratch2, cnt2, largeLoopExitCondition);
 7170         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
 7171         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
 7172     }
 7173 
 7174     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
 7175     __ br(Assembler::LE, LESS16);
 7176     __ align(OptoLoopAlignment);
 7177     __ bind(LOOP_COMPARE16);
 7178       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 7179       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 7180       __ cmp(tmp1, tmp2);
 7181       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 7182       __ br(Assembler::NE, DIFF);
 7183       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 7184       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 7185       __ br(Assembler::LT, LESS16);
 7186 
 7187       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
 7188       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
 7189       __ cmp(tmp1, tmp2);
 7190       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
 7191       __ br(Assembler::NE, DIFF);
 7192       __ sub(cnt2, cnt2, isLL ? 16 : 8);
 7193       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
 7194       __ br(Assembler::GE, LOOP_COMPARE16);
 7195       __ cbz(cnt2, LENGTH_DIFF);
 7196 
 7197     __ bind(LESS16);
 7198       // each 8 compare
 7199       __ subs(cnt2, cnt2, isLL ? 8 : 4);
 7200       __ br(Assembler::LE, LESS8);
 7201       __ ldr(tmp1, Address(__ post(str1, 8)));
 7202       __ ldr(tmp2, Address(__ post(str2, 8)));
 7203       __ eor(rscratch2, tmp1, tmp2);
 7204       __ cbnz(rscratch2, CAL_DIFFERENCE);
 7205       __ sub(cnt2, cnt2, isLL ? 8 : 4);
 7206 
 7207     __ bind(LESS8); // directly load last 8 bytes
 7208       if (!isLL) {
 7209         __ add(cnt2, cnt2, cnt2);
 7210       }
 7211       __ ldr(tmp1, Address(str1, cnt2));
 7212       __ ldr(tmp2, Address(str2, cnt2));
 7213       __ eor(rscratch2, tmp1, tmp2);
 7214       __ cbz(rscratch2, LENGTH_DIFF);
 7215       __ b(CAL_DIFFERENCE);
 7216 
 7217     __ bind(DIFF);
 7218       __ cmp(tmp1, tmp2);
 7219       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
 7220       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
 7221       // reuse rscratch2 register for the result of eor instruction
 7222       __ eor(rscratch2, tmp1, tmp2);
 7223 
 7224     __ bind(CAL_DIFFERENCE);
 7225       __ rev(rscratch2, rscratch2);
 7226       __ clz(rscratch2, rscratch2);
 7227       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
 7228       __ lsrv(tmp1, tmp1, rscratch2);
 7229       __ lsrv(tmp2, tmp2, rscratch2);
 7230       if (isLL) {
 7231         __ uxtbw(tmp1, tmp1);
 7232         __ uxtbw(tmp2, tmp2);
 7233       } else {
 7234         __ uxthw(tmp1, tmp1);
 7235         __ uxthw(tmp2, tmp2);
 7236       }
 7237       __ subw(result, tmp1, tmp2);
 7238 
 7239     __ bind(LENGTH_DIFF);
 7240       __ ret(lr);
 7241     return entry;
 7242   }
 7243 
 7244   enum string_compare_mode {
 7245     LL,
 7246     LU,
 7247     UL,
 7248     UU,
 7249   };
 7250 
 7251   // The following registers are declared in aarch64.ad
 7252   // r0  = result
 7253   // r1  = str1
 7254   // r2  = cnt1
 7255   // r3  = str2
 7256   // r4  = cnt2
 7257   // r10 = tmp1
 7258   // r11 = tmp2
 7259   // z0  = ztmp1
 7260   // z1  = ztmp2
 7261   // p0  = pgtmp1
 7262   // p1  = pgtmp2
 7263   address generate_compare_long_string_sve(string_compare_mode mode) {
 7264     StubGenStubId stub_id;
 7265     switch (mode) {
 7266       case LL: stub_id = StubGenStubId::compare_long_string_LL_id;  break;
 7267       case LU: stub_id = StubGenStubId::compare_long_string_LU_id; break;
 7268       case UL: stub_id = StubGenStubId::compare_long_string_UL_id; break;
 7269       case UU: stub_id = StubGenStubId::compare_long_string_UU_id; break;
 7270       default: ShouldNotReachHere();
 7271     }
 7272 
 7273     __ align(CodeEntryAlignment);
 7274     address entry = __ pc();
 7275     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
 7276              tmp1 = r10, tmp2 = r11;
 7277 
 7278     Label LOOP, DONE, MISMATCH;
 7279     Register vec_len = tmp1;
 7280     Register idx = tmp2;
 7281     // The minimum of the string lengths has been stored in cnt2.
 7282     Register cnt = cnt2;
 7283     FloatRegister ztmp1 = z0, ztmp2 = z1;
 7284     PRegister pgtmp1 = p0, pgtmp2 = p1;
 7285 
 7286 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
 7287     switch (mode) {                                                            \
 7288       case LL:                                                                 \
 7289         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
 7290         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
 7291         break;                                                                 \
 7292       case LU:                                                                 \
 7293         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
 7294         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 7295         break;                                                                 \
 7296       case UL:                                                                 \
 7297         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 7298         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
 7299         break;                                                                 \
 7300       case UU:                                                                 \
 7301         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
 7302         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
 7303         break;                                                                 \
 7304       default:                                                                 \
 7305         ShouldNotReachHere();                                                  \
 7306     }
 7307 
 7308     StubCodeMark mark(this, stub_id);
 7309 
 7310     __ mov(idx, 0);
 7311     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 7312 
 7313     if (mode == LL) {
 7314       __ sve_cntb(vec_len);
 7315     } else {
 7316       __ sve_cnth(vec_len);
 7317     }
 7318 
 7319     __ sub(rscratch1, cnt, vec_len);
 7320 
 7321     __ bind(LOOP);
 7322 
 7323       // main loop
 7324       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 7325       __ add(idx, idx, vec_len);
 7326       // Compare strings.
 7327       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 7328       __ br(__ NE, MISMATCH);
 7329       __ cmp(idx, rscratch1);
 7330       __ br(__ LT, LOOP);
 7331 
 7332     // post loop, last iteration
 7333     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
 7334 
 7335     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
 7336     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
 7337     __ br(__ EQ, DONE);
 7338 
 7339     __ bind(MISMATCH);
 7340 
 7341     // Crop the vector to find its location.
 7342     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
 7343     // Extract the first different characters of each string.
 7344     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
 7345     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
 7346 
 7347     // Compute the difference of the first different characters.
 7348     __ sub(result, rscratch1, rscratch2);
 7349 
 7350     __ bind(DONE);
 7351     __ ret(lr);
 7352 #undef LOAD_PAIR
 7353     return entry;
 7354   }
 7355 
 7356   void generate_compare_long_strings() {
 7357     if (UseSVE == 0) {
 7358       StubRoutines::aarch64::_compare_long_string_LL
 7359           = generate_compare_long_string_same_encoding(true);
 7360       StubRoutines::aarch64::_compare_long_string_UU
 7361           = generate_compare_long_string_same_encoding(false);
 7362       StubRoutines::aarch64::_compare_long_string_LU
 7363           = generate_compare_long_string_different_encoding(true);
 7364       StubRoutines::aarch64::_compare_long_string_UL
 7365           = generate_compare_long_string_different_encoding(false);
 7366     } else {
 7367       StubRoutines::aarch64::_compare_long_string_LL
 7368           = generate_compare_long_string_sve(LL);
 7369       StubRoutines::aarch64::_compare_long_string_UU
 7370           = generate_compare_long_string_sve(UU);
 7371       StubRoutines::aarch64::_compare_long_string_LU
 7372           = generate_compare_long_string_sve(LU);
 7373       StubRoutines::aarch64::_compare_long_string_UL
 7374           = generate_compare_long_string_sve(UL);
 7375     }
 7376   }
 7377 
 7378   // R0 = result
 7379   // R1 = str2
 7380   // R2 = cnt1
 7381   // R3 = str1
 7382   // R4 = cnt2
 7383   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
 7384   //
 7385   // This generic linear code use few additional ideas, which makes it faster:
 7386   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
 7387   // in order to skip initial loading(help in systems with 1 ld pipeline)
 7388   // 2) we can use "fast" algorithm of finding single character to search for
 7389   // first symbol with less branches(1 branch per each loaded register instead
 7390   // of branch for each symbol), so, this is where constants like
 7391   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
 7392   // 3) after loading and analyzing 1st register of source string, it can be
 7393   // used to search for every 1st character entry, saving few loads in
 7394   // comparison with "simplier-but-slower" implementation
 7395   // 4) in order to avoid lots of push/pop operations, code below is heavily
 7396   // re-using/re-initializing/compressing register values, which makes code
 7397   // larger and a bit less readable, however, most of extra operations are
 7398   // issued during loads or branches, so, penalty is minimal
 7399   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
 7400     StubGenStubId stub_id;
 7401     if (str1_isL) {
 7402       if (str2_isL) {
 7403         stub_id = StubGenStubId::string_indexof_linear_ll_id;
 7404       } else {
 7405         stub_id = StubGenStubId::string_indexof_linear_ul_id;
 7406       }
 7407     } else {
 7408       if (str2_isL) {
 7409         ShouldNotReachHere();
 7410       } else {
 7411         stub_id = StubGenStubId::string_indexof_linear_uu_id;
 7412       }
 7413     }
 7414     __ align(CodeEntryAlignment);
 7415     StubCodeMark mark(this, stub_id);
 7416     address entry = __ pc();
 7417 
 7418     int str1_chr_size = str1_isL ? 1 : 2;
 7419     int str2_chr_size = str2_isL ? 1 : 2;
 7420     int str1_chr_shift = str1_isL ? 0 : 1;
 7421     int str2_chr_shift = str2_isL ? 0 : 1;
 7422     bool isL = str1_isL && str2_isL;
 7423    // parameters
 7424     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
 7425     // temporary registers
 7426     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
 7427     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
 7428     // redefinitions
 7429     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
 7430 
 7431     __ push(spilled_regs, sp);
 7432     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
 7433         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
 7434         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
 7435         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
 7436         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
 7437         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
 7438     // Read whole register from str1. It is safe, because length >=8 here
 7439     __ ldr(ch1, Address(str1));
 7440     // Read whole register from str2. It is safe, because length >=8 here
 7441     __ ldr(ch2, Address(str2));
 7442     __ sub(cnt2, cnt2, cnt1);
 7443     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
 7444     if (str1_isL != str2_isL) {
 7445       __ eor(v0, __ T16B, v0, v0);
 7446     }
 7447     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
 7448     __ mul(first, first, tmp1);
 7449     // check if we have less than 1 register to check
 7450     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
 7451     if (str1_isL != str2_isL) {
 7452       __ fmovd(v1, ch1);
 7453     }
 7454     __ br(__ LE, L_SMALL);
 7455     __ eor(ch2, first, ch2);
 7456     if (str1_isL != str2_isL) {
 7457       __ zip1(v1, __ T16B, v1, v0);
 7458     }
 7459     __ sub(tmp2, ch2, tmp1);
 7460     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 7461     __ bics(tmp2, tmp2, ch2);
 7462     if (str1_isL != str2_isL) {
 7463       __ fmovd(ch1, v1);
 7464     }
 7465     __ br(__ NE, L_HAS_ZERO);
 7466     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 7467     __ add(result, result, wordSize/str2_chr_size);
 7468     __ add(str2, str2, wordSize);
 7469     __ br(__ LT, L_POST_LOOP);
 7470     __ BIND(L_LOOP);
 7471       __ ldr(ch2, Address(str2));
 7472       __ eor(ch2, first, ch2);
 7473       __ sub(tmp2, ch2, tmp1);
 7474       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 7475       __ bics(tmp2, tmp2, ch2);
 7476       __ br(__ NE, L_HAS_ZERO);
 7477     __ BIND(L_LOOP_PROCEED);
 7478       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
 7479       __ add(str2, str2, wordSize);
 7480       __ add(result, result, wordSize/str2_chr_size);
 7481       __ br(__ GE, L_LOOP);
 7482     __ BIND(L_POST_LOOP);
 7483       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
 7484       __ br(__ LE, NOMATCH);
 7485       __ ldr(ch2, Address(str2));
 7486       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 7487       __ eor(ch2, first, ch2);
 7488       __ sub(tmp2, ch2, tmp1);
 7489       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 7490       __ mov(tmp4, -1); // all bits set
 7491       __ b(L_SMALL_PROCEED);
 7492     __ align(OptoLoopAlignment);
 7493     __ BIND(L_SMALL);
 7494       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
 7495       __ eor(ch2, first, ch2);
 7496       if (str1_isL != str2_isL) {
 7497         __ zip1(v1, __ T16B, v1, v0);
 7498       }
 7499       __ sub(tmp2, ch2, tmp1);
 7500       __ mov(tmp4, -1); // all bits set
 7501       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
 7502       if (str1_isL != str2_isL) {
 7503         __ fmovd(ch1, v1); // move converted 4 symbols
 7504       }
 7505     __ BIND(L_SMALL_PROCEED);
 7506       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
 7507       __ bic(tmp2, tmp2, ch2);
 7508       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
 7509       __ rbit(tmp2, tmp2);
 7510       __ br(__ EQ, NOMATCH);
 7511     __ BIND(L_SMALL_HAS_ZERO_LOOP);
 7512       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
 7513       __ cmp(cnt1, u1(wordSize/str2_chr_size));
 7514       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
 7515       if (str2_isL) { // LL
 7516         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 7517         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 7518         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 7519         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 7520         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 7521       } else {
 7522         __ mov(ch2, 0xE); // all bits in byte set except last one
 7523         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 7524         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 7525         __ lslv(tmp2, tmp2, tmp4);
 7526         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7527         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7528         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 7529         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7530       }
 7531       __ cmp(ch1, ch2);
 7532       __ mov(tmp4, wordSize/str2_chr_size);
 7533       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 7534     __ BIND(L_SMALL_CMP_LOOP);
 7535       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 7536                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 7537       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 7538                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 7539       __ add(tmp4, tmp4, 1);
 7540       __ cmp(tmp4, cnt1);
 7541       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
 7542       __ cmp(first, ch2);
 7543       __ br(__ EQ, L_SMALL_CMP_LOOP);
 7544     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
 7545       __ cbz(tmp2, NOMATCH); // no more matches. exit
 7546       __ clz(tmp4, tmp2);
 7547       __ add(result, result, 1); // advance index
 7548       __ add(str2, str2, str2_chr_size); // advance pointer
 7549       __ b(L_SMALL_HAS_ZERO_LOOP);
 7550     __ align(OptoLoopAlignment);
 7551     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
 7552       __ cmp(first, ch2);
 7553       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 7554       __ b(DONE);
 7555     __ align(OptoLoopAlignment);
 7556     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
 7557       if (str2_isL) { // LL
 7558         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
 7559         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
 7560         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
 7561         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
 7562         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 7563       } else {
 7564         __ mov(ch2, 0xE); // all bits in byte set except last one
 7565         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 7566         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 7567         __ lslv(tmp2, tmp2, tmp4);
 7568         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7569         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7570         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
 7571         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7572       }
 7573       __ cmp(ch1, ch2);
 7574       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
 7575       __ b(DONE);
 7576     __ align(OptoLoopAlignment);
 7577     __ BIND(L_HAS_ZERO);
 7578       __ rbit(tmp2, tmp2);
 7579       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
 7580       // Now, perform compression of counters(cnt2 and cnt1) into one register.
 7581       // It's fine because both counters are 32bit and are not changed in this
 7582       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
 7583       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
 7584       __ sub(result, result, 1);
 7585     __ BIND(L_HAS_ZERO_LOOP);
 7586       __ mov(cnt1, wordSize/str2_chr_size);
 7587       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 7588       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
 7589       if (str2_isL) {
 7590         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 7591         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 7592         __ lslv(tmp2, tmp2, tmp4);
 7593         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7594         __ add(tmp4, tmp4, 1);
 7595         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7596         __ lsl(tmp2, tmp2, 1);
 7597         __ mov(tmp4, wordSize/str2_chr_size);
 7598       } else {
 7599         __ mov(ch2, 0xE);
 7600         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 7601         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 7602         __ lslv(tmp2, tmp2, tmp4);
 7603         __ add(tmp4, tmp4, 1);
 7604         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7605         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 7606         __ lsl(tmp2, tmp2, 1);
 7607         __ mov(tmp4, wordSize/str2_chr_size);
 7608         __ sub(str2, str2, str2_chr_size);
 7609       }
 7610       __ cmp(ch1, ch2);
 7611       __ mov(tmp4, wordSize/str2_chr_size);
 7612       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 7613     __ BIND(L_CMP_LOOP);
 7614       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
 7615                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
 7616       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
 7617                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
 7618       __ add(tmp4, tmp4, 1);
 7619       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
 7620       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
 7621       __ cmp(cnt1, ch2);
 7622       __ br(__ EQ, L_CMP_LOOP);
 7623     __ BIND(L_CMP_LOOP_NOMATCH);
 7624       // here we're not matched
 7625       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
 7626       __ clz(tmp4, tmp2);
 7627       __ add(str2, str2, str2_chr_size); // advance pointer
 7628       __ b(L_HAS_ZERO_LOOP);
 7629     __ align(OptoLoopAlignment);
 7630     __ BIND(L_CMP_LOOP_LAST_CMP);
 7631       __ cmp(cnt1, ch2);
 7632       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 7633       __ b(DONE);
 7634     __ align(OptoLoopAlignment);
 7635     __ BIND(L_CMP_LOOP_LAST_CMP2);
 7636       if (str2_isL) {
 7637         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
 7638         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 7639         __ lslv(tmp2, tmp2, tmp4);
 7640         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7641         __ add(tmp4, tmp4, 1);
 7642         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7643         __ lsl(tmp2, tmp2, 1);
 7644       } else {
 7645         __ mov(ch2, 0xE);
 7646         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
 7647         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
 7648         __ lslv(tmp2, tmp2, tmp4);
 7649         __ add(tmp4, tmp4, 1);
 7650         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
 7651         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
 7652         __ lsl(tmp2, tmp2, 1);
 7653         __ sub(str2, str2, str2_chr_size);
 7654       }
 7655       __ cmp(ch1, ch2);
 7656       __ br(__ NE, L_CMP_LOOP_NOMATCH);
 7657       __ b(DONE);
 7658     __ align(OptoLoopAlignment);
 7659     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
 7660       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
 7661       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
 7662       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
 7663       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
 7664       // result by analyzed characters value, so, we can just reset lower bits
 7665       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
 7666       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
 7667       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
 7668       // index of last analyzed substring inside current octet. So, str2 in at
 7669       // respective start address. We need to advance it to next octet
 7670       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
 7671       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
 7672       __ bfm(result, zr, 0, 2 - str2_chr_shift);
 7673       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
 7674       __ movw(cnt2, cnt2);
 7675       __ b(L_LOOP_PROCEED);
 7676     __ align(OptoLoopAlignment);
 7677     __ BIND(NOMATCH);
 7678       __ mov(result, -1);
 7679     __ BIND(DONE);
 7680       __ pop(spilled_regs, sp);
 7681       __ ret(lr);
 7682     return entry;
 7683   }
 7684 
 7685   void generate_string_indexof_stubs() {
 7686     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
 7687     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
 7688     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
 7689   }
 7690 
 7691   void inflate_and_store_2_fp_registers(bool generatePrfm,
 7692       FloatRegister src1, FloatRegister src2) {
 7693     Register dst = r1;
 7694     __ zip1(v1, __ T16B, src1, v0);
 7695     __ zip2(v2, __ T16B, src1, v0);
 7696     if (generatePrfm) {
 7697       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
 7698     }
 7699     __ zip1(v3, __ T16B, src2, v0);
 7700     __ zip2(v4, __ T16B, src2, v0);
 7701     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
 7702   }
 7703 
 7704   // R0 = src
 7705   // R1 = dst
 7706   // R2 = len
 7707   // R3 = len >> 3
 7708   // V0 = 0
 7709   // v1 = loaded 8 bytes
 7710   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
 7711   address generate_large_byte_array_inflate() {
 7712     __ align(CodeEntryAlignment);
 7713     StubGenStubId stub_id = StubGenStubId::large_byte_array_inflate_id;
 7714     StubCodeMark mark(this, stub_id);
 7715     address entry = __ pc();
 7716     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
 7717     Register src = r0, dst = r1, len = r2, octetCounter = r3;
 7718     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
 7719 
 7720     // do one more 8-byte read to have address 16-byte aligned in most cases
 7721     // also use single store instruction
 7722     __ ldrd(v2, __ post(src, 8));
 7723     __ sub(octetCounter, octetCounter, 2);
 7724     __ zip1(v1, __ T16B, v1, v0);
 7725     __ zip1(v2, __ T16B, v2, v0);
 7726     __ st1(v1, v2, __ T16B, __ post(dst, 32));
 7727     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 7728     __ subs(rscratch1, octetCounter, large_loop_threshold);
 7729     __ br(__ LE, LOOP_START);
 7730     __ b(LOOP_PRFM_START);
 7731     __ bind(LOOP_PRFM);
 7732       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 7733     __ bind(LOOP_PRFM_START);
 7734       __ prfm(Address(src, SoftwarePrefetchHintDistance));
 7735       __ sub(octetCounter, octetCounter, 8);
 7736       __ subs(rscratch1, octetCounter, large_loop_threshold);
 7737       inflate_and_store_2_fp_registers(true, v3, v4);
 7738       inflate_and_store_2_fp_registers(true, v5, v6);
 7739       __ br(__ GT, LOOP_PRFM);
 7740       __ cmp(octetCounter, (u1)8);
 7741       __ br(__ LT, DONE);
 7742     __ bind(LOOP);
 7743       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
 7744       __ bind(LOOP_START);
 7745       __ sub(octetCounter, octetCounter, 8);
 7746       __ cmp(octetCounter, (u1)8);
 7747       inflate_and_store_2_fp_registers(false, v3, v4);
 7748       inflate_and_store_2_fp_registers(false, v5, v6);
 7749       __ br(__ GE, LOOP);
 7750     __ bind(DONE);
 7751       __ ret(lr);
 7752     return entry;
 7753   }
 7754 
 7755   /**
 7756    *  Arguments:
 7757    *
 7758    *  Input:
 7759    *  c_rarg0   - current state address
 7760    *  c_rarg1   - H key address
 7761    *  c_rarg2   - data address
 7762    *  c_rarg3   - number of blocks
 7763    *
 7764    *  Output:
 7765    *  Updated state at c_rarg0
 7766    */
 7767   address generate_ghash_processBlocks() {
 7768     // Bafflingly, GCM uses little-endian for the byte order, but
 7769     // big-endian for the bit order.  For example, the polynomial 1 is
 7770     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
 7771     //
 7772     // So, we must either reverse the bytes in each word and do
 7773     // everything big-endian or reverse the bits in each byte and do
 7774     // it little-endian.  On AArch64 it's more idiomatic to reverse
 7775     // the bits in each byte (we have an instruction, RBIT, to do
 7776     // that) and keep the data in little-endian bit order through the
 7777     // calculation, bit-reversing the inputs and outputs.
 7778 
 7779     StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_id;
 7780     StubCodeMark mark(this, stub_id);
 7781     __ align(wordSize * 2);
 7782     address p = __ pc();
 7783     __ emit_int64(0x87);  // The low-order bits of the field
 7784                           // polynomial (i.e. p = z^7+z^2+z+1)
 7785                           // repeated in the low and high parts of a
 7786                           // 128-bit vector
 7787     __ emit_int64(0x87);
 7788 
 7789     __ align(CodeEntryAlignment);
 7790     address start = __ pc();
 7791 
 7792     Register state   = c_rarg0;
 7793     Register subkeyH = c_rarg1;
 7794     Register data    = c_rarg2;
 7795     Register blocks  = c_rarg3;
 7796 
 7797     FloatRegister vzr = v30;
 7798     __ eor(vzr, __ T16B, vzr, vzr); // zero register
 7799 
 7800     __ ldrq(v24, p);    // The field polynomial
 7801 
 7802     __ ldrq(v0, Address(state));
 7803     __ ldrq(v1, Address(subkeyH));
 7804 
 7805     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
 7806     __ rbit(v0, __ T16B, v0);
 7807     __ rev64(v1, __ T16B, v1);
 7808     __ rbit(v1, __ T16B, v1);
 7809 
 7810     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
 7811     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
 7812 
 7813     {
 7814       Label L_ghash_loop;
 7815       __ bind(L_ghash_loop);
 7816 
 7817       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
 7818                                                  // reversing each byte
 7819       __ rbit(v2, __ T16B, v2);
 7820       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
 7821 
 7822       // Multiply state in v2 by subkey in v1
 7823       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
 7824                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
 7825                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
 7826       // Reduce v7:v5 by the field polynomial
 7827       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
 7828 
 7829       __ sub(blocks, blocks, 1);
 7830       __ cbnz(blocks, L_ghash_loop);
 7831     }
 7832 
 7833     // The bit-reversed result is at this point in v0
 7834     __ rev64(v0, __ T16B, v0);
 7835     __ rbit(v0, __ T16B, v0);
 7836 
 7837     __ st1(v0, __ T16B, state);
 7838     __ ret(lr);
 7839 
 7840     return start;
 7841   }
 7842 
 7843   address generate_ghash_processBlocks_wide() {
 7844     address small = generate_ghash_processBlocks();
 7845 
 7846     StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_wide_id;
 7847     StubCodeMark mark(this, stub_id);
 7848     __ align(wordSize * 2);
 7849     address p = __ pc();
 7850     __ emit_int64(0x87);  // The low-order bits of the field
 7851                           // polynomial (i.e. p = z^7+z^2+z+1)
 7852                           // repeated in the low and high parts of a
 7853                           // 128-bit vector
 7854     __ emit_int64(0x87);
 7855 
 7856     __ align(CodeEntryAlignment);
 7857     address start = __ pc();
 7858 
 7859     Register state   = c_rarg0;
 7860     Register subkeyH = c_rarg1;
 7861     Register data    = c_rarg2;
 7862     Register blocks  = c_rarg3;
 7863 
 7864     const int unroll = 4;
 7865 
 7866     __ cmp(blocks, (unsigned char)(unroll * 2));
 7867     __ br(__ LT, small);
 7868 
 7869     if (unroll > 1) {
 7870     // Save state before entering routine
 7871       __ sub(sp, sp, 4 * 16);
 7872       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
 7873       __ sub(sp, sp, 4 * 16);
 7874       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
 7875     }
 7876 
 7877     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
 7878 
 7879     if (unroll > 1) {
 7880       // And restore state
 7881       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
 7882       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
 7883     }
 7884 
 7885     __ cmp(blocks, (unsigned char)0);
 7886     __ br(__ GT, small);
 7887 
 7888     __ ret(lr);
 7889 
 7890     return start;
 7891   }
 7892 
 7893   void generate_base64_encode_simdround(Register src, Register dst,
 7894         FloatRegister codec, u8 size) {
 7895 
 7896     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
 7897     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
 7898     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
 7899 
 7900     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 7901 
 7902     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
 7903 
 7904     __ ushr(ind0, arrangement, in0,  2);
 7905 
 7906     __ ushr(ind1, arrangement, in1,  2);
 7907     __ shl(in0,   arrangement, in0,  6);
 7908     __ orr(ind1,  arrangement, ind1, in0);
 7909     __ ushr(ind1, arrangement, ind1, 2);
 7910 
 7911     __ ushr(ind2, arrangement, in2,  4);
 7912     __ shl(in1,   arrangement, in1,  4);
 7913     __ orr(ind2,  arrangement, in1,  ind2);
 7914     __ ushr(ind2, arrangement, ind2, 2);
 7915 
 7916     __ shl(ind3,  arrangement, in2,  2);
 7917     __ ushr(ind3, arrangement, ind3, 2);
 7918 
 7919     __ tbl(out0,  arrangement, codec,  4, ind0);
 7920     __ tbl(out1,  arrangement, codec,  4, ind1);
 7921     __ tbl(out2,  arrangement, codec,  4, ind2);
 7922     __ tbl(out3,  arrangement, codec,  4, ind3);
 7923 
 7924     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
 7925   }
 7926 
 7927    /**
 7928    *  Arguments:
 7929    *
 7930    *  Input:
 7931    *  c_rarg0   - src_start
 7932    *  c_rarg1   - src_offset
 7933    *  c_rarg2   - src_length
 7934    *  c_rarg3   - dest_start
 7935    *  c_rarg4   - dest_offset
 7936    *  c_rarg5   - isURL
 7937    *
 7938    */
 7939   address generate_base64_encodeBlock() {
 7940 
 7941     static const char toBase64[64] = {
 7942       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 7943       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 7944       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 7945       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 7946       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
 7947     };
 7948 
 7949     static const char toBase64URL[64] = {
 7950       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
 7951       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
 7952       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 7953       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
 7954       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
 7955     };
 7956 
 7957     __ align(CodeEntryAlignment);
 7958     StubGenStubId stub_id = StubGenStubId::base64_encodeBlock_id;
 7959     StubCodeMark mark(this, stub_id);
 7960     address start = __ pc();
 7961 
 7962     Register src   = c_rarg0;  // source array
 7963     Register soff  = c_rarg1;  // source start offset
 7964     Register send  = c_rarg2;  // source end offset
 7965     Register dst   = c_rarg3;  // dest array
 7966     Register doff  = c_rarg4;  // position for writing to dest array
 7967     Register isURL = c_rarg5;  // Base64 or URL character set
 7968 
 7969     // c_rarg6 and c_rarg7 are free to use as temps
 7970     Register codec  = c_rarg6;
 7971     Register length = c_rarg7;
 7972 
 7973     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
 7974 
 7975     __ add(src, src, soff);
 7976     __ add(dst, dst, doff);
 7977     __ sub(length, send, soff);
 7978 
 7979     // load the codec base address
 7980     __ lea(codec, ExternalAddress((address) toBase64));
 7981     __ cbz(isURL, ProcessData);
 7982     __ lea(codec, ExternalAddress((address) toBase64URL));
 7983 
 7984     __ BIND(ProcessData);
 7985 
 7986     // too short to formup a SIMD loop, roll back
 7987     __ cmp(length, (u1)24);
 7988     __ br(Assembler::LT, Process3B);
 7989 
 7990     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
 7991 
 7992     __ BIND(Process48B);
 7993     __ cmp(length, (u1)48);
 7994     __ br(Assembler::LT, Process24B);
 7995     generate_base64_encode_simdround(src, dst, v0, 16);
 7996     __ sub(length, length, 48);
 7997     __ b(Process48B);
 7998 
 7999     __ BIND(Process24B);
 8000     __ cmp(length, (u1)24);
 8001     __ br(Assembler::LT, SIMDExit);
 8002     generate_base64_encode_simdround(src, dst, v0, 8);
 8003     __ sub(length, length, 24);
 8004 
 8005     __ BIND(SIMDExit);
 8006     __ cbz(length, Exit);
 8007 
 8008     __ BIND(Process3B);
 8009     //  3 src bytes, 24 bits
 8010     __ ldrb(r10, __ post(src, 1));
 8011     __ ldrb(r11, __ post(src, 1));
 8012     __ ldrb(r12, __ post(src, 1));
 8013     __ orrw(r11, r11, r10, Assembler::LSL, 8);
 8014     __ orrw(r12, r12, r11, Assembler::LSL, 8);
 8015     // codec index
 8016     __ ubfmw(r15, r12, 18, 23);
 8017     __ ubfmw(r14, r12, 12, 17);
 8018     __ ubfmw(r13, r12, 6,  11);
 8019     __ andw(r12,  r12, 63);
 8020     // get the code based on the codec
 8021     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
 8022     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
 8023     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
 8024     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
 8025     __ strb(r15, __ post(dst, 1));
 8026     __ strb(r14, __ post(dst, 1));
 8027     __ strb(r13, __ post(dst, 1));
 8028     __ strb(r12, __ post(dst, 1));
 8029     __ sub(length, length, 3);
 8030     __ cbnz(length, Process3B);
 8031 
 8032     __ BIND(Exit);
 8033     __ ret(lr);
 8034 
 8035     return start;
 8036   }
 8037 
 8038   void generate_base64_decode_simdround(Register src, Register dst,
 8039         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
 8040 
 8041     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
 8042     FloatRegister out0 = v20, out1 = v21, out2 = v22;
 8043 
 8044     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
 8045     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
 8046 
 8047     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
 8048 
 8049     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
 8050 
 8051     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
 8052 
 8053     // we need unsigned saturating subtract, to make sure all input values
 8054     // in range [0, 63] will have 0U value in the higher half lookup
 8055     __ uqsubv(decH0, __ T16B, in0, v27);
 8056     __ uqsubv(decH1, __ T16B, in1, v27);
 8057     __ uqsubv(decH2, __ T16B, in2, v27);
 8058     __ uqsubv(decH3, __ T16B, in3, v27);
 8059 
 8060     // lower half lookup
 8061     __ tbl(decL0, arrangement, codecL, 4, in0);
 8062     __ tbl(decL1, arrangement, codecL, 4, in1);
 8063     __ tbl(decL2, arrangement, codecL, 4, in2);
 8064     __ tbl(decL3, arrangement, codecL, 4, in3);
 8065 
 8066     // higher half lookup
 8067     __ tbx(decH0, arrangement, codecH, 4, decH0);
 8068     __ tbx(decH1, arrangement, codecH, 4, decH1);
 8069     __ tbx(decH2, arrangement, codecH, 4, decH2);
 8070     __ tbx(decH3, arrangement, codecH, 4, decH3);
 8071 
 8072     // combine lower and higher
 8073     __ orr(decL0, arrangement, decL0, decH0);
 8074     __ orr(decL1, arrangement, decL1, decH1);
 8075     __ orr(decL2, arrangement, decL2, decH2);
 8076     __ orr(decL3, arrangement, decL3, decH3);
 8077 
 8078     // check illegal inputs, value larger than 63 (maximum of 6 bits)
 8079     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
 8080     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
 8081     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
 8082     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
 8083     __ orr(in0, arrangement, decH0, decH1);
 8084     __ orr(in1, arrangement, decH2, decH3);
 8085     __ orr(in2, arrangement, in0,   in1);
 8086     __ umaxv(in3, arrangement, in2);
 8087     __ umov(rscratch2, in3, __ B, 0);
 8088 
 8089     // get the data to output
 8090     __ shl(out0,  arrangement, decL0, 2);
 8091     __ ushr(out1, arrangement, decL1, 4);
 8092     __ orr(out0,  arrangement, out0,  out1);
 8093     __ shl(out1,  arrangement, decL1, 4);
 8094     __ ushr(out2, arrangement, decL2, 2);
 8095     __ orr(out1,  arrangement, out1,  out2);
 8096     __ shl(out2,  arrangement, decL2, 6);
 8097     __ orr(out2,  arrangement, out2,  decL3);
 8098 
 8099     __ cbz(rscratch2, NoIllegalData);
 8100 
 8101     // handle illegal input
 8102     __ umov(r10, in2, __ D, 0);
 8103     if (size == 16) {
 8104       __ cbnz(r10, ErrorInLowerHalf);
 8105 
 8106       // illegal input is in higher half, store the lower half now.
 8107       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
 8108 
 8109       __ umov(r10, in2,  __ D, 1);
 8110       __ umov(r11, out0, __ D, 1);
 8111       __ umov(r12, out1, __ D, 1);
 8112       __ umov(r13, out2, __ D, 1);
 8113       __ b(StoreLegalData);
 8114 
 8115       __ BIND(ErrorInLowerHalf);
 8116     }
 8117     __ umov(r11, out0, __ D, 0);
 8118     __ umov(r12, out1, __ D, 0);
 8119     __ umov(r13, out2, __ D, 0);
 8120 
 8121     __ BIND(StoreLegalData);
 8122     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
 8123     __ strb(r11, __ post(dst, 1));
 8124     __ strb(r12, __ post(dst, 1));
 8125     __ strb(r13, __ post(dst, 1));
 8126     __ lsr(r10, r10, 8);
 8127     __ lsr(r11, r11, 8);
 8128     __ lsr(r12, r12, 8);
 8129     __ lsr(r13, r13, 8);
 8130     __ b(StoreLegalData);
 8131 
 8132     __ BIND(NoIllegalData);
 8133     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
 8134   }
 8135 
 8136 
 8137    /**
 8138    *  Arguments:
 8139    *
 8140    *  Input:
 8141    *  c_rarg0   - src_start
 8142    *  c_rarg1   - src_offset
 8143    *  c_rarg2   - src_length
 8144    *  c_rarg3   - dest_start
 8145    *  c_rarg4   - dest_offset
 8146    *  c_rarg5   - isURL
 8147    *  c_rarg6   - isMIME
 8148    *
 8149    */
 8150   address generate_base64_decodeBlock() {
 8151 
 8152     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
 8153     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
 8154     // titled "Base64 decoding".
 8155 
 8156     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
 8157     // except the trailing character '=' is also treated illegal value in this intrinsic. That
 8158     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
 8159     static const uint8_t fromBase64ForNoSIMD[256] = {
 8160       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8161       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8162       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
 8163        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 8164       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
 8165        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
 8166       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
 8167        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
 8168       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8169       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8170       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8171       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8172       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8173       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8174       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8175       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8176     };
 8177 
 8178     static const uint8_t fromBase64URLForNoSIMD[256] = {
 8179       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8180       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8181       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
 8182        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 8183       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
 8184        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
 8185       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
 8186        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
 8187       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8188       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8189       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8190       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8191       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8192       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8193       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8194       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8195     };
 8196 
 8197     // A legal value of base64 code is in range [0, 127].  We need two lookups
 8198     // with tbl/tbx and combine them to get the decode data. The 1st table vector
 8199     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
 8200     // table vector lookup use tbx, out of range indices are unchanged in
 8201     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
 8202     // The value of index 64 is set to 0, so that we know that we already get the
 8203     // decoded data with the 1st lookup.
 8204     static const uint8_t fromBase64ForSIMD[128] = {
 8205       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8206       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8207       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
 8208        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 8209         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
 8210        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
 8211       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
 8212        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
 8213     };
 8214 
 8215     static const uint8_t fromBase64URLForSIMD[128] = {
 8216       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8217       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
 8218       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
 8219        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
 8220         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
 8221        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
 8222        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
 8223        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
 8224     };
 8225 
 8226     __ align(CodeEntryAlignment);
 8227     StubGenStubId stub_id = StubGenStubId::base64_decodeBlock_id;
 8228     StubCodeMark mark(this, stub_id);
 8229     address start = __ pc();
 8230 
 8231     Register src    = c_rarg0;  // source array
 8232     Register soff   = c_rarg1;  // source start offset
 8233     Register send   = c_rarg2;  // source end offset
 8234     Register dst    = c_rarg3;  // dest array
 8235     Register doff   = c_rarg4;  // position for writing to dest array
 8236     Register isURL  = c_rarg5;  // Base64 or URL character set
 8237     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
 8238 
 8239     Register length = send;    // reuse send as length of source data to process
 8240 
 8241     Register simd_codec   = c_rarg6;
 8242     Register nosimd_codec = c_rarg7;
 8243 
 8244     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
 8245 
 8246     __ enter();
 8247 
 8248     __ add(src, src, soff);
 8249     __ add(dst, dst, doff);
 8250 
 8251     __ mov(doff, dst);
 8252 
 8253     __ sub(length, send, soff);
 8254     __ bfm(length, zr, 0, 1);
 8255 
 8256     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
 8257     __ cbz(isURL, ProcessData);
 8258     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
 8259 
 8260     __ BIND(ProcessData);
 8261     __ mov(rscratch1, length);
 8262     __ cmp(length, (u1)144); // 144 = 80 + 64
 8263     __ br(Assembler::LT, Process4B);
 8264 
 8265     // In the MIME case, the line length cannot be more than 76
 8266     // bytes (see RFC 2045). This is too short a block for SIMD
 8267     // to be worthwhile, so we use non-SIMD here.
 8268     __ movw(rscratch1, 79);
 8269 
 8270     __ BIND(Process4B);
 8271     __ ldrw(r14, __ post(src, 4));
 8272     __ ubfxw(r10, r14, 0,  8);
 8273     __ ubfxw(r11, r14, 8,  8);
 8274     __ ubfxw(r12, r14, 16, 8);
 8275     __ ubfxw(r13, r14, 24, 8);
 8276     // get the de-code
 8277     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
 8278     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
 8279     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
 8280     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
 8281     // error detection, 255u indicates an illegal input
 8282     __ orrw(r14, r10, r11);
 8283     __ orrw(r15, r12, r13);
 8284     __ orrw(r14, r14, r15);
 8285     __ tbnz(r14, 7, Exit);
 8286     // recover the data
 8287     __ lslw(r14, r10, 10);
 8288     __ bfiw(r14, r11, 4, 6);
 8289     __ bfmw(r14, r12, 2, 5);
 8290     __ rev16w(r14, r14);
 8291     __ bfiw(r13, r12, 6, 2);
 8292     __ strh(r14, __ post(dst, 2));
 8293     __ strb(r13, __ post(dst, 1));
 8294     // non-simd loop
 8295     __ subsw(rscratch1, rscratch1, 4);
 8296     __ br(Assembler::GT, Process4B);
 8297 
 8298     // if exiting from PreProcess80B, rscratch1 == -1;
 8299     // otherwise, rscratch1 == 0.
 8300     __ cbzw(rscratch1, Exit);
 8301     __ sub(length, length, 80);
 8302 
 8303     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
 8304     __ cbz(isURL, SIMDEnter);
 8305     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
 8306 
 8307     __ BIND(SIMDEnter);
 8308     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
 8309     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
 8310     __ mov(rscratch1, 63);
 8311     __ dup(v27, __ T16B, rscratch1);
 8312 
 8313     __ BIND(Process64B);
 8314     __ cmp(length, (u1)64);
 8315     __ br(Assembler::LT, Process32B);
 8316     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
 8317     __ sub(length, length, 64);
 8318     __ b(Process64B);
 8319 
 8320     __ BIND(Process32B);
 8321     __ cmp(length, (u1)32);
 8322     __ br(Assembler::LT, SIMDExit);
 8323     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
 8324     __ sub(length, length, 32);
 8325     __ b(Process32B);
 8326 
 8327     __ BIND(SIMDExit);
 8328     __ cbz(length, Exit);
 8329     __ movw(rscratch1, length);
 8330     __ b(Process4B);
 8331 
 8332     __ BIND(Exit);
 8333     __ sub(c_rarg0, dst, doff);
 8334 
 8335     __ leave();
 8336     __ ret(lr);
 8337 
 8338     return start;
 8339   }
 8340 
 8341   // Support for spin waits.
 8342   address generate_spin_wait() {
 8343     __ align(CodeEntryAlignment);
 8344     StubGenStubId stub_id = StubGenStubId::spin_wait_id;
 8345     StubCodeMark mark(this, stub_id);
 8346     address start = __ pc();
 8347 
 8348     __ spin_wait();
 8349     __ ret(lr);
 8350 
 8351     return start;
 8352   }
 8353 
 8354   void generate_lookup_secondary_supers_table_stub() {
 8355     StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_id;
 8356     StubCodeMark mark(this, stub_id);
 8357 
 8358     const Register
 8359       r_super_klass  = r0,
 8360       r_array_base   = r1,
 8361       r_array_length = r2,
 8362       r_array_index  = r3,
 8363       r_sub_klass    = r4,
 8364       r_bitmap       = rscratch2,
 8365       result         = r5;
 8366     const FloatRegister
 8367       vtemp          = v0;
 8368 
 8369     for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
 8370       StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
 8371       Label L_success;
 8372       __ enter();
 8373       __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
 8374                                              r_array_base, r_array_length, r_array_index,
 8375                                              vtemp, result, slot,
 8376                                              /*stub_is_near*/true);
 8377       __ leave();
 8378       __ ret(lr);
 8379     }
 8380   }
 8381 
 8382   // Slow path implementation for UseSecondarySupersTable.
 8383   address generate_lookup_secondary_supers_table_slow_path_stub() {
 8384     StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_slow_path_id;
 8385     StubCodeMark mark(this, stub_id);
 8386 
 8387     address start = __ pc();
 8388     const Register
 8389       r_super_klass  = r0,        // argument
 8390       r_array_base   = r1,        // argument
 8391       temp1          = r2,        // temp
 8392       r_array_index  = r3,        // argument
 8393       r_bitmap       = rscratch2, // argument
 8394       result         = r5;        // argument
 8395 
 8396     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
 8397     __ ret(lr);
 8398 
 8399     return start;
 8400   }
 8401 
 8402 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
 8403 
 8404   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
 8405   //
 8406   // If LSE is in use, generate LSE versions of all the stubs. The
 8407   // non-LSE versions are in atomic_aarch64.S.
 8408 
 8409   // class AtomicStubMark records the entry point of a stub and the
 8410   // stub pointer which will point to it. The stub pointer is set to
 8411   // the entry point when ~AtomicStubMark() is called, which must be
 8412   // after ICache::invalidate_range. This ensures safe publication of
 8413   // the generated code.
 8414   class AtomicStubMark {
 8415     address _entry_point;
 8416     aarch64_atomic_stub_t *_stub;
 8417     MacroAssembler *_masm;
 8418   public:
 8419     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
 8420       _masm = masm;
 8421       __ align(32);
 8422       _entry_point = __ pc();
 8423       _stub = stub;
 8424     }
 8425     ~AtomicStubMark() {
 8426       *_stub = (aarch64_atomic_stub_t)_entry_point;
 8427     }
 8428   };
 8429 
 8430   // NB: For memory_order_conservative we need a trailing membar after
 8431   // LSE atomic operations but not a leading membar.
 8432   //
 8433   // We don't need a leading membar because a clause in the Arm ARM
 8434   // says:
 8435   //
 8436   //   Barrier-ordered-before
 8437   //
 8438   //   Barrier instructions order prior Memory effects before subsequent
 8439   //   Memory effects generated by the same Observer. A read or a write
 8440   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
 8441   //   Observer if and only if RW1 appears in program order before RW 2
 8442   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
 8443   //   instruction with both Acquire and Release semantics.
 8444   //
 8445   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
 8446   // and Release semantics, therefore we don't need a leading
 8447   // barrier. However, there is no corresponding Barrier-ordered-after
 8448   // relationship, therefore we need a trailing membar to prevent a
 8449   // later store or load from being reordered with the store in an
 8450   // atomic instruction.
 8451   //
 8452   // This was checked by using the herd7 consistency model simulator
 8453   // (http://diy.inria.fr/) with this test case:
 8454   //
 8455   // AArch64 LseCas
 8456   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
 8457   // P0 | P1;
 8458   // LDR W4, [X2] | MOV W3, #0;
 8459   // DMB LD       | MOV W4, #1;
 8460   // LDR W3, [X1] | CASAL W3, W4, [X1];
 8461   //              | DMB ISH;
 8462   //              | STR W4, [X2];
 8463   // exists
 8464   // (0:X3=0 /\ 0:X4=1)
 8465   //
 8466   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
 8467   // with the store to x in P1. Without the DMB in P1 this may happen.
 8468   //
 8469   // At the time of writing we don't know of any AArch64 hardware that
 8470   // reorders stores in this way, but the Reference Manual permits it.
 8471 
 8472   void gen_cas_entry(Assembler::operand_size size,
 8473                      atomic_memory_order order) {
 8474     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
 8475       exchange_val = c_rarg2;
 8476     bool acquire, release;
 8477     switch (order) {
 8478       case memory_order_relaxed:
 8479         acquire = false;
 8480         release = false;
 8481         break;
 8482       case memory_order_release:
 8483         acquire = false;
 8484         release = true;
 8485         break;
 8486       default:
 8487         acquire = true;
 8488         release = true;
 8489         break;
 8490     }
 8491     __ mov(prev, compare_val);
 8492     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
 8493     if (order == memory_order_conservative) {
 8494       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
 8495     }
 8496     if (size == Assembler::xword) {
 8497       __ mov(r0, prev);
 8498     } else {
 8499       __ movw(r0, prev);
 8500     }
 8501     __ ret(lr);
 8502   }
 8503 
 8504   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
 8505     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
 8506     // If not relaxed, then default to conservative.  Relaxed is the only
 8507     // case we use enough to be worth specializing.
 8508     if (order == memory_order_relaxed) {
 8509       __ ldadd(size, incr, prev, addr);
 8510     } else {
 8511       __ ldaddal(size, incr, prev, addr);
 8512       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
 8513     }
 8514     if (size == Assembler::xword) {
 8515       __ mov(r0, prev);
 8516     } else {
 8517       __ movw(r0, prev);
 8518     }
 8519     __ ret(lr);
 8520   }
 8521 
 8522   void gen_swpal_entry(Assembler::operand_size size) {
 8523     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
 8524     __ swpal(size, incr, prev, addr);
 8525     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
 8526     if (size == Assembler::xword) {
 8527       __ mov(r0, prev);
 8528     } else {
 8529       __ movw(r0, prev);
 8530     }
 8531     __ ret(lr);
 8532   }
 8533 
 8534   void generate_atomic_entry_points() {
 8535     if (! UseLSE) {
 8536       return;
 8537     }
 8538     __ align(CodeEntryAlignment);
 8539     StubGenStubId stub_id = StubGenStubId::atomic_entry_points_id;
 8540     StubCodeMark mark(this, stub_id);
 8541     address first_entry = __ pc();
 8542 
 8543     // ADD, memory_order_conservative
 8544     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
 8545     gen_ldadd_entry(Assembler::word, memory_order_conservative);
 8546     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
 8547     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
 8548 
 8549     // ADD, memory_order_relaxed
 8550     AtomicStubMark mark_fetch_add_4_relaxed
 8551       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
 8552     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
 8553     AtomicStubMark mark_fetch_add_8_relaxed
 8554       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
 8555     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
 8556 
 8557     // XCHG, memory_order_conservative
 8558     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
 8559     gen_swpal_entry(Assembler::word);
 8560     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
 8561     gen_swpal_entry(Assembler::xword);
 8562 
 8563     // CAS, memory_order_conservative
 8564     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
 8565     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
 8566     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
 8567     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
 8568     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
 8569     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
 8570 
 8571     // CAS, memory_order_relaxed
 8572     AtomicStubMark mark_cmpxchg_1_relaxed
 8573       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
 8574     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
 8575     AtomicStubMark mark_cmpxchg_4_relaxed
 8576       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
 8577     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
 8578     AtomicStubMark mark_cmpxchg_8_relaxed
 8579       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
 8580     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
 8581 
 8582     AtomicStubMark mark_cmpxchg_4_release
 8583       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
 8584     gen_cas_entry(MacroAssembler::word, memory_order_release);
 8585     AtomicStubMark mark_cmpxchg_8_release
 8586       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
 8587     gen_cas_entry(MacroAssembler::xword, memory_order_release);
 8588 
 8589     AtomicStubMark mark_cmpxchg_4_seq_cst
 8590       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
 8591     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
 8592     AtomicStubMark mark_cmpxchg_8_seq_cst
 8593       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
 8594     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
 8595 
 8596     ICache::invalidate_range(first_entry, __ pc() - first_entry);
 8597   }
 8598 #endif // LINUX
 8599 
 8600   address generate_cont_thaw(Continuation::thaw_kind kind) {
 8601     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
 8602     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
 8603 
 8604     address start = __ pc();
 8605 
 8606     if (return_barrier) {
 8607       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
 8608       __ mov(sp, rscratch1);
 8609     }
 8610     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
 8611 
 8612     if (return_barrier) {
 8613       // preserve possible return value from a method returning to the return barrier
 8614       __ fmovd(rscratch1, v0);
 8615       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
 8616     }
 8617 
 8618     __ movw(c_rarg1, (return_barrier ? 1 : 0));
 8619     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
 8620     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
 8621 
 8622     if (return_barrier) {
 8623       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
 8624       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
 8625       __ fmovd(v0, rscratch1);
 8626     }
 8627     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
 8628 
 8629 
 8630     Label thaw_success;
 8631     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
 8632     __ cbnz(rscratch2, thaw_success);
 8633     __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
 8634     __ br(rscratch1);
 8635     __ bind(thaw_success);
 8636 
 8637     // make room for the thawed frames
 8638     __ sub(rscratch1, sp, rscratch2);
 8639     __ andr(rscratch1, rscratch1, -16); // align
 8640     __ mov(sp, rscratch1);
 8641 
 8642     if (return_barrier) {
 8643       // save original return value -- again
 8644       __ fmovd(rscratch1, v0);
 8645       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
 8646     }
 8647 
 8648     // If we want, we can templatize thaw by kind, and have three different entries
 8649     __ movw(c_rarg1, (uint32_t)kind);
 8650 
 8651     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
 8652     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
 8653 
 8654     if (return_barrier) {
 8655       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
 8656       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
 8657       __ fmovd(v0, rscratch1);
 8658     } else {
 8659       __ mov(r0, zr); // return 0 (success) from doYield
 8660     }
 8661 
 8662     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
 8663     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
 8664     __ mov(rfp, sp);
 8665 
 8666     if (return_barrier_exception) {
 8667       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
 8668       __ authenticate_return_address(c_rarg1);
 8669       __ verify_oop(r0);
 8670       // save return value containing the exception oop in callee-saved R19
 8671       __ mov(r19, r0);
 8672 
 8673       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
 8674 
 8675       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
 8676       // __ reinitialize_ptrue();
 8677 
 8678       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
 8679 
 8680       __ mov(r1, r0); // the exception handler
 8681       __ mov(r0, r19); // restore return value containing the exception oop
 8682       __ verify_oop(r0);
 8683 
 8684       __ leave();
 8685       __ mov(r3, lr);
 8686       __ br(r1); // the exception handler
 8687     } else {
 8688       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
 8689       __ leave();
 8690       __ ret(lr);
 8691     }
 8692 
 8693     return start;
 8694   }
 8695 
 8696   address generate_cont_thaw() {
 8697     if (!Continuations::enabled()) return nullptr;
 8698 
 8699     StubGenStubId stub_id = StubGenStubId::cont_thaw_id;
 8700     StubCodeMark mark(this, stub_id);
 8701     address start = __ pc();
 8702     generate_cont_thaw(Continuation::thaw_top);
 8703     return start;
 8704   }
 8705 
 8706   address generate_cont_returnBarrier() {
 8707     if (!Continuations::enabled()) return nullptr;
 8708 
 8709     // TODO: will probably need multiple return barriers depending on return type
 8710     StubGenStubId stub_id = StubGenStubId::cont_returnBarrier_id;
 8711     StubCodeMark mark(this, stub_id);
 8712     address start = __ pc();
 8713 
 8714     generate_cont_thaw(Continuation::thaw_return_barrier);
 8715 
 8716     return start;
 8717   }
 8718 
 8719   address generate_cont_returnBarrier_exception() {
 8720     if (!Continuations::enabled()) return nullptr;
 8721 
 8722     StubGenStubId stub_id = StubGenStubId::cont_returnBarrierExc_id;
 8723     StubCodeMark mark(this, stub_id);
 8724     address start = __ pc();
 8725 
 8726     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
 8727 
 8728     return start;
 8729   }
 8730 
 8731   address generate_cont_preempt_stub() {
 8732     if (!Continuations::enabled()) return nullptr;
 8733     StubGenStubId stub_id = StubGenStubId::cont_preempt_id;
 8734     StubCodeMark mark(this, stub_id);
 8735     address start = __ pc();
 8736 
 8737     __ reset_last_Java_frame(true);
 8738 
 8739     // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
 8740     __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
 8741     __ mov(sp, rscratch2);
 8742 
 8743     Label preemption_cancelled;
 8744     __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
 8745     __ cbnz(rscratch1, preemption_cancelled);
 8746 
 8747     // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
 8748     SharedRuntime::continuation_enter_cleanup(_masm);
 8749     __ leave();
 8750     __ ret(lr);
 8751 
 8752     // We acquired the monitor after freezing the frames so call thaw to continue execution.
 8753     __ bind(preemption_cancelled);
 8754     __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
 8755     __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
 8756     __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
 8757     __ ldr(rscratch1, Address(rscratch1));
 8758     __ br(rscratch1);
 8759 
 8760     return start;
 8761   }
 8762 
 8763   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
 8764   // are represented as long[5], with BITS_PER_LIMB = 26.
 8765   // Pack five 26-bit limbs into three 64-bit registers.
 8766   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
 8767     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
 8768     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
 8769     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
 8770     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
 8771 
 8772     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
 8773     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
 8774     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
 8775     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
 8776 
 8777     if (dest2->is_valid()) {
 8778       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
 8779     } else {
 8780 #ifdef ASSERT
 8781       Label OK;
 8782       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
 8783       __ br(__ EQ, OK);
 8784       __ stop("high bits of Poly1305 integer should be zero");
 8785       __ should_not_reach_here();
 8786       __ bind(OK);
 8787 #endif
 8788     }
 8789   }
 8790 
 8791   // As above, but return only a 128-bit integer, packed into two
 8792   // 64-bit registers.
 8793   void pack_26(Register dest0, Register dest1, Register src) {
 8794     pack_26(dest0, dest1, noreg, src);
 8795   }
 8796 
 8797   // Multiply and multiply-accumulate unsigned 64-bit registers.
 8798   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
 8799     __ mul(prod_lo, n, m);
 8800     __ umulh(prod_hi, n, m);
 8801   }
 8802   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
 8803     wide_mul(rscratch1, rscratch2, n, m);
 8804     __ adds(sum_lo, sum_lo, rscratch1);
 8805     __ adc(sum_hi, sum_hi, rscratch2);
 8806   }
 8807 
 8808   // Poly1305, RFC 7539
 8809 
 8810   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
 8811   // description of the tricks used to simplify and accelerate this
 8812   // computation.
 8813 
 8814   address generate_poly1305_processBlocks() {
 8815     __ align(CodeEntryAlignment);
 8816     StubGenStubId stub_id = StubGenStubId::poly1305_processBlocks_id;
 8817     StubCodeMark mark(this, stub_id);
 8818     address start = __ pc();
 8819     Label here;
 8820     __ enter();
 8821     RegSet callee_saved = RegSet::range(r19, r28);
 8822     __ push(callee_saved, sp);
 8823 
 8824     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
 8825 
 8826     // Arguments
 8827     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
 8828 
 8829     // R_n is the 128-bit randomly-generated key, packed into two
 8830     // registers.  The caller passes this key to us as long[5], with
 8831     // BITS_PER_LIMB = 26.
 8832     const Register R_0 = *++regs, R_1 = *++regs;
 8833     pack_26(R_0, R_1, r_start);
 8834 
 8835     // RR_n is (R_n >> 2) * 5
 8836     const Register RR_0 = *++regs, RR_1 = *++regs;
 8837     __ lsr(RR_0, R_0, 2);
 8838     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
 8839     __ lsr(RR_1, R_1, 2);
 8840     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
 8841 
 8842     // U_n is the current checksum
 8843     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
 8844     pack_26(U_0, U_1, U_2, acc_start);
 8845 
 8846     static constexpr int BLOCK_LENGTH = 16;
 8847     Label DONE, LOOP;
 8848 
 8849     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
 8850     __ br(Assembler::LT, DONE); {
 8851       __ bind(LOOP);
 8852 
 8853       // S_n is to be the sum of U_n and the next block of data
 8854       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
 8855       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
 8856       __ adds(S_0, U_0, S_0);
 8857       __ adcs(S_1, U_1, S_1);
 8858       __ adc(S_2, U_2, zr);
 8859       __ add(S_2, S_2, 1);
 8860 
 8861       const Register U_0HI = *++regs, U_1HI = *++regs;
 8862 
 8863       // NB: this logic depends on some of the special properties of
 8864       // Poly1305 keys. In particular, because we know that the top
 8865       // four bits of R_0 and R_1 are zero, we can add together
 8866       // partial products without any risk of needing to propagate a
 8867       // carry out.
 8868       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
 8869       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
 8870       __ andr(U_2, R_0, 3);
 8871       __ mul(U_2, S_2, U_2);
 8872 
 8873       // Recycle registers S_0, S_1, S_2
 8874       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
 8875 
 8876       // Partial reduction mod 2**130 - 5
 8877       __ adds(U_1, U_0HI, U_1);
 8878       __ adc(U_2, U_1HI, U_2);
 8879       // Sum now in U_2:U_1:U_0.
 8880       // Dead: U_0HI, U_1HI.
 8881       regs = (regs.remaining() + U_0HI + U_1HI).begin();
 8882 
 8883       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
 8884 
 8885       // First, U_2:U_1:U_0 += (U_2 >> 2)
 8886       __ lsr(rscratch1, U_2, 2);
 8887       __ andr(U_2, U_2, (u8)3);
 8888       __ adds(U_0, U_0, rscratch1);
 8889       __ adcs(U_1, U_1, zr);
 8890       __ adc(U_2, U_2, zr);
 8891       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
 8892       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
 8893       __ adcs(U_1, U_1, zr);
 8894       __ adc(U_2, U_2, zr);
 8895 
 8896       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
 8897       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
 8898       __ br(~ Assembler::LT, LOOP);
 8899     }
 8900 
 8901     // Further reduce modulo 2^130 - 5
 8902     __ lsr(rscratch1, U_2, 2);
 8903     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
 8904     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
 8905     __ adcs(U_1, U_1, zr);
 8906     __ andr(U_2, U_2, (u1)3);
 8907     __ adc(U_2, U_2, zr);
 8908 
 8909     // Unpack the sum into five 26-bit limbs and write to memory.
 8910     __ ubfiz(rscratch1, U_0, 0, 26);
 8911     __ ubfx(rscratch2, U_0, 26, 26);
 8912     __ stp(rscratch1, rscratch2, Address(acc_start));
 8913     __ ubfx(rscratch1, U_0, 52, 12);
 8914     __ bfi(rscratch1, U_1, 12, 14);
 8915     __ ubfx(rscratch2, U_1, 14, 26);
 8916     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
 8917     __ ubfx(rscratch1, U_1, 40, 24);
 8918     __ bfi(rscratch1, U_2, 24, 3);
 8919     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
 8920 
 8921     __ bind(DONE);
 8922     __ pop(callee_saved, sp);
 8923     __ leave();
 8924     __ ret(lr);
 8925 
 8926     return start;
 8927   }
 8928 
 8929   // exception handler for upcall stubs
 8930   address generate_upcall_stub_exception_handler() {
 8931     StubGenStubId stub_id = StubGenStubId::upcall_stub_exception_handler_id;
 8932     StubCodeMark mark(this, stub_id);
 8933     address start = __ pc();
 8934 
 8935     // Native caller has no idea how to handle exceptions,
 8936     // so we just crash here. Up to callee to catch exceptions.
 8937     __ verify_oop(r0);
 8938     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
 8939     __ blr(rscratch1);
 8940     __ should_not_reach_here();
 8941 
 8942     return start;
 8943   }
 8944 
 8945   // load Method* target of MethodHandle
 8946   // j_rarg0 = jobject receiver
 8947   // rmethod = result
 8948   address generate_upcall_stub_load_target() {
 8949     StubGenStubId stub_id = StubGenStubId::upcall_stub_load_target_id;
 8950     StubCodeMark mark(this, stub_id);
 8951     address start = __ pc();
 8952 
 8953     __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
 8954       // Load target method from receiver
 8955     __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
 8956     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
 8957     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
 8958     __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
 8959                       Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
 8960                       noreg, noreg);
 8961     __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
 8962 
 8963     __ ret(lr);
 8964 
 8965     return start;
 8966   }
 8967 
 8968 #undef __
 8969 #define __ masm->
 8970 
 8971   class MontgomeryMultiplyGenerator : public MacroAssembler {
 8972 
 8973     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
 8974       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
 8975 
 8976     RegSet _toSave;
 8977     bool _squaring;
 8978 
 8979   public:
 8980     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
 8981       : MacroAssembler(as->code()), _squaring(squaring) {
 8982 
 8983       // Register allocation
 8984 
 8985       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
 8986       Pa_base = *regs;       // Argument registers
 8987       if (squaring)
 8988         Pb_base = Pa_base;
 8989       else
 8990         Pb_base = *++regs;
 8991       Pn_base = *++regs;
 8992       Rlen= *++regs;
 8993       inv = *++regs;
 8994       Pm_base = *++regs;
 8995 
 8996                           // Working registers:
 8997       Ra =  *++regs;        // The current digit of a, b, n, and m.
 8998       Rb =  *++regs;
 8999       Rm =  *++regs;
 9000       Rn =  *++regs;
 9001 
 9002       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
 9003       Pb =  *++regs;
 9004       Pm =  *++regs;
 9005       Pn =  *++regs;
 9006 
 9007       t0 =  *++regs;        // Three registers which form a
 9008       t1 =  *++regs;        // triple-precision accumuator.
 9009       t2 =  *++regs;
 9010 
 9011       Ri =  *++regs;        // Inner and outer loop indexes.
 9012       Rj =  *++regs;
 9013 
 9014       Rhi_ab = *++regs;     // Product registers: low and high parts
 9015       Rlo_ab = *++regs;     // of a*b and m*n.
 9016       Rhi_mn = *++regs;
 9017       Rlo_mn = *++regs;
 9018 
 9019       // r19 and up are callee-saved.
 9020       _toSave = RegSet::range(r19, *regs) + Pm_base;
 9021     }
 9022 
 9023   private:
 9024     void save_regs() {
 9025       push(_toSave, sp);
 9026     }
 9027 
 9028     void restore_regs() {
 9029       pop(_toSave, sp);
 9030     }
 9031 
 9032     template <typename T>
 9033     void unroll_2(Register count, T block) {
 9034       Label loop, end, odd;
 9035       tbnz(count, 0, odd);
 9036       cbz(count, end);
 9037       align(16);
 9038       bind(loop);
 9039       (this->*block)();
 9040       bind(odd);
 9041       (this->*block)();
 9042       subs(count, count, 2);
 9043       br(Assembler::GT, loop);
 9044       bind(end);
 9045     }
 9046 
 9047     template <typename T>
 9048     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
 9049       Label loop, end, odd;
 9050       tbnz(count, 0, odd);
 9051       cbz(count, end);
 9052       align(16);
 9053       bind(loop);
 9054       (this->*block)(d, s, tmp);
 9055       bind(odd);
 9056       (this->*block)(d, s, tmp);
 9057       subs(count, count, 2);
 9058       br(Assembler::GT, loop);
 9059       bind(end);
 9060     }
 9061 
 9062     void pre1(RegisterOrConstant i) {
 9063       block_comment("pre1");
 9064       // Pa = Pa_base;
 9065       // Pb = Pb_base + i;
 9066       // Pm = Pm_base;
 9067       // Pn = Pn_base + i;
 9068       // Ra = *Pa;
 9069       // Rb = *Pb;
 9070       // Rm = *Pm;
 9071       // Rn = *Pn;
 9072       ldr(Ra, Address(Pa_base));
 9073       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
 9074       ldr(Rm, Address(Pm_base));
 9075       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
 9076       lea(Pa, Address(Pa_base));
 9077       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
 9078       lea(Pm, Address(Pm_base));
 9079       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
 9080 
 9081       // Zero the m*n result.
 9082       mov(Rhi_mn, zr);
 9083       mov(Rlo_mn, zr);
 9084     }
 9085 
 9086     // The core multiply-accumulate step of a Montgomery
 9087     // multiplication.  The idea is to schedule operations as a
 9088     // pipeline so that instructions with long latencies (loads and
 9089     // multiplies) have time to complete before their results are
 9090     // used.  This most benefits in-order implementations of the
 9091     // architecture but out-of-order ones also benefit.
 9092     void step() {
 9093       block_comment("step");
 9094       // MACC(Ra, Rb, t0, t1, t2);
 9095       // Ra = *++Pa;
 9096       // Rb = *--Pb;
 9097       umulh(Rhi_ab, Ra, Rb);
 9098       mul(Rlo_ab, Ra, Rb);
 9099       ldr(Ra, pre(Pa, wordSize));
 9100       ldr(Rb, pre(Pb, -wordSize));
 9101       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
 9102                                        // previous iteration.
 9103       // MACC(Rm, Rn, t0, t1, t2);
 9104       // Rm = *++Pm;
 9105       // Rn = *--Pn;
 9106       umulh(Rhi_mn, Rm, Rn);
 9107       mul(Rlo_mn, Rm, Rn);
 9108       ldr(Rm, pre(Pm, wordSize));
 9109       ldr(Rn, pre(Pn, -wordSize));
 9110       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
 9111     }
 9112 
 9113     void post1() {
 9114       block_comment("post1");
 9115 
 9116       // MACC(Ra, Rb, t0, t1, t2);
 9117       // Ra = *++Pa;
 9118       // Rb = *--Pb;
 9119       umulh(Rhi_ab, Ra, Rb);
 9120       mul(Rlo_ab, Ra, Rb);
 9121       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
 9122       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
 9123 
 9124       // *Pm = Rm = t0 * inv;
 9125       mul(Rm, t0, inv);
 9126       str(Rm, Address(Pm));
 9127 
 9128       // MACC(Rm, Rn, t0, t1, t2);
 9129       // t0 = t1; t1 = t2; t2 = 0;
 9130       umulh(Rhi_mn, Rm, Rn);
 9131 
 9132 #ifndef PRODUCT
 9133       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
 9134       {
 9135         mul(Rlo_mn, Rm, Rn);
 9136         add(Rlo_mn, t0, Rlo_mn);
 9137         Label ok;
 9138         cbz(Rlo_mn, ok); {
 9139           stop("broken Montgomery multiply");
 9140         } bind(ok);
 9141       }
 9142 #endif
 9143       // We have very carefully set things up so that
 9144       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
 9145       // the lower half of Rm * Rn because we know the result already:
 9146       // it must be -t0.  t0 + (-t0) must generate a carry iff
 9147       // t0 != 0.  So, rather than do a mul and an adds we just set
 9148       // the carry flag iff t0 is nonzero.
 9149       //
 9150       // mul(Rlo_mn, Rm, Rn);
 9151       // adds(zr, t0, Rlo_mn);
 9152       subs(zr, t0, 1); // Set carry iff t0 is nonzero
 9153       adcs(t0, t1, Rhi_mn);
 9154       adc(t1, t2, zr);
 9155       mov(t2, zr);
 9156     }
 9157 
 9158     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
 9159       block_comment("pre2");
 9160       // Pa = Pa_base + i-len;
 9161       // Pb = Pb_base + len;
 9162       // Pm = Pm_base + i-len;
 9163       // Pn = Pn_base + len;
 9164 
 9165       if (i.is_register()) {
 9166         sub(Rj, i.as_register(), len);
 9167       } else {
 9168         mov(Rj, i.as_constant());
 9169         sub(Rj, Rj, len);
 9170       }
 9171       // Rj == i-len
 9172 
 9173       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
 9174       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
 9175       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
 9176       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
 9177 
 9178       // Ra = *++Pa;
 9179       // Rb = *--Pb;
 9180       // Rm = *++Pm;
 9181       // Rn = *--Pn;
 9182       ldr(Ra, pre(Pa, wordSize));
 9183       ldr(Rb, pre(Pb, -wordSize));
 9184       ldr(Rm, pre(Pm, wordSize));
 9185       ldr(Rn, pre(Pn, -wordSize));
 9186 
 9187       mov(Rhi_mn, zr);
 9188       mov(Rlo_mn, zr);
 9189     }
 9190 
 9191     void post2(RegisterOrConstant i, RegisterOrConstant len) {
 9192       block_comment("post2");
 9193       if (i.is_constant()) {
 9194         mov(Rj, i.as_constant()-len.as_constant());
 9195       } else {
 9196         sub(Rj, i.as_register(), len);
 9197       }
 9198 
 9199       adds(t0, t0, Rlo_mn); // The pending m*n, low part
 9200 
 9201       // As soon as we know the least significant digit of our result,
 9202       // store it.
 9203       // Pm_base[i-len] = t0;
 9204       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
 9205 
 9206       // t0 = t1; t1 = t2; t2 = 0;
 9207       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
 9208       adc(t1, t2, zr);
 9209       mov(t2, zr);
 9210     }
 9211 
 9212     // A carry in t0 after Montgomery multiplication means that we
 9213     // should subtract multiples of n from our result in m.  We'll
 9214     // keep doing that until there is no carry.
 9215     void normalize(RegisterOrConstant len) {
 9216       block_comment("normalize");
 9217       // while (t0)
 9218       //   t0 = sub(Pm_base, Pn_base, t0, len);
 9219       Label loop, post, again;
 9220       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
 9221       cbz(t0, post); {
 9222         bind(again); {
 9223           mov(i, zr);
 9224           mov(cnt, len);
 9225           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
 9226           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
 9227           subs(zr, zr, zr); // set carry flag, i.e. no borrow
 9228           align(16);
 9229           bind(loop); {
 9230             sbcs(Rm, Rm, Rn);
 9231             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
 9232             add(i, i, 1);
 9233             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
 9234             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
 9235             sub(cnt, cnt, 1);
 9236           } cbnz(cnt, loop);
 9237           sbc(t0, t0, zr);
 9238         } cbnz(t0, again);
 9239       } bind(post);
 9240     }
 9241 
 9242     // Move memory at s to d, reversing words.
 9243     //    Increments d to end of copied memory
 9244     //    Destroys tmp1, tmp2
 9245     //    Preserves len
 9246     //    Leaves s pointing to the address which was in d at start
 9247     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
 9248       assert(tmp1->encoding() < r19->encoding(), "register corruption");
 9249       assert(tmp2->encoding() < r19->encoding(), "register corruption");
 9250 
 9251       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
 9252       mov(tmp1, len);
 9253       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
 9254       sub(s, d, len, ext::uxtw, LogBytesPerWord);
 9255     }
 9256     // where
 9257     void reverse1(Register d, Register s, Register tmp) {
 9258       ldr(tmp, pre(s, -wordSize));
 9259       ror(tmp, tmp, 32);
 9260       str(tmp, post(d, wordSize));
 9261     }
 9262 
 9263     void step_squaring() {
 9264       // An extra ACC
 9265       step();
 9266       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
 9267     }
 9268 
 9269     void last_squaring(RegisterOrConstant i) {
 9270       Label dont;
 9271       // if ((i & 1) == 0) {
 9272       tbnz(i.as_register(), 0, dont); {
 9273         // MACC(Ra, Rb, t0, t1, t2);
 9274         // Ra = *++Pa;
 9275         // Rb = *--Pb;
 9276         umulh(Rhi_ab, Ra, Rb);
 9277         mul(Rlo_ab, Ra, Rb);
 9278         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
 9279       } bind(dont);
 9280     }
 9281 
 9282     void extra_step_squaring() {
 9283       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
 9284 
 9285       // MACC(Rm, Rn, t0, t1, t2);
 9286       // Rm = *++Pm;
 9287       // Rn = *--Pn;
 9288       umulh(Rhi_mn, Rm, Rn);
 9289       mul(Rlo_mn, Rm, Rn);
 9290       ldr(Rm, pre(Pm, wordSize));
 9291       ldr(Rn, pre(Pn, -wordSize));
 9292     }
 9293 
 9294     void post1_squaring() {
 9295       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
 9296 
 9297       // *Pm = Rm = t0 * inv;
 9298       mul(Rm, t0, inv);
 9299       str(Rm, Address(Pm));
 9300 
 9301       // MACC(Rm, Rn, t0, t1, t2);
 9302       // t0 = t1; t1 = t2; t2 = 0;
 9303       umulh(Rhi_mn, Rm, Rn);
 9304 
 9305 #ifndef PRODUCT
 9306       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
 9307       {
 9308         mul(Rlo_mn, Rm, Rn);
 9309         add(Rlo_mn, t0, Rlo_mn);
 9310         Label ok;
 9311         cbz(Rlo_mn, ok); {
 9312           stop("broken Montgomery multiply");
 9313         } bind(ok);
 9314       }
 9315 #endif
 9316       // We have very carefully set things up so that
 9317       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
 9318       // the lower half of Rm * Rn because we know the result already:
 9319       // it must be -t0.  t0 + (-t0) must generate a carry iff
 9320       // t0 != 0.  So, rather than do a mul and an adds we just set
 9321       // the carry flag iff t0 is nonzero.
 9322       //
 9323       // mul(Rlo_mn, Rm, Rn);
 9324       // adds(zr, t0, Rlo_mn);
 9325       subs(zr, t0, 1); // Set carry iff t0 is nonzero
 9326       adcs(t0, t1, Rhi_mn);
 9327       adc(t1, t2, zr);
 9328       mov(t2, zr);
 9329     }
 9330 
 9331     void acc(Register Rhi, Register Rlo,
 9332              Register t0, Register t1, Register t2) {
 9333       adds(t0, t0, Rlo);
 9334       adcs(t1, t1, Rhi);
 9335       adc(t2, t2, zr);
 9336     }
 9337 
 9338   public:
 9339     /**
 9340      * Fast Montgomery multiplication.  The derivation of the
 9341      * algorithm is in A Cryptographic Library for the Motorola
 9342      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
 9343      *
 9344      * Arguments:
 9345      *
 9346      * Inputs for multiplication:
 9347      *   c_rarg0   - int array elements a
 9348      *   c_rarg1   - int array elements b
 9349      *   c_rarg2   - int array elements n (the modulus)
 9350      *   c_rarg3   - int length
 9351      *   c_rarg4   - int inv
 9352      *   c_rarg5   - int array elements m (the result)
 9353      *
 9354      * Inputs for squaring:
 9355      *   c_rarg0   - int array elements a
 9356      *   c_rarg1   - int array elements n (the modulus)
 9357      *   c_rarg2   - int length
 9358      *   c_rarg3   - int inv
 9359      *   c_rarg4   - int array elements m (the result)
 9360      *
 9361      */
 9362     address generate_multiply() {
 9363       Label argh, nothing;
 9364       bind(argh);
 9365       stop("MontgomeryMultiply total_allocation must be <= 8192");
 9366 
 9367       align(CodeEntryAlignment);
 9368       address entry = pc();
 9369 
 9370       cbzw(Rlen, nothing);
 9371 
 9372       enter();
 9373 
 9374       // Make room.
 9375       cmpw(Rlen, 512);
 9376       br(Assembler::HI, argh);
 9377       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
 9378       andr(sp, Ra, -2 * wordSize);
 9379 
 9380       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
 9381 
 9382       {
 9383         // Copy input args, reversing as we go.  We use Ra as a
 9384         // temporary variable.
 9385         reverse(Ra, Pa_base, Rlen, t0, t1);
 9386         if (!_squaring)
 9387           reverse(Ra, Pb_base, Rlen, t0, t1);
 9388         reverse(Ra, Pn_base, Rlen, t0, t1);
 9389       }
 9390 
 9391       // Push all call-saved registers and also Pm_base which we'll need
 9392       // at the end.
 9393       save_regs();
 9394 
 9395 #ifndef PRODUCT
 9396       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
 9397       {
 9398         ldr(Rn, Address(Pn_base, 0));
 9399         mul(Rlo_mn, Rn, inv);
 9400         subs(zr, Rlo_mn, -1);
 9401         Label ok;
 9402         br(EQ, ok); {
 9403           stop("broken inverse in Montgomery multiply");
 9404         } bind(ok);
 9405       }
 9406 #endif
 9407 
 9408       mov(Pm_base, Ra);
 9409 
 9410       mov(t0, zr);
 9411       mov(t1, zr);
 9412       mov(t2, zr);
 9413 
 9414       block_comment("for (int i = 0; i < len; i++) {");
 9415       mov(Ri, zr); {
 9416         Label loop, end;
 9417         cmpw(Ri, Rlen);
 9418         br(Assembler::GE, end);
 9419 
 9420         bind(loop);
 9421         pre1(Ri);
 9422 
 9423         block_comment("  for (j = i; j; j--) {"); {
 9424           movw(Rj, Ri);
 9425           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
 9426         } block_comment("  } // j");
 9427 
 9428         post1();
 9429         addw(Ri, Ri, 1);
 9430         cmpw(Ri, Rlen);
 9431         br(Assembler::LT, loop);
 9432         bind(end);
 9433         block_comment("} // i");
 9434       }
 9435 
 9436       block_comment("for (int i = len; i < 2*len; i++) {");
 9437       mov(Ri, Rlen); {
 9438         Label loop, end;
 9439         cmpw(Ri, Rlen, Assembler::LSL, 1);
 9440         br(Assembler::GE, end);
 9441 
 9442         bind(loop);
 9443         pre2(Ri, Rlen);
 9444 
 9445         block_comment("  for (j = len*2-i-1; j; j--) {"); {
 9446           lslw(Rj, Rlen, 1);
 9447           subw(Rj, Rj, Ri);
 9448           subw(Rj, Rj, 1);
 9449           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
 9450         } block_comment("  } // j");
 9451 
 9452         post2(Ri, Rlen);
 9453         addw(Ri, Ri, 1);
 9454         cmpw(Ri, Rlen, Assembler::LSL, 1);
 9455         br(Assembler::LT, loop);
 9456         bind(end);
 9457       }
 9458       block_comment("} // i");
 9459 
 9460       normalize(Rlen);
 9461 
 9462       mov(Ra, Pm_base);  // Save Pm_base in Ra
 9463       restore_regs();  // Restore caller's Pm_base
 9464 
 9465       // Copy our result into caller's Pm_base
 9466       reverse(Pm_base, Ra, Rlen, t0, t1);
 9467 
 9468       leave();
 9469       bind(nothing);
 9470       ret(lr);
 9471 
 9472       return entry;
 9473     }
 9474     // In C, approximately:
 9475 
 9476     // void
 9477     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
 9478     //                     julong Pn_base[], julong Pm_base[],
 9479     //                     julong inv, int len) {
 9480     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
 9481     //   julong *Pa, *Pb, *Pn, *Pm;
 9482     //   julong Ra, Rb, Rn, Rm;
 9483 
 9484     //   int i;
 9485 
 9486     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
 9487 
 9488     //   for (i = 0; i < len; i++) {
 9489     //     int j;
 9490 
 9491     //     Pa = Pa_base;
 9492     //     Pb = Pb_base + i;
 9493     //     Pm = Pm_base;
 9494     //     Pn = Pn_base + i;
 9495 
 9496     //     Ra = *Pa;
 9497     //     Rb = *Pb;
 9498     //     Rm = *Pm;
 9499     //     Rn = *Pn;
 9500 
 9501     //     int iters = i;
 9502     //     for (j = 0; iters--; j++) {
 9503     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
 9504     //       MACC(Ra, Rb, t0, t1, t2);
 9505     //       Ra = *++Pa;
 9506     //       Rb = *--Pb;
 9507     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
 9508     //       MACC(Rm, Rn, t0, t1, t2);
 9509     //       Rm = *++Pm;
 9510     //       Rn = *--Pn;
 9511     //     }
 9512 
 9513     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
 9514     //     MACC(Ra, Rb, t0, t1, t2);
 9515     //     *Pm = Rm = t0 * inv;
 9516     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
 9517     //     MACC(Rm, Rn, t0, t1, t2);
 9518 
 9519     //     assert(t0 == 0, "broken Montgomery multiply");
 9520 
 9521     //     t0 = t1; t1 = t2; t2 = 0;
 9522     //   }
 9523 
 9524     //   for (i = len; i < 2*len; i++) {
 9525     //     int j;
 9526 
 9527     //     Pa = Pa_base + i-len;
 9528     //     Pb = Pb_base + len;
 9529     //     Pm = Pm_base + i-len;
 9530     //     Pn = Pn_base + len;
 9531 
 9532     //     Ra = *++Pa;
 9533     //     Rb = *--Pb;
 9534     //     Rm = *++Pm;
 9535     //     Rn = *--Pn;
 9536 
 9537     //     int iters = len*2-i-1;
 9538     //     for (j = i-len+1; iters--; j++) {
 9539     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
 9540     //       MACC(Ra, Rb, t0, t1, t2);
 9541     //       Ra = *++Pa;
 9542     //       Rb = *--Pb;
 9543     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
 9544     //       MACC(Rm, Rn, t0, t1, t2);
 9545     //       Rm = *++Pm;
 9546     //       Rn = *--Pn;
 9547     //     }
 9548 
 9549     //     Pm_base[i-len] = t0;
 9550     //     t0 = t1; t1 = t2; t2 = 0;
 9551     //   }
 9552 
 9553     //   while (t0)
 9554     //     t0 = sub(Pm_base, Pn_base, t0, len);
 9555     // }
 9556 
 9557     /**
 9558      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
 9559      * multiplies than Montgomery multiplication so it should be up to
 9560      * 25% faster.  However, its loop control is more complex and it
 9561      * may actually run slower on some machines.
 9562      *
 9563      * Arguments:
 9564      *
 9565      * Inputs:
 9566      *   c_rarg0   - int array elements a
 9567      *   c_rarg1   - int array elements n (the modulus)
 9568      *   c_rarg2   - int length
 9569      *   c_rarg3   - int inv
 9570      *   c_rarg4   - int array elements m (the result)
 9571      *
 9572      */
 9573     address generate_square() {
 9574       Label argh;
 9575       bind(argh);
 9576       stop("MontgomeryMultiply total_allocation must be <= 8192");
 9577 
 9578       align(CodeEntryAlignment);
 9579       address entry = pc();
 9580 
 9581       enter();
 9582 
 9583       // Make room.
 9584       cmpw(Rlen, 512);
 9585       br(Assembler::HI, argh);
 9586       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
 9587       andr(sp, Ra, -2 * wordSize);
 9588 
 9589       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
 9590 
 9591       {
 9592         // Copy input args, reversing as we go.  We use Ra as a
 9593         // temporary variable.
 9594         reverse(Ra, Pa_base, Rlen, t0, t1);
 9595         reverse(Ra, Pn_base, Rlen, t0, t1);
 9596       }
 9597 
 9598       // Push all call-saved registers and also Pm_base which we'll need
 9599       // at the end.
 9600       save_regs();
 9601 
 9602       mov(Pm_base, Ra);
 9603 
 9604       mov(t0, zr);
 9605       mov(t1, zr);
 9606       mov(t2, zr);
 9607 
 9608       block_comment("for (int i = 0; i < len; i++) {");
 9609       mov(Ri, zr); {
 9610         Label loop, end;
 9611         bind(loop);
 9612         cmp(Ri, Rlen);
 9613         br(Assembler::GE, end);
 9614 
 9615         pre1(Ri);
 9616 
 9617         block_comment("for (j = (i+1)/2; j; j--) {"); {
 9618           add(Rj, Ri, 1);
 9619           lsr(Rj, Rj, 1);
 9620           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
 9621         } block_comment("  } // j");
 9622 
 9623         last_squaring(Ri);
 9624 
 9625         block_comment("  for (j = i/2; j; j--) {"); {
 9626           lsr(Rj, Ri, 1);
 9627           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
 9628         } block_comment("  } // j");
 9629 
 9630         post1_squaring();
 9631         add(Ri, Ri, 1);
 9632         cmp(Ri, Rlen);
 9633         br(Assembler::LT, loop);
 9634 
 9635         bind(end);
 9636         block_comment("} // i");
 9637       }
 9638 
 9639       block_comment("for (int i = len; i < 2*len; i++) {");
 9640       mov(Ri, Rlen); {
 9641         Label loop, end;
 9642         bind(loop);
 9643         cmp(Ri, Rlen, Assembler::LSL, 1);
 9644         br(Assembler::GE, end);
 9645 
 9646         pre2(Ri, Rlen);
 9647 
 9648         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
 9649           lsl(Rj, Rlen, 1);
 9650           sub(Rj, Rj, Ri);
 9651           sub(Rj, Rj, 1);
 9652           lsr(Rj, Rj, 1);
 9653           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
 9654         } block_comment("  } // j");
 9655 
 9656         last_squaring(Ri);
 9657 
 9658         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
 9659           lsl(Rj, Rlen, 1);
 9660           sub(Rj, Rj, Ri);
 9661           lsr(Rj, Rj, 1);
 9662           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
 9663         } block_comment("  } // j");
 9664 
 9665         post2(Ri, Rlen);
 9666         add(Ri, Ri, 1);
 9667         cmp(Ri, Rlen, Assembler::LSL, 1);
 9668 
 9669         br(Assembler::LT, loop);
 9670         bind(end);
 9671         block_comment("} // i");
 9672       }
 9673 
 9674       normalize(Rlen);
 9675 
 9676       mov(Ra, Pm_base);  // Save Pm_base in Ra
 9677       restore_regs();  // Restore caller's Pm_base
 9678 
 9679       // Copy our result into caller's Pm_base
 9680       reverse(Pm_base, Ra, Rlen, t0, t1);
 9681 
 9682       leave();
 9683       ret(lr);
 9684 
 9685       return entry;
 9686     }
 9687     // In C, approximately:
 9688 
 9689     // void
 9690     // montgomery_square(julong Pa_base[], julong Pn_base[],
 9691     //                   julong Pm_base[], julong inv, int len) {
 9692     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
 9693     //   julong *Pa, *Pb, *Pn, *Pm;
 9694     //   julong Ra, Rb, Rn, Rm;
 9695 
 9696     //   int i;
 9697 
 9698     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
 9699 
 9700     //   for (i = 0; i < len; i++) {
 9701     //     int j;
 9702 
 9703     //     Pa = Pa_base;
 9704     //     Pb = Pa_base + i;
 9705     //     Pm = Pm_base;
 9706     //     Pn = Pn_base + i;
 9707 
 9708     //     Ra = *Pa;
 9709     //     Rb = *Pb;
 9710     //     Rm = *Pm;
 9711     //     Rn = *Pn;
 9712 
 9713     //     int iters = (i+1)/2;
 9714     //     for (j = 0; iters--; j++) {
 9715     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
 9716     //       MACC2(Ra, Rb, t0, t1, t2);
 9717     //       Ra = *++Pa;
 9718     //       Rb = *--Pb;
 9719     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
 9720     //       MACC(Rm, Rn, t0, t1, t2);
 9721     //       Rm = *++Pm;
 9722     //       Rn = *--Pn;
 9723     //     }
 9724     //     if ((i & 1) == 0) {
 9725     //       assert(Ra == Pa_base[j], "must be");
 9726     //       MACC(Ra, Ra, t0, t1, t2);
 9727     //     }
 9728     //     iters = i/2;
 9729     //     assert(iters == i-j, "must be");
 9730     //     for (; iters--; j++) {
 9731     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
 9732     //       MACC(Rm, Rn, t0, t1, t2);
 9733     //       Rm = *++Pm;
 9734     //       Rn = *--Pn;
 9735     //     }
 9736 
 9737     //     *Pm = Rm = t0 * inv;
 9738     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
 9739     //     MACC(Rm, Rn, t0, t1, t2);
 9740 
 9741     //     assert(t0 == 0, "broken Montgomery multiply");
 9742 
 9743     //     t0 = t1; t1 = t2; t2 = 0;
 9744     //   }
 9745 
 9746     //   for (i = len; i < 2*len; i++) {
 9747     //     int start = i-len+1;
 9748     //     int end = start + (len - start)/2;
 9749     //     int j;
 9750 
 9751     //     Pa = Pa_base + i-len;
 9752     //     Pb = Pa_base + len;
 9753     //     Pm = Pm_base + i-len;
 9754     //     Pn = Pn_base + len;
 9755 
 9756     //     Ra = *++Pa;
 9757     //     Rb = *--Pb;
 9758     //     Rm = *++Pm;
 9759     //     Rn = *--Pn;
 9760 
 9761     //     int iters = (2*len-i-1)/2;
 9762     //     assert(iters == end-start, "must be");
 9763     //     for (j = start; iters--; j++) {
 9764     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
 9765     //       MACC2(Ra, Rb, t0, t1, t2);
 9766     //       Ra = *++Pa;
 9767     //       Rb = *--Pb;
 9768     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
 9769     //       MACC(Rm, Rn, t0, t1, t2);
 9770     //       Rm = *++Pm;
 9771     //       Rn = *--Pn;
 9772     //     }
 9773     //     if ((i & 1) == 0) {
 9774     //       assert(Ra == Pa_base[j], "must be");
 9775     //       MACC(Ra, Ra, t0, t1, t2);
 9776     //     }
 9777     //     iters =  (2*len-i)/2;
 9778     //     assert(iters == len-j, "must be");
 9779     //     for (; iters--; j++) {
 9780     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
 9781     //       MACC(Rm, Rn, t0, t1, t2);
 9782     //       Rm = *++Pm;
 9783     //       Rn = *--Pn;
 9784     //     }
 9785     //     Pm_base[i-len] = t0;
 9786     //     t0 = t1; t1 = t2; t2 = 0;
 9787     //   }
 9788 
 9789     //   while (t0)
 9790     //     t0 = sub(Pm_base, Pn_base, t0, len);
 9791     // }
 9792   };
 9793 
 9794   void generate_vector_math_stubs() {
 9795     // Get native vector math stub routine addresses
 9796     void* libsleef = nullptr;
 9797     char ebuf[1024];
 9798     char dll_name[JVM_MAXPATHLEN];
 9799     if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "sleef")) {
 9800       libsleef = os::dll_load(dll_name, ebuf, sizeof ebuf);
 9801     }
 9802     if (libsleef == nullptr) {
 9803       log_info(library)("Failed to load native vector math library, %s!", ebuf);
 9804       return;
 9805     }
 9806     // Method naming convention
 9807     //   All the methods are named as <OP><T><N>_<U><suffix>
 9808     //   Where:
 9809     //     <OP>     is the operation name, e.g. sin
 9810     //     <T>      is optional to indicate float/double
 9811     //              "f/d" for vector float/double operation
 9812     //     <N>      is the number of elements in the vector
 9813     //              "2/4" for neon, and "x" for sve
 9814     //     <U>      is the precision level
 9815     //              "u10/u05" represents 1.0/0.5 ULP error bounds
 9816     //               We use "u10" for all operations by default
 9817     //               But for those functions do not have u10 support, we use "u05" instead
 9818     //     <suffix> indicates neon/sve
 9819     //              "sve/advsimd" for sve/neon implementations
 9820     //     e.g. sinfx_u10sve is the method for computing vector float sin using SVE instructions
 9821     //          cosd2_u10advsimd is the method for computing 2 elements vector double cos using NEON instructions
 9822     //
 9823     log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "sleef" JNI_LIB_SUFFIX, p2i(libsleef));
 9824 
 9825     // Math vector stubs implemented with SVE for scalable vector size.
 9826     if (UseSVE > 0) {
 9827       for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) {
 9828         int vop = VectorSupport::VECTOR_OP_MATH_START + op;
 9829         // Skip "tanh" because there is performance regression
 9830         if (vop == VectorSupport::VECTOR_OP_TANH) {
 9831           continue;
 9832         }
 9833 
 9834         // The native library does not support u10 level of "hypot".
 9835         const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10";
 9836 
 9837         snprintf(ebuf, sizeof(ebuf), "%sfx_%ssve", VectorSupport::mathname[op], ulf);
 9838         StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf);
 9839 
 9840         snprintf(ebuf, sizeof(ebuf), "%sdx_%ssve", VectorSupport::mathname[op], ulf);
 9841         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf);
 9842       }
 9843     }
 9844 
 9845     // Math vector stubs implemented with NEON for 64/128 bits vector size.
 9846     for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) {
 9847       int vop = VectorSupport::VECTOR_OP_MATH_START + op;
 9848       // Skip "tanh" because there is performance regression
 9849       if (vop == VectorSupport::VECTOR_OP_TANH) {
 9850         continue;
 9851       }
 9852 
 9853       // The native library does not support u10 level of "hypot".
 9854       const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10";
 9855 
 9856       snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf);
 9857       StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libsleef, ebuf);
 9858 
 9859       snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf);
 9860       StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf);
 9861 
 9862       snprintf(ebuf, sizeof(ebuf), "%sd2_%sadvsimd", VectorSupport::mathname[op], ulf);
 9863       StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf);
 9864     }
 9865   }
 9866 
 9867   // Call here from the interpreter or compiled code to either load
 9868   // multiple returned values from the inline type instance being
 9869   // returned to registers or to store returned values to a newly
 9870   // allocated inline type instance.
 9871   address generate_return_value_stub(address destination, const char* name, bool has_res) {
 9872     // We need to save all registers the calling convention may use so
 9873     // the runtime calls read or update those registers. This needs to
 9874     // be in sync with SharedRuntime::java_return_convention().
 9875     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
 9876     enum layout {
 9877       j_rarg7_off = 0, j_rarg7_2,    // j_rarg7 is r0
 9878       j_rarg6_off, j_rarg6_2,
 9879       j_rarg5_off, j_rarg5_2,
 9880       j_rarg4_off, j_rarg4_2,
 9881       j_rarg3_off, j_rarg3_2,
 9882       j_rarg2_off, j_rarg2_2,
 9883       j_rarg1_off, j_rarg1_2,
 9884       j_rarg0_off, j_rarg0_2,
 9885 
 9886       j_farg7_off, j_farg7_2,
 9887       j_farg6_off, j_farg6_2,
 9888       j_farg5_off, j_farg5_2,
 9889       j_farg4_off, j_farg4_2,
 9890       j_farg3_off, j_farg3_2,
 9891       j_farg2_off, j_farg2_2,
 9892       j_farg1_off, j_farg1_2,
 9893       j_farg0_off, j_farg0_2,
 9894 
 9895       rfp_off, rfp_off2,
 9896       return_off, return_off2,
 9897 
 9898       framesize // inclusive of return address
 9899     };
 9900 
 9901     CodeBuffer code(name, 512, 64);
 9902     MacroAssembler* masm = new MacroAssembler(&code);
 9903 
 9904     int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16);
 9905     assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned");
 9906     int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
 9907     int frame_size_in_words = frame_size_in_bytes / wordSize;
 9908 
 9909     OopMapSet* oop_maps = new OopMapSet();
 9910     OopMap* map = new OopMap(frame_size_in_slots, 0);
 9911 
 9912     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg());
 9913     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg());
 9914     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg());
 9915     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg());
 9916     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg());
 9917     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg());
 9918     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg());
 9919     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg());
 9920 
 9921     map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg());
 9922     map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg());
 9923     map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg());
 9924     map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg());
 9925     map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg());
 9926     map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg());
 9927     map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg());
 9928     map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg());
 9929 
 9930     address start = __ pc();
 9931 
 9932     __ enter(); // Save FP and LR before call
 9933 
 9934     __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize)));
 9935     __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize)));
 9936     __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize)));
 9937     __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize)));
 9938 
 9939     __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize)));
 9940     __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize)));
 9941     __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize)));
 9942     __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize)));
 9943 
 9944     int frame_complete = __ offset();
 9945 
 9946     // Set up last_Java_sp and last_Java_fp
 9947     address the_pc = __ pc();
 9948     __ set_last_Java_frame(sp, noreg, the_pc, rscratch1);
 9949 
 9950     // Call runtime
 9951     __ mov(c_rarg1, r0);
 9952     __ mov(c_rarg0, rthread);
 9953 
 9954     __ mov(rscratch1, destination);
 9955     __ blr(rscratch1);
 9956 
 9957     oop_maps->add_gc_map(the_pc - start, map);
 9958 
 9959     __ reset_last_Java_frame(false);
 9960 
 9961     __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize)));
 9962     __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize)));
 9963     __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize)));
 9964     __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize)));
 9965 
 9966     __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize)));
 9967     __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize)));
 9968     __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize)));
 9969     __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize)));
 9970 
 9971     __ leave();
 9972 
 9973     // check for pending exceptions
 9974     Label pending;
 9975     __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 9976     __ cbnz(rscratch1, pending);
 9977 
 9978     if (has_res) {
 9979       __ get_vm_result(r0, rthread);
 9980     }
 9981 
 9982     __ ret(lr);
 9983 
 9984     __ bind(pending);
 9985     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
 9986 
 9987     // -------------
 9988     // make sure all code is generated
 9989     masm->flush();
 9990 
 9991     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false);
 9992     return stub->entry_point();
 9993   }
 9994 
 9995   // Initialization
 9996   void generate_initial_stubs() {
 9997     // Generate initial stubs and initializes the entry points
 9998 
 9999     // entry points that exist in all platforms Note: This is code
10000     // that could be shared among different platforms - however the
10001     // benefit seems to be smaller than the disadvantage of having a
10002     // much more complicated generator structure. See also comment in
10003     // stubRoutines.hpp.
10004 
10005     StubRoutines::_forward_exception_entry = generate_forward_exception();
10006 
10007     StubRoutines::_call_stub_entry =
10008       generate_call_stub(StubRoutines::_call_stub_return_address);
10009 
10010     // is referenced by megamorphic call
10011     StubRoutines::_catch_exception_entry = generate_catch_exception();
10012 
10013     // Initialize table for copy memory (arraycopy) check.
10014     if (UnsafeMemoryAccess::_table == nullptr) {
10015       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
10016     }
10017 
10018     if (UseCRC32Intrinsics) {
10019       // set table address before stub generation which use it
10020       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
10021       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
10022     }
10023 
10024     if (UseCRC32CIntrinsics) {
10025       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
10026     }
10027 
10028     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
10029       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
10030     }
10031 
10032     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
10033       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
10034     }
10035 
10036     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
10037         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
10038       StubRoutines::_hf2f = generate_float16ToFloat();
10039       StubRoutines::_f2hf = generate_floatToFloat16();
10040     }
10041 
10042     if (InlineTypeReturnedAsFields) {
10043       StubRoutines::_load_inline_type_fields_in_regs =
10044          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false);
10045       StubRoutines::_store_inline_type_fields_to_buf =
10046          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true);
10047     }
10048 
10049   }
10050 
10051   void generate_continuation_stubs() {
10052     // Continuation stubs:
10053     StubRoutines::_cont_thaw          = generate_cont_thaw();
10054     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
10055     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
10056     StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
10057   }
10058 
10059   void generate_final_stubs() {
10060     // support for verify_oop (must happen after universe_init)
10061     if (VerifyOops) {
10062       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
10063     }
10064 
10065     // arraycopy stubs used by compilers
10066     generate_arraycopy_stubs();
10067 
10068     StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
10069 
10070     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
10071 
10072     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
10073     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
10074 
10075 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
10076 
10077     generate_atomic_entry_points();
10078 
10079 #endif // LINUX
10080 
10081 #ifdef COMPILER2
10082     if (UseSecondarySupersTable) {
10083       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
10084       if (! InlineSecondarySupersTest) {
10085         generate_lookup_secondary_supers_table_stub();
10086       }
10087     }
10088 #endif
10089 
10090     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
10091   }
10092 
10093   void generate_compiler_stubs() {
10094 #if COMPILER2_OR_JVMCI
10095 
10096     if (UseSVE == 0) {
10097       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubGenStubId::vector_iota_indices_id);
10098     }
10099 
10100     // array equals stub for large arrays.
10101     if (!UseSimpleArrayEquals) {
10102       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
10103     }
10104 
10105     // arrays_hascode stub for large arrays.
10106     StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
10107     StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
10108     StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
10109     StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
10110     StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
10111 
10112     // byte_array_inflate stub for large arrays.
10113     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
10114 
10115     // countPositives stub for large arrays.
10116     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
10117 
10118     generate_compare_long_strings();
10119 
10120     generate_string_indexof_stubs();
10121 
10122 #ifdef COMPILER2
10123     if (UseMultiplyToLenIntrinsic) {
10124       StubRoutines::_multiplyToLen = generate_multiplyToLen();
10125     }
10126 
10127     if (UseSquareToLenIntrinsic) {
10128       StubRoutines::_squareToLen = generate_squareToLen();
10129     }
10130 
10131     if (UseMulAddIntrinsic) {
10132       StubRoutines::_mulAdd = generate_mulAdd();
10133     }
10134 
10135     if (UseSIMDForBigIntegerShiftIntrinsics) {
10136       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
10137       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
10138     }
10139 
10140     if (UseMontgomeryMultiplyIntrinsic) {
10141       StubGenStubId stub_id = StubGenStubId::montgomeryMultiply_id;
10142       StubCodeMark mark(this, stub_id);
10143       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
10144       StubRoutines::_montgomeryMultiply = g.generate_multiply();
10145     }
10146 
10147     if (UseMontgomerySquareIntrinsic) {
10148       StubGenStubId stub_id = StubGenStubId::montgomerySquare_id;
10149       StubCodeMark mark(this, stub_id);
10150       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
10151       // We use generate_multiply() rather than generate_square()
10152       // because it's faster for the sizes of modulus we care about.
10153       StubRoutines::_montgomerySquare = g.generate_multiply();
10154     }
10155 
10156     generate_vector_math_stubs();
10157 
10158 #endif // COMPILER2
10159 
10160     if (UseChaCha20Intrinsics) {
10161       StubRoutines::_chacha20Block = generate_chacha20Block_qrpar();
10162     }
10163 
10164     if (UseDilithiumIntrinsics) {
10165       StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
10166       StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
10167       StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
10168       StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
10169       StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
10170     }
10171 
10172     if (UseBASE64Intrinsics) {
10173         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
10174         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
10175     }
10176 
10177     // data cache line writeback
10178     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
10179     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
10180 
10181     if (UseAESIntrinsics) {
10182       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
10183       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
10184       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
10185       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
10186       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
10187     }
10188     if (UseGHASHIntrinsics) {
10189       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
10190       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
10191     }
10192     if (UseAESIntrinsics && UseGHASHIntrinsics) {
10193       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
10194     }
10195 
10196     if (UseMD5Intrinsics) {
10197       StubRoutines::_md5_implCompress      = generate_md5_implCompress(StubGenStubId::md5_implCompress_id);
10198       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(StubGenStubId::md5_implCompressMB_id);
10199     }
10200     if (UseSHA1Intrinsics) {
10201       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(StubGenStubId::sha1_implCompress_id);
10202       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(StubGenStubId::sha1_implCompressMB_id);
10203     }
10204     if (UseSHA256Intrinsics) {
10205       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(StubGenStubId::sha256_implCompress_id);
10206       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubGenStubId::sha256_implCompressMB_id);
10207     }
10208     if (UseSHA512Intrinsics) {
10209       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(StubGenStubId::sha512_implCompress_id);
10210       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubGenStubId::sha512_implCompressMB_id);
10211     }
10212     if (UseSHA3Intrinsics) {
10213       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(StubGenStubId::sha3_implCompress_id);
10214       StubRoutines::_double_keccak         = generate_double_keccak();
10215       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(StubGenStubId::sha3_implCompressMB_id);
10216     }
10217 
10218     if (UsePoly1305Intrinsics) {
10219       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
10220     }
10221 
10222     // generate Adler32 intrinsics code
10223     if (UseAdler32Intrinsics) {
10224       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
10225     }
10226 
10227 #endif // COMPILER2_OR_JVMCI
10228   }
10229 
10230  public:
10231   StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) {
10232     switch(blob_id) {
10233     case initial_id:
10234       generate_initial_stubs();
10235       break;
10236      case continuation_id:
10237       generate_continuation_stubs();
10238       break;
10239     case compiler_id:
10240       generate_compiler_stubs();
10241       break;
10242     case final_id:
10243       generate_final_stubs();
10244       break;
10245     default:
10246       fatal("unexpected blob id: %d", blob_id);
10247       break;
10248     };
10249   }
10250 }; // end class declaration
10251 
10252 void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) {
10253   StubGenerator g(code, blob_id);
10254 }
10255 
10256 
10257 #if defined (LINUX)
10258 
10259 // Define pointers to atomic stubs and initialize them to point to the
10260 // code in atomic_aarch64.S.
10261 
10262 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
10263   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
10264     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
10265   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
10266     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
10267 
10268 DEFAULT_ATOMIC_OP(fetch_add, 4, )
10269 DEFAULT_ATOMIC_OP(fetch_add, 8, )
10270 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
10271 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
10272 DEFAULT_ATOMIC_OP(xchg, 4, )
10273 DEFAULT_ATOMIC_OP(xchg, 8, )
10274 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
10275 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
10276 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
10277 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
10278 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
10279 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
10280 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
10281 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
10282 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
10283 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
10284 
10285 #undef DEFAULT_ATOMIC_OP
10286 
10287 #endif // LINUX