1 /*
   2  * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2024, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "asm/register.hpp"
  30 #include "atomic_aarch64.hpp"
  31 #include "compiler/oopMap.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/barrierSetAssembler.hpp"
  34 #include "gc/shared/gc_globals.hpp"
  35 #include "gc/shared/tlab_globals.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "memory/universe.hpp"
  38 #include "nativeInst_aarch64.hpp"
  39 #include "oops/instanceOop.hpp"
  40 #include "oops/method.hpp"
  41 #include "oops/objArrayKlass.hpp"
  42 #include "oops/oop.inline.hpp"
  43 #include "prims/methodHandles.hpp"
  44 #include "prims/upcallLinker.hpp"
  45 #include "runtime/arguments.hpp"
  46 #include "runtime/atomic.hpp"
  47 #include "runtime/continuation.hpp"
  48 #include "runtime/continuationEntry.inline.hpp"
  49 #include "runtime/frame.inline.hpp"
  50 #include "runtime/handles.inline.hpp"
  51 #include "runtime/javaThread.hpp"
  52 #include "runtime/sharedRuntime.hpp"
  53 #include "runtime/stubCodeGenerator.hpp"
  54 #include "runtime/stubRoutines.hpp"
  55 #include "utilities/align.hpp"
  56 #include "utilities/checkedCast.hpp"
  57 #include "utilities/debug.hpp"
  58 #include "utilities/globalDefinitions.hpp"
  59 #include "utilities/intpow.hpp"
  60 #include "utilities/powerOfTwo.hpp"
  61 #ifdef COMPILER2
  62 #include "opto/runtime.hpp"
  63 #endif
  64 #if INCLUDE_ZGC
  65 #include "gc/z/zThreadLocalData.hpp"
  66 #endif
  67 
  68 // Declaration and definition of StubGenerator (no .hpp file).
  69 // For a more detailed description of the stub routine structure
  70 // see the comment in stubRoutines.hpp
  71 
  72 #undef __
  73 #define __ _masm->
  74 
  75 #ifdef PRODUCT
  76 #define BLOCK_COMMENT(str) /* nothing */
  77 #else
  78 #define BLOCK_COMMENT(str) __ block_comment(str)
  79 #endif
  80 
  81 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  82 
  83 // Stub Code definitions
  84 
  85 class StubGenerator: public StubCodeGenerator {
  86  private:
  87 
  88 #ifdef PRODUCT
  89 #define inc_counter_np(counter) ((void)0)
  90 #else
  91   void inc_counter_np_(uint& counter) {
  92     __ incrementw(ExternalAddress((address)&counter));
  93   }
  94 #define inc_counter_np(counter) \
  95   BLOCK_COMMENT("inc_counter " #counter); \
  96   inc_counter_np_(counter);
  97 #endif
  98 
  99   // Call stubs are used to call Java from C
 100   //
 101   // Arguments:
 102   //    c_rarg0:   call wrapper address                   address
 103   //    c_rarg1:   result                                 address
 104   //    c_rarg2:   result type                            BasicType
 105   //    c_rarg3:   method                                 Method*
 106   //    c_rarg4:   (interpreter) entry point              address
 107   //    c_rarg5:   parameters                             intptr_t*
 108   //    c_rarg6:   parameter size (in words)              int
 109   //    c_rarg7:   thread                                 Thread*
 110   //
 111   // There is no return from the stub itself as any Java result
 112   // is written to result
 113   //
 114   // we save r30 (lr) as the return PC at the base of the frame and
 115   // link r29 (fp) below it as the frame pointer installing sp (r31)
 116   // into fp.
 117   //
 118   // we save r0-r7, which accounts for all the c arguments.
 119   //
 120   // TODO: strictly do we need to save them all? they are treated as
 121   // volatile by C so could we omit saving the ones we are going to
 122   // place in global registers (thread? method?) or those we only use
 123   // during setup of the Java call?
 124   //
 125   // we don't need to save r8 which C uses as an indirect result location
 126   // return register.
 127   //
 128   // we don't need to save r9-r15 which both C and Java treat as
 129   // volatile
 130   //
 131   // we don't need to save r16-18 because Java does not use them
 132   //
 133   // we save r19-r28 which Java uses as scratch registers and C
 134   // expects to be callee-save
 135   //
 136   // we save the bottom 64 bits of each value stored in v8-v15; it is
 137   // the responsibility of the caller to preserve larger values.
 138   //
 139   // so the stub frame looks like this when we enter Java code
 140   //
 141   //     [ return_from_Java     ] <--- sp
 142   //     [ argument word n      ]
 143   //      ...
 144   // -29 [ argument word 1      ]
 145   // -28 [ saved Floating-point Control Register ]
 146   // -26 [ saved v15            ] <--- sp_after_call
 147   // -25 [ saved v14            ]
 148   // -24 [ saved v13            ]
 149   // -23 [ saved v12            ]
 150   // -22 [ saved v11            ]
 151   // -21 [ saved v10            ]
 152   // -20 [ saved v9             ]
 153   // -19 [ saved v8             ]
 154   // -18 [ saved r28            ]
 155   // -17 [ saved r27            ]
 156   // -16 [ saved r26            ]
 157   // -15 [ saved r25            ]
 158   // -14 [ saved r24            ]
 159   // -13 [ saved r23            ]
 160   // -12 [ saved r22            ]
 161   // -11 [ saved r21            ]
 162   // -10 [ saved r20            ]
 163   //  -9 [ saved r19            ]
 164   //  -8 [ call wrapper    (r0) ]
 165   //  -7 [ result          (r1) ]
 166   //  -6 [ result type     (r2) ]
 167   //  -5 [ method          (r3) ]
 168   //  -4 [ entry point     (r4) ]
 169   //  -3 [ parameters      (r5) ]
 170   //  -2 [ parameter size  (r6) ]
 171   //  -1 [ thread (r7)          ]
 172   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 173   //   1 [ saved lr       (r30) ]
 174 
 175   // Call stub stack layout word offsets from fp
 176   enum call_stub_layout {
 177     sp_after_call_off  = -28,
 178 
 179     fpcr_off           = sp_after_call_off,
 180     d15_off            = -26,
 181     d13_off            = -24,
 182     d11_off            = -22,
 183     d9_off             = -20,
 184 
 185     r28_off            = -18,
 186     r26_off            = -16,
 187     r24_off            = -14,
 188     r22_off            = -12,
 189     r20_off            = -10,
 190     call_wrapper_off   =  -8,
 191     result_off         =  -7,
 192     result_type_off    =  -6,
 193     method_off         =  -5,
 194     entry_point_off    =  -4,
 195     parameter_size_off =  -2,
 196     thread_off         =  -1,
 197     fp_f               =   0,
 198     retaddr_off        =   1,
 199   };
 200 
 201   address generate_call_stub(address& return_address) {
 202     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 203            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 204            "adjust this code");
 205 
 206     StubCodeMark mark(this, "StubRoutines", "call_stub");
 207     address start = __ pc();
 208 
 209     const Address sp_after_call (rfp, sp_after_call_off * wordSize);
 210 
 211     const Address fpcr_save     (rfp, fpcr_off           * wordSize);
 212     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 213     const Address result        (rfp, result_off         * wordSize);
 214     const Address result_type   (rfp, result_type_off    * wordSize);
 215     const Address method        (rfp, method_off         * wordSize);
 216     const Address entry_point   (rfp, entry_point_off    * wordSize);
 217     const Address parameter_size(rfp, parameter_size_off * wordSize);
 218 
 219     const Address thread        (rfp, thread_off         * wordSize);
 220 
 221     const Address d15_save      (rfp, d15_off * wordSize);
 222     const Address d13_save      (rfp, d13_off * wordSize);
 223     const Address d11_save      (rfp, d11_off * wordSize);
 224     const Address d9_save       (rfp, d9_off * wordSize);
 225 
 226     const Address r28_save      (rfp, r28_off * wordSize);
 227     const Address r26_save      (rfp, r26_off * wordSize);
 228     const Address r24_save      (rfp, r24_off * wordSize);
 229     const Address r22_save      (rfp, r22_off * wordSize);
 230     const Address r20_save      (rfp, r20_off * wordSize);
 231 
 232     // stub code
 233 
 234     address aarch64_entry = __ pc();
 235 
 236     // set up frame and move sp to end of save area
 237     __ enter();
 238     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 239 
 240     // save register parameters and Java scratch/global registers
 241     // n.b. we save thread even though it gets installed in
 242     // rthread because we want to sanity check rthread later
 243     __ str(c_rarg7,  thread);
 244     __ strw(c_rarg6, parameter_size);
 245     __ stp(c_rarg4, c_rarg5,  entry_point);
 246     __ stp(c_rarg2, c_rarg3,  result_type);
 247     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 248 
 249     __ stp(r20, r19,   r20_save);
 250     __ stp(r22, r21,   r22_save);
 251     __ stp(r24, r23,   r24_save);
 252     __ stp(r26, r25,   r26_save);
 253     __ stp(r28, r27,   r28_save);
 254 
 255     __ stpd(v9,  v8,   d9_save);
 256     __ stpd(v11, v10,  d11_save);
 257     __ stpd(v13, v12,  d13_save);
 258     __ stpd(v15, v14,  d15_save);
 259 
 260     __ get_fpcr(rscratch1);
 261     __ str(rscratch1, fpcr_save);
 262     // Set FPCR to the state we need. We do want Round to Nearest. We
 263     // don't want non-IEEE rounding modes or floating-point traps.
 264     __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
 265     __ bfi(rscratch1, zr, 8, 5);  // Clear exception-control bits (8-12)
 266     __ set_fpcr(rscratch1);
 267 
 268     // install Java thread in global register now we have saved
 269     // whatever value it held
 270     __ mov(rthread, c_rarg7);
 271     // And method
 272     __ mov(rmethod, c_rarg3);
 273 
 274     // set up the heapbase register
 275     __ reinit_heapbase();
 276 
 277 #ifdef ASSERT
 278     // make sure we have no pending exceptions
 279     {
 280       Label L;
 281       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 282       __ cmp(rscratch1, (u1)NULL_WORD);
 283       __ br(Assembler::EQ, L);
 284       __ stop("StubRoutines::call_stub: entered with pending exception");
 285       __ BIND(L);
 286     }
 287 #endif
 288     // pass parameters if any
 289     __ mov(esp, sp);
 290     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 291     __ andr(sp, rscratch1, -2 * wordSize);
 292 
 293     BLOCK_COMMENT("pass parameters if any");
 294     Label parameters_done;
 295     // parameter count is still in c_rarg6
 296     // and parameter pointer identifying param 1 is in c_rarg5
 297     __ cbzw(c_rarg6, parameters_done);
 298 
 299     address loop = __ pc();
 300     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 301     __ subsw(c_rarg6, c_rarg6, 1);
 302     __ push(rscratch1);
 303     __ br(Assembler::GT, loop);
 304 
 305     __ BIND(parameters_done);
 306 
 307     // call Java entry -- passing methdoOop, and current sp
 308     //      rmethod: Method*
 309     //      r19_sender_sp: sender sp
 310     BLOCK_COMMENT("call Java function");
 311     __ mov(r19_sender_sp, sp);
 312     __ blr(c_rarg4);
 313 
 314     // we do this here because the notify will already have been done
 315     // if we get to the next instruction via an exception
 316     //
 317     // n.b. adding this instruction here affects the calculation of
 318     // whether or not a routine returns to the call stub (used when
 319     // doing stack walks) since the normal test is to check the return
 320     // pc against the address saved below. so we may need to allow for
 321     // this extra instruction in the check.
 322 
 323     // save current address for use by exception handling code
 324 
 325     return_address = __ pc();
 326 
 327     // store result depending on type (everything that is not
 328     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 329     // n.b. this assumes Java returns an integral result in r0
 330     // and a floating result in j_farg0
 331     // All of j_rargN may be used to return inline type fields so be careful
 332     // not to clobber those.
 333     // SharedRuntime::generate_buffered_inline_type_adapter() knows the register
 334     // assignment of Rresult below.
 335     Register Rresult = r14, Rresult_type = r15;
 336     __ ldr(Rresult, result);
 337     Label is_long, is_float, is_double, check_prim, exit;
 338     __ ldr(Rresult_type, result_type);
 339     __ cmp(Rresult_type, (u1)T_OBJECT);
 340     __ br(Assembler::EQ, check_prim);
 341     __ cmp(Rresult_type, (u1)T_LONG);
 342     __ br(Assembler::EQ, is_long);
 343     __ cmp(Rresult_type, (u1)T_FLOAT);
 344     __ br(Assembler::EQ, is_float);
 345     __ cmp(Rresult_type, (u1)T_DOUBLE);
 346     __ br(Assembler::EQ, is_double);
 347 
 348     // handle T_INT case
 349     __ strw(r0, Address(Rresult));
 350 
 351     __ BIND(exit);
 352 
 353     // pop parameters
 354     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 355 
 356 #ifdef ASSERT
 357     // verify that threads correspond
 358     {
 359       Label L, S;
 360       __ ldr(rscratch1, thread);
 361       __ cmp(rthread, rscratch1);
 362       __ br(Assembler::NE, S);
 363       __ get_thread(rscratch1);
 364       __ cmp(rthread, rscratch1);
 365       __ br(Assembler::EQ, L);
 366       __ BIND(S);
 367       __ stop("StubRoutines::call_stub: threads must correspond");
 368       __ BIND(L);
 369     }
 370 #endif
 371 
 372     __ pop_cont_fastpath(rthread);
 373 
 374     // restore callee-save registers
 375     __ ldpd(v15, v14,  d15_save);
 376     __ ldpd(v13, v12,  d13_save);
 377     __ ldpd(v11, v10,  d11_save);
 378     __ ldpd(v9,  v8,   d9_save);
 379 
 380     __ ldp(r28, r27,   r28_save);
 381     __ ldp(r26, r25,   r26_save);
 382     __ ldp(r24, r23,   r24_save);
 383     __ ldp(r22, r21,   r22_save);
 384     __ ldp(r20, r19,   r20_save);
 385 
 386     // restore fpcr
 387     __ ldr(rscratch1,  fpcr_save);
 388     __ set_fpcr(rscratch1);
 389 
 390     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 391     __ ldrw(c_rarg2, result_type);
 392     __ ldr(c_rarg3,  method);
 393     __ ldp(c_rarg4, c_rarg5,  entry_point);
 394     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 395 
 396     // leave frame and return to caller
 397     __ leave();
 398     __ ret(lr);
 399 
 400     // handle return types different from T_INT
 401     __ BIND(check_prim);
 402     if (InlineTypeReturnedAsFields) {
 403       // Check for scalarized return value
 404       __ tbz(r0, 0, is_long);
 405       // Load pack handler address
 406       __ andr(rscratch1, r0, -2);
 407       __ ldr(rscratch1, Address(rscratch1, InstanceKlass::adr_inlineklass_fixed_block_offset()));
 408       __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset()));
 409       __ blr(rscratch1);
 410       __ b(exit);
 411     }
 412 
 413     __ BIND(is_long);
 414     __ str(r0, Address(Rresult, 0));
 415     __ br(Assembler::AL, exit);
 416 
 417     __ BIND(is_float);
 418     __ strs(j_farg0, Address(Rresult, 0));
 419     __ br(Assembler::AL, exit);
 420 
 421     __ BIND(is_double);
 422     __ strd(j_farg0, Address(Rresult, 0));
 423     __ br(Assembler::AL, exit);
 424 
 425     return start;
 426   }
 427 
 428   // Return point for a Java call if there's an exception thrown in
 429   // Java code.  The exception is caught and transformed into a
 430   // pending exception stored in JavaThread that can be tested from
 431   // within the VM.
 432   //
 433   // Note: Usually the parameters are removed by the callee. In case
 434   // of an exception crossing an activation frame boundary, that is
 435   // not the case if the callee is compiled code => need to setup the
 436   // rsp.
 437   //
 438   // r0: exception oop
 439 
 440   address generate_catch_exception() {
 441     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 442     address start = __ pc();
 443 
 444     // same as in generate_call_stub():
 445     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 446     const Address thread        (rfp, thread_off         * wordSize);
 447 
 448 #ifdef ASSERT
 449     // verify that threads correspond
 450     {
 451       Label L, S;
 452       __ ldr(rscratch1, thread);
 453       __ cmp(rthread, rscratch1);
 454       __ br(Assembler::NE, S);
 455       __ get_thread(rscratch1);
 456       __ cmp(rthread, rscratch1);
 457       __ br(Assembler::EQ, L);
 458       __ bind(S);
 459       __ stop("StubRoutines::catch_exception: threads must correspond");
 460       __ bind(L);
 461     }
 462 #endif
 463 
 464     // set pending exception
 465     __ verify_oop(r0);
 466 
 467     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 468     __ mov(rscratch1, (address)__FILE__);
 469     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 470     __ movw(rscratch1, (int)__LINE__);
 471     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 472 
 473     // complete return to VM
 474     assert(StubRoutines::_call_stub_return_address != nullptr,
 475            "_call_stub_return_address must have been generated before");
 476     __ b(StubRoutines::_call_stub_return_address);
 477 
 478     return start;
 479   }
 480 
 481   // Continuation point for runtime calls returning with a pending
 482   // exception.  The pending exception check happened in the runtime
 483   // or native call stub.  The pending exception in Thread is
 484   // converted into a Java-level exception.
 485   //
 486   // Contract with Java-level exception handlers:
 487   // r0: exception
 488   // r3: throwing pc
 489   //
 490   // NOTE: At entry of this stub, exception-pc must be in LR !!
 491 
 492   // NOTE: this is always used as a jump target within generated code
 493   // so it just needs to be generated code with no x86 prolog
 494 
 495   address generate_forward_exception() {
 496     StubCodeMark mark(this, "StubRoutines", "forward exception");
 497     address start = __ pc();
 498 
 499     // Upon entry, LR points to the return address returning into
 500     // Java (interpreted or compiled) code; i.e., the return address
 501     // becomes the throwing pc.
 502     //
 503     // Arguments pushed before the runtime call are still on the stack
 504     // but the exception handler will reset the stack pointer ->
 505     // ignore them.  A potential result in registers can be ignored as
 506     // well.
 507 
 508 #ifdef ASSERT
 509     // make sure this code is only executed if there is a pending exception
 510     {
 511       Label L;
 512       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 513       __ cbnz(rscratch1, L);
 514       __ stop("StubRoutines::forward exception: no pending exception (1)");
 515       __ bind(L);
 516     }
 517 #endif
 518 
 519     // compute exception handler into r19
 520 
 521     // call the VM to find the handler address associated with the
 522     // caller address. pass thread in r0 and caller pc (ret address)
 523     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 524     // the stack.
 525     __ mov(c_rarg1, lr);
 526     // lr will be trashed by the VM call so we move it to R19
 527     // (callee-saved) because we also need to pass it to the handler
 528     // returned by this call.
 529     __ mov(r19, lr);
 530     BLOCK_COMMENT("call exception_handler_for_return_address");
 531     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 532                          SharedRuntime::exception_handler_for_return_address),
 533                     rthread, c_rarg1);
 534     // Reinitialize the ptrue predicate register, in case the external runtime
 535     // call clobbers ptrue reg, as we may return to SVE compiled code.
 536     __ reinitialize_ptrue();
 537 
 538     // we should not really care that lr is no longer the callee
 539     // address. we saved the value the handler needs in r19 so we can
 540     // just copy it to r3. however, the C2 handler will push its own
 541     // frame and then calls into the VM and the VM code asserts that
 542     // the PC for the frame above the handler belongs to a compiled
 543     // Java method. So, we restore lr here to satisfy that assert.
 544     __ mov(lr, r19);
 545     // setup r0 & r3 & clear pending exception
 546     __ mov(r3, r19);
 547     __ mov(r19, r0);
 548     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 549     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 550 
 551 #ifdef ASSERT
 552     // make sure exception is set
 553     {
 554       Label L;
 555       __ cbnz(r0, L);
 556       __ stop("StubRoutines::forward exception: no pending exception (2)");
 557       __ bind(L);
 558     }
 559 #endif
 560 
 561     // continue at exception handler
 562     // r0: exception
 563     // r3: throwing pc
 564     // r19: exception handler
 565     __ verify_oop(r0);
 566     __ br(r19);
 567 
 568     return start;
 569   }
 570 
 571   // Non-destructive plausibility checks for oops
 572   //
 573   // Arguments:
 574   //    r0: oop to verify
 575   //    rscratch1: error message
 576   //
 577   // Stack after saving c_rarg3:
 578   //    [tos + 0]: saved c_rarg3
 579   //    [tos + 1]: saved c_rarg2
 580   //    [tos + 2]: saved lr
 581   //    [tos + 3]: saved rscratch2
 582   //    [tos + 4]: saved r0
 583   //    [tos + 5]: saved rscratch1
 584   address generate_verify_oop() {
 585 
 586     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 587     address start = __ pc();
 588 
 589     Label exit, error;
 590 
 591     // save c_rarg2 and c_rarg3
 592     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 593 
 594     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 595     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 596     __ ldr(c_rarg3, Address(c_rarg2));
 597     __ add(c_rarg3, c_rarg3, 1);
 598     __ str(c_rarg3, Address(c_rarg2));
 599 
 600     // object is in r0
 601     // make sure object is 'reasonable'
 602     __ cbz(r0, exit); // if obj is null it is OK
 603 
 604     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
 605     bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
 606 
 607     // return if everything seems ok
 608     __ bind(exit);
 609 
 610     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 611     __ ret(lr);
 612 
 613     // handle errors
 614     __ bind(error);
 615     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 616 
 617     __ push(RegSet::range(r0, r29), sp);
 618     // debug(char* msg, int64_t pc, int64_t regs[])
 619     __ mov(c_rarg0, rscratch1);      // pass address of error message
 620     __ mov(c_rarg1, lr);             // pass return address
 621     __ mov(c_rarg2, sp);             // pass address of regs on stack
 622 #ifndef PRODUCT
 623     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 624 #endif
 625     BLOCK_COMMENT("call MacroAssembler::debug");
 626     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 627     __ blr(rscratch1);
 628     __ hlt(0);
 629 
 630     return start;
 631   }
 632 
 633   // Generate indices for iota vector.
 634   address generate_iota_indices(const char *stub_name) {
 635     __ align(CodeEntryAlignment);
 636     StubCodeMark mark(this, "StubRoutines", stub_name);
 637     address start = __ pc();
 638     // B
 639     __ emit_data64(0x0706050403020100, relocInfo::none);
 640     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 641     // H
 642     __ emit_data64(0x0003000200010000, relocInfo::none);
 643     __ emit_data64(0x0007000600050004, relocInfo::none);
 644     // S
 645     __ emit_data64(0x0000000100000000, relocInfo::none);
 646     __ emit_data64(0x0000000300000002, relocInfo::none);
 647     // D
 648     __ emit_data64(0x0000000000000000, relocInfo::none);
 649     __ emit_data64(0x0000000000000001, relocInfo::none);
 650     // S - FP
 651     __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
 652     __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
 653     // D - FP
 654     __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
 655     __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
 656     return start;
 657   }
 658 
 659   // The inner part of zero_words().  This is the bulk operation,
 660   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 661   // caller is responsible for zeroing the last few words.
 662   //
 663   // Inputs:
 664   // r10: the HeapWord-aligned base address of an array to zero.
 665   // r11: the count in HeapWords, r11 > 0.
 666   //
 667   // Returns r10 and r11, adjusted for the caller to clear.
 668   // r10: the base address of the tail of words left to clear.
 669   // r11: the number of words in the tail.
 670   //      r11 < MacroAssembler::zero_words_block_size.
 671 
 672   address generate_zero_blocks() {
 673     Label done;
 674     Label base_aligned;
 675 
 676     Register base = r10, cnt = r11;
 677 
 678     __ align(CodeEntryAlignment);
 679     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 680     address start = __ pc();
 681 
 682     if (UseBlockZeroing) {
 683       int zva_length = VM_Version::zva_length();
 684 
 685       // Ensure ZVA length can be divided by 16. This is required by
 686       // the subsequent operations.
 687       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 688 
 689       __ tbz(base, 3, base_aligned);
 690       __ str(zr, Address(__ post(base, 8)));
 691       __ sub(cnt, cnt, 1);
 692       __ bind(base_aligned);
 693 
 694       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 695       // alignment.
 696       Label small;
 697       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 698       __ subs(rscratch1, cnt, low_limit >> 3);
 699       __ br(Assembler::LT, small);
 700       __ zero_dcache_blocks(base, cnt);
 701       __ bind(small);
 702     }
 703 
 704     {
 705       // Number of stp instructions we'll unroll
 706       const int unroll =
 707         MacroAssembler::zero_words_block_size / 2;
 708       // Clear the remaining blocks.
 709       Label loop;
 710       __ subs(cnt, cnt, unroll * 2);
 711       __ br(Assembler::LT, done);
 712       __ bind(loop);
 713       for (int i = 0; i < unroll; i++)
 714         __ stp(zr, zr, __ post(base, 16));
 715       __ subs(cnt, cnt, unroll * 2);
 716       __ br(Assembler::GE, loop);
 717       __ bind(done);
 718       __ add(cnt, cnt, unroll * 2);
 719     }
 720 
 721     __ ret(lr);
 722 
 723     return start;
 724   }
 725 
 726 
 727   typedef enum {
 728     copy_forwards = 1,
 729     copy_backwards = -1
 730   } copy_direction;
 731 
 732   // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
 733   // for arraycopy stubs.
 734   class ArrayCopyBarrierSetHelper : StackObj {
 735     BarrierSetAssembler* _bs_asm;
 736     MacroAssembler* _masm;
 737     DecoratorSet _decorators;
 738     BasicType _type;
 739     Register _gct1;
 740     Register _gct2;
 741     Register _gct3;
 742     FloatRegister _gcvt1;
 743     FloatRegister _gcvt2;
 744     FloatRegister _gcvt3;
 745 
 746   public:
 747     ArrayCopyBarrierSetHelper(MacroAssembler* masm,
 748                               DecoratorSet decorators,
 749                               BasicType type,
 750                               Register gct1,
 751                               Register gct2,
 752                               Register gct3,
 753                               FloatRegister gcvt1,
 754                               FloatRegister gcvt2,
 755                               FloatRegister gcvt3)
 756       : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
 757         _masm(masm),
 758         _decorators(decorators),
 759         _type(type),
 760         _gct1(gct1),
 761         _gct2(gct2),
 762         _gct3(gct3),
 763         _gcvt1(gcvt1),
 764         _gcvt2(gcvt2),
 765         _gcvt3(gcvt3) {
 766     }
 767 
 768     void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
 769       _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
 770                             dst1, dst2, src,
 771                             _gct1, _gct2, _gcvt1);
 772     }
 773 
 774     void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
 775       _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
 776                              dst, src1, src2,
 777                              _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
 778     }
 779 
 780     void copy_load_at_16(Register dst1, Register dst2, Address src) {
 781       _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
 782                             dst1, dst2, src,
 783                             _gct1);
 784     }
 785 
 786     void copy_store_at_16(Address dst, Register src1, Register src2) {
 787       _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
 788                              dst, src1, src2,
 789                              _gct1, _gct2, _gct3);
 790     }
 791 
 792     void copy_load_at_8(Register dst, Address src) {
 793       _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
 794                             dst, noreg, src,
 795                             _gct1);
 796     }
 797 
 798     void copy_store_at_8(Address dst, Register src) {
 799       _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
 800                              dst, src, noreg,
 801                              _gct1, _gct2, _gct3);
 802     }
 803   };
 804 
 805   // Bulk copy of blocks of 8 words.
 806   //
 807   // count is a count of words.
 808   //
 809   // Precondition: count >= 8
 810   //
 811   // Postconditions:
 812   //
 813   // The least significant bit of count contains the remaining count
 814   // of words to copy.  The rest of count is trash.
 815   //
 816   // s and d are adjusted to point to the remaining words to copy
 817   //
 818   void generate_copy_longs(DecoratorSet decorators, BasicType type, Label &start, Register s, Register d, Register count,
 819                            copy_direction direction) {
 820     int unit = wordSize * direction;
 821     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 822 
 823     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 824       t4 = r7, t5 = r11, t6 = r12, t7 = r13;
 825     const Register stride = r14;
 826     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
 827     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
 828     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
 829 
 830     assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
 831     assert_different_registers(s, d, count, rscratch1, rscratch2);
 832 
 833     Label again, drain;
 834     const char *stub_name;
 835     if (direction == copy_forwards)
 836       stub_name = "forward_copy_longs";
 837     else
 838       stub_name = "backward_copy_longs";
 839 
 840     __ align(CodeEntryAlignment);
 841 
 842     StubCodeMark mark(this, "StubRoutines", stub_name);
 843 
 844     __ bind(start);
 845 
 846     Label unaligned_copy_long;
 847     if (AvoidUnalignedAccesses) {
 848       __ tbnz(d, 3, unaligned_copy_long);
 849     }
 850 
 851     if (direction == copy_forwards) {
 852       __ sub(s, s, bias);
 853       __ sub(d, d, bias);
 854     }
 855 
 856 #ifdef ASSERT
 857     // Make sure we are never given < 8 words
 858     {
 859       Label L;
 860       __ cmp(count, (u1)8);
 861       __ br(Assembler::GE, L);
 862       __ stop("genrate_copy_longs called with < 8 words");
 863       __ bind(L);
 864     }
 865 #endif
 866 
 867     // Fill 8 registers
 868     if (UseSIMDForMemoryOps) {
 869       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 870       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 871     } else {
 872       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 873       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 874       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 875       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 876     }
 877 
 878     __ subs(count, count, 16);
 879     __ br(Assembler::LO, drain);
 880 
 881     int prefetch = PrefetchCopyIntervalInBytes;
 882     bool use_stride = false;
 883     if (direction == copy_backwards) {
 884        use_stride = prefetch > 256;
 885        prefetch = -prefetch;
 886        if (use_stride) __ mov(stride, prefetch);
 887     }
 888 
 889     __ bind(again);
 890 
 891     if (PrefetchCopyIntervalInBytes > 0)
 892       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 893 
 894     if (UseSIMDForMemoryOps) {
 895       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 896       bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
 897       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 898       bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
 899     } else {
 900       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 901       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 902       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 903       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
 904       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 905       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
 906       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 907       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
 908     }
 909 
 910     __ subs(count, count, 8);
 911     __ br(Assembler::HS, again);
 912 
 913     // Drain
 914     __ bind(drain);
 915     if (UseSIMDForMemoryOps) {
 916       bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
 917       bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
 918     } else {
 919       bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 920       bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
 921       bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
 922       bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
 923     }
 924 
 925     {
 926       Label L1, L2;
 927       __ tbz(count, exact_log2(4), L1);
 928       if (UseSIMDForMemoryOps) {
 929         bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
 930         bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
 931       } else {
 932         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
 933         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
 934         bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
 935         bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
 936       }
 937       __ bind(L1);
 938 
 939       if (direction == copy_forwards) {
 940         __ add(s, s, bias);
 941         __ add(d, d, bias);
 942       }
 943 
 944       __ tbz(count, 1, L2);
 945       bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 946       bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
 947       __ bind(L2);
 948     }
 949 
 950     __ ret(lr);
 951 
 952     if (AvoidUnalignedAccesses) {
 953       Label drain, again;
 954       // Register order for storing. Order is different for backward copy.
 955 
 956       __ bind(unaligned_copy_long);
 957 
 958       // source address is even aligned, target odd aligned
 959       //
 960       // when forward copying word pairs we read long pairs at offsets
 961       // {0, 2, 4, 6} (in long words). when backwards copying we read
 962       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 963       // address by -2 in the forwards case so we can compute the
 964       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 965       // or -1.
 966       //
 967       // when forward copying we need to store 1 word, 3 pairs and
 968       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
 969       // zero offset We adjust the destination by -1 which means we
 970       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 971       //
 972       // When backwards copyng we need to store 1 word, 3 pairs and
 973       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 974       // offsets {1, 3, 5, 7, 8} * unit.
 975 
 976       if (direction == copy_forwards) {
 977         __ sub(s, s, 16);
 978         __ sub(d, d, 8);
 979       }
 980 
 981       // Fill 8 registers
 982       //
 983       // for forwards copy s was offset by -16 from the original input
 984       // value of s so the register contents are at these offsets
 985       // relative to the 64 bit block addressed by that original input
 986       // and so on for each successive 64 byte block when s is updated
 987       //
 988       // t0 at offset 0,  t1 at offset 8
 989       // t2 at offset 16, t3 at offset 24
 990       // t4 at offset 32, t5 at offset 40
 991       // t6 at offset 48, t7 at offset 56
 992 
 993       // for backwards copy s was not offset so the register contents
 994       // are at these offsets into the preceding 64 byte block
 995       // relative to that original input and so on for each successive
 996       // preceding 64 byte block when s is updated. this explains the
 997       // slightly counter-intuitive looking pattern of register usage
 998       // in the stp instructions for backwards copy.
 999       //
1000       // t0 at offset -16, t1 at offset -8
1001       // t2 at offset -32, t3 at offset -24
1002       // t4 at offset -48, t5 at offset -40
1003       // t6 at offset -64, t7 at offset -56
1004 
1005       bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1006       bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1007       bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1008       bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1009 
1010       __ subs(count, count, 16);
1011       __ br(Assembler::LO, drain);
1012 
1013       int prefetch = PrefetchCopyIntervalInBytes;
1014       bool use_stride = false;
1015       if (direction == copy_backwards) {
1016          use_stride = prefetch > 256;
1017          prefetch = -prefetch;
1018          if (use_stride) __ mov(stride, prefetch);
1019       }
1020 
1021       __ bind(again);
1022 
1023       if (PrefetchCopyIntervalInBytes > 0)
1024         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1025 
1026       if (direction == copy_forwards) {
1027        // allowing for the offset of -8 the store instructions place
1028        // registers into the target 64 bit block at the following
1029        // offsets
1030        //
1031        // t0 at offset 0
1032        // t1 at offset 8,  t2 at offset 16
1033        // t3 at offset 24, t4 at offset 32
1034        // t5 at offset 40, t6 at offset 48
1035        // t7 at offset 56
1036 
1037         bs.copy_store_at_8(Address(d, 1 * unit), t0);
1038         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1039         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1040         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1041         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1042         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1043         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1044         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1045         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1046       } else {
1047        // d was not offset when we started so the registers are
1048        // written into the 64 bit block preceding d with the following
1049        // offsets
1050        //
1051        // t1 at offset -8
1052        // t3 at offset -24, t0 at offset -16
1053        // t5 at offset -48, t2 at offset -32
1054        // t7 at offset -56, t4 at offset -48
1055        //                   t6 at offset -64
1056        //
1057        // note that this matches the offsets previously noted for the
1058        // loads
1059 
1060         bs.copy_store_at_8(Address(d, 1 * unit), t1);
1061         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1062         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1063         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1064         bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1065         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1066         bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1067         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1068         bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1069       }
1070 
1071       __ subs(count, count, 8);
1072       __ br(Assembler::HS, again);
1073 
1074       // Drain
1075       //
1076       // this uses the same pattern of offsets and register arguments
1077       // as above
1078       __ bind(drain);
1079       if (direction == copy_forwards) {
1080         bs.copy_store_at_8(Address(d, 1 * unit), t0);
1081         bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1082         bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1083         bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1084         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1085       } else {
1086         bs.copy_store_at_8(Address(d, 1 * unit), t1);
1087         bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1088         bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1089         bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1090         bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1091       }
1092       // now we need to copy any remaining part block which may
1093       // include a 4 word block subblock and/or a 2 word subblock.
1094       // bits 2 and 1 in the count are the tell-tale for whether we
1095       // have each such subblock
1096       {
1097         Label L1, L2;
1098         __ tbz(count, exact_log2(4), L1);
1099        // this is the same as above but copying only 4 longs hence
1100        // with only one intervening stp between the str instructions
1101        // but note that the offsets and registers still follow the
1102        // same pattern
1103         bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1104         bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
1105         if (direction == copy_forwards) {
1106           bs.copy_store_at_8(Address(d, 1 * unit), t0);
1107           bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1108           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
1109         } else {
1110           bs.copy_store_at_8(Address(d, 1 * unit), t1);
1111           bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1112           bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
1113         }
1114         __ bind(L1);
1115 
1116         __ tbz(count, 1, L2);
1117        // this is the same as above but copying only 2 longs hence
1118        // there is no intervening stp between the str instructions
1119        // but note that the offset and register patterns are still
1120        // the same
1121         bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
1122         if (direction == copy_forwards) {
1123           bs.copy_store_at_8(Address(d, 1 * unit), t0);
1124           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
1125         } else {
1126           bs.copy_store_at_8(Address(d, 1 * unit), t1);
1127           bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
1128         }
1129         __ bind(L2);
1130 
1131        // for forwards copy we need to re-adjust the offsets we
1132        // applied so that s and d are follow the last words written
1133 
1134        if (direction == copy_forwards) {
1135          __ add(s, s, 16);
1136          __ add(d, d, 8);
1137        }
1138 
1139       }
1140 
1141       __ ret(lr);
1142       }
1143   }
1144 
1145   // Small copy: less than 16 bytes.
1146   //
1147   // NB: Ignores all of the bits of count which represent more than 15
1148   // bytes, so a caller doesn't have to mask them.
1149 
1150   void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
1151     bool is_backwards = step < 0;
1152     size_t granularity = uabs(step);
1153     int direction = is_backwards ? -1 : 1;
1154 
1155     Label Lword, Lint, Lshort, Lbyte;
1156 
1157     assert(granularity
1158            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1159 
1160     const Register t0 = r3;
1161     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1162     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
1163 
1164     // ??? I don't know if this bit-test-and-branch is the right thing
1165     // to do.  It does a lot of jumping, resulting in several
1166     // mispredicted branches.  It might make more sense to do this
1167     // with something like Duff's device with a single computed branch.
1168 
1169     __ tbz(count, 3 - exact_log2(granularity), Lword);
1170     bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1171     bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1172     __ bind(Lword);
1173 
1174     if (granularity <= sizeof (jint)) {
1175       __ tbz(count, 2 - exact_log2(granularity), Lint);
1176       __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1177       __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1178       __ bind(Lint);
1179     }
1180 
1181     if (granularity <= sizeof (jshort)) {
1182       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1183       __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1184       __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1185       __ bind(Lshort);
1186     }
1187 
1188     if (granularity <= sizeof (jbyte)) {
1189       __ tbz(count, 0, Lbyte);
1190       __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1191       __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1192       __ bind(Lbyte);
1193     }
1194   }
1195 
1196   Label copy_f, copy_b;
1197   Label copy_obj_f, copy_obj_b;
1198   Label copy_obj_uninit_f, copy_obj_uninit_b;
1199 
1200   // All-singing all-dancing memory copy.
1201   //
1202   // Copy count units of memory from s to d.  The size of a unit is
1203   // step, which can be positive or negative depending on the direction
1204   // of copy.  If is_aligned is false, we align the source address.
1205   //
1206 
1207   void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
1208                    Register s, Register d, Register count, int step) {
1209     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1210     bool is_backwards = step < 0;
1211     unsigned int granularity = uabs(step);
1212     const Register t0 = r3, t1 = r4;
1213 
1214     // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1215     // load all the data before writing anything
1216     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1217     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
1218     const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
1219     const Register send = r17, dend = r16;
1220     const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1221     const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
1222     ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
1223 
1224     if (PrefetchCopyIntervalInBytes > 0)
1225       __ prfm(Address(s, 0), PLDL1KEEP);
1226     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1227     __ br(Assembler::HI, copy_big);
1228 
1229     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1230     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1231 
1232     __ cmp(count, u1(16/granularity));
1233     __ br(Assembler::LS, copy16);
1234 
1235     __ cmp(count, u1(64/granularity));
1236     __ br(Assembler::HI, copy80);
1237 
1238     __ cmp(count, u1(32/granularity));
1239     __ br(Assembler::LS, copy32);
1240 
1241     // 33..64 bytes
1242     if (UseSIMDForMemoryOps) {
1243       bs.copy_load_at_32(v0, v1, Address(s, 0));
1244       bs.copy_load_at_32(v2, v3, Address(send, -32));
1245       bs.copy_store_at_32(Address(d, 0), v0, v1);
1246       bs.copy_store_at_32(Address(dend, -32), v2, v3);
1247     } else {
1248       bs.copy_load_at_16(t0, t1, Address(s, 0));
1249       bs.copy_load_at_16(t2, t3, Address(s, 16));
1250       bs.copy_load_at_16(t4, t5, Address(send, -32));
1251       bs.copy_load_at_16(t6, t7, Address(send, -16));
1252 
1253       bs.copy_store_at_16(Address(d, 0), t0, t1);
1254       bs.copy_store_at_16(Address(d, 16), t2, t3);
1255       bs.copy_store_at_16(Address(dend, -32), t4, t5);
1256       bs.copy_store_at_16(Address(dend, -16), t6, t7);
1257     }
1258     __ b(finish);
1259 
1260     // 17..32 bytes
1261     __ bind(copy32);
1262     bs.copy_load_at_16(t0, t1, Address(s, 0));
1263     bs.copy_load_at_16(t6, t7, Address(send, -16));
1264 
1265     bs.copy_store_at_16(Address(d, 0), t0, t1);
1266     bs.copy_store_at_16(Address(dend, -16), t6, t7);
1267     __ b(finish);
1268 
1269     // 65..80/96 bytes
1270     // (96 bytes if SIMD because we do 32 byes per instruction)
1271     __ bind(copy80);
1272     if (UseSIMDForMemoryOps) {
1273       bs.copy_load_at_32(v0, v1, Address(s, 0));
1274       bs.copy_load_at_32(v2, v3, Address(s, 32));
1275       // Unaligned pointers can be an issue for copying.
1276       // The issue has more chances to happen when granularity of data is
1277       // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1278       // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1279       // The most performance drop has been seen for the range 65-80 bytes.
1280       // For such cases using the pair of ldp/stp instead of the third pair of
1281       // ldpq/stpq fixes the performance issue.
1282       if (granularity < sizeof (jint)) {
1283         Label copy96;
1284         __ cmp(count, u1(80/granularity));
1285         __ br(Assembler::HI, copy96);
1286         bs.copy_load_at_16(t0, t1, Address(send, -16));
1287 
1288         bs.copy_store_at_32(Address(d, 0), v0, v1);
1289         bs.copy_store_at_32(Address(d, 32), v2, v3);
1290 
1291         bs.copy_store_at_16(Address(dend, -16), t0, t1);
1292         __ b(finish);
1293 
1294         __ bind(copy96);
1295       }
1296       bs.copy_load_at_32(v4, v5, Address(send, -32));
1297 
1298       bs.copy_store_at_32(Address(d, 0), v0, v1);
1299       bs.copy_store_at_32(Address(d, 32), v2, v3);
1300 
1301       bs.copy_store_at_32(Address(dend, -32), v4, v5);
1302     } else {
1303       bs.copy_load_at_16(t0, t1, Address(s, 0));
1304       bs.copy_load_at_16(t2, t3, Address(s, 16));
1305       bs.copy_load_at_16(t4, t5, Address(s, 32));
1306       bs.copy_load_at_16(t6, t7, Address(s, 48));
1307       bs.copy_load_at_16(t8, t9, Address(send, -16));
1308 
1309       bs.copy_store_at_16(Address(d, 0), t0, t1);
1310       bs.copy_store_at_16(Address(d, 16), t2, t3);
1311       bs.copy_store_at_16(Address(d, 32), t4, t5);
1312       bs.copy_store_at_16(Address(d, 48), t6, t7);
1313       bs.copy_store_at_16(Address(dend, -16), t8, t9);
1314     }
1315     __ b(finish);
1316 
1317     // 0..16 bytes
1318     __ bind(copy16);
1319     __ cmp(count, u1(8/granularity));
1320     __ br(Assembler::LO, copy8);
1321 
1322     // 8..16 bytes
1323     bs.copy_load_at_8(t0, Address(s, 0));
1324     bs.copy_load_at_8(t1, Address(send, -8));
1325     bs.copy_store_at_8(Address(d, 0), t0);
1326     bs.copy_store_at_8(Address(dend, -8), t1);
1327     __ b(finish);
1328 
1329     if (granularity < 8) {
1330       // 4..7 bytes
1331       __ bind(copy8);
1332       __ tbz(count, 2 - exact_log2(granularity), copy4);
1333       __ ldrw(t0, Address(s, 0));
1334       __ ldrw(t1, Address(send, -4));
1335       __ strw(t0, Address(d, 0));
1336       __ strw(t1, Address(dend, -4));
1337       __ b(finish);
1338       if (granularity < 4) {
1339         // 0..3 bytes
1340         __ bind(copy4);
1341         __ cbz(count, finish); // get rid of 0 case
1342         if (granularity == 2) {
1343           __ ldrh(t0, Address(s, 0));
1344           __ strh(t0, Address(d, 0));
1345         } else { // granularity == 1
1346           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1347           // the first and last byte.
1348           // Handle the 3 byte case by loading and storing base + count/2
1349           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1350           // This does means in the 1 byte case we load/store the same
1351           // byte 3 times.
1352           __ lsr(count, count, 1);
1353           __ ldrb(t0, Address(s, 0));
1354           __ ldrb(t1, Address(send, -1));
1355           __ ldrb(t2, Address(s, count));
1356           __ strb(t0, Address(d, 0));
1357           __ strb(t1, Address(dend, -1));
1358           __ strb(t2, Address(d, count));
1359         }
1360         __ b(finish);
1361       }
1362     }
1363 
1364     __ bind(copy_big);
1365     if (is_backwards) {
1366       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1367       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1368     }
1369 
1370     // Now we've got the small case out of the way we can align the
1371     // source address on a 2-word boundary.
1372 
1373     // Here we will materialize a count in r15, which is used by copy_memory_small
1374     // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
1375     // Up until here, we have used t9, which aliases r15, but from here on, that register
1376     // can not be used as a temp register, as it contains the count.
1377 
1378     Label aligned;
1379 
1380     if (is_aligned) {
1381       // We may have to adjust by 1 word to get s 2-word-aligned.
1382       __ tbz(s, exact_log2(wordSize), aligned);
1383       bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1384       bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1385       __ sub(count, count, wordSize/granularity);
1386     } else {
1387       if (is_backwards) {
1388         __ andr(r15, s, 2 * wordSize - 1);
1389       } else {
1390         __ neg(r15, s);
1391         __ andr(r15, r15, 2 * wordSize - 1);
1392       }
1393       // r15 is the byte adjustment needed to align s.
1394       __ cbz(r15, aligned);
1395       int shift = exact_log2(granularity);
1396       if (shift > 0) {
1397         __ lsr(r15, r15, shift);
1398       }
1399       __ sub(count, count, r15);
1400 
1401 #if 0
1402       // ?? This code is only correct for a disjoint copy.  It may or
1403       // may not make sense to use it in that case.
1404 
1405       // Copy the first pair; s and d may not be aligned.
1406       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1407       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1408 
1409       // Align s and d, adjust count
1410       if (is_backwards) {
1411         __ sub(s, s, r15);
1412         __ sub(d, d, r15);
1413       } else {
1414         __ add(s, s, r15);
1415         __ add(d, d, r15);
1416       }
1417 #else
1418       copy_memory_small(decorators, type, s, d, r15, step);
1419 #endif
1420     }
1421 
1422     __ bind(aligned);
1423 
1424     // s is now 2-word-aligned.
1425 
1426     // We have a count of units and some trailing bytes. Adjust the
1427     // count and do a bulk copy of words. If the shift is zero
1428     // perform a move instead to benefit from zero latency moves.
1429     int shift = exact_log2(wordSize/granularity);
1430     if (shift > 0) {
1431       __ lsr(r15, count, shift);
1432     } else {
1433       __ mov(r15, count);
1434     }
1435     if (direction == copy_forwards) {
1436       if (type != T_OBJECT) {
1437         __ bl(copy_f);
1438       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1439         __ bl(copy_obj_uninit_f);
1440       } else {
1441         __ bl(copy_obj_f);
1442       }
1443     } else {
1444       if (type != T_OBJECT) {
1445         __ bl(copy_b);
1446       } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1447         __ bl(copy_obj_uninit_b);
1448       } else {
1449         __ bl(copy_obj_b);
1450       }
1451     }
1452 
1453     // And the tail.
1454     copy_memory_small(decorators, type, s, d, count, step);
1455 
1456     if (granularity >= 8) __ bind(copy8);
1457     if (granularity >= 4) __ bind(copy4);
1458     __ bind(finish);
1459   }
1460 
1461 
1462   void clobber_registers() {
1463 #ifdef ASSERT
1464     RegSet clobbered
1465       = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1466     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1467     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1468     for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1469       __ mov(*it, rscratch1);
1470     }
1471 #endif
1472 
1473   }
1474 
1475   // Scan over array at a for count oops, verifying each one.
1476   // Preserves a and count, clobbers rscratch1 and rscratch2.
1477   void verify_oop_array (int size, Register a, Register count, Register temp) {
1478     Label loop, end;
1479     __ mov(rscratch1, a);
1480     __ mov(rscratch2, zr);
1481     __ bind(loop);
1482     __ cmp(rscratch2, count);
1483     __ br(Assembler::HS, end);
1484     if (size == wordSize) {
1485       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1486       __ verify_oop(temp);
1487     } else {
1488       __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1489       __ decode_heap_oop(temp); // calls verify_oop
1490     }
1491     __ add(rscratch2, rscratch2, 1);
1492     __ b(loop);
1493     __ bind(end);
1494   }
1495 
1496   // Arguments:
1497   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1498   //             ignored
1499   //   is_oop  - true => oop array, so generate store check code
1500   //   name    - stub name string
1501   //
1502   // Inputs:
1503   //   c_rarg0   - source array address
1504   //   c_rarg1   - destination array address
1505   //   c_rarg2   - element count, treated as ssize_t, can be zero
1506   //
1507   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1508   // the hardware handle it.  The two dwords within qwords that span
1509   // cache line boundaries will still be loaded and stored atomically.
1510   //
1511   // Side Effects:
1512   //   disjoint_int_copy_entry is set to the no-overlap entry point
1513   //   used by generate_conjoint_int_oop_copy().
1514   //
1515   address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry,
1516                                   const char *name, bool dest_uninitialized = false) {
1517     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1518     RegSet saved_reg = RegSet::of(s, d, count);
1519     __ align(CodeEntryAlignment);
1520     StubCodeMark mark(this, "StubRoutines", name);
1521     address start = __ pc();
1522     __ enter();
1523 
1524     if (entry != nullptr) {
1525       *entry = __ pc();
1526       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1527       BLOCK_COMMENT("Entry:");
1528     }
1529 
1530     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1531     if (dest_uninitialized) {
1532       decorators |= IS_DEST_UNINITIALIZED;
1533     }
1534     if (aligned) {
1535       decorators |= ARRAYCOPY_ALIGNED;
1536     }
1537 
1538     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1539     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1540 
1541     if (is_oop) {
1542       // save regs before copy_memory
1543       __ push(RegSet::of(d, count), sp);
1544     }
1545     {
1546       // UnsafeMemoryAccess page error: continue after unsafe access
1547       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1548       UnsafeMemoryAccessMark umam(this, add_entry, true);
1549       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1550     }
1551 
1552     if (is_oop) {
1553       __ pop(RegSet::of(d, count), sp);
1554       if (VerifyOops)
1555         verify_oop_array(size, d, count, r16);
1556     }
1557 
1558     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1559 
1560     __ leave();
1561     __ mov(r0, zr); // return 0
1562     __ ret(lr);
1563     return start;
1564   }
1565 
1566   // Arguments:
1567   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1568   //             ignored
1569   //   is_oop  - true => oop array, so generate store check code
1570   //   name    - stub name string
1571   //
1572   // Inputs:
1573   //   c_rarg0   - source array address
1574   //   c_rarg1   - destination array address
1575   //   c_rarg2   - element count, treated as ssize_t, can be zero
1576   //
1577   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1578   // the hardware handle it.  The two dwords within qwords that span
1579   // cache line boundaries will still be loaded and stored atomically.
1580   //
1581   address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
1582                                  address *entry, const char *name,
1583                                  bool dest_uninitialized = false) {
1584     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1585     RegSet saved_regs = RegSet::of(s, d, count);
1586     StubCodeMark mark(this, "StubRoutines", name);
1587     address start = __ pc();
1588     __ enter();
1589 
1590     if (entry != nullptr) {
1591       *entry = __ pc();
1592       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1593       BLOCK_COMMENT("Entry:");
1594     }
1595 
1596     // use fwd copy when (d-s) above_equal (count*size)
1597     __ sub(rscratch1, d, s);
1598     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1599     __ br(Assembler::HS, nooverlap_target);
1600 
1601     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1602     if (dest_uninitialized) {
1603       decorators |= IS_DEST_UNINITIALIZED;
1604     }
1605     if (aligned) {
1606       decorators |= ARRAYCOPY_ALIGNED;
1607     }
1608 
1609     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1610     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1611 
1612     if (is_oop) {
1613       // save regs before copy_memory
1614       __ push(RegSet::of(d, count), sp);
1615     }
1616     {
1617       // UnsafeMemoryAccess page error: continue after unsafe access
1618       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1619       UnsafeMemoryAccessMark umam(this, add_entry, true);
1620       copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
1621     }
1622     if (is_oop) {
1623       __ pop(RegSet::of(d, count), sp);
1624       if (VerifyOops)
1625         verify_oop_array(size, d, count, r16);
1626     }
1627     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1628     __ leave();
1629     __ mov(r0, zr); // return 0
1630     __ ret(lr);
1631     return start;
1632 }
1633 
1634   // Arguments:
1635   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1636   //             ignored
1637   //   name    - stub name string
1638   //
1639   // Inputs:
1640   //   c_rarg0   - source array address
1641   //   c_rarg1   - destination array address
1642   //   c_rarg2   - element count, treated as ssize_t, can be zero
1643   //
1644   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1645   // we let the hardware handle it.  The one to eight bytes within words,
1646   // dwords or qwords that span cache line boundaries will still be loaded
1647   // and stored atomically.
1648   //
1649   // Side Effects:
1650   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1651   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1652   // we let the hardware handle it.  The one to eight bytes within words,
1653   // dwords or qwords that span cache line boundaries will still be loaded
1654   // and stored atomically.
1655   //
1656   // Side Effects:
1657   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1658   //   used by generate_conjoint_byte_copy().
1659   //
1660   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1661     const bool not_oop = false;
1662     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1663   }
1664 
1665   // Arguments:
1666   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1667   //             ignored
1668   //   name    - stub name string
1669   //
1670   // Inputs:
1671   //   c_rarg0   - source array address
1672   //   c_rarg1   - destination array address
1673   //   c_rarg2   - element count, treated as ssize_t, can be zero
1674   //
1675   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1676   // we let the hardware handle it.  The one to eight bytes within words,
1677   // dwords or qwords that span cache line boundaries will still be loaded
1678   // and stored atomically.
1679   //
1680   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1681                                       address* entry, const char *name) {
1682     const bool not_oop = false;
1683     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1684   }
1685 
1686   // Arguments:
1687   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1688   //             ignored
1689   //   name    - stub name string
1690   //
1691   // Inputs:
1692   //   c_rarg0   - source array address
1693   //   c_rarg1   - destination array address
1694   //   c_rarg2   - element count, treated as ssize_t, can be zero
1695   //
1696   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1697   // let the hardware handle it.  The two or four words within dwords
1698   // or qwords that span cache line boundaries will still be loaded
1699   // and stored atomically.
1700   //
1701   // Side Effects:
1702   //   disjoint_short_copy_entry is set to the no-overlap entry point
1703   //   used by generate_conjoint_short_copy().
1704   //
1705   address generate_disjoint_short_copy(bool aligned,
1706                                        address* entry, const char *name) {
1707     const bool not_oop = false;
1708     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1709   }
1710 
1711   // Arguments:
1712   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1713   //             ignored
1714   //   name    - stub name string
1715   //
1716   // Inputs:
1717   //   c_rarg0   - source array address
1718   //   c_rarg1   - destination array address
1719   //   c_rarg2   - element count, treated as ssize_t, can be zero
1720   //
1721   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1722   // let the hardware handle it.  The two or four words within dwords
1723   // or qwords that span cache line boundaries will still be loaded
1724   // and stored atomically.
1725   //
1726   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1727                                        address *entry, const char *name) {
1728     const bool not_oop = false;
1729     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1730 
1731   }
1732   // Arguments:
1733   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1734   //             ignored
1735   //   name    - stub name string
1736   //
1737   // Inputs:
1738   //   c_rarg0   - source array address
1739   //   c_rarg1   - destination array address
1740   //   c_rarg2   - element count, treated as ssize_t, can be zero
1741   //
1742   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1743   // the hardware handle it.  The two dwords within qwords that span
1744   // cache line boundaries will still be loaded and stored atomically.
1745   //
1746   // Side Effects:
1747   //   disjoint_int_copy_entry is set to the no-overlap entry point
1748   //   used by generate_conjoint_int_oop_copy().
1749   //
1750   address generate_disjoint_int_copy(bool aligned, address *entry,
1751                                          const char *name, bool dest_uninitialized = false) {
1752     const bool not_oop = false;
1753     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1754   }
1755 
1756   // Arguments:
1757   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1758   //             ignored
1759   //   name    - stub name string
1760   //
1761   // Inputs:
1762   //   c_rarg0   - source array address
1763   //   c_rarg1   - destination array address
1764   //   c_rarg2   - element count, treated as ssize_t, can be zero
1765   //
1766   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1767   // the hardware handle it.  The two dwords within qwords that span
1768   // cache line boundaries will still be loaded and stored atomically.
1769   //
1770   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1771                                      address *entry, const char *name,
1772                                      bool dest_uninitialized = false) {
1773     const bool not_oop = false;
1774     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1775   }
1776 
1777 
1778   // Arguments:
1779   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1780   //             ignored
1781   //   name    - stub name string
1782   //
1783   // Inputs:
1784   //   c_rarg0   - source array address
1785   //   c_rarg1   - destination array address
1786   //   c_rarg2   - element count, treated as size_t, can be zero
1787   //
1788   // Side Effects:
1789   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1790   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1791   //
1792   address generate_disjoint_long_copy(bool aligned, address *entry,
1793                                           const char *name, bool dest_uninitialized = false) {
1794     const bool not_oop = false;
1795     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1796   }
1797 
1798   // Arguments:
1799   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1800   //             ignored
1801   //   name    - stub name string
1802   //
1803   // Inputs:
1804   //   c_rarg0   - source array address
1805   //   c_rarg1   - destination array address
1806   //   c_rarg2   - element count, treated as size_t, can be zero
1807   //
1808   address generate_conjoint_long_copy(bool aligned,
1809                                       address nooverlap_target, address *entry,
1810                                       const char *name, bool dest_uninitialized = false) {
1811     const bool not_oop = false;
1812     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1813   }
1814 
1815   // Arguments:
1816   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1817   //             ignored
1818   //   name    - stub name string
1819   //
1820   // Inputs:
1821   //   c_rarg0   - source array address
1822   //   c_rarg1   - destination array address
1823   //   c_rarg2   - element count, treated as size_t, can be zero
1824   //
1825   // Side Effects:
1826   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1827   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1828   //
1829   address generate_disjoint_oop_copy(bool aligned, address *entry,
1830                                      const char *name, bool dest_uninitialized) {
1831     const bool is_oop = true;
1832     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1833     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1834   }
1835 
1836   // Arguments:
1837   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1838   //             ignored
1839   //   name    - stub name string
1840   //
1841   // Inputs:
1842   //   c_rarg0   - source array address
1843   //   c_rarg1   - destination array address
1844   //   c_rarg2   - element count, treated as size_t, can be zero
1845   //
1846   address generate_conjoint_oop_copy(bool aligned,
1847                                      address nooverlap_target, address *entry,
1848                                      const char *name, bool dest_uninitialized) {
1849     const bool is_oop = true;
1850     const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1851     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1852                                   name, dest_uninitialized);
1853   }
1854 
1855 
1856   // Helper for generating a dynamic type check.
1857   // Smashes rscratch1, rscratch2.
1858   void generate_type_check(Register sub_klass,
1859                            Register super_check_offset,
1860                            Register super_klass,
1861                            Register temp1,
1862                            Register temp2,
1863                            Register result,
1864                            Label& L_success) {
1865     assert_different_registers(sub_klass, super_check_offset, super_klass);
1866 
1867     BLOCK_COMMENT("type_check:");
1868 
1869     Label L_miss;
1870 
1871     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, nullptr,
1872                                      super_check_offset);
1873     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
1874 
1875     // Fall through on failure!
1876     __ BIND(L_miss);
1877   }
1878 
1879   //
1880   //  Generate checkcasting array copy stub
1881   //
1882   //  Input:
1883   //    c_rarg0   - source array address
1884   //    c_rarg1   - destination array address
1885   //    c_rarg2   - element count, treated as ssize_t, can be zero
1886   //    c_rarg3   - size_t ckoff (super_check_offset)
1887   //    c_rarg4   - oop ckval (super_klass)
1888   //
1889   //  Output:
1890   //    r0 ==  0  -  success
1891   //    r0 == -1^K - failure, where K is partial transfer count
1892   //
1893   address generate_checkcast_copy(const char *name, address *entry,
1894                                   bool dest_uninitialized = false) {
1895 
1896     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1897 
1898     // Input registers (after setup_arg_regs)
1899     const Register from        = c_rarg0;   // source array address
1900     const Register to          = c_rarg1;   // destination array address
1901     const Register count       = c_rarg2;   // elementscount
1902     const Register ckoff       = c_rarg3;   // super_check_offset
1903     const Register ckval       = c_rarg4;   // super_klass
1904 
1905     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1906     RegSet wb_post_saved_regs = RegSet::of(count);
1907 
1908     // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1909     const Register copied_oop  = r22;       // actual oop copied
1910     const Register count_save  = r21;       // orig elementscount
1911     const Register start_to    = r20;       // destination array start address
1912     const Register r19_klass   = r19;       // oop._klass
1913 
1914     // Registers used as gc temps (r5, r6, r7 are save-on-call)
1915     const Register gct1 = r5, gct2 = r6, gct3 = r7;
1916 
1917     //---------------------------------------------------------------
1918     // Assembler stub will be used for this call to arraycopy
1919     // if the two arrays are subtypes of Object[] but the
1920     // destination array type is not equal to or a supertype
1921     // of the source type.  Each element must be separately
1922     // checked.
1923 
1924     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1925                                copied_oop, r19_klass, count_save);
1926 
1927     __ align(CodeEntryAlignment);
1928     StubCodeMark mark(this, "StubRoutines", name);
1929     address start = __ pc();
1930 
1931     __ enter(); // required for proper stackwalking of RuntimeStub frame
1932 
1933 #ifdef ASSERT
1934     // caller guarantees that the arrays really are different
1935     // otherwise, we would have to make conjoint checks
1936     { Label L;
1937       __ b(L);                  // conjoint check not yet implemented
1938       __ stop("checkcast_copy within a single array");
1939       __ bind(L);
1940     }
1941 #endif //ASSERT
1942 
1943     // Caller of this entry point must set up the argument registers.
1944     if (entry != nullptr) {
1945       *entry = __ pc();
1946       BLOCK_COMMENT("Entry:");
1947     }
1948 
1949      // Empty array:  Nothing to do.
1950     __ cbz(count, L_done);
1951     __ push(RegSet::of(r19, r20, r21, r22), sp);
1952 
1953 #ifdef ASSERT
1954     BLOCK_COMMENT("assert consistent ckoff/ckval");
1955     // The ckoff and ckval must be mutually consistent,
1956     // even though caller generates both.
1957     { Label L;
1958       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1959       __ ldrw(start_to, Address(ckval, sco_offset));
1960       __ cmpw(ckoff, start_to);
1961       __ br(Assembler::EQ, L);
1962       __ stop("super_check_offset inconsistent");
1963       __ bind(L);
1964     }
1965 #endif //ASSERT
1966 
1967     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1968     bool is_oop = true;
1969     int element_size = UseCompressedOops ? 4 : 8;
1970     if (dest_uninitialized) {
1971       decorators |= IS_DEST_UNINITIALIZED;
1972     }
1973 
1974     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1975     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1976 
1977     // save the original count
1978     __ mov(count_save, count);
1979 
1980     // Copy from low to high addresses
1981     __ mov(start_to, to);              // Save destination array start address
1982     __ b(L_load_element);
1983 
1984     // ======== begin loop ========
1985     // (Loop is rotated; its entry is L_load_element.)
1986     // Loop control:
1987     //   for (; count != 0; count--) {
1988     //     copied_oop = load_heap_oop(from++);
1989     //     ... generate_type_check ...;
1990     //     store_heap_oop(to++, copied_oop);
1991     //   }
1992     __ align(OptoLoopAlignment);
1993 
1994     __ BIND(L_store_element);
1995     bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
1996                       __ post(to, element_size), copied_oop, noreg,
1997                       gct1, gct2, gct3);
1998     __ sub(count, count, 1);
1999     __ cbz(count, L_do_card_marks);
2000 
2001     // ======== loop entry is here ========
2002     __ BIND(L_load_element);
2003     bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
2004                      copied_oop, noreg, __ post(from, element_size),
2005                      gct1);
2006     __ cbz(copied_oop, L_store_element);
2007 
2008     __ load_klass(r19_klass, copied_oop);// query the object klass
2009 
2010     BLOCK_COMMENT("type_check:");
2011     generate_type_check(/*sub_klass*/r19_klass,
2012                         /*super_check_offset*/ckoff,
2013                         /*super_klass*/ckval,
2014                         /*r_array_base*/gct1,
2015                         /*temp2*/gct2,
2016                         /*result*/r10, L_store_element);
2017 
2018     // Fall through on failure!
2019 
2020     // ======== end loop ========
2021 
2022     // It was a real error; we must depend on the caller to finish the job.
2023     // Register count = remaining oops, count_orig = total oops.
2024     // Emit GC store barriers for the oops we have copied and report
2025     // their number to the caller.
2026 
2027     __ subs(count, count_save, count);     // K = partially copied oop count
2028     __ eon(count, count, zr);              // report (-1^K) to caller
2029     __ br(Assembler::EQ, L_done_pop);
2030 
2031     __ BIND(L_do_card_marks);
2032     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
2033 
2034     __ bind(L_done_pop);
2035     __ pop(RegSet::of(r19, r20, r21, r22), sp);
2036     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
2037 
2038     __ bind(L_done);
2039     __ mov(r0, count);
2040     __ leave();
2041     __ ret(lr);
2042 
2043     return start;
2044   }
2045 
2046   // Perform range checks on the proposed arraycopy.
2047   // Kills temp, but nothing else.
2048   // Also, clean the sign bits of src_pos and dst_pos.
2049   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2050                               Register src_pos, // source position (c_rarg1)
2051                               Register dst,     // destination array oo (c_rarg2)
2052                               Register dst_pos, // destination position (c_rarg3)
2053                               Register length,
2054                               Register temp,
2055                               Label& L_failed) {
2056     BLOCK_COMMENT("arraycopy_range_checks:");
2057 
2058     assert_different_registers(rscratch1, temp);
2059 
2060     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2061     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
2062     __ addw(temp, length, src_pos);
2063     __ cmpw(temp, rscratch1);
2064     __ br(Assembler::HI, L_failed);
2065 
2066     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2067     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2068     __ addw(temp, length, dst_pos);
2069     __ cmpw(temp, rscratch1);
2070     __ br(Assembler::HI, L_failed);
2071 
2072     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
2073     __ movw(src_pos, src_pos);
2074     __ movw(dst_pos, dst_pos);
2075 
2076     BLOCK_COMMENT("arraycopy_range_checks done");
2077   }
2078 
2079   // These stubs get called from some dumb test routine.
2080   // I'll write them properly when they're called from
2081   // something that's actually doing something.
2082   static void fake_arraycopy_stub(address src, address dst, int count) {
2083     assert(count == 0, "huh?");
2084   }
2085 
2086 
2087   //
2088   //  Generate 'unsafe' array copy stub
2089   //  Though just as safe as the other stubs, it takes an unscaled
2090   //  size_t argument instead of an element count.
2091   //
2092   //  Input:
2093   //    c_rarg0   - source array address
2094   //    c_rarg1   - destination array address
2095   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2096   //
2097   // Examines the alignment of the operands and dispatches
2098   // to a long, int, short, or byte copy loop.
2099   //
2100   address generate_unsafe_copy(const char *name,
2101                                address byte_copy_entry,
2102                                address short_copy_entry,
2103                                address int_copy_entry,
2104                                address long_copy_entry) {
2105     Label L_long_aligned, L_int_aligned, L_short_aligned;
2106     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2107 
2108     __ align(CodeEntryAlignment);
2109     StubCodeMark mark(this, "StubRoutines", name);
2110     address start = __ pc();
2111     __ enter(); // required for proper stackwalking of RuntimeStub frame
2112 
2113     // bump this on entry, not on exit:
2114     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2115 
2116     __ orr(rscratch1, s, d);
2117     __ orr(rscratch1, rscratch1, count);
2118 
2119     __ andr(rscratch1, rscratch1, BytesPerLong-1);
2120     __ cbz(rscratch1, L_long_aligned);
2121     __ andr(rscratch1, rscratch1, BytesPerInt-1);
2122     __ cbz(rscratch1, L_int_aligned);
2123     __ tbz(rscratch1, 0, L_short_aligned);
2124     __ b(RuntimeAddress(byte_copy_entry));
2125 
2126     __ BIND(L_short_aligned);
2127     __ lsr(count, count, LogBytesPerShort);  // size => short_count
2128     __ b(RuntimeAddress(short_copy_entry));
2129     __ BIND(L_int_aligned);
2130     __ lsr(count, count, LogBytesPerInt);    // size => int_count
2131     __ b(RuntimeAddress(int_copy_entry));
2132     __ BIND(L_long_aligned);
2133     __ lsr(count, count, LogBytesPerLong);   // size => long_count
2134     __ b(RuntimeAddress(long_copy_entry));
2135 
2136     return start;
2137   }
2138 
2139   //
2140   //  Generate generic array copy stubs
2141   //
2142   //  Input:
2143   //    c_rarg0    -  src oop
2144   //    c_rarg1    -  src_pos (32-bits)
2145   //    c_rarg2    -  dst oop
2146   //    c_rarg3    -  dst_pos (32-bits)
2147   //    c_rarg4    -  element count (32-bits)
2148   //
2149   //  Output:
2150   //    r0 ==  0  -  success
2151   //    r0 == -1^K - failure, where K is partial transfer count
2152   //
2153   address generate_generic_copy(const char *name,
2154                                 address byte_copy_entry, address short_copy_entry,
2155                                 address int_copy_entry, address oop_copy_entry,
2156                                 address long_copy_entry, address checkcast_copy_entry) {
2157 
2158     Label L_failed, L_objArray;
2159     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2160 
2161     // Input registers
2162     const Register src        = c_rarg0;  // source array oop
2163     const Register src_pos    = c_rarg1;  // source position
2164     const Register dst        = c_rarg2;  // destination array oop
2165     const Register dst_pos    = c_rarg3;  // destination position
2166     const Register length     = c_rarg4;
2167 
2168 
2169     // Registers used as temps
2170     const Register dst_klass  = c_rarg5;
2171 
2172     __ align(CodeEntryAlignment);
2173 
2174     StubCodeMark mark(this, "StubRoutines", name);
2175 
2176     address start = __ pc();
2177 
2178     __ enter(); // required for proper stackwalking of RuntimeStub frame
2179 
2180     // bump this on entry, not on exit:
2181     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2182 
2183     //-----------------------------------------------------------------------
2184     // Assembler stub will be used for this call to arraycopy
2185     // if the following conditions are met:
2186     //
2187     // (1) src and dst must not be null.
2188     // (2) src_pos must not be negative.
2189     // (3) dst_pos must not be negative.
2190     // (4) length  must not be negative.
2191     // (5) src klass and dst klass should be the same and not null.
2192     // (6) src and dst should be arrays.
2193     // (7) src_pos + length must not exceed length of src.
2194     // (8) dst_pos + length must not exceed length of dst.
2195     //
2196 
2197     //  if (src == nullptr) return -1;
2198     __ cbz(src, L_failed);
2199 
2200     //  if (src_pos < 0) return -1;
2201     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2202 
2203     //  if (dst == nullptr) return -1;
2204     __ cbz(dst, L_failed);
2205 
2206     //  if (dst_pos < 0) return -1;
2207     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2208 
2209     // registers used as temp
2210     const Register scratch_length    = r16; // elements count to copy
2211     const Register scratch_src_klass = r17; // array klass
2212     const Register lh                = r15; // layout helper
2213 
2214     //  if (length < 0) return -1;
2215     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2216     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2217 
2218     __ load_klass(scratch_src_klass, src);
2219 #ifdef ASSERT
2220     //  assert(src->klass() != nullptr);
2221     {
2222       BLOCK_COMMENT("assert klasses not null {");
2223       Label L1, L2;
2224       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is null
2225       __ bind(L1);
2226       __ stop("broken null klass");
2227       __ bind(L2);
2228       __ load_klass(rscratch1, dst);
2229       __ cbz(rscratch1, L1);     // this would be broken also
2230       BLOCK_COMMENT("} assert klasses not null done");
2231     }
2232 #endif
2233 
2234     // Load layout helper (32-bits)
2235     //
2236     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2237     // 32        30    24            16              8     2                 0
2238     //
2239     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2240     //
2241 
2242     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2243 
2244     // Handle objArrays completely differently...
2245     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2246     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2247     __ movw(rscratch1, objArray_lh);
2248     __ eorw(rscratch2, lh, rscratch1);
2249     __ cbzw(rscratch2, L_objArray);
2250 
2251     //  if (src->klass() != dst->klass()) return -1;
2252     __ load_klass(rscratch2, dst);
2253     __ eor(rscratch2, rscratch2, scratch_src_klass);
2254     __ cbnz(rscratch2, L_failed);
2255 
2256     // Check for flat inline type array -> return -1
2257     __ test_flat_array_oop(src, rscratch2, L_failed);
2258 
2259     // Check for null-free (non-flat) inline type array -> handle as object array
2260     __ test_null_free_array_oop(src, rscratch2, L_objArray);
2261 
2262     //  if (!src->is_Array()) return -1;
2263     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2264 
2265     // At this point, it is known to be a typeArray (array_tag 0x3).
2266 #ifdef ASSERT
2267     {
2268       BLOCK_COMMENT("assert primitive array {");
2269       Label L;
2270       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2271       __ cmpw(lh, rscratch2);
2272       __ br(Assembler::GE, L);
2273       __ stop("must be a primitive array");
2274       __ bind(L);
2275       BLOCK_COMMENT("} assert primitive array done");
2276     }
2277 #endif
2278 
2279     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2280                            rscratch2, L_failed);
2281 
2282     // TypeArrayKlass
2283     //
2284     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2285     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2286     //
2287 
2288     const Register rscratch1_offset = rscratch1;    // array offset
2289     const Register r15_elsize = lh; // element size
2290 
2291     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2292            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2293     __ add(src, src, rscratch1_offset);           // src array offset
2294     __ add(dst, dst, rscratch1_offset);           // dst array offset
2295     BLOCK_COMMENT("choose copy loop based on element size");
2296 
2297     // next registers should be set before the jump to corresponding stub
2298     const Register from     = c_rarg0;  // source array address
2299     const Register to       = c_rarg1;  // destination array address
2300     const Register count    = c_rarg2;  // elements count
2301 
2302     // 'from', 'to', 'count' registers should be set in such order
2303     // since they are the same as 'src', 'src_pos', 'dst'.
2304 
2305     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2306 
2307     // The possible values of elsize are 0-3, i.e. exact_log2(element
2308     // size in bytes).  We do a simple bitwise binary search.
2309   __ BIND(L_copy_bytes);
2310     __ tbnz(r15_elsize, 1, L_copy_ints);
2311     __ tbnz(r15_elsize, 0, L_copy_shorts);
2312     __ lea(from, Address(src, src_pos));// src_addr
2313     __ lea(to,   Address(dst, dst_pos));// dst_addr
2314     __ movw(count, scratch_length); // length
2315     __ b(RuntimeAddress(byte_copy_entry));
2316 
2317   __ BIND(L_copy_shorts);
2318     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2319     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2320     __ movw(count, scratch_length); // length
2321     __ b(RuntimeAddress(short_copy_entry));
2322 
2323   __ BIND(L_copy_ints);
2324     __ tbnz(r15_elsize, 0, L_copy_longs);
2325     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2326     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2327     __ movw(count, scratch_length); // length
2328     __ b(RuntimeAddress(int_copy_entry));
2329 
2330   __ BIND(L_copy_longs);
2331 #ifdef ASSERT
2332     {
2333       BLOCK_COMMENT("assert long copy {");
2334       Label L;
2335       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2336       __ cmpw(r15_elsize, LogBytesPerLong);
2337       __ br(Assembler::EQ, L);
2338       __ stop("must be long copy, but elsize is wrong");
2339       __ bind(L);
2340       BLOCK_COMMENT("} assert long copy done");
2341     }
2342 #endif
2343     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2344     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2345     __ movw(count, scratch_length); // length
2346     __ b(RuntimeAddress(long_copy_entry));
2347 
2348     // ObjArrayKlass
2349   __ BIND(L_objArray);
2350     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2351 
2352     Label L_plain_copy, L_checkcast_copy;
2353     //  test array classes for subtyping
2354     __ load_klass(r15, dst);
2355     __ cmp(scratch_src_klass, r15); // usual case is exact equality
2356     __ br(Assembler::NE, L_checkcast_copy);
2357 
2358     // Identically typed arrays can be copied without element-wise checks.
2359     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2360                            rscratch2, L_failed);
2361 
2362     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2363     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2364     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2365     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2366     __ movw(count, scratch_length); // length
2367   __ BIND(L_plain_copy);
2368     __ b(RuntimeAddress(oop_copy_entry));
2369 
2370   __ BIND(L_checkcast_copy);
2371     // live at this point:  scratch_src_klass, scratch_length, r15 (dst_klass)
2372     {
2373       // Before looking at dst.length, make sure dst is also an objArray.
2374       __ ldrw(rscratch1, Address(r15, lh_offset));
2375       __ movw(rscratch2, objArray_lh);
2376       __ eorw(rscratch1, rscratch1, rscratch2);
2377       __ cbnzw(rscratch1, L_failed);
2378 
2379       // It is safe to examine both src.length and dst.length.
2380       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2381                              r15, L_failed);
2382 
2383       __ load_klass(dst_klass, dst); // reload
2384 
2385       // Marshal the base address arguments now, freeing registers.
2386       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2387       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2388       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2389       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2390       __ movw(count, length);           // length (reloaded)
2391       Register sco_temp = c_rarg3;      // this register is free now
2392       assert_different_registers(from, to, count, sco_temp,
2393                                  dst_klass, scratch_src_klass);
2394       // assert_clean_int(count, sco_temp);
2395 
2396       // Generate the type check.
2397       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2398       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2399 
2400       // Smashes rscratch1, rscratch2
2401       generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
2402                           L_plain_copy);
2403 
2404       // Fetch destination element klass from the ObjArrayKlass header.
2405       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2406       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2407       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2408 
2409       // the checkcast_copy loop needs two extra arguments:
2410       assert(c_rarg3 == sco_temp, "#3 already in place");
2411       // Set up arguments for checkcast_copy_entry.
2412       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2413       __ b(RuntimeAddress(checkcast_copy_entry));
2414     }
2415 
2416   __ BIND(L_failed);
2417     __ mov(r0, -1);
2418     __ leave();   // required for proper stackwalking of RuntimeStub frame
2419     __ ret(lr);
2420 
2421     return start;
2422   }
2423 
2424   //
2425   // Generate stub for array fill. If "aligned" is true, the
2426   // "to" address is assumed to be heapword aligned.
2427   //
2428   // Arguments for generated stub:
2429   //   to:    c_rarg0
2430   //   value: c_rarg1
2431   //   count: c_rarg2 treated as signed
2432   //
2433   address generate_fill(BasicType t, bool aligned, const char *name) {
2434     __ align(CodeEntryAlignment);
2435     StubCodeMark mark(this, "StubRoutines", name);
2436     address start = __ pc();
2437 
2438     BLOCK_COMMENT("Entry:");
2439 
2440     const Register to        = c_rarg0;  // source array address
2441     const Register value     = c_rarg1;  // value
2442     const Register count     = c_rarg2;  // elements count
2443 
2444     const Register bz_base = r10;        // base for block_zero routine
2445     const Register cnt_words = r11;      // temp register
2446 
2447     __ enter();
2448 
2449     Label L_fill_elements, L_exit1;
2450 
2451     int shift = -1;
2452     switch (t) {
2453       case T_BYTE:
2454         shift = 0;
2455         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2456         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2457         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2458         __ br(Assembler::LO, L_fill_elements);
2459         break;
2460       case T_SHORT:
2461         shift = 1;
2462         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2463         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2464         __ br(Assembler::LO, L_fill_elements);
2465         break;
2466       case T_INT:
2467         shift = 2;
2468         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2469         __ br(Assembler::LO, L_fill_elements);
2470         break;
2471       default: ShouldNotReachHere();
2472     }
2473 
2474     // Align source address at 8 bytes address boundary.
2475     Label L_skip_align1, L_skip_align2, L_skip_align4;
2476     if (!aligned) {
2477       switch (t) {
2478         case T_BYTE:
2479           // One byte misalignment happens only for byte arrays.
2480           __ tbz(to, 0, L_skip_align1);
2481           __ strb(value, Address(__ post(to, 1)));
2482           __ subw(count, count, 1);
2483           __ bind(L_skip_align1);
2484           // Fallthrough
2485         case T_SHORT:
2486           // Two bytes misalignment happens only for byte and short (char) arrays.
2487           __ tbz(to, 1, L_skip_align2);
2488           __ strh(value, Address(__ post(to, 2)));
2489           __ subw(count, count, 2 >> shift);
2490           __ bind(L_skip_align2);
2491           // Fallthrough
2492         case T_INT:
2493           // Align to 8 bytes, we know we are 4 byte aligned to start.
2494           __ tbz(to, 2, L_skip_align4);
2495           __ strw(value, Address(__ post(to, 4)));
2496           __ subw(count, count, 4 >> shift);
2497           __ bind(L_skip_align4);
2498           break;
2499         default: ShouldNotReachHere();
2500       }
2501     }
2502 
2503     //
2504     //  Fill large chunks
2505     //
2506     __ lsrw(cnt_words, count, 3 - shift); // number of words
2507     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2508     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2509     if (UseBlockZeroing) {
2510       Label non_block_zeroing, rest;
2511       // If the fill value is zero we can use the fast zero_words().
2512       __ cbnz(value, non_block_zeroing);
2513       __ mov(bz_base, to);
2514       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2515       address tpc = __ zero_words(bz_base, cnt_words);
2516       if (tpc == nullptr) {
2517         fatal("CodeCache is full at generate_fill");
2518       }
2519       __ b(rest);
2520       __ bind(non_block_zeroing);
2521       __ fill_words(to, cnt_words, value);
2522       __ bind(rest);
2523     } else {
2524       __ fill_words(to, cnt_words, value);
2525     }
2526 
2527     // Remaining count is less than 8 bytes. Fill it by a single store.
2528     // Note that the total length is no less than 8 bytes.
2529     if (t == T_BYTE || t == T_SHORT) {
2530       Label L_exit1;
2531       __ cbzw(count, L_exit1);
2532       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2533       __ str(value, Address(to, -8));    // overwrite some elements
2534       __ bind(L_exit1);
2535       __ leave();
2536       __ ret(lr);
2537     }
2538 
2539     // Handle copies less than 8 bytes.
2540     Label L_fill_2, L_fill_4, L_exit2;
2541     __ bind(L_fill_elements);
2542     switch (t) {
2543       case T_BYTE:
2544         __ tbz(count, 0, L_fill_2);
2545         __ strb(value, Address(__ post(to, 1)));
2546         __ bind(L_fill_2);
2547         __ tbz(count, 1, L_fill_4);
2548         __ strh(value, Address(__ post(to, 2)));
2549         __ bind(L_fill_4);
2550         __ tbz(count, 2, L_exit2);
2551         __ strw(value, Address(to));
2552         break;
2553       case T_SHORT:
2554         __ tbz(count, 0, L_fill_4);
2555         __ strh(value, Address(__ post(to, 2)));
2556         __ bind(L_fill_4);
2557         __ tbz(count, 1, L_exit2);
2558         __ strw(value, Address(to));
2559         break;
2560       case T_INT:
2561         __ cbzw(count, L_exit2);
2562         __ strw(value, Address(to));
2563         break;
2564       default: ShouldNotReachHere();
2565     }
2566     __ bind(L_exit2);
2567     __ leave();
2568     __ ret(lr);
2569     return start;
2570   }
2571 
2572   address generate_data_cache_writeback() {
2573     const Register line        = c_rarg0;  // address of line to write back
2574 
2575     __ align(CodeEntryAlignment);
2576 
2577     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2578 
2579     address start = __ pc();
2580     __ enter();
2581     __ cache_wb(Address(line, 0));
2582     __ leave();
2583     __ ret(lr);
2584 
2585     return start;
2586   }
2587 
2588   address generate_data_cache_writeback_sync() {
2589     const Register is_pre     = c_rarg0;  // pre or post sync
2590 
2591     __ align(CodeEntryAlignment);
2592 
2593     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2594 
2595     // pre wbsync is a no-op
2596     // post wbsync translates to an sfence
2597 
2598     Label skip;
2599     address start = __ pc();
2600     __ enter();
2601     __ cbnz(is_pre, skip);
2602     __ cache_wbsync(false);
2603     __ bind(skip);
2604     __ leave();
2605     __ ret(lr);
2606 
2607     return start;
2608   }
2609 
2610   void generate_arraycopy_stubs() {
2611     address entry;
2612     address entry_jbyte_arraycopy;
2613     address entry_jshort_arraycopy;
2614     address entry_jint_arraycopy;
2615     address entry_oop_arraycopy;
2616     address entry_jlong_arraycopy;
2617     address entry_checkcast_arraycopy;
2618 
2619     generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_f, r0, r1, r15, copy_forwards);
2620     generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_b, r0, r1, r15, copy_backwards);
2621 
2622     generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_f, r0, r1, r15, copy_forwards);
2623     generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_b, r0, r1, r15, copy_backwards);
2624 
2625     generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_f, r0, r1, r15, copy_forwards);
2626     generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_b, r0, r1, r15, copy_backwards);
2627 
2628     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2629 
2630     //*** jbyte
2631     // Always need aligned and unaligned versions
2632     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2633                                                                                   "jbyte_disjoint_arraycopy");
2634     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2635                                                                                   &entry_jbyte_arraycopy,
2636                                                                                   "jbyte_arraycopy");
2637     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2638                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2639     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, nullptr,
2640                                                                                   "arrayof_jbyte_arraycopy");
2641 
2642     //*** jshort
2643     // Always need aligned and unaligned versions
2644     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2645                                                                                     "jshort_disjoint_arraycopy");
2646     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2647                                                                                     &entry_jshort_arraycopy,
2648                                                                                     "jshort_arraycopy");
2649     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2650                                                                                     "arrayof_jshort_disjoint_arraycopy");
2651     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, nullptr,
2652                                                                                     "arrayof_jshort_arraycopy");
2653 
2654     //*** jint
2655     // Aligned versions
2656     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2657                                                                                 "arrayof_jint_disjoint_arraycopy");
2658     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2659                                                                                 "arrayof_jint_arraycopy");
2660     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2661     // entry_jint_arraycopy always points to the unaligned version
2662     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2663                                                                                 "jint_disjoint_arraycopy");
2664     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2665                                                                                 &entry_jint_arraycopy,
2666                                                                                 "jint_arraycopy");
2667 
2668     //*** jlong
2669     // It is always aligned
2670     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2671                                                                                   "arrayof_jlong_disjoint_arraycopy");
2672     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2673                                                                                   "arrayof_jlong_arraycopy");
2674     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2675     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2676 
2677     //*** oops
2678     {
2679       // With compressed oops we need unaligned versions; notice that
2680       // we overwrite entry_oop_arraycopy.
2681       bool aligned = !UseCompressedOops;
2682 
2683       StubRoutines::_arrayof_oop_disjoint_arraycopy
2684         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2685                                      /*dest_uninitialized*/false);
2686       StubRoutines::_arrayof_oop_arraycopy
2687         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2688                                      /*dest_uninitialized*/false);
2689       // Aligned versions without pre-barriers
2690       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2691         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2692                                      /*dest_uninitialized*/true);
2693       StubRoutines::_arrayof_oop_arraycopy_uninit
2694         = generate_conjoint_oop_copy(aligned, entry, nullptr, "arrayof_oop_arraycopy_uninit",
2695                                      /*dest_uninitialized*/true);
2696     }
2697 
2698     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2699     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2700     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2701     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2702 
2703     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2704     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr,
2705                                                                         /*dest_uninitialized*/true);
2706 
2707     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2708                                                               entry_jbyte_arraycopy,
2709                                                               entry_jshort_arraycopy,
2710                                                               entry_jint_arraycopy,
2711                                                               entry_jlong_arraycopy);
2712 
2713     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2714                                                                entry_jbyte_arraycopy,
2715                                                                entry_jshort_arraycopy,
2716                                                                entry_jint_arraycopy,
2717                                                                entry_oop_arraycopy,
2718                                                                entry_jlong_arraycopy,
2719                                                                entry_checkcast_arraycopy);
2720 
2721     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2722     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2723     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2724     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2725     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2726     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2727   }
2728 
2729   void generate_math_stubs() { Unimplemented(); }
2730 
2731   // Arguments:
2732   //
2733   // Inputs:
2734   //   c_rarg0   - source byte array address
2735   //   c_rarg1   - destination byte array address
2736   //   c_rarg2   - K (key) in little endian int array
2737   //
2738   address generate_aescrypt_encryptBlock() {
2739     __ align(CodeEntryAlignment);
2740     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2741 
2742     const Register from        = c_rarg0;  // source array address
2743     const Register to          = c_rarg1;  // destination array address
2744     const Register key         = c_rarg2;  // key array address
2745     const Register keylen      = rscratch1;
2746 
2747     address start = __ pc();
2748     __ enter();
2749 
2750     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2751 
2752     __ aesenc_loadkeys(key, keylen);
2753     __ aesecb_encrypt(from, to, keylen);
2754 
2755     __ mov(r0, 0);
2756 
2757     __ leave();
2758     __ ret(lr);
2759 
2760     return start;
2761   }
2762 
2763   // Arguments:
2764   //
2765   // Inputs:
2766   //   c_rarg0   - source byte array address
2767   //   c_rarg1   - destination byte array address
2768   //   c_rarg2   - K (key) in little endian int array
2769   //
2770   address generate_aescrypt_decryptBlock() {
2771     assert(UseAES, "need AES cryptographic extension support");
2772     __ align(CodeEntryAlignment);
2773     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2774     Label L_doLast;
2775 
2776     const Register from        = c_rarg0;  // source array address
2777     const Register to          = c_rarg1;  // destination array address
2778     const Register key         = c_rarg2;  // key array address
2779     const Register keylen      = rscratch1;
2780 
2781     address start = __ pc();
2782     __ enter(); // required for proper stackwalking of RuntimeStub frame
2783 
2784     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2785 
2786     __ aesecb_decrypt(from, to, key, keylen);
2787 
2788     __ mov(r0, 0);
2789 
2790     __ leave();
2791     __ ret(lr);
2792 
2793     return start;
2794   }
2795 
2796   // Arguments:
2797   //
2798   // Inputs:
2799   //   c_rarg0   - source byte array address
2800   //   c_rarg1   - destination byte array address
2801   //   c_rarg2   - K (key) in little endian int array
2802   //   c_rarg3   - r vector byte array address
2803   //   c_rarg4   - input length
2804   //
2805   // Output:
2806   //   x0        - input length
2807   //
2808   address generate_cipherBlockChaining_encryptAESCrypt() {
2809     assert(UseAES, "need AES cryptographic extension support");
2810     __ align(CodeEntryAlignment);
2811     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2812 
2813     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2814 
2815     const Register from        = c_rarg0;  // source array address
2816     const Register to          = c_rarg1;  // destination array address
2817     const Register key         = c_rarg2;  // key array address
2818     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2819                                            // and left with the results of the last encryption block
2820     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2821     const Register keylen      = rscratch1;
2822 
2823     address start = __ pc();
2824 
2825       __ enter();
2826 
2827       __ movw(rscratch2, len_reg);
2828 
2829       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2830 
2831       __ ld1(v0, __ T16B, rvec);
2832 
2833       __ cmpw(keylen, 52);
2834       __ br(Assembler::CC, L_loadkeys_44);
2835       __ br(Assembler::EQ, L_loadkeys_52);
2836 
2837       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2838       __ rev32(v17, __ T16B, v17);
2839       __ rev32(v18, __ T16B, v18);
2840     __ BIND(L_loadkeys_52);
2841       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2842       __ rev32(v19, __ T16B, v19);
2843       __ rev32(v20, __ T16B, v20);
2844     __ BIND(L_loadkeys_44);
2845       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2846       __ rev32(v21, __ T16B, v21);
2847       __ rev32(v22, __ T16B, v22);
2848       __ rev32(v23, __ T16B, v23);
2849       __ rev32(v24, __ T16B, v24);
2850       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2851       __ rev32(v25, __ T16B, v25);
2852       __ rev32(v26, __ T16B, v26);
2853       __ rev32(v27, __ T16B, v27);
2854       __ rev32(v28, __ T16B, v28);
2855       __ ld1(v29, v30, v31, __ T16B, key);
2856       __ rev32(v29, __ T16B, v29);
2857       __ rev32(v30, __ T16B, v30);
2858       __ rev32(v31, __ T16B, v31);
2859 
2860     __ BIND(L_aes_loop);
2861       __ ld1(v1, __ T16B, __ post(from, 16));
2862       __ eor(v0, __ T16B, v0, v1);
2863 
2864       __ br(Assembler::CC, L_rounds_44);
2865       __ br(Assembler::EQ, L_rounds_52);
2866 
2867       __ aese(v0, v17); __ aesmc(v0, v0);
2868       __ aese(v0, v18); __ aesmc(v0, v0);
2869     __ BIND(L_rounds_52);
2870       __ aese(v0, v19); __ aesmc(v0, v0);
2871       __ aese(v0, v20); __ aesmc(v0, v0);
2872     __ BIND(L_rounds_44);
2873       __ aese(v0, v21); __ aesmc(v0, v0);
2874       __ aese(v0, v22); __ aesmc(v0, v0);
2875       __ aese(v0, v23); __ aesmc(v0, v0);
2876       __ aese(v0, v24); __ aesmc(v0, v0);
2877       __ aese(v0, v25); __ aesmc(v0, v0);
2878       __ aese(v0, v26); __ aesmc(v0, v0);
2879       __ aese(v0, v27); __ aesmc(v0, v0);
2880       __ aese(v0, v28); __ aesmc(v0, v0);
2881       __ aese(v0, v29); __ aesmc(v0, v0);
2882       __ aese(v0, v30);
2883       __ eor(v0, __ T16B, v0, v31);
2884 
2885       __ st1(v0, __ T16B, __ post(to, 16));
2886 
2887       __ subw(len_reg, len_reg, 16);
2888       __ cbnzw(len_reg, L_aes_loop);
2889 
2890       __ st1(v0, __ T16B, rvec);
2891 
2892       __ mov(r0, rscratch2);
2893 
2894       __ leave();
2895       __ ret(lr);
2896 
2897       return start;
2898   }
2899 
2900   // Arguments:
2901   //
2902   // Inputs:
2903   //   c_rarg0   - source byte array address
2904   //   c_rarg1   - destination byte array address
2905   //   c_rarg2   - K (key) in little endian int array
2906   //   c_rarg3   - r vector byte array address
2907   //   c_rarg4   - input length
2908   //
2909   // Output:
2910   //   r0        - input length
2911   //
2912   address generate_cipherBlockChaining_decryptAESCrypt() {
2913     assert(UseAES, "need AES cryptographic extension support");
2914     __ align(CodeEntryAlignment);
2915     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2916 
2917     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2918 
2919     const Register from        = c_rarg0;  // source array address
2920     const Register to          = c_rarg1;  // destination array address
2921     const Register key         = c_rarg2;  // key array address
2922     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2923                                            // and left with the results of the last encryption block
2924     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2925     const Register keylen      = rscratch1;
2926 
2927     address start = __ pc();
2928 
2929       __ enter();
2930 
2931       __ movw(rscratch2, len_reg);
2932 
2933       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2934 
2935       __ ld1(v2, __ T16B, rvec);
2936 
2937       __ ld1(v31, __ T16B, __ post(key, 16));
2938       __ rev32(v31, __ T16B, v31);
2939 
2940       __ cmpw(keylen, 52);
2941       __ br(Assembler::CC, L_loadkeys_44);
2942       __ br(Assembler::EQ, L_loadkeys_52);
2943 
2944       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2945       __ rev32(v17, __ T16B, v17);
2946       __ rev32(v18, __ T16B, v18);
2947     __ BIND(L_loadkeys_52);
2948       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2949       __ rev32(v19, __ T16B, v19);
2950       __ rev32(v20, __ T16B, v20);
2951     __ BIND(L_loadkeys_44);
2952       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2953       __ rev32(v21, __ T16B, v21);
2954       __ rev32(v22, __ T16B, v22);
2955       __ rev32(v23, __ T16B, v23);
2956       __ rev32(v24, __ T16B, v24);
2957       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2958       __ rev32(v25, __ T16B, v25);
2959       __ rev32(v26, __ T16B, v26);
2960       __ rev32(v27, __ T16B, v27);
2961       __ rev32(v28, __ T16B, v28);
2962       __ ld1(v29, v30, __ T16B, key);
2963       __ rev32(v29, __ T16B, v29);
2964       __ rev32(v30, __ T16B, v30);
2965 
2966     __ BIND(L_aes_loop);
2967       __ ld1(v0, __ T16B, __ post(from, 16));
2968       __ orr(v1, __ T16B, v0, v0);
2969 
2970       __ br(Assembler::CC, L_rounds_44);
2971       __ br(Assembler::EQ, L_rounds_52);
2972 
2973       __ aesd(v0, v17); __ aesimc(v0, v0);
2974       __ aesd(v0, v18); __ aesimc(v0, v0);
2975     __ BIND(L_rounds_52);
2976       __ aesd(v0, v19); __ aesimc(v0, v0);
2977       __ aesd(v0, v20); __ aesimc(v0, v0);
2978     __ BIND(L_rounds_44);
2979       __ aesd(v0, v21); __ aesimc(v0, v0);
2980       __ aesd(v0, v22); __ aesimc(v0, v0);
2981       __ aesd(v0, v23); __ aesimc(v0, v0);
2982       __ aesd(v0, v24); __ aesimc(v0, v0);
2983       __ aesd(v0, v25); __ aesimc(v0, v0);
2984       __ aesd(v0, v26); __ aesimc(v0, v0);
2985       __ aesd(v0, v27); __ aesimc(v0, v0);
2986       __ aesd(v0, v28); __ aesimc(v0, v0);
2987       __ aesd(v0, v29); __ aesimc(v0, v0);
2988       __ aesd(v0, v30);
2989       __ eor(v0, __ T16B, v0, v31);
2990       __ eor(v0, __ T16B, v0, v2);
2991 
2992       __ st1(v0, __ T16B, __ post(to, 16));
2993       __ orr(v2, __ T16B, v1, v1);
2994 
2995       __ subw(len_reg, len_reg, 16);
2996       __ cbnzw(len_reg, L_aes_loop);
2997 
2998       __ st1(v2, __ T16B, rvec);
2999 
3000       __ mov(r0, rscratch2);
3001 
3002       __ leave();
3003       __ ret(lr);
3004 
3005     return start;
3006   }
3007 
3008   // Big-endian 128-bit + 64-bit -> 128-bit addition.
3009   // Inputs: 128-bits. in is preserved.
3010   // The least-significant 64-bit word is in the upper dword of each vector.
3011   // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
3012   // Output: result
3013   void be_add_128_64(FloatRegister result, FloatRegister in,
3014                      FloatRegister inc, FloatRegister tmp) {
3015     assert_different_registers(result, tmp, inc);
3016 
3017     __ addv(result, __ T2D, in, inc);      // Add inc to the least-significant dword of
3018                                            // input
3019     __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
3020     __ ext(tmp, __ T16B, tmp, tmp, 0x08);  // Swap LSD of comparison result to MSD and
3021                                            // MSD == 0 (must be!) to LSD
3022     __ subv(result, __ T2D, result, tmp);  // Subtract -1 from MSD if there was an overflow
3023   }
3024 
3025   // CTR AES crypt.
3026   // Arguments:
3027   //
3028   // Inputs:
3029   //   c_rarg0   - source byte array address
3030   //   c_rarg1   - destination byte array address
3031   //   c_rarg2   - K (key) in little endian int array
3032   //   c_rarg3   - counter vector byte array address
3033   //   c_rarg4   - input length
3034   //   c_rarg5   - saved encryptedCounter start
3035   //   c_rarg6   - saved used length
3036   //
3037   // Output:
3038   //   r0       - input length
3039   //
3040   address generate_counterMode_AESCrypt() {
3041     const Register in = c_rarg0;
3042     const Register out = c_rarg1;
3043     const Register key = c_rarg2;
3044     const Register counter = c_rarg3;
3045     const Register saved_len = c_rarg4, len = r10;
3046     const Register saved_encrypted_ctr = c_rarg5;
3047     const Register used_ptr = c_rarg6, used = r12;
3048 
3049     const Register offset = r7;
3050     const Register keylen = r11;
3051 
3052     const unsigned char block_size = 16;
3053     const int bulk_width = 4;
3054     // NB: bulk_width can be 4 or 8. 8 gives slightly faster
3055     // performance with larger data sizes, but it also means that the
3056     // fast path isn't used until you have at least 8 blocks, and up
3057     // to 127 bytes of data will be executed on the slow path. For
3058     // that reason, and also so as not to blow away too much icache, 4
3059     // blocks seems like a sensible compromise.
3060 
3061     // Algorithm:
3062     //
3063     //    if (len == 0) {
3064     //        goto DONE;
3065     //    }
3066     //    int result = len;
3067     //    do {
3068     //        if (used >= blockSize) {
3069     //            if (len >= bulk_width * blockSize) {
3070     //                CTR_large_block();
3071     //                if (len == 0)
3072     //                    goto DONE;
3073     //            }
3074     //            for (;;) {
3075     //                16ByteVector v0 = counter;
3076     //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
3077     //                used = 0;
3078     //                if (len < blockSize)
3079     //                    break;    /* goto NEXT */
3080     //                16ByteVector v1 = load16Bytes(in, offset);
3081     //                v1 = v1 ^ encryptedCounter;
3082     //                store16Bytes(out, offset);
3083     //                used = blockSize;
3084     //                offset += blockSize;
3085     //                len -= blockSize;
3086     //                if (len == 0)
3087     //                    goto DONE;
3088     //            }
3089     //        }
3090     //      NEXT:
3091     //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
3092     //        len--;
3093     //    } while (len != 0);
3094     //  DONE:
3095     //    return result;
3096     //
3097     // CTR_large_block()
3098     //    Wide bulk encryption of whole blocks.
3099 
3100     __ align(CodeEntryAlignment);
3101     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
3102     const address start = __ pc();
3103     __ enter();
3104 
3105     Label DONE, CTR_large_block, large_block_return;
3106     __ ldrw(used, Address(used_ptr));
3107     __ cbzw(saved_len, DONE);
3108 
3109     __ mov(len, saved_len);
3110     __ mov(offset, 0);
3111 
3112     // Compute #rounds for AES based on the length of the key array
3113     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3114 
3115     __ aesenc_loadkeys(key, keylen);
3116 
3117     {
3118       Label L_CTR_loop, NEXT;
3119 
3120       __ bind(L_CTR_loop);
3121 
3122       __ cmp(used, block_size);
3123       __ br(__ LO, NEXT);
3124 
3125       // Maybe we have a lot of data
3126       __ subsw(rscratch1, len, bulk_width * block_size);
3127       __ br(__ HS, CTR_large_block);
3128       __ BIND(large_block_return);
3129       __ cbzw(len, DONE);
3130 
3131       // Setup the counter
3132       __ movi(v4, __ T4S, 0);
3133       __ movi(v5, __ T4S, 1);
3134       __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
3135 
3136       // 128-bit big-endian increment
3137       __ ld1(v0, __ T16B, counter);
3138       __ rev64(v16, __ T16B, v0);
3139       be_add_128_64(v16, v16, v4, /*tmp*/v5);
3140       __ rev64(v16, __ T16B, v16);
3141       __ st1(v16, __ T16B, counter);
3142       // Previous counter value is in v0
3143       // v4 contains { 0, 1 }
3144 
3145       {
3146         // We have fewer than bulk_width blocks of data left. Encrypt
3147         // them one by one until there is less than a full block
3148         // remaining, being careful to save both the encrypted counter
3149         // and the counter.
3150 
3151         Label inner_loop;
3152         __ bind(inner_loop);
3153         // Counter to encrypt is in v0
3154         __ aesecb_encrypt(noreg, noreg, keylen);
3155         __ st1(v0, __ T16B, saved_encrypted_ctr);
3156 
3157         // Do we have a remaining full block?
3158 
3159         __ mov(used, 0);
3160         __ cmp(len, block_size);
3161         __ br(__ LO, NEXT);
3162 
3163         // Yes, we have a full block
3164         __ ldrq(v1, Address(in, offset));
3165         __ eor(v1, __ T16B, v1, v0);
3166         __ strq(v1, Address(out, offset));
3167         __ mov(used, block_size);
3168         __ add(offset, offset, block_size);
3169 
3170         __ subw(len, len, block_size);
3171         __ cbzw(len, DONE);
3172 
3173         // Increment the counter, store it back
3174         __ orr(v0, __ T16B, v16, v16);
3175         __ rev64(v16, __ T16B, v16);
3176         be_add_128_64(v16, v16, v4, /*tmp*/v5);
3177         __ rev64(v16, __ T16B, v16);
3178         __ st1(v16, __ T16B, counter); // Save the incremented counter back
3179 
3180         __ b(inner_loop);
3181       }
3182 
3183       __ BIND(NEXT);
3184 
3185       // Encrypt a single byte, and loop.
3186       // We expect this to be a rare event.
3187       __ ldrb(rscratch1, Address(in, offset));
3188       __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3189       __ eor(rscratch1, rscratch1, rscratch2);
3190       __ strb(rscratch1, Address(out, offset));
3191       __ add(offset, offset, 1);
3192       __ add(used, used, 1);
3193       __ subw(len, len,1);
3194       __ cbnzw(len, L_CTR_loop);
3195     }
3196 
3197     __ bind(DONE);
3198     __ strw(used, Address(used_ptr));
3199     __ mov(r0, saved_len);
3200 
3201     __ leave(); // required for proper stackwalking of RuntimeStub frame
3202     __ ret(lr);
3203 
3204     // Bulk encryption
3205 
3206     __ BIND (CTR_large_block);
3207     assert(bulk_width == 4 || bulk_width == 8, "must be");
3208 
3209     if (bulk_width == 8) {
3210       __ sub(sp, sp, 4 * 16);
3211       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3212     }
3213     __ sub(sp, sp, 4 * 16);
3214     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3215     RegSet saved_regs = (RegSet::of(in, out, offset)
3216                          + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3217     __ push(saved_regs, sp);
3218     __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
3219     __ add(in, in, offset);
3220     __ add(out, out, offset);
3221 
3222     // Keys should already be loaded into the correct registers
3223 
3224     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3225     __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
3226 
3227     // AES/CTR loop
3228     {
3229       Label L_CTR_loop;
3230       __ BIND(L_CTR_loop);
3231 
3232       // Setup the counters
3233       __ movi(v8, __ T4S, 0);
3234       __ movi(v9, __ T4S, 1);
3235       __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
3236 
3237       for (int i = 0; i < bulk_width; i++) {
3238         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3239         __ rev64(v0_ofs, __ T16B, v16);
3240         be_add_128_64(v16, v16, v8, /*tmp*/v9);
3241       }
3242 
3243       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3244 
3245       // Encrypt the counters
3246       __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3247 
3248       if (bulk_width == 8) {
3249         __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3250       }
3251 
3252       // XOR the encrypted counters with the inputs
3253       for (int i = 0; i < bulk_width; i++) {
3254         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3255         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3256         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3257       }
3258 
3259       // Write the encrypted data
3260       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3261       if (bulk_width == 8) {
3262         __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3263       }
3264 
3265       __ subw(len, len, 16 * bulk_width);
3266       __ cbnzw(len, L_CTR_loop);
3267     }
3268 
3269     // Save the counter back where it goes
3270     __ rev64(v16, __ T16B, v16);
3271     __ st1(v16, __ T16B, counter);
3272 
3273     __ pop(saved_regs, sp);
3274 
3275     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3276     if (bulk_width == 8) {
3277       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3278     }
3279 
3280     __ andr(rscratch1, len, -16 * bulk_width);
3281     __ sub(len, len, rscratch1);
3282     __ add(offset, offset, rscratch1);
3283     __ mov(used, 16);
3284     __ strw(used, Address(used_ptr));
3285     __ b(large_block_return);
3286 
3287     return start;
3288   }
3289 
3290   // Vector AES Galois Counter Mode implementation. Parameters:
3291   //
3292   // in = c_rarg0
3293   // len = c_rarg1
3294   // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3295   // out = c_rarg3
3296   // key = c_rarg4
3297   // state = c_rarg5 - GHASH.state
3298   // subkeyHtbl = c_rarg6 - powers of H
3299   // counter = c_rarg7 - 16 bytes of CTR
3300   // return - number of processed bytes
3301   address generate_galoisCounterMode_AESCrypt() {
3302     address ghash_polynomial = __ pc();
3303     __ emit_int64(0x87);  // The low-order bits of the field
3304                           // polynomial (i.e. p = z^7+z^2+z+1)
3305                           // repeated in the low and high parts of a
3306                           // 128-bit vector
3307     __ emit_int64(0x87);
3308 
3309     __ align(CodeEntryAlignment);
3310      StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt");
3311     address start = __ pc();
3312     __ enter();
3313 
3314     const Register in = c_rarg0;
3315     const Register len = c_rarg1;
3316     const Register ct = c_rarg2;
3317     const Register out = c_rarg3;
3318     // and updated with the incremented counter in the end
3319 
3320     const Register key = c_rarg4;
3321     const Register state = c_rarg5;
3322 
3323     const Register subkeyHtbl = c_rarg6;
3324 
3325     const Register counter = c_rarg7;
3326 
3327     const Register keylen = r10;
3328     // Save state before entering routine
3329     __ sub(sp, sp, 4 * 16);
3330     __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3331     __ sub(sp, sp, 4 * 16);
3332     __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3333 
3334     // __ andr(len, len, -512);
3335     __ andr(len, len, -16 * 8);  // 8 encryptions, 16 bytes per encryption
3336     __ str(len, __ pre(sp, -2 * wordSize));
3337 
3338     Label DONE;
3339     __ cbz(len, DONE);
3340 
3341     // Compute #rounds for AES based on the length of the key array
3342     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3343 
3344     __ aesenc_loadkeys(key, keylen);
3345     __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3346     __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3347 
3348     // AES/CTR loop
3349     {
3350       Label L_CTR_loop;
3351       __ BIND(L_CTR_loop);
3352 
3353       // Setup the counters
3354       __ movi(v8, __ T4S, 0);
3355       __ movi(v9, __ T4S, 1);
3356       __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3357 
3358       assert(v0->encoding() < v8->encoding(), "");
3359       for (int i = v0->encoding(); i < v8->encoding(); i++) {
3360         FloatRegister f = as_FloatRegister(i);
3361         __ rev32(f, __ T16B, v16);
3362         __ addv(v16, __ T4S, v16, v8);
3363       }
3364 
3365       __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3366 
3367       // Encrypt the counters
3368       __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
3369 
3370       __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3371 
3372       // XOR the encrypted counters with the inputs
3373       for (int i = 0; i < 8; i++) {
3374         FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3375         FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3376         __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3377       }
3378       __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3379       __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3380 
3381       __ subw(len, len, 16 * 8);
3382       __ cbnzw(len, L_CTR_loop);
3383     }
3384 
3385     __ rev32(v16, __ T16B, v16);
3386     __ st1(v16, __ T16B, counter);
3387 
3388     __ ldr(len, Address(sp));
3389     __ lsr(len, len, exact_log2(16));  // We want the count of blocks
3390 
3391     // GHASH/CTR loop
3392     __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
3393                                 len, /*unrolls*/4);
3394 
3395 #ifdef ASSERT
3396     { Label L;
3397       __ cmp(len, (unsigned char)0);
3398       __ br(Assembler::EQ, L);
3399       __ stop("stubGenerator: abort");
3400       __ bind(L);
3401   }
3402 #endif
3403 
3404   __ bind(DONE);
3405     // Return the number of bytes processed
3406     __ ldr(r0, __ post(sp, 2 * wordSize));
3407 
3408     __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3409     __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3410 
3411     __ leave(); // required for proper stackwalking of RuntimeStub frame
3412     __ ret(lr);
3413      return start;
3414   }
3415 
3416   class Cached64Bytes {
3417   private:
3418     MacroAssembler *_masm;
3419     Register _regs[8];
3420 
3421   public:
3422     Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
3423       assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
3424       auto it = rs.begin();
3425       for (auto &r: _regs) {
3426         r = *it;
3427         ++it;
3428       }
3429     }
3430 
3431     void gen_loads(Register base) {
3432       for (int i = 0; i < 8; i += 2) {
3433         __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
3434       }
3435     }
3436 
3437     // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
3438     void extract_u32(Register dest, int i) {
3439       __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
3440     }
3441   };
3442 
3443   // Utility routines for md5.
3444   // Clobbers r10 and r11.
3445   void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3446               int k, int s, int t) {
3447     Register rscratch3 = r10;
3448     Register rscratch4 = r11;
3449 
3450     __ eorw(rscratch3, r3, r4);
3451     __ movw(rscratch2, t);
3452     __ andw(rscratch3, rscratch3, r2);
3453     __ addw(rscratch4, r1, rscratch2);
3454     reg_cache.extract_u32(rscratch1, k);
3455     __ eorw(rscratch3, rscratch3, r4);
3456     __ addw(rscratch4, rscratch4, rscratch1);
3457     __ addw(rscratch3, rscratch3, rscratch4);
3458     __ rorw(rscratch2, rscratch3, 32 - s);
3459     __ addw(r1, rscratch2, r2);
3460   }
3461 
3462   void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3463               int k, int s, int t) {
3464     Register rscratch3 = r10;
3465     Register rscratch4 = r11;
3466 
3467     reg_cache.extract_u32(rscratch1, k);
3468     __ movw(rscratch2, t);
3469     __ addw(rscratch4, r1, rscratch2);
3470     __ addw(rscratch4, rscratch4, rscratch1);
3471     __ bicw(rscratch2, r3, r4);
3472     __ andw(rscratch3, r2, r4);
3473     __ addw(rscratch2, rscratch2, rscratch4);
3474     __ addw(rscratch2, rscratch2, rscratch3);
3475     __ rorw(rscratch2, rscratch2, 32 - s);
3476     __ addw(r1, rscratch2, r2);
3477   }
3478 
3479   void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3480               int k, int s, int t) {
3481     Register rscratch3 = r10;
3482     Register rscratch4 = r11;
3483 
3484     __ eorw(rscratch3, r3, r4);
3485     __ movw(rscratch2, t);
3486     __ addw(rscratch4, r1, rscratch2);
3487     reg_cache.extract_u32(rscratch1, k);
3488     __ eorw(rscratch3, rscratch3, r2);
3489     __ addw(rscratch4, rscratch4, rscratch1);
3490     __ addw(rscratch3, rscratch3, rscratch4);
3491     __ rorw(rscratch2, rscratch3, 32 - s);
3492     __ addw(r1, rscratch2, r2);
3493   }
3494 
3495   void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3496               int k, int s, int t) {
3497     Register rscratch3 = r10;
3498     Register rscratch4 = r11;
3499 
3500     __ movw(rscratch3, t);
3501     __ ornw(rscratch2, r2, r4);
3502     __ addw(rscratch4, r1, rscratch3);
3503     reg_cache.extract_u32(rscratch1, k);
3504     __ eorw(rscratch3, rscratch2, r3);
3505     __ addw(rscratch4, rscratch4, rscratch1);
3506     __ addw(rscratch3, rscratch3, rscratch4);
3507     __ rorw(rscratch2, rscratch3, 32 - s);
3508     __ addw(r1, rscratch2, r2);
3509   }
3510 
3511   // Arguments:
3512   //
3513   // Inputs:
3514   //   c_rarg0   - byte[]  source+offset
3515   //   c_rarg1   - int[]   SHA.state
3516   //   c_rarg2   - int     offset
3517   //   c_rarg3   - int     limit
3518   //
3519   address generate_md5_implCompress(bool multi_block, const char *name) {
3520     __ align(CodeEntryAlignment);
3521     StubCodeMark mark(this, "StubRoutines", name);
3522     address start = __ pc();
3523 
3524     Register buf       = c_rarg0;
3525     Register state     = c_rarg1;
3526     Register ofs       = c_rarg2;
3527     Register limit     = c_rarg3;
3528     Register a         = r4;
3529     Register b         = r5;
3530     Register c         = r6;
3531     Register d         = r7;
3532     Register rscratch3 = r10;
3533     Register rscratch4 = r11;
3534 
3535     Register state_regs[2] = { r12, r13 };
3536     RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
3537     Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs);  // using 8 registers
3538 
3539     __ push(saved_regs, sp);
3540 
3541     __ ldp(state_regs[0], state_regs[1], Address(state));
3542     __ ubfx(a, state_regs[0],  0, 32);
3543     __ ubfx(b, state_regs[0], 32, 32);
3544     __ ubfx(c, state_regs[1],  0, 32);
3545     __ ubfx(d, state_regs[1], 32, 32);
3546 
3547     Label md5_loop;
3548     __ BIND(md5_loop);
3549 
3550     reg_cache.gen_loads(buf);
3551 
3552     // Round 1
3553     md5_FF(reg_cache, a, b, c, d,  0,  7, 0xd76aa478);
3554     md5_FF(reg_cache, d, a, b, c,  1, 12, 0xe8c7b756);
3555     md5_FF(reg_cache, c, d, a, b,  2, 17, 0x242070db);
3556     md5_FF(reg_cache, b, c, d, a,  3, 22, 0xc1bdceee);
3557     md5_FF(reg_cache, a, b, c, d,  4,  7, 0xf57c0faf);
3558     md5_FF(reg_cache, d, a, b, c,  5, 12, 0x4787c62a);
3559     md5_FF(reg_cache, c, d, a, b,  6, 17, 0xa8304613);
3560     md5_FF(reg_cache, b, c, d, a,  7, 22, 0xfd469501);
3561     md5_FF(reg_cache, a, b, c, d,  8,  7, 0x698098d8);
3562     md5_FF(reg_cache, d, a, b, c,  9, 12, 0x8b44f7af);
3563     md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
3564     md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
3565     md5_FF(reg_cache, a, b, c, d, 12,  7, 0x6b901122);
3566     md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
3567     md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
3568     md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
3569 
3570     // Round 2
3571     md5_GG(reg_cache, a, b, c, d,  1,  5, 0xf61e2562);
3572     md5_GG(reg_cache, d, a, b, c,  6,  9, 0xc040b340);
3573     md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
3574     md5_GG(reg_cache, b, c, d, a,  0, 20, 0xe9b6c7aa);
3575     md5_GG(reg_cache, a, b, c, d,  5,  5, 0xd62f105d);
3576     md5_GG(reg_cache, d, a, b, c, 10,  9, 0x02441453);
3577     md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
3578     md5_GG(reg_cache, b, c, d, a,  4, 20, 0xe7d3fbc8);
3579     md5_GG(reg_cache, a, b, c, d,  9,  5, 0x21e1cde6);
3580     md5_GG(reg_cache, d, a, b, c, 14,  9, 0xc33707d6);
3581     md5_GG(reg_cache, c, d, a, b,  3, 14, 0xf4d50d87);
3582     md5_GG(reg_cache, b, c, d, a,  8, 20, 0x455a14ed);
3583     md5_GG(reg_cache, a, b, c, d, 13,  5, 0xa9e3e905);
3584     md5_GG(reg_cache, d, a, b, c,  2,  9, 0xfcefa3f8);
3585     md5_GG(reg_cache, c, d, a, b,  7, 14, 0x676f02d9);
3586     md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
3587 
3588     // Round 3
3589     md5_HH(reg_cache, a, b, c, d,  5,  4, 0xfffa3942);
3590     md5_HH(reg_cache, d, a, b, c,  8, 11, 0x8771f681);
3591     md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
3592     md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
3593     md5_HH(reg_cache, a, b, c, d,  1,  4, 0xa4beea44);
3594     md5_HH(reg_cache, d, a, b, c,  4, 11, 0x4bdecfa9);
3595     md5_HH(reg_cache, c, d, a, b,  7, 16, 0xf6bb4b60);
3596     md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
3597     md5_HH(reg_cache, a, b, c, d, 13,  4, 0x289b7ec6);
3598     md5_HH(reg_cache, d, a, b, c,  0, 11, 0xeaa127fa);
3599     md5_HH(reg_cache, c, d, a, b,  3, 16, 0xd4ef3085);
3600     md5_HH(reg_cache, b, c, d, a,  6, 23, 0x04881d05);
3601     md5_HH(reg_cache, a, b, c, d,  9,  4, 0xd9d4d039);
3602     md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
3603     md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
3604     md5_HH(reg_cache, b, c, d, a,  2, 23, 0xc4ac5665);
3605 
3606     // Round 4
3607     md5_II(reg_cache, a, b, c, d,  0,  6, 0xf4292244);
3608     md5_II(reg_cache, d, a, b, c,  7, 10, 0x432aff97);
3609     md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
3610     md5_II(reg_cache, b, c, d, a,  5, 21, 0xfc93a039);
3611     md5_II(reg_cache, a, b, c, d, 12,  6, 0x655b59c3);
3612     md5_II(reg_cache, d, a, b, c,  3, 10, 0x8f0ccc92);
3613     md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
3614     md5_II(reg_cache, b, c, d, a,  1, 21, 0x85845dd1);
3615     md5_II(reg_cache, a, b, c, d,  8,  6, 0x6fa87e4f);
3616     md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
3617     md5_II(reg_cache, c, d, a, b,  6, 15, 0xa3014314);
3618     md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
3619     md5_II(reg_cache, a, b, c, d,  4,  6, 0xf7537e82);
3620     md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
3621     md5_II(reg_cache, c, d, a, b,  2, 15, 0x2ad7d2bb);
3622     md5_II(reg_cache, b, c, d, a,  9, 21, 0xeb86d391);
3623 
3624     __ addw(a, state_regs[0], a);
3625     __ ubfx(rscratch2, state_regs[0], 32, 32);
3626     __ addw(b, rscratch2, b);
3627     __ addw(c, state_regs[1], c);
3628     __ ubfx(rscratch4, state_regs[1], 32, 32);
3629     __ addw(d, rscratch4, d);
3630 
3631     __ orr(state_regs[0], a, b, Assembler::LSL, 32);
3632     __ orr(state_regs[1], c, d, Assembler::LSL, 32);
3633 
3634     if (multi_block) {
3635       __ add(buf, buf, 64);
3636       __ add(ofs, ofs, 64);
3637       __ cmp(ofs, limit);
3638       __ br(Assembler::LE, md5_loop);
3639       __ mov(c_rarg0, ofs); // return ofs
3640     }
3641 
3642     // write hash values back in the correct order
3643     __ stp(state_regs[0], state_regs[1], Address(state));
3644 
3645     __ pop(saved_regs, sp);
3646 
3647     __ ret(lr);
3648 
3649     return start;
3650   }
3651 
3652   // Arguments:
3653   //
3654   // Inputs:
3655   //   c_rarg0   - byte[]  source+offset
3656   //   c_rarg1   - int[]   SHA.state
3657   //   c_rarg2   - int     offset
3658   //   c_rarg3   - int     limit
3659   //
3660   address generate_sha1_implCompress(bool multi_block, const char *name) {
3661     __ align(CodeEntryAlignment);
3662     StubCodeMark mark(this, "StubRoutines", name);
3663     address start = __ pc();
3664 
3665     Register buf   = c_rarg0;
3666     Register state = c_rarg1;
3667     Register ofs   = c_rarg2;
3668     Register limit = c_rarg3;
3669 
3670     Label keys;
3671     Label sha1_loop;
3672 
3673     // load the keys into v0..v3
3674     __ adr(rscratch1, keys);
3675     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3676     // load 5 words state into v6, v7
3677     __ ldrq(v6, Address(state, 0));
3678     __ ldrs(v7, Address(state, 16));
3679 
3680 
3681     __ BIND(sha1_loop);
3682     // load 64 bytes of data into v16..v19
3683     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3684     __ rev32(v16, __ T16B, v16);
3685     __ rev32(v17, __ T16B, v17);
3686     __ rev32(v18, __ T16B, v18);
3687     __ rev32(v19, __ T16B, v19);
3688 
3689     // do the sha1
3690     __ addv(v4, __ T4S, v16, v0);
3691     __ orr(v20, __ T16B, v6, v6);
3692 
3693     FloatRegister d0 = v16;
3694     FloatRegister d1 = v17;
3695     FloatRegister d2 = v18;
3696     FloatRegister d3 = v19;
3697 
3698     for (int round = 0; round < 20; round++) {
3699       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3700       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3701       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3702       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3703       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3704 
3705       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3706       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3707       __ sha1h(tmp2, __ T4S, v20);
3708       if (round < 5)
3709         __ sha1c(v20, __ T4S, tmp3, tmp4);
3710       else if (round < 10 || round >= 15)
3711         __ sha1p(v20, __ T4S, tmp3, tmp4);
3712       else
3713         __ sha1m(v20, __ T4S, tmp3, tmp4);
3714       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3715 
3716       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3717     }
3718 
3719     __ addv(v7, __ T2S, v7, v21);
3720     __ addv(v6, __ T4S, v6, v20);
3721 
3722     if (multi_block) {
3723       __ add(ofs, ofs, 64);
3724       __ cmp(ofs, limit);
3725       __ br(Assembler::LE, sha1_loop);
3726       __ mov(c_rarg0, ofs); // return ofs
3727     }
3728 
3729     __ strq(v6, Address(state, 0));
3730     __ strs(v7, Address(state, 16));
3731 
3732     __ ret(lr);
3733 
3734     __ bind(keys);
3735     __ emit_int32(0x5a827999);
3736     __ emit_int32(0x6ed9eba1);
3737     __ emit_int32(0x8f1bbcdc);
3738     __ emit_int32(0xca62c1d6);
3739 
3740     return start;
3741   }
3742 
3743 
3744   // Arguments:
3745   //
3746   // Inputs:
3747   //   c_rarg0   - byte[]  source+offset
3748   //   c_rarg1   - int[]   SHA.state
3749   //   c_rarg2   - int     offset
3750   //   c_rarg3   - int     limit
3751   //
3752   address generate_sha256_implCompress(bool multi_block, const char *name) {
3753     static const uint32_t round_consts[64] = {
3754       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3755       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3756       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3757       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3758       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3759       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3760       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3761       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3762       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3763       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3764       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3765       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3766       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3767       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3768       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3769       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3770     };
3771     __ align(CodeEntryAlignment);
3772     StubCodeMark mark(this, "StubRoutines", name);
3773     address start = __ pc();
3774 
3775     Register buf   = c_rarg0;
3776     Register state = c_rarg1;
3777     Register ofs   = c_rarg2;
3778     Register limit = c_rarg3;
3779 
3780     Label sha1_loop;
3781 
3782     __ stpd(v8, v9, __ pre(sp, -32));
3783     __ stpd(v10, v11, Address(sp, 16));
3784 
3785 // dga == v0
3786 // dgb == v1
3787 // dg0 == v2
3788 // dg1 == v3
3789 // dg2 == v4
3790 // t0 == v6
3791 // t1 == v7
3792 
3793     // load 16 keys to v16..v31
3794     __ lea(rscratch1, ExternalAddress((address)round_consts));
3795     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3796     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3797     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3798     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3799 
3800     // load 8 words (256 bits) state
3801     __ ldpq(v0, v1, state);
3802 
3803     __ BIND(sha1_loop);
3804     // load 64 bytes of data into v8..v11
3805     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3806     __ rev32(v8, __ T16B, v8);
3807     __ rev32(v9, __ T16B, v9);
3808     __ rev32(v10, __ T16B, v10);
3809     __ rev32(v11, __ T16B, v11);
3810 
3811     __ addv(v6, __ T4S, v8, v16);
3812     __ orr(v2, __ T16B, v0, v0);
3813     __ orr(v3, __ T16B, v1, v1);
3814 
3815     FloatRegister d0 = v8;
3816     FloatRegister d1 = v9;
3817     FloatRegister d2 = v10;
3818     FloatRegister d3 = v11;
3819 
3820 
3821     for (int round = 0; round < 16; round++) {
3822       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3823       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3824       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3825       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3826 
3827       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3828        __ orr(v4, __ T16B, v2, v2);
3829       if (round < 15)
3830         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3831       __ sha256h(v2, __ T4S, v3, tmp2);
3832       __ sha256h2(v3, __ T4S, v4, tmp2);
3833       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3834 
3835       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3836     }
3837 
3838     __ addv(v0, __ T4S, v0, v2);
3839     __ addv(v1, __ T4S, v1, v3);
3840 
3841     if (multi_block) {
3842       __ add(ofs, ofs, 64);
3843       __ cmp(ofs, limit);
3844       __ br(Assembler::LE, sha1_loop);
3845       __ mov(c_rarg0, ofs); // return ofs
3846     }
3847 
3848     __ ldpd(v10, v11, Address(sp, 16));
3849     __ ldpd(v8, v9, __ post(sp, 32));
3850 
3851     __ stpq(v0, v1, state);
3852 
3853     __ ret(lr);
3854 
3855     return start;
3856   }
3857 
3858   // Double rounds for sha512.
3859   void sha512_dround(int dr,
3860                      FloatRegister vi0, FloatRegister vi1,
3861                      FloatRegister vi2, FloatRegister vi3,
3862                      FloatRegister vi4, FloatRegister vrc0,
3863                      FloatRegister vrc1, FloatRegister vin0,
3864                      FloatRegister vin1, FloatRegister vin2,
3865                      FloatRegister vin3, FloatRegister vin4) {
3866       if (dr < 36) {
3867         __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
3868       }
3869       __ addv(v5, __ T2D, vrc0, vin0);
3870       __ ext(v6, __ T16B, vi2, vi3, 8);
3871       __ ext(v5, __ T16B, v5, v5, 8);
3872       __ ext(v7, __ T16B, vi1, vi2, 8);
3873       __ addv(vi3, __ T2D, vi3, v5);
3874       if (dr < 32) {
3875         __ ext(v5, __ T16B, vin3, vin4, 8);
3876         __ sha512su0(vin0, __ T2D, vin1);
3877       }
3878       __ sha512h(vi3, __ T2D, v6, v7);
3879       if (dr < 32) {
3880         __ sha512su1(vin0, __ T2D, vin2, v5);
3881       }
3882       __ addv(vi4, __ T2D, vi1, vi3);
3883       __ sha512h2(vi3, __ T2D, vi1, vi0);
3884   }
3885 
3886   // Arguments:
3887   //
3888   // Inputs:
3889   //   c_rarg0   - byte[]  source+offset
3890   //   c_rarg1   - int[]   SHA.state
3891   //   c_rarg2   - int     offset
3892   //   c_rarg3   - int     limit
3893   //
3894   address generate_sha512_implCompress(bool multi_block, const char *name) {
3895     static const uint64_t round_consts[80] = {
3896       0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
3897       0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
3898       0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
3899       0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
3900       0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
3901       0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
3902       0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
3903       0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
3904       0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
3905       0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
3906       0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
3907       0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
3908       0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
3909       0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
3910       0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
3911       0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
3912       0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
3913       0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
3914       0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
3915       0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
3916       0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
3917       0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
3918       0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
3919       0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
3920       0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
3921       0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
3922       0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
3923     };
3924 
3925     __ align(CodeEntryAlignment);
3926     StubCodeMark mark(this, "StubRoutines", name);
3927     address start = __ pc();
3928 
3929     Register buf   = c_rarg0;
3930     Register state = c_rarg1;
3931     Register ofs   = c_rarg2;
3932     Register limit = c_rarg3;
3933 
3934     __ stpd(v8, v9, __ pre(sp, -64));
3935     __ stpd(v10, v11, Address(sp, 16));
3936     __ stpd(v12, v13, Address(sp, 32));
3937     __ stpd(v14, v15, Address(sp, 48));
3938 
3939     Label sha512_loop;
3940 
3941     // load state
3942     __ ld1(v8, v9, v10, v11, __ T2D, state);
3943 
3944     // load first 4 round constants
3945     __ lea(rscratch1, ExternalAddress((address)round_consts));
3946     __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
3947 
3948     __ BIND(sha512_loop);
3949     // load 128B of data into v12..v19
3950     __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
3951     __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
3952     __ rev64(v12, __ T16B, v12);
3953     __ rev64(v13, __ T16B, v13);
3954     __ rev64(v14, __ T16B, v14);
3955     __ rev64(v15, __ T16B, v15);
3956     __ rev64(v16, __ T16B, v16);
3957     __ rev64(v17, __ T16B, v17);
3958     __ rev64(v18, __ T16B, v18);
3959     __ rev64(v19, __ T16B, v19);
3960 
3961     __ mov(rscratch2, rscratch1);
3962 
3963     __ mov(v0, __ T16B, v8);
3964     __ mov(v1, __ T16B, v9);
3965     __ mov(v2, __ T16B, v10);
3966     __ mov(v3, __ T16B, v11);
3967 
3968     sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
3969     sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
3970     sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
3971     sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
3972     sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
3973     sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
3974     sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
3975     sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
3976     sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
3977     sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
3978     sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
3979     sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
3980     sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
3981     sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
3982     sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
3983     sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
3984     sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
3985     sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
3986     sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
3987     sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
3988     sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
3989     sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
3990     sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
3991     sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
3992     sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
3993     sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
3994     sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
3995     sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
3996     sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
3997     sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
3998     sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
3999     sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
4000     sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12,  v0,  v0,  v0,  v0);
4001     sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13,  v0,  v0,  v0,  v0);
4002     sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14,  v0,  v0,  v0,  v0);
4003     sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15,  v0,  v0,  v0,  v0);
4004     sha512_dround(36, v3, v0, v4, v2, v1, v24,  v0, v16,  v0,  v0,  v0,  v0);
4005     sha512_dround(37, v2, v3, v1, v4, v0, v25,  v0, v17,  v0,  v0,  v0,  v0);
4006     sha512_dround(38, v4, v2, v0, v1, v3, v26,  v0, v18,  v0,  v0,  v0,  v0);
4007     sha512_dround(39, v1, v4, v3, v0, v2, v27,  v0, v19,  v0,  v0,  v0,  v0);
4008 
4009     __ addv(v8, __ T2D, v8, v0);
4010     __ addv(v9, __ T2D, v9, v1);
4011     __ addv(v10, __ T2D, v10, v2);
4012     __ addv(v11, __ T2D, v11, v3);
4013 
4014     if (multi_block) {
4015       __ add(ofs, ofs, 128);
4016       __ cmp(ofs, limit);
4017       __ br(Assembler::LE, sha512_loop);
4018       __ mov(c_rarg0, ofs); // return ofs
4019     }
4020 
4021     __ st1(v8, v9, v10, v11, __ T2D, state);
4022 
4023     __ ldpd(v14, v15, Address(sp, 48));
4024     __ ldpd(v12, v13, Address(sp, 32));
4025     __ ldpd(v10, v11, Address(sp, 16));
4026     __ ldpd(v8, v9, __ post(sp, 64));
4027 
4028     __ ret(lr);
4029 
4030     return start;
4031   }
4032 
4033   // Arguments:
4034   //
4035   // Inputs:
4036   //   c_rarg0   - byte[]  source+offset
4037   //   c_rarg1   - byte[]  SHA.state
4038   //   c_rarg2   - int     block_size
4039   //   c_rarg3   - int     offset
4040   //   c_rarg4   - int     limit
4041   //
4042   address generate_sha3_implCompress(bool multi_block, const char *name) {
4043     static const uint64_t round_consts[24] = {
4044       0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
4045       0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
4046       0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
4047       0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
4048       0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
4049       0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
4050       0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
4051       0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
4052     };
4053 
4054     __ align(CodeEntryAlignment);
4055     StubCodeMark mark(this, "StubRoutines", name);
4056     address start = __ pc();
4057 
4058     Register buf           = c_rarg0;
4059     Register state         = c_rarg1;
4060     Register block_size    = c_rarg2;
4061     Register ofs           = c_rarg3;
4062     Register limit         = c_rarg4;
4063 
4064     Label sha3_loop, rounds24_loop;
4065     Label sha3_512_or_sha3_384, shake128;
4066 
4067     __ stpd(v8, v9, __ pre(sp, -64));
4068     __ stpd(v10, v11, Address(sp, 16));
4069     __ stpd(v12, v13, Address(sp, 32));
4070     __ stpd(v14, v15, Address(sp, 48));
4071 
4072     // load state
4073     __ add(rscratch1, state, 32);
4074     __ ld1(v0, v1, v2,  v3,  __ T1D, state);
4075     __ ld1(v4, v5, v6,  v7,  __ T1D, __ post(rscratch1, 32));
4076     __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
4077     __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
4078     __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
4079     __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
4080     __ ld1(v24, __ T1D, rscratch1);
4081 
4082     __ BIND(sha3_loop);
4083 
4084     // 24 keccak rounds
4085     __ movw(rscratch2, 24);
4086 
4087     // load round_constants base
4088     __ lea(rscratch1, ExternalAddress((address) round_consts));
4089 
4090     // load input
4091     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4092     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4093     __ eor(v0, __ T8B, v0, v25);
4094     __ eor(v1, __ T8B, v1, v26);
4095     __ eor(v2, __ T8B, v2, v27);
4096     __ eor(v3, __ T8B, v3, v28);
4097     __ eor(v4, __ T8B, v4, v29);
4098     __ eor(v5, __ T8B, v5, v30);
4099     __ eor(v6, __ T8B, v6, v31);
4100 
4101     // block_size == 72, SHA3-512; block_size == 104, SHA3-384
4102     __ tbz(block_size, 7, sha3_512_or_sha3_384);
4103 
4104     __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4105     __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4106     __ eor(v7, __ T8B, v7, v25);
4107     __ eor(v8, __ T8B, v8, v26);
4108     __ eor(v9, __ T8B, v9, v27);
4109     __ eor(v10, __ T8B, v10, v28);
4110     __ eor(v11, __ T8B, v11, v29);
4111     __ eor(v12, __ T8B, v12, v30);
4112     __ eor(v13, __ T8B, v13, v31);
4113 
4114     __ ld1(v25, v26, v27,  __ T8B, __ post(buf, 24));
4115     __ eor(v14, __ T8B, v14, v25);
4116     __ eor(v15, __ T8B, v15, v26);
4117     __ eor(v16, __ T8B, v16, v27);
4118 
4119     // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
4120     __ andw(c_rarg5, block_size, 48);
4121     __ cbzw(c_rarg5, rounds24_loop);
4122 
4123     __ tbnz(block_size, 5, shake128);
4124     // block_size == 144, bit5 == 0, SHA3-244
4125     __ ldrd(v28, __ post(buf, 8));
4126     __ eor(v17, __ T8B, v17, v28);
4127     __ b(rounds24_loop);
4128 
4129     __ BIND(shake128);
4130     __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
4131     __ eor(v17, __ T8B, v17, v28);
4132     __ eor(v18, __ T8B, v18, v29);
4133     __ eor(v19, __ T8B, v19, v30);
4134     __ eor(v20, __ T8B, v20, v31);
4135     __ b(rounds24_loop); // block_size == 168, SHAKE128
4136 
4137     __ BIND(sha3_512_or_sha3_384);
4138     __ ld1(v25, v26, __ T8B, __ post(buf, 16));
4139     __ eor(v7, __ T8B, v7, v25);
4140     __ eor(v8, __ T8B, v8, v26);
4141     __ tbz(block_size, 5, rounds24_loop); // SHA3-512
4142 
4143     // SHA3-384
4144     __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
4145     __ eor(v9,  __ T8B, v9,  v27);
4146     __ eor(v10, __ T8B, v10, v28);
4147     __ eor(v11, __ T8B, v11, v29);
4148     __ eor(v12, __ T8B, v12, v30);
4149 
4150     __ BIND(rounds24_loop);
4151     __ subw(rscratch2, rscratch2, 1);
4152 
4153     __ eor3(v29, __ T16B, v4, v9, v14);
4154     __ eor3(v26, __ T16B, v1, v6, v11);
4155     __ eor3(v28, __ T16B, v3, v8, v13);
4156     __ eor3(v25, __ T16B, v0, v5, v10);
4157     __ eor3(v27, __ T16B, v2, v7, v12);
4158     __ eor3(v29, __ T16B, v29, v19, v24);
4159     __ eor3(v26, __ T16B, v26, v16, v21);
4160     __ eor3(v28, __ T16B, v28, v18, v23);
4161     __ eor3(v25, __ T16B, v25, v15, v20);
4162     __ eor3(v27, __ T16B, v27, v17, v22);
4163 
4164     __ rax1(v30, __ T2D, v29, v26);
4165     __ rax1(v26, __ T2D, v26, v28);
4166     __ rax1(v28, __ T2D, v28, v25);
4167     __ rax1(v25, __ T2D, v25, v27);
4168     __ rax1(v27, __ T2D, v27, v29);
4169 
4170     __ eor(v0, __ T16B, v0, v30);
4171     __ xar(v29, __ T2D, v1,  v25, (64 - 1));
4172     __ xar(v1,  __ T2D, v6,  v25, (64 - 44));
4173     __ xar(v6,  __ T2D, v9,  v28, (64 - 20));
4174     __ xar(v9,  __ T2D, v22, v26, (64 - 61));
4175     __ xar(v22, __ T2D, v14, v28, (64 - 39));
4176     __ xar(v14, __ T2D, v20, v30, (64 - 18));
4177     __ xar(v31, __ T2D, v2,  v26, (64 - 62));
4178     __ xar(v2,  __ T2D, v12, v26, (64 - 43));
4179     __ xar(v12, __ T2D, v13, v27, (64 - 25));
4180     __ xar(v13, __ T2D, v19, v28, (64 - 8));
4181     __ xar(v19, __ T2D, v23, v27, (64 - 56));
4182     __ xar(v23, __ T2D, v15, v30, (64 - 41));
4183     __ xar(v15, __ T2D, v4,  v28, (64 - 27));
4184     __ xar(v28, __ T2D, v24, v28, (64 - 14));
4185     __ xar(v24, __ T2D, v21, v25, (64 - 2));
4186     __ xar(v8,  __ T2D, v8,  v27, (64 - 55));
4187     __ xar(v4,  __ T2D, v16, v25, (64 - 45));
4188     __ xar(v16, __ T2D, v5,  v30, (64 - 36));
4189     __ xar(v5,  __ T2D, v3,  v27, (64 - 28));
4190     __ xar(v27, __ T2D, v18, v27, (64 - 21));
4191     __ xar(v3,  __ T2D, v17, v26, (64 - 15));
4192     __ xar(v25, __ T2D, v11, v25, (64 - 10));
4193     __ xar(v26, __ T2D, v7,  v26, (64 - 6));
4194     __ xar(v30, __ T2D, v10, v30, (64 - 3));
4195 
4196     __ bcax(v20, __ T16B, v31, v22, v8);
4197     __ bcax(v21, __ T16B, v8,  v23, v22);
4198     __ bcax(v22, __ T16B, v22, v24, v23);
4199     __ bcax(v23, __ T16B, v23, v31, v24);
4200     __ bcax(v24, __ T16B, v24, v8,  v31);
4201 
4202     __ ld1r(v31, __ T2D, __ post(rscratch1, 8));
4203 
4204     __ bcax(v17, __ T16B, v25, v19, v3);
4205     __ bcax(v18, __ T16B, v3,  v15, v19);
4206     __ bcax(v19, __ T16B, v19, v16, v15);
4207     __ bcax(v15, __ T16B, v15, v25, v16);
4208     __ bcax(v16, __ T16B, v16, v3,  v25);
4209 
4210     __ bcax(v10, __ T16B, v29, v12, v26);
4211     __ bcax(v11, __ T16B, v26, v13, v12);
4212     __ bcax(v12, __ T16B, v12, v14, v13);
4213     __ bcax(v13, __ T16B, v13, v29, v14);
4214     __ bcax(v14, __ T16B, v14, v26, v29);
4215 
4216     __ bcax(v7, __ T16B, v30, v9,  v4);
4217     __ bcax(v8, __ T16B, v4,  v5,  v9);
4218     __ bcax(v9, __ T16B, v9,  v6,  v5);
4219     __ bcax(v5, __ T16B, v5,  v30, v6);
4220     __ bcax(v6, __ T16B, v6,  v4,  v30);
4221 
4222     __ bcax(v3, __ T16B, v27, v0,  v28);
4223     __ bcax(v4, __ T16B, v28, v1,  v0);
4224     __ bcax(v0, __ T16B, v0,  v2,  v1);
4225     __ bcax(v1, __ T16B, v1,  v27, v2);
4226     __ bcax(v2, __ T16B, v2,  v28, v27);
4227 
4228     __ eor(v0, __ T16B, v0, v31);
4229 
4230     __ cbnzw(rscratch2, rounds24_loop);
4231 
4232     if (multi_block) {
4233       __ add(ofs, ofs, block_size);
4234       __ cmp(ofs, limit);
4235       __ br(Assembler::LE, sha3_loop);
4236       __ mov(c_rarg0, ofs); // return ofs
4237     }
4238 
4239     __ st1(v0, v1, v2,  v3,  __ T1D, __ post(state, 32));
4240     __ st1(v4, v5, v6,  v7,  __ T1D, __ post(state, 32));
4241     __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
4242     __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
4243     __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
4244     __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
4245     __ st1(v24, __ T1D, state);
4246 
4247     __ ldpd(v14, v15, Address(sp, 48));
4248     __ ldpd(v12, v13, Address(sp, 32));
4249     __ ldpd(v10, v11, Address(sp, 16));
4250     __ ldpd(v8, v9, __ post(sp, 64));
4251 
4252     __ ret(lr);
4253 
4254     return start;
4255   }
4256 
4257   /**
4258    *  Arguments:
4259    *
4260    * Inputs:
4261    *   c_rarg0   - int crc
4262    *   c_rarg1   - byte* buf
4263    *   c_rarg2   - int length
4264    *
4265    * Output:
4266    *       rax   - int crc result
4267    */
4268   address generate_updateBytesCRC32() {
4269     assert(UseCRC32Intrinsics, "what are we doing here?");
4270 
4271     __ align(CodeEntryAlignment);
4272     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
4273 
4274     address start = __ pc();
4275 
4276     const Register crc   = c_rarg0;  // crc
4277     const Register buf   = c_rarg1;  // source java byte array address
4278     const Register len   = c_rarg2;  // length
4279     const Register table0 = c_rarg3; // crc_table address
4280     const Register table1 = c_rarg4;
4281     const Register table2 = c_rarg5;
4282     const Register table3 = c_rarg6;
4283     const Register tmp3 = c_rarg7;
4284 
4285     BLOCK_COMMENT("Entry:");
4286     __ enter(); // required for proper stackwalking of RuntimeStub frame
4287 
4288     __ kernel_crc32(crc, buf, len,
4289               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4290 
4291     __ leave(); // required for proper stackwalking of RuntimeStub frame
4292     __ ret(lr);
4293 
4294     return start;
4295   }
4296 
4297   // ChaCha20 block function.  This version parallelizes by loading
4298   // individual 32-bit state elements into vectors for four blocks
4299   // (e.g. all four blocks' worth of state[0] in one register, etc.)
4300   //
4301   // state (int[16]) = c_rarg0
4302   // keystream (byte[1024]) = c_rarg1
4303   // return - number of bytes of keystream (always 256)
4304   address generate_chacha20Block_blockpar() {
4305     Label L_twoRounds, L_cc20_const;
4306     // The constant data is broken into two 128-bit segments to be loaded
4307     // onto FloatRegisters.  The first 128 bits are a counter add overlay
4308     // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
4309     // The second 128-bits is a table constant used for 8-bit left rotations.
4310     __ BIND(L_cc20_const);
4311     __ emit_int64(0x0000000100000000UL);
4312     __ emit_int64(0x0000000300000002UL);
4313     __ emit_int64(0x0605040702010003UL);
4314     __ emit_int64(0x0E0D0C0F0A09080BUL);
4315 
4316     __ align(CodeEntryAlignment);
4317     StubCodeMark mark(this, "StubRoutines", "chacha20Block");
4318     address start = __ pc();
4319     __ enter();
4320 
4321     int i, j;
4322     const Register state = c_rarg0;
4323     const Register keystream = c_rarg1;
4324     const Register loopCtr = r10;
4325     const Register tmpAddr = r11;
4326 
4327     const FloatRegister stateFirst = v0;
4328     const FloatRegister stateSecond = v1;
4329     const FloatRegister stateThird = v2;
4330     const FloatRegister stateFourth = v3;
4331     const FloatRegister origCtrState = v28;
4332     const FloatRegister scratch = v29;
4333     const FloatRegister lrot8Tbl = v30;
4334 
4335     // Organize SIMD registers in an array that facilitates
4336     // putting repetitive opcodes into loop structures.  It is
4337     // important that each grouping of 4 registers is monotonically
4338     // increasing to support the requirements of multi-register
4339     // instructions (e.g. ld4r, st4, etc.)
4340     const FloatRegister workSt[16] = {
4341          v4,  v5,  v6,  v7, v16, v17, v18, v19,
4342         v20, v21, v22, v23, v24, v25, v26, v27
4343     };
4344 
4345     // Load from memory and interlace across 16 SIMD registers,
4346     // With each word from memory being broadcast to all lanes of
4347     // each successive SIMD register.
4348     //      Addr(0) -> All lanes in workSt[i]
4349     //      Addr(4) -> All lanes workSt[i + 1], etc.
4350     __ mov(tmpAddr, state);
4351     for (i = 0; i < 16; i += 4) {
4352       __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
4353           __ post(tmpAddr, 16));
4354     }
4355 
4356     // Pull in constant data.  The first 16 bytes are the add overlay
4357     // which is applied to the vector holding the counter (state[12]).
4358     // The second 16 bytes is the index register for the 8-bit left
4359     // rotation tbl instruction.
4360     __ adr(tmpAddr, L_cc20_const);
4361     __ ldpq(origCtrState, lrot8Tbl, Address(tmpAddr));
4362     __ addv(workSt[12], __ T4S, workSt[12], origCtrState);
4363 
4364     // Set up the 10 iteration loop and perform all 8 quarter round ops
4365     __ mov(loopCtr, 10);
4366     __ BIND(L_twoRounds);
4367 
4368     __ cc20_quarter_round(workSt[0], workSt[4], workSt[8], workSt[12],
4369         scratch, lrot8Tbl);
4370     __ cc20_quarter_round(workSt[1], workSt[5], workSt[9], workSt[13],
4371         scratch, lrot8Tbl);
4372     __ cc20_quarter_round(workSt[2], workSt[6], workSt[10], workSt[14],
4373         scratch, lrot8Tbl);
4374     __ cc20_quarter_round(workSt[3], workSt[7], workSt[11], workSt[15],
4375         scratch, lrot8Tbl);
4376 
4377     __ cc20_quarter_round(workSt[0], workSt[5], workSt[10], workSt[15],
4378         scratch, lrot8Tbl);
4379     __ cc20_quarter_round(workSt[1], workSt[6], workSt[11], workSt[12],
4380         scratch, lrot8Tbl);
4381     __ cc20_quarter_round(workSt[2], workSt[7], workSt[8], workSt[13],
4382         scratch, lrot8Tbl);
4383     __ cc20_quarter_round(workSt[3], workSt[4], workSt[9], workSt[14],
4384         scratch, lrot8Tbl);
4385 
4386     // Decrement and iterate
4387     __ sub(loopCtr, loopCtr, 1);
4388     __ cbnz(loopCtr, L_twoRounds);
4389 
4390     __ mov(tmpAddr, state);
4391 
4392     // Add the starting state back to the post-loop keystream
4393     // state.  We read/interlace the state array from memory into
4394     // 4 registers similar to what we did in the beginning.  Then
4395     // add the counter overlay onto workSt[12] at the end.
4396     for (i = 0; i < 16; i += 4) {
4397       __ ld4r(stateFirst, stateSecond, stateThird, stateFourth, __ T4S,
4398           __ post(tmpAddr, 16));
4399       __ addv(workSt[i], __ T4S, workSt[i], stateFirst);
4400       __ addv(workSt[i + 1], __ T4S, workSt[i + 1], stateSecond);
4401       __ addv(workSt[i + 2], __ T4S, workSt[i + 2], stateThird);
4402       __ addv(workSt[i + 3], __ T4S, workSt[i + 3], stateFourth);
4403     }
4404     __ addv(workSt[12], __ T4S, workSt[12], origCtrState);    // Add ctr mask
4405 
4406     // Write to key stream, storing the same element out of workSt[0..15]
4407     // to consecutive 4-byte offsets in the key stream buffer, then repeating
4408     // for the next element position.
4409     for (i = 0; i < 4; i++) {
4410       for (j = 0; j < 16; j += 4) {
4411         __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
4412             __ post(keystream, 16));
4413       }
4414     }
4415 
4416     __ mov(r0, 256);             // Return length of output keystream
4417     __ leave();
4418     __ ret(lr);
4419 
4420     return start;
4421   }
4422 
4423   /**
4424    *  Arguments:
4425    *
4426    * Inputs:
4427    *   c_rarg0   - int crc
4428    *   c_rarg1   - byte* buf
4429    *   c_rarg2   - int length
4430    *   c_rarg3   - int* table
4431    *
4432    * Output:
4433    *       r0   - int crc result
4434    */
4435   address generate_updateBytesCRC32C() {
4436     assert(UseCRC32CIntrinsics, "what are we doing here?");
4437 
4438     __ align(CodeEntryAlignment);
4439     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
4440 
4441     address start = __ pc();
4442 
4443     const Register crc   = c_rarg0;  // crc
4444     const Register buf   = c_rarg1;  // source java byte array address
4445     const Register len   = c_rarg2;  // length
4446     const Register table0 = c_rarg3; // crc_table address
4447     const Register table1 = c_rarg4;
4448     const Register table2 = c_rarg5;
4449     const Register table3 = c_rarg6;
4450     const Register tmp3 = c_rarg7;
4451 
4452     BLOCK_COMMENT("Entry:");
4453     __ enter(); // required for proper stackwalking of RuntimeStub frame
4454 
4455     __ kernel_crc32c(crc, buf, len,
4456               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
4457 
4458     __ leave(); // required for proper stackwalking of RuntimeStub frame
4459     __ ret(lr);
4460 
4461     return start;
4462   }
4463 
4464   /***
4465    *  Arguments:
4466    *
4467    *  Inputs:
4468    *   c_rarg0   - int   adler
4469    *   c_rarg1   - byte* buff
4470    *   c_rarg2   - int   len
4471    *
4472    * Output:
4473    *   c_rarg0   - int adler result
4474    */
4475   address generate_updateBytesAdler32() {
4476     __ align(CodeEntryAlignment);
4477     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
4478     address start = __ pc();
4479 
4480     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
4481 
4482     // Aliases
4483     Register adler  = c_rarg0;
4484     Register s1     = c_rarg0;
4485     Register s2     = c_rarg3;
4486     Register buff   = c_rarg1;
4487     Register len    = c_rarg2;
4488     Register nmax  = r4;
4489     Register base  = r5;
4490     Register count = r6;
4491     Register temp0 = rscratch1;
4492     Register temp1 = rscratch2;
4493     FloatRegister vbytes = v0;
4494     FloatRegister vs1acc = v1;
4495     FloatRegister vs2acc = v2;
4496     FloatRegister vtable = v3;
4497 
4498     // Max number of bytes we can process before having to take the mod
4499     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
4500     uint64_t BASE = 0xfff1;
4501     uint64_t NMAX = 0x15B0;
4502 
4503     __ mov(base, BASE);
4504     __ mov(nmax, NMAX);
4505 
4506     // Load accumulation coefficients for the upper 16 bits
4507     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
4508     __ ld1(vtable, __ T16B, Address(temp0));
4509 
4510     // s1 is initialized to the lower 16 bits of adler
4511     // s2 is initialized to the upper 16 bits of adler
4512     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
4513     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
4514 
4515     // The pipelined loop needs at least 16 elements for 1 iteration
4516     // It does check this, but it is more effective to skip to the cleanup loop
4517     __ cmp(len, (u1)16);
4518     __ br(Assembler::HS, L_nmax);
4519     __ cbz(len, L_combine);
4520 
4521     __ bind(L_simple_by1_loop);
4522     __ ldrb(temp0, Address(__ post(buff, 1)));
4523     __ add(s1, s1, temp0);
4524     __ add(s2, s2, s1);
4525     __ subs(len, len, 1);
4526     __ br(Assembler::HI, L_simple_by1_loop);
4527 
4528     // s1 = s1 % BASE
4529     __ subs(temp0, s1, base);
4530     __ csel(s1, temp0, s1, Assembler::HS);
4531 
4532     // s2 = s2 % BASE
4533     __ lsr(temp0, s2, 16);
4534     __ lsl(temp1, temp0, 4);
4535     __ sub(temp1, temp1, temp0);
4536     __ add(s2, temp1, s2, ext::uxth);
4537 
4538     __ subs(temp0, s2, base);
4539     __ csel(s2, temp0, s2, Assembler::HS);
4540 
4541     __ b(L_combine);
4542 
4543     __ bind(L_nmax);
4544     __ subs(len, len, nmax);
4545     __ sub(count, nmax, 16);
4546     __ br(Assembler::LO, L_by16);
4547 
4548     __ bind(L_nmax_loop);
4549 
4550     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4551                                       vbytes, vs1acc, vs2acc, vtable);
4552 
4553     __ subs(count, count, 16);
4554     __ br(Assembler::HS, L_nmax_loop);
4555 
4556     // s1 = s1 % BASE
4557     __ lsr(temp0, s1, 16);
4558     __ lsl(temp1, temp0, 4);
4559     __ sub(temp1, temp1, temp0);
4560     __ add(temp1, temp1, s1, ext::uxth);
4561 
4562     __ lsr(temp0, temp1, 16);
4563     __ lsl(s1, temp0, 4);
4564     __ sub(s1, s1, temp0);
4565     __ add(s1, s1, temp1, ext:: uxth);
4566 
4567     __ subs(temp0, s1, base);
4568     __ csel(s1, temp0, s1, Assembler::HS);
4569 
4570     // s2 = s2 % BASE
4571     __ lsr(temp0, s2, 16);
4572     __ lsl(temp1, temp0, 4);
4573     __ sub(temp1, temp1, temp0);
4574     __ add(temp1, temp1, s2, ext::uxth);
4575 
4576     __ lsr(temp0, temp1, 16);
4577     __ lsl(s2, temp0, 4);
4578     __ sub(s2, s2, temp0);
4579     __ add(s2, s2, temp1, ext:: uxth);
4580 
4581     __ subs(temp0, s2, base);
4582     __ csel(s2, temp0, s2, Assembler::HS);
4583 
4584     __ subs(len, len, nmax);
4585     __ sub(count, nmax, 16);
4586     __ br(Assembler::HS, L_nmax_loop);
4587 
4588     __ bind(L_by16);
4589     __ adds(len, len, count);
4590     __ br(Assembler::LO, L_by1);
4591 
4592     __ bind(L_by16_loop);
4593 
4594     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
4595                                       vbytes, vs1acc, vs2acc, vtable);
4596 
4597     __ subs(len, len, 16);
4598     __ br(Assembler::HS, L_by16_loop);
4599 
4600     __ bind(L_by1);
4601     __ adds(len, len, 15);
4602     __ br(Assembler::LO, L_do_mod);
4603 
4604     __ bind(L_by1_loop);
4605     __ ldrb(temp0, Address(__ post(buff, 1)));
4606     __ add(s1, temp0, s1);
4607     __ add(s2, s2, s1);
4608     __ subs(len, len, 1);
4609     __ br(Assembler::HS, L_by1_loop);
4610 
4611     __ bind(L_do_mod);
4612     // s1 = s1 % BASE
4613     __ lsr(temp0, s1, 16);
4614     __ lsl(temp1, temp0, 4);
4615     __ sub(temp1, temp1, temp0);
4616     __ add(temp1, temp1, s1, ext::uxth);
4617 
4618     __ lsr(temp0, temp1, 16);
4619     __ lsl(s1, temp0, 4);
4620     __ sub(s1, s1, temp0);
4621     __ add(s1, s1, temp1, ext:: uxth);
4622 
4623     __ subs(temp0, s1, base);
4624     __ csel(s1, temp0, s1, Assembler::HS);
4625 
4626     // s2 = s2 % BASE
4627     __ lsr(temp0, s2, 16);
4628     __ lsl(temp1, temp0, 4);
4629     __ sub(temp1, temp1, temp0);
4630     __ add(temp1, temp1, s2, ext::uxth);
4631 
4632     __ lsr(temp0, temp1, 16);
4633     __ lsl(s2, temp0, 4);
4634     __ sub(s2, s2, temp0);
4635     __ add(s2, s2, temp1, ext:: uxth);
4636 
4637     __ subs(temp0, s2, base);
4638     __ csel(s2, temp0, s2, Assembler::HS);
4639 
4640     // Combine lower bits and higher bits
4641     __ bind(L_combine);
4642     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
4643 
4644     __ ret(lr);
4645 
4646     return start;
4647   }
4648 
4649   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
4650           Register temp0, Register temp1, FloatRegister vbytes,
4651           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
4652     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
4653     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
4654     // In non-vectorized code, we update s1 and s2 as:
4655     //   s1 <- s1 + b1
4656     //   s2 <- s2 + s1
4657     //   s1 <- s1 + b2
4658     //   s2 <- s2 + b1
4659     //   ...
4660     //   s1 <- s1 + b16
4661     //   s2 <- s2 + s1
4662     // Putting above assignments together, we have:
4663     //   s1_new = s1 + b1 + b2 + ... + b16
4664     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
4665     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
4666     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
4667     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
4668 
4669     // s2 = s2 + s1 * 16
4670     __ add(s2, s2, s1, Assembler::LSL, 4);
4671 
4672     // vs1acc = b1 + b2 + b3 + ... + b16
4673     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
4674     __ umullv(vs2acc, __ T8B, vtable, vbytes);
4675     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
4676     __ uaddlv(vs1acc, __ T16B, vbytes);
4677     __ uaddlv(vs2acc, __ T8H, vs2acc);
4678 
4679     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
4680     __ fmovd(temp0, vs1acc);
4681     __ fmovd(temp1, vs2acc);
4682     __ add(s1, s1, temp0);
4683     __ add(s2, s2, temp1);
4684   }
4685 
4686   /**
4687    *  Arguments:
4688    *
4689    *  Input:
4690    *    c_rarg0   - x address
4691    *    c_rarg1   - x length
4692    *    c_rarg2   - y address
4693    *    c_rarg3   - y length
4694    *    c_rarg4   - z address
4695    */
4696   address generate_multiplyToLen() {
4697     __ align(CodeEntryAlignment);
4698     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
4699 
4700     address start = __ pc();
4701     const Register x     = r0;
4702     const Register xlen  = r1;
4703     const Register y     = r2;
4704     const Register ylen  = r3;
4705     const Register z     = r4;
4706 
4707     const Register tmp0  = r5;
4708     const Register tmp1  = r10;
4709     const Register tmp2  = r11;
4710     const Register tmp3  = r12;
4711     const Register tmp4  = r13;
4712     const Register tmp5  = r14;
4713     const Register tmp6  = r15;
4714     const Register tmp7  = r16;
4715 
4716     BLOCK_COMMENT("Entry:");
4717     __ enter(); // required for proper stackwalking of RuntimeStub frame
4718     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4719     __ leave(); // required for proper stackwalking of RuntimeStub frame
4720     __ ret(lr);
4721 
4722     return start;
4723   }
4724 
4725   address generate_squareToLen() {
4726     // squareToLen algorithm for sizes 1..127 described in java code works
4727     // faster than multiply_to_len on some CPUs and slower on others, but
4728     // multiply_to_len shows a bit better overall results
4729     __ align(CodeEntryAlignment);
4730     StubCodeMark mark(this, "StubRoutines", "squareToLen");
4731     address start = __ pc();
4732 
4733     const Register x     = r0;
4734     const Register xlen  = r1;
4735     const Register z     = r2;
4736     const Register y     = r4; // == x
4737     const Register ylen  = r5; // == xlen
4738 
4739     const Register tmp0  = r3;
4740     const Register tmp1  = r10;
4741     const Register tmp2  = r11;
4742     const Register tmp3  = r12;
4743     const Register tmp4  = r13;
4744     const Register tmp5  = r14;
4745     const Register tmp6  = r15;
4746     const Register tmp7  = r16;
4747 
4748     RegSet spilled_regs = RegSet::of(y, ylen);
4749     BLOCK_COMMENT("Entry:");
4750     __ enter();
4751     __ push(spilled_regs, sp);
4752     __ mov(y, x);
4753     __ mov(ylen, xlen);
4754     __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
4755     __ pop(spilled_regs, sp);
4756     __ leave();
4757     __ ret(lr);
4758     return start;
4759   }
4760 
4761   address generate_mulAdd() {
4762     __ align(CodeEntryAlignment);
4763     StubCodeMark mark(this, "StubRoutines", "mulAdd");
4764 
4765     address start = __ pc();
4766 
4767     const Register out     = r0;
4768     const Register in      = r1;
4769     const Register offset  = r2;
4770     const Register len     = r3;
4771     const Register k       = r4;
4772 
4773     BLOCK_COMMENT("Entry:");
4774     __ enter();
4775     __ mul_add(out, in, offset, len, k);
4776     __ leave();
4777     __ ret(lr);
4778 
4779     return start;
4780   }
4781 
4782   // Arguments:
4783   //
4784   // Input:
4785   //   c_rarg0   - newArr address
4786   //   c_rarg1   - oldArr address
4787   //   c_rarg2   - newIdx
4788   //   c_rarg3   - shiftCount
4789   //   c_rarg4   - numIter
4790   //
4791   address generate_bigIntegerRightShift() {
4792     __ align(CodeEntryAlignment);
4793     StubCodeMark mark(this,  "StubRoutines", "bigIntegerRightShiftWorker");
4794     address start = __ pc();
4795 
4796     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4797 
4798     Register newArr        = c_rarg0;
4799     Register oldArr        = c_rarg1;
4800     Register newIdx        = c_rarg2;
4801     Register shiftCount    = c_rarg3;
4802     Register numIter       = c_rarg4;
4803     Register idx           = numIter;
4804 
4805     Register newArrCur     = rscratch1;
4806     Register shiftRevCount = rscratch2;
4807     Register oldArrCur     = r13;
4808     Register oldArrNext    = r14;
4809 
4810     FloatRegister oldElem0        = v0;
4811     FloatRegister oldElem1        = v1;
4812     FloatRegister newElem         = v2;
4813     FloatRegister shiftVCount     = v3;
4814     FloatRegister shiftVRevCount  = v4;
4815 
4816     __ cbz(idx, Exit);
4817 
4818     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4819 
4820     // left shift count
4821     __ movw(shiftRevCount, 32);
4822     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4823 
4824     // numIter too small to allow a 4-words SIMD loop, rolling back
4825     __ cmp(numIter, (u1)4);
4826     __ br(Assembler::LT, ShiftThree);
4827 
4828     __ dup(shiftVCount,    __ T4S, shiftCount);
4829     __ dup(shiftVRevCount, __ T4S, shiftRevCount);
4830     __ negr(shiftVCount,   __ T4S, shiftVCount);
4831 
4832     __ BIND(ShiftSIMDLoop);
4833 
4834     // Calculate the load addresses
4835     __ sub(idx, idx, 4);
4836     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4837     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4838     __ add(oldArrCur,  oldArrNext, 4);
4839 
4840     // Load 4 words and process
4841     __ ld1(oldElem0,  __ T4S,  Address(oldArrCur));
4842     __ ld1(oldElem1,  __ T4S,  Address(oldArrNext));
4843     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4844     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4845     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4846     __ st1(newElem,   __ T4S,  Address(newArrCur));
4847 
4848     __ cmp(idx, (u1)4);
4849     __ br(Assembler::LT, ShiftTwoLoop);
4850     __ b(ShiftSIMDLoop);
4851 
4852     __ BIND(ShiftTwoLoop);
4853     __ cbz(idx, Exit);
4854     __ cmp(idx, (u1)1);
4855     __ br(Assembler::EQ, ShiftOne);
4856 
4857     // Calculate the load addresses
4858     __ sub(idx, idx, 2);
4859     __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
4860     __ add(newArrCur,  newArr, idx, Assembler::LSL, 2);
4861     __ add(oldArrCur,  oldArrNext, 4);
4862 
4863     // Load 2 words and process
4864     __ ld1(oldElem0,  __ T2S, Address(oldArrCur));
4865     __ ld1(oldElem1,  __ T2S, Address(oldArrNext));
4866     __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
4867     __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
4868     __ orr(newElem,   __ T8B, oldElem0, oldElem1);
4869     __ st1(newElem,   __ T2S, Address(newArrCur));
4870     __ b(ShiftTwoLoop);
4871 
4872     __ BIND(ShiftThree);
4873     __ tbz(idx, 1, ShiftOne);
4874     __ tbz(idx, 0, ShiftTwo);
4875     __ ldrw(r10,  Address(oldArr, 12));
4876     __ ldrw(r11,  Address(oldArr, 8));
4877     __ lsrvw(r10, r10, shiftCount);
4878     __ lslvw(r11, r11, shiftRevCount);
4879     __ orrw(r12,  r10, r11);
4880     __ strw(r12,  Address(newArr, 8));
4881 
4882     __ BIND(ShiftTwo);
4883     __ ldrw(r10,  Address(oldArr, 8));
4884     __ ldrw(r11,  Address(oldArr, 4));
4885     __ lsrvw(r10, r10, shiftCount);
4886     __ lslvw(r11, r11, shiftRevCount);
4887     __ orrw(r12,  r10, r11);
4888     __ strw(r12,  Address(newArr, 4));
4889 
4890     __ BIND(ShiftOne);
4891     __ ldrw(r10,  Address(oldArr, 4));
4892     __ ldrw(r11,  Address(oldArr));
4893     __ lsrvw(r10, r10, shiftCount);
4894     __ lslvw(r11, r11, shiftRevCount);
4895     __ orrw(r12,  r10, r11);
4896     __ strw(r12,  Address(newArr));
4897 
4898     __ BIND(Exit);
4899     __ ret(lr);
4900 
4901     return start;
4902   }
4903 
4904   // Arguments:
4905   //
4906   // Input:
4907   //   c_rarg0   - newArr address
4908   //   c_rarg1   - oldArr address
4909   //   c_rarg2   - newIdx
4910   //   c_rarg3   - shiftCount
4911   //   c_rarg4   - numIter
4912   //
4913   address generate_bigIntegerLeftShift() {
4914     __ align(CodeEntryAlignment);
4915     StubCodeMark mark(this,  "StubRoutines", "bigIntegerLeftShiftWorker");
4916     address start = __ pc();
4917 
4918     Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4919 
4920     Register newArr        = c_rarg0;
4921     Register oldArr        = c_rarg1;
4922     Register newIdx        = c_rarg2;
4923     Register shiftCount    = c_rarg3;
4924     Register numIter       = c_rarg4;
4925 
4926     Register shiftRevCount = rscratch1;
4927     Register oldArrNext    = rscratch2;
4928 
4929     FloatRegister oldElem0        = v0;
4930     FloatRegister oldElem1        = v1;
4931     FloatRegister newElem         = v2;
4932     FloatRegister shiftVCount     = v3;
4933     FloatRegister shiftVRevCount  = v4;
4934 
4935     __ cbz(numIter, Exit);
4936 
4937     __ add(oldArrNext, oldArr, 4);
4938     __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
4939 
4940     // right shift count
4941     __ movw(shiftRevCount, 32);
4942     __ subw(shiftRevCount, shiftRevCount, shiftCount);
4943 
4944     // numIter too small to allow a 4-words SIMD loop, rolling back
4945     __ cmp(numIter, (u1)4);
4946     __ br(Assembler::LT, ShiftThree);
4947 
4948     __ dup(shiftVCount,     __ T4S, shiftCount);
4949     __ dup(shiftVRevCount,  __ T4S, shiftRevCount);
4950     __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
4951 
4952     __ BIND(ShiftSIMDLoop);
4953 
4954     // load 4 words and process
4955     __ ld1(oldElem0,  __ T4S,  __ post(oldArr, 16));
4956     __ ld1(oldElem1,  __ T4S,  __ post(oldArrNext, 16));
4957     __ ushl(oldElem0, __ T4S,  oldElem0, shiftVCount);
4958     __ ushl(oldElem1, __ T4S,  oldElem1, shiftVRevCount);
4959     __ orr(newElem,   __ T16B, oldElem0, oldElem1);
4960     __ st1(newElem,   __ T4S,  __ post(newArr, 16));
4961     __ sub(numIter,   numIter, 4);
4962 
4963     __ cmp(numIter, (u1)4);
4964     __ br(Assembler::LT, ShiftTwoLoop);
4965     __ b(ShiftSIMDLoop);
4966 
4967     __ BIND(ShiftTwoLoop);
4968     __ cbz(numIter, Exit);
4969     __ cmp(numIter, (u1)1);
4970     __ br(Assembler::EQ, ShiftOne);
4971 
4972     // load 2 words and process
4973     __ ld1(oldElem0,  __ T2S,  __ post(oldArr, 8));
4974     __ ld1(oldElem1,  __ T2S,  __ post(oldArrNext, 8));
4975     __ ushl(oldElem0, __ T2S,  oldElem0, shiftVCount);
4976     __ ushl(oldElem1, __ T2S,  oldElem1, shiftVRevCount);
4977     __ orr(newElem,   __ T8B,  oldElem0, oldElem1);
4978     __ st1(newElem,   __ T2S,  __ post(newArr, 8));
4979     __ sub(numIter,   numIter, 2);
4980     __ b(ShiftTwoLoop);
4981 
4982     __ BIND(ShiftThree);
4983     __ ldrw(r10,  __ post(oldArr, 4));
4984     __ ldrw(r11,  __ post(oldArrNext, 4));
4985     __ lslvw(r10, r10, shiftCount);
4986     __ lsrvw(r11, r11, shiftRevCount);
4987     __ orrw(r12,  r10, r11);
4988     __ strw(r12,  __ post(newArr, 4));
4989     __ tbz(numIter, 1, Exit);
4990     __ tbz(numIter, 0, ShiftOne);
4991 
4992     __ BIND(ShiftTwo);
4993     __ ldrw(r10,  __ post(oldArr, 4));
4994     __ ldrw(r11,  __ post(oldArrNext, 4));
4995     __ lslvw(r10, r10, shiftCount);
4996     __ lsrvw(r11, r11, shiftRevCount);
4997     __ orrw(r12,  r10, r11);
4998     __ strw(r12,  __ post(newArr, 4));
4999 
5000     __ BIND(ShiftOne);
5001     __ ldrw(r10,  Address(oldArr));
5002     __ ldrw(r11,  Address(oldArrNext));
5003     __ lslvw(r10, r10, shiftCount);
5004     __ lsrvw(r11, r11, shiftRevCount);
5005     __ orrw(r12,  r10, r11);
5006     __ strw(r12,  Address(newArr));
5007 
5008     __ BIND(Exit);
5009     __ ret(lr);
5010 
5011     return start;
5012   }
5013 
5014   address generate_count_positives(address &count_positives_long) {
5015     const u1 large_loop_size = 64;
5016     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
5017     int dcache_line = VM_Version::dcache_line_size();
5018 
5019     Register ary1 = r1, len = r2, result = r0;
5020 
5021     __ align(CodeEntryAlignment);
5022 
5023     StubCodeMark mark(this, "StubRoutines", "count_positives");
5024 
5025     address entry = __ pc();
5026 
5027     __ enter();
5028     // precondition: a copy of len is already in result
5029     // __ mov(result, len);
5030 
5031   Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
5032         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
5033 
5034   __ cmp(len, (u1)15);
5035   __ br(Assembler::GT, LEN_OVER_15);
5036   // The only case when execution falls into this code is when pointer is near
5037   // the end of memory page and we have to avoid reading next page
5038   __ add(ary1, ary1, len);
5039   __ subs(len, len, 8);
5040   __ br(Assembler::GT, LEN_OVER_8);
5041   __ ldr(rscratch2, Address(ary1, -8));
5042   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
5043   __ lsrv(rscratch2, rscratch2, rscratch1);
5044   __ tst(rscratch2, UPPER_BIT_MASK);
5045   __ csel(result, zr, result, Assembler::NE);
5046   __ leave();
5047   __ ret(lr);
5048   __ bind(LEN_OVER_8);
5049   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
5050   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
5051   __ tst(rscratch2, UPPER_BIT_MASK);
5052   __ br(Assembler::NE, RET_NO_POP);
5053   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
5054   __ lsrv(rscratch1, rscratch1, rscratch2);
5055   __ tst(rscratch1, UPPER_BIT_MASK);
5056   __ bind(RET_NO_POP);
5057   __ csel(result, zr, result, Assembler::NE);
5058   __ leave();
5059   __ ret(lr);
5060 
5061   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
5062   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
5063 
5064   count_positives_long = __ pc(); // 2nd entry point
5065 
5066   __ enter();
5067 
5068   __ bind(LEN_OVER_15);
5069     __ push(spilled_regs, sp);
5070     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
5071     __ cbz(rscratch2, ALIGNED);
5072     __ ldp(tmp6, tmp1, Address(ary1));
5073     __ mov(tmp5, 16);
5074     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
5075     __ add(ary1, ary1, rscratch1);
5076     __ orr(tmp6, tmp6, tmp1);
5077     __ tst(tmp6, UPPER_BIT_MASK);
5078     __ br(Assembler::NE, RET_ADJUST);
5079     __ sub(len, len, rscratch1);
5080 
5081   __ bind(ALIGNED);
5082     __ cmp(len, large_loop_size);
5083     __ br(Assembler::LT, CHECK_16);
5084     // Perform 16-byte load as early return in pre-loop to handle situation
5085     // when initially aligned large array has negative values at starting bytes,
5086     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
5087     // slower. Cases with negative bytes further ahead won't be affected that
5088     // much. In fact, it'll be faster due to early loads, less instructions and
5089     // less branches in LARGE_LOOP.
5090     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
5091     __ sub(len, len, 16);
5092     __ orr(tmp6, tmp6, tmp1);
5093     __ tst(tmp6, UPPER_BIT_MASK);
5094     __ br(Assembler::NE, RET_ADJUST_16);
5095     __ cmp(len, large_loop_size);
5096     __ br(Assembler::LT, CHECK_16);
5097 
5098     if (SoftwarePrefetchHintDistance >= 0
5099         && SoftwarePrefetchHintDistance >= dcache_line) {
5100       // initial prefetch
5101       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
5102     }
5103   __ bind(LARGE_LOOP);
5104     if (SoftwarePrefetchHintDistance >= 0) {
5105       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
5106     }
5107     // Issue load instructions first, since it can save few CPU/MEM cycles, also
5108     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
5109     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
5110     // instructions per cycle and have less branches, but this approach disables
5111     // early return, thus, all 64 bytes are loaded and checked every time.
5112     __ ldp(tmp2, tmp3, Address(ary1));
5113     __ ldp(tmp4, tmp5, Address(ary1, 16));
5114     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
5115     __ ldp(tmp6, tmp1, Address(ary1, 48));
5116     __ add(ary1, ary1, large_loop_size);
5117     __ sub(len, len, large_loop_size);
5118     __ orr(tmp2, tmp2, tmp3);
5119     __ orr(tmp4, tmp4, tmp5);
5120     __ orr(rscratch1, rscratch1, rscratch2);
5121     __ orr(tmp6, tmp6, tmp1);
5122     __ orr(tmp2, tmp2, tmp4);
5123     __ orr(rscratch1, rscratch1, tmp6);
5124     __ orr(tmp2, tmp2, rscratch1);
5125     __ tst(tmp2, UPPER_BIT_MASK);
5126     __ br(Assembler::NE, RET_ADJUST_LONG);
5127     __ cmp(len, large_loop_size);
5128     __ br(Assembler::GE, LARGE_LOOP);
5129 
5130   __ bind(CHECK_16); // small 16-byte load pre-loop
5131     __ cmp(len, (u1)16);
5132     __ br(Assembler::LT, POST_LOOP16);
5133 
5134   __ bind(LOOP16); // small 16-byte load loop
5135     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
5136     __ sub(len, len, 16);
5137     __ orr(tmp2, tmp2, tmp3);
5138     __ tst(tmp2, UPPER_BIT_MASK);
5139     __ br(Assembler::NE, RET_ADJUST_16);
5140     __ cmp(len, (u1)16);
5141     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
5142 
5143   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
5144     __ cmp(len, (u1)8);
5145     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
5146     __ ldr(tmp3, Address(__ post(ary1, 8)));
5147     __ tst(tmp3, UPPER_BIT_MASK);
5148     __ br(Assembler::NE, RET_ADJUST);
5149     __ sub(len, len, 8);
5150 
5151   __ bind(POST_LOOP16_LOAD_TAIL);
5152     __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
5153     __ ldr(tmp1, Address(ary1));
5154     __ mov(tmp2, 64);
5155     __ sub(tmp4, tmp2, len, __ LSL, 3);
5156     __ lslv(tmp1, tmp1, tmp4);
5157     __ tst(tmp1, UPPER_BIT_MASK);
5158     __ br(Assembler::NE, RET_ADJUST);
5159     // Fallthrough
5160 
5161   __ bind(RET_LEN);
5162     __ pop(spilled_regs, sp);
5163     __ leave();
5164     __ ret(lr);
5165 
5166     // difference result - len is the count of guaranteed to be
5167     // positive bytes
5168 
5169   __ bind(RET_ADJUST_LONG);
5170     __ add(len, len, (u1)(large_loop_size - 16));
5171   __ bind(RET_ADJUST_16);
5172     __ add(len, len, 16);
5173   __ bind(RET_ADJUST);
5174     __ pop(spilled_regs, sp);
5175     __ leave();
5176     __ sub(result, result, len);
5177     __ ret(lr);
5178 
5179     return entry;
5180   }
5181 
5182   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
5183         bool usePrefetch, Label &NOT_EQUAL) {
5184     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5185         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
5186         tmp7 = r12, tmp8 = r13;
5187     Label LOOP;
5188 
5189     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5190     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5191     __ bind(LOOP);
5192     if (usePrefetch) {
5193       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
5194       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
5195     }
5196     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
5197     __ eor(tmp1, tmp1, tmp2);
5198     __ eor(tmp3, tmp3, tmp4);
5199     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
5200     __ orr(tmp1, tmp1, tmp3);
5201     __ cbnz(tmp1, NOT_EQUAL);
5202     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5203     __ eor(tmp5, tmp5, tmp6);
5204     __ eor(tmp7, tmp7, tmp8);
5205     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5206     __ orr(tmp5, tmp5, tmp7);
5207     __ cbnz(tmp5, NOT_EQUAL);
5208     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
5209     __ eor(tmp1, tmp1, tmp2);
5210     __ eor(tmp3, tmp3, tmp4);
5211     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
5212     __ orr(tmp1, tmp1, tmp3);
5213     __ cbnz(tmp1, NOT_EQUAL);
5214     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
5215     __ eor(tmp5, tmp5, tmp6);
5216     __ sub(cnt1, cnt1, 8 * wordSize);
5217     __ eor(tmp7, tmp7, tmp8);
5218     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
5219     // tmp6 is not used. MacroAssembler::subs is used here (rather than
5220     // cmp) because subs allows an unlimited range of immediate operand.
5221     __ subs(tmp6, cnt1, loopThreshold);
5222     __ orr(tmp5, tmp5, tmp7);
5223     __ cbnz(tmp5, NOT_EQUAL);
5224     __ br(__ GE, LOOP);
5225     // post-loop
5226     __ eor(tmp1, tmp1, tmp2);
5227     __ eor(tmp3, tmp3, tmp4);
5228     __ orr(tmp1, tmp1, tmp3);
5229     __ sub(cnt1, cnt1, 2 * wordSize);
5230     __ cbnz(tmp1, NOT_EQUAL);
5231   }
5232 
5233   void generate_large_array_equals_loop_simd(int loopThreshold,
5234         bool usePrefetch, Label &NOT_EQUAL) {
5235     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5236         tmp2 = rscratch2;
5237     Label LOOP;
5238 
5239     __ bind(LOOP);
5240     if (usePrefetch) {
5241       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
5242       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
5243     }
5244     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
5245     __ sub(cnt1, cnt1, 8 * wordSize);
5246     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
5247     __ subs(tmp1, cnt1, loopThreshold);
5248     __ eor(v0, __ T16B, v0, v4);
5249     __ eor(v1, __ T16B, v1, v5);
5250     __ eor(v2, __ T16B, v2, v6);
5251     __ eor(v3, __ T16B, v3, v7);
5252     __ orr(v0, __ T16B, v0, v1);
5253     __ orr(v1, __ T16B, v2, v3);
5254     __ orr(v0, __ T16B, v0, v1);
5255     __ umov(tmp1, v0, __ D, 0);
5256     __ umov(tmp2, v0, __ D, 1);
5257     __ orr(tmp1, tmp1, tmp2);
5258     __ cbnz(tmp1, NOT_EQUAL);
5259     __ br(__ GE, LOOP);
5260   }
5261 
5262   // a1 = r1 - array1 address
5263   // a2 = r2 - array2 address
5264   // result = r0 - return value. Already contains "false"
5265   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
5266   // r3-r5 are reserved temporary registers
5267   // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
5268   address generate_large_array_equals() {
5269     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
5270         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
5271         tmp7 = r12, tmp8 = r13;
5272     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
5273         SMALL_LOOP, POST_LOOP;
5274     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
5275     // calculate if at least 32 prefetched bytes are used
5276     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
5277     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
5278     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
5279     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
5280         tmp5, tmp6, tmp7, tmp8);
5281 
5282     __ align(CodeEntryAlignment);
5283 
5284     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
5285 
5286     address entry = __ pc();
5287     __ enter();
5288     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
5289     // also advance pointers to use post-increment instead of pre-increment
5290     __ add(a1, a1, wordSize);
5291     __ add(a2, a2, wordSize);
5292     if (AvoidUnalignedAccesses) {
5293       // both implementations (SIMD/nonSIMD) are using relatively large load
5294       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
5295       // on some CPUs in case of address is not at least 16-byte aligned.
5296       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
5297       // load if needed at least for 1st address and make if 16-byte aligned.
5298       Label ALIGNED16;
5299       __ tbz(a1, 3, ALIGNED16);
5300       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5301       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5302       __ sub(cnt1, cnt1, wordSize);
5303       __ eor(tmp1, tmp1, tmp2);
5304       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
5305       __ bind(ALIGNED16);
5306     }
5307     if (UseSIMDForArrayEquals) {
5308       if (SoftwarePrefetchHintDistance >= 0) {
5309         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5310         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5311         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
5312             /* prfm = */ true, NOT_EQUAL);
5313         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5314         __ br(__ LT, TAIL);
5315       }
5316       __ bind(NO_PREFETCH_LARGE_LOOP);
5317       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
5318           /* prfm = */ false, NOT_EQUAL);
5319     } else {
5320       __ push(spilled_regs, sp);
5321       if (SoftwarePrefetchHintDistance >= 0) {
5322         __ subs(tmp1, cnt1, prefetchLoopThreshold);
5323         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
5324         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
5325             /* prfm = */ true, NOT_EQUAL);
5326         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
5327         __ br(__ LT, TAIL);
5328       }
5329       __ bind(NO_PREFETCH_LARGE_LOOP);
5330       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
5331           /* prfm = */ false, NOT_EQUAL);
5332     }
5333     __ bind(TAIL);
5334       __ cbz(cnt1, EQUAL);
5335       __ subs(cnt1, cnt1, wordSize);
5336       __ br(__ LE, POST_LOOP);
5337     __ bind(SMALL_LOOP);
5338       __ ldr(tmp1, Address(__ post(a1, wordSize)));
5339       __ ldr(tmp2, Address(__ post(a2, wordSize)));
5340       __ subs(cnt1, cnt1, wordSize);
5341       __ eor(tmp1, tmp1, tmp2);
5342       __ cbnz(tmp1, NOT_EQUAL);
5343       __ br(__ GT, SMALL_LOOP);
5344     __ bind(POST_LOOP);
5345       __ ldr(tmp1, Address(a1, cnt1));
5346       __ ldr(tmp2, Address(a2, cnt1));
5347       __ eor(tmp1, tmp1, tmp2);
5348       __ cbnz(tmp1, NOT_EQUAL);
5349     __ bind(EQUAL);
5350       __ mov(result, true);
5351     __ bind(NOT_EQUAL);
5352       if (!UseSIMDForArrayEquals) {
5353         __ pop(spilled_regs, sp);
5354       }
5355     __ bind(NOT_EQUAL_NO_POP);
5356     __ leave();
5357     __ ret(lr);
5358     return entry;
5359   }
5360 
5361   // result = r0 - return value. Contains initial hashcode value on entry.
5362   // ary = r1 - array address
5363   // cnt = r2 - elements count
5364   // Clobbers: v0-v13, rscratch1, rscratch2
5365   address generate_large_arrays_hashcode(BasicType eltype) {
5366     const Register result = r0, ary = r1, cnt = r2;
5367     const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
5368     const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
5369     const FloatRegister vpow = v12;  // powers of 31: <31^3, ..., 31^0>
5370     const FloatRegister vpowm = v13;
5371 
5372     ARRAYS_HASHCODE_REGISTERS;
5373 
5374     Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
5375 
5376     unsigned int vf; // vectorization factor
5377     bool multiply_by_halves;
5378     Assembler::SIMD_Arrangement load_arrangement;
5379     switch (eltype) {
5380     case T_BOOLEAN:
5381     case T_BYTE:
5382       load_arrangement = Assembler::T8B;
5383       multiply_by_halves = true;
5384       vf = 8;
5385       break;
5386     case T_CHAR:
5387     case T_SHORT:
5388       load_arrangement = Assembler::T8H;
5389       multiply_by_halves = true;
5390       vf = 8;
5391       break;
5392     case T_INT:
5393       load_arrangement = Assembler::T4S;
5394       multiply_by_halves = false;
5395       vf = 4;
5396       break;
5397     default:
5398       ShouldNotReachHere();
5399     }
5400 
5401     // Unroll factor
5402     const unsigned uf = 4;
5403 
5404     // Effective vectorization factor
5405     const unsigned evf = vf * uf;
5406 
5407     __ align(CodeEntryAlignment);
5408 
5409     const char *mark_name = "";
5410     switch (eltype) {
5411     case T_BOOLEAN:
5412       mark_name = "_large_arrays_hashcode_boolean";
5413       break;
5414     case T_BYTE:
5415       mark_name = "_large_arrays_hashcode_byte";
5416       break;
5417     case T_CHAR:
5418       mark_name = "_large_arrays_hashcode_char";
5419       break;
5420     case T_SHORT:
5421       mark_name = "_large_arrays_hashcode_short";
5422       break;
5423     case T_INT:
5424       mark_name = "_large_arrays_hashcode_int";
5425       break;
5426     default:
5427       mark_name = "_large_arrays_hashcode_incorrect_type";
5428       __ should_not_reach_here();
5429     };
5430 
5431     StubCodeMark mark(this, "StubRoutines", mark_name);
5432 
5433     address entry = __ pc();
5434     __ enter();
5435 
5436     // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
5437     // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
5438     // value shouldn't change throughout both loops.
5439     __ movw(rscratch1, intpow(31U, 3));
5440     __ mov(vpow, Assembler::S, 0, rscratch1);
5441     __ movw(rscratch1, intpow(31U, 2));
5442     __ mov(vpow, Assembler::S, 1, rscratch1);
5443     __ movw(rscratch1, intpow(31U, 1));
5444     __ mov(vpow, Assembler::S, 2, rscratch1);
5445     __ movw(rscratch1, intpow(31U, 0));
5446     __ mov(vpow, Assembler::S, 3, rscratch1);
5447 
5448     __ mov(vmul0, Assembler::T16B, 0);
5449     __ mov(vmul0, Assembler::S, 3, result);
5450 
5451     __ andr(rscratch2, cnt, (uf - 1) * vf);
5452     __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
5453 
5454     __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
5455     __ mov(vpowm, Assembler::S, 0, rscratch1);
5456 
5457     // SMALL LOOP
5458     __ bind(SMALL_LOOP);
5459 
5460     __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
5461     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
5462     __ subsw(rscratch2, rscratch2, vf);
5463 
5464     if (load_arrangement == Assembler::T8B) {
5465       // Extend 8B to 8H to be able to use vector multiply
5466       // instructions
5467       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
5468       if (is_signed_subword_type(eltype)) {
5469         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
5470       } else {
5471         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
5472       }
5473     }
5474 
5475     switch (load_arrangement) {
5476     case Assembler::T4S:
5477       __ addv(vmul0, load_arrangement, vmul0, vdata0);
5478       break;
5479     case Assembler::T8B:
5480     case Assembler::T8H:
5481       assert(is_subword_type(eltype), "subword type expected");
5482       if (is_signed_subword_type(eltype)) {
5483         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
5484       } else {
5485         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
5486       }
5487       break;
5488     default:
5489       __ should_not_reach_here();
5490     }
5491 
5492     // Process the upper half of a vector
5493     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
5494       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
5495       if (is_signed_subword_type(eltype)) {
5496         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
5497       } else {
5498         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
5499       }
5500     }
5501 
5502     __ br(Assembler::HI, SMALL_LOOP);
5503 
5504     // SMALL LOOP'S EPILOQUE
5505     __ lsr(rscratch2, cnt, exact_log2(evf));
5506     __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
5507 
5508     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
5509     __ addv(vmul0, Assembler::T4S, vmul0);
5510     __ umov(result, vmul0, Assembler::S, 0);
5511 
5512     // TAIL
5513     __ bind(TAIL);
5514 
5515     // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
5516     // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
5517     assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
5518     __ andr(rscratch2, cnt, vf - 1);
5519     __ bind(TAIL_SHORTCUT);
5520     __ adr(rscratch1, BR_BASE);
5521     __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, 3);
5522     __ movw(rscratch2, 0x1f);
5523     __ br(rscratch1);
5524 
5525     for (size_t i = 0; i < vf - 1; ++i) {
5526       __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
5527                                    eltype);
5528       __ maddw(result, result, rscratch2, rscratch1);
5529     }
5530     __ bind(BR_BASE);
5531 
5532     __ leave();
5533     __ ret(lr);
5534 
5535     // LARGE LOOP
5536     __ bind(LARGE_LOOP_PREHEADER);
5537 
5538     __ lsr(rscratch2, cnt, exact_log2(evf));
5539 
5540     if (multiply_by_halves) {
5541       // 31^4 - multiplier between lower and upper parts of a register
5542       __ movw(rscratch1, intpow(31U, vf / 2));
5543       __ mov(vpowm, Assembler::S, 1, rscratch1);
5544       // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
5545       __ movw(rscratch1, intpow(31U, evf - vf / 2));
5546       __ mov(vpowm, Assembler::S, 0, rscratch1);
5547     } else {
5548       // 31^16
5549       __ movw(rscratch1, intpow(31U, evf));
5550       __ mov(vpowm, Assembler::S, 0, rscratch1);
5551     }
5552 
5553     __ mov(vmul3, Assembler::T16B, 0);
5554     __ mov(vmul2, Assembler::T16B, 0);
5555     __ mov(vmul1, Assembler::T16B, 0);
5556 
5557     __ bind(LARGE_LOOP);
5558 
5559     __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
5560     __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
5561     __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
5562     __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
5563 
5564     __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
5565            Address(__ post(ary, evf * type2aelembytes(eltype))));
5566 
5567     if (load_arrangement == Assembler::T8B) {
5568       // Extend 8B to 8H to be able to use vector multiply
5569       // instructions
5570       assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
5571       if (is_signed_subword_type(eltype)) {
5572         __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
5573         __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
5574         __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
5575         __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
5576       } else {
5577         __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
5578         __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
5579         __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
5580         __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
5581       }
5582     }
5583 
5584     switch (load_arrangement) {
5585     case Assembler::T4S:
5586       __ addv(vmul3, load_arrangement, vmul3, vdata3);
5587       __ addv(vmul2, load_arrangement, vmul2, vdata2);
5588       __ addv(vmul1, load_arrangement, vmul1, vdata1);
5589       __ addv(vmul0, load_arrangement, vmul0, vdata0);
5590       break;
5591     case Assembler::T8B:
5592     case Assembler::T8H:
5593       assert(is_subword_type(eltype), "subword type expected");
5594       if (is_signed_subword_type(eltype)) {
5595         __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
5596         __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
5597         __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
5598         __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
5599       } else {
5600         __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
5601         __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
5602         __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
5603         __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
5604       }
5605       break;
5606     default:
5607       __ should_not_reach_here();
5608     }
5609 
5610     // Process the upper half of a vector
5611     if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
5612       __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
5613       __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
5614       __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
5615       __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
5616       if (is_signed_subword_type(eltype)) {
5617         __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
5618         __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
5619         __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
5620         __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
5621       } else {
5622         __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
5623         __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
5624         __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
5625         __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
5626       }
5627     }
5628 
5629     __ subsw(rscratch2, rscratch2, 1);
5630     __ br(Assembler::HI, LARGE_LOOP);
5631 
5632     __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
5633     __ addv(vmul3, Assembler::T4S, vmul3);
5634     __ umov(result, vmul3, Assembler::S, 0);
5635 
5636     __ mov(rscratch2, intpow(31U, vf));
5637 
5638     __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
5639     __ addv(vmul2, Assembler::T4S, vmul2);
5640     __ umov(rscratch1, vmul2, Assembler::S, 0);
5641     __ maddw(result, result, rscratch2, rscratch1);
5642 
5643     __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
5644     __ addv(vmul1, Assembler::T4S, vmul1);
5645     __ umov(rscratch1, vmul1, Assembler::S, 0);
5646     __ maddw(result, result, rscratch2, rscratch1);
5647 
5648     __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
5649     __ addv(vmul0, Assembler::T4S, vmul0);
5650     __ umov(rscratch1, vmul0, Assembler::S, 0);
5651     __ maddw(result, result, rscratch2, rscratch1);
5652 
5653     __ andr(rscratch2, cnt, vf - 1);
5654     __ cbnz(rscratch2, TAIL_SHORTCUT);
5655 
5656     __ leave();
5657     __ ret(lr);
5658 
5659     return entry;
5660   }
5661 
5662   address generate_dsin_dcos(bool isCos) {
5663     __ align(CodeEntryAlignment);
5664     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
5665     address start = __ pc();
5666     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
5667         (address)StubRoutines::aarch64::_two_over_pi,
5668         (address)StubRoutines::aarch64::_pio2,
5669         (address)StubRoutines::aarch64::_dsin_coef,
5670         (address)StubRoutines::aarch64::_dcos_coef);
5671     return start;
5672   }
5673 
5674   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
5675   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
5676       Label &DIFF2) {
5677     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
5678     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
5679 
5680     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
5681     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5682     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
5683     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
5684 
5685     __ fmovd(tmpL, vtmp3);
5686     __ eor(rscratch2, tmp3, tmpL);
5687     __ cbnz(rscratch2, DIFF2);
5688 
5689     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5690     __ umov(tmpL, vtmp3, __ D, 1);
5691     __ eor(rscratch2, tmpU, tmpL);
5692     __ cbnz(rscratch2, DIFF1);
5693 
5694     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
5695     __ ldr(tmpU, Address(__ post(cnt1, 8)));
5696     __ fmovd(tmpL, vtmp);
5697     __ eor(rscratch2, tmp3, tmpL);
5698     __ cbnz(rscratch2, DIFF2);
5699 
5700     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5701     __ umov(tmpL, vtmp, __ D, 1);
5702     __ eor(rscratch2, tmpU, tmpL);
5703     __ cbnz(rscratch2, DIFF1);
5704   }
5705 
5706   // r0  = result
5707   // r1  = str1
5708   // r2  = cnt1
5709   // r3  = str2
5710   // r4  = cnt2
5711   // r10 = tmp1
5712   // r11 = tmp2
5713   address generate_compare_long_string_different_encoding(bool isLU) {
5714     __ align(CodeEntryAlignment);
5715     StubCodeMark mark(this, "StubRoutines", isLU
5716         ? "compare_long_string_different_encoding LU"
5717         : "compare_long_string_different_encoding UL");
5718     address entry = __ pc();
5719     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
5720         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
5721         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
5722     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5723         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
5724     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
5725     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
5726 
5727     int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
5728 
5729     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
5730     // cnt2 == amount of characters left to compare
5731     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
5732     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5733     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
5734     __ add(str2, str2, isLU ? wordSize : wordSize/2);
5735     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
5736     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
5737     __ eor(rscratch2, tmp1, tmp2);
5738     __ mov(rscratch1, tmp2);
5739     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
5740     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
5741              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
5742     __ push(spilled_regs, sp);
5743     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
5744     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
5745 
5746     __ ldr(tmp3, Address(__ post(cnt1, 8)));
5747 
5748     if (SoftwarePrefetchHintDistance >= 0) {
5749       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5750       __ br(__ LT, NO_PREFETCH);
5751       __ bind(LARGE_LOOP_PREFETCH);
5752         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
5753         __ mov(tmp4, 2);
5754         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5755         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
5756           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5757           __ subs(tmp4, tmp4, 1);
5758           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
5759           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
5760           __ mov(tmp4, 2);
5761         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
5762           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5763           __ subs(tmp4, tmp4, 1);
5764           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
5765           __ sub(cnt2, cnt2, 64);
5766           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
5767           __ br(__ GE, LARGE_LOOP_PREFETCH);
5768     }
5769     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
5770     __ bind(NO_PREFETCH);
5771     __ subs(cnt2, cnt2, 16);
5772     __ br(__ LT, TAIL);
5773     __ align(OptoLoopAlignment);
5774     __ bind(SMALL_LOOP); // smaller loop
5775       __ subs(cnt2, cnt2, 16);
5776       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
5777       __ br(__ GE, SMALL_LOOP);
5778       __ cmn(cnt2, (u1)16);
5779       __ br(__ EQ, LOAD_LAST);
5780     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
5781       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
5782       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
5783       __ ldr(tmp3, Address(cnt1, -8));
5784       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
5785       __ b(LOAD_LAST);
5786     __ bind(DIFF2);
5787       __ mov(tmpU, tmp3);
5788     __ bind(DIFF1);
5789       __ pop(spilled_regs, sp);
5790       __ b(CALCULATE_DIFFERENCE);
5791     __ bind(LOAD_LAST);
5792       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
5793       // No need to load it again
5794       __ mov(tmpU, tmp3);
5795       __ pop(spilled_regs, sp);
5796 
5797       // tmp2 points to the address of the last 4 Latin1 characters right now
5798       __ ldrs(vtmp, Address(tmp2));
5799       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
5800       __ fmovd(tmpL, vtmp);
5801 
5802       __ eor(rscratch2, tmpU, tmpL);
5803       __ cbz(rscratch2, DONE);
5804 
5805     // Find the first different characters in the longwords and
5806     // compute their difference.
5807     __ bind(CALCULATE_DIFFERENCE);
5808       __ rev(rscratch2, rscratch2);
5809       __ clz(rscratch2, rscratch2);
5810       __ andr(rscratch2, rscratch2, -16);
5811       __ lsrv(tmp1, tmp1, rscratch2);
5812       __ uxthw(tmp1, tmp1);
5813       __ lsrv(rscratch1, rscratch1, rscratch2);
5814       __ uxthw(rscratch1, rscratch1);
5815       __ subw(result, tmp1, rscratch1);
5816     __ bind(DONE);
5817       __ ret(lr);
5818     return entry;
5819   }
5820 
5821   // r0 = input (float16)
5822   // v0 = result (float)
5823   // v1 = temporary float register
5824   address generate_float16ToFloat() {
5825     __ align(CodeEntryAlignment);
5826     StubCodeMark mark(this, "StubRoutines", "float16ToFloat");
5827     address entry = __ pc();
5828     BLOCK_COMMENT("Entry:");
5829     __ flt16_to_flt(v0, r0, v1);
5830     __ ret(lr);
5831     return entry;
5832   }
5833 
5834   // v0 = input (float)
5835   // r0 = result (float16)
5836   // v1 = temporary float register
5837   address generate_floatToFloat16() {
5838     __ align(CodeEntryAlignment);
5839     StubCodeMark mark(this, "StubRoutines", "floatToFloat16");
5840     address entry = __ pc();
5841     BLOCK_COMMENT("Entry:");
5842     __ flt_to_flt16(r0, v0, v1);
5843     __ ret(lr);
5844     return entry;
5845   }
5846 
5847   address generate_method_entry_barrier() {
5848     __ align(CodeEntryAlignment);
5849     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
5850 
5851     Label deoptimize_label;
5852 
5853     address start = __ pc();
5854 
5855     BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
5856 
5857     if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
5858       BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
5859       // We can get here despite the nmethod being good, if we have not
5860       // yet applied our cross modification fence (or data fence).
5861       Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
5862       __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
5863       __ ldrw(rscratch2, rscratch2);
5864       __ strw(rscratch2, thread_epoch_addr);
5865       __ isb();
5866       __ membar(__ LoadLoad);
5867     }
5868 
5869     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
5870 
5871     __ enter();
5872     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
5873 
5874     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
5875 
5876     __ push_call_clobbered_registers();
5877 
5878     __ mov(c_rarg0, rscratch2);
5879     __ call_VM_leaf
5880          (CAST_FROM_FN_PTR
5881           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
5882 
5883     __ reset_last_Java_frame(true);
5884 
5885     __ mov(rscratch1, r0);
5886 
5887     __ pop_call_clobbered_registers();
5888 
5889     __ cbnz(rscratch1, deoptimize_label);
5890 
5891     __ leave();
5892     __ ret(lr);
5893 
5894     __ BIND(deoptimize_label);
5895 
5896     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
5897     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
5898 
5899     __ mov(sp, rscratch1);
5900     __ br(rscratch2);
5901 
5902     return start;
5903   }
5904 
5905   // r0  = result
5906   // r1  = str1
5907   // r2  = cnt1
5908   // r3  = str2
5909   // r4  = cnt2
5910   // r10 = tmp1
5911   // r11 = tmp2
5912   address generate_compare_long_string_same_encoding(bool isLL) {
5913     __ align(CodeEntryAlignment);
5914     StubCodeMark mark(this, "StubRoutines", isLL
5915         ? "compare_long_string_same_encoding LL"
5916         : "compare_long_string_same_encoding UU");
5917     address entry = __ pc();
5918     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
5919         tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
5920 
5921     Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
5922 
5923     // exit from large loop when less than 64 bytes left to read or we're about
5924     // to prefetch memory behind array border
5925     int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
5926 
5927     // before jumping to stub, pre-load 8 bytes already, so do comparison directly
5928     __ eor(rscratch2, tmp1, tmp2);
5929     __ cbnz(rscratch2, CAL_DIFFERENCE);
5930 
5931     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
5932     // update pointers, because of previous read
5933     __ add(str1, str1, wordSize);
5934     __ add(str2, str2, wordSize);
5935     if (SoftwarePrefetchHintDistance >= 0) {
5936       __ align(OptoLoopAlignment);
5937       __ bind(LARGE_LOOP_PREFETCH);
5938         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
5939         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
5940 
5941         for (int i = 0; i < 4; i++) {
5942           __ ldp(tmp1, tmp1h, Address(str1, i * 16));
5943           __ ldp(tmp2, tmp2h, Address(str2, i * 16));
5944           __ cmp(tmp1, tmp2);
5945           __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5946           __ br(Assembler::NE, DIFF);
5947         }
5948         __ sub(cnt2, cnt2, isLL ? 64 : 32);
5949         __ add(str1, str1, 64);
5950         __ add(str2, str2, 64);
5951         __ subs(rscratch2, cnt2, largeLoopExitCondition);
5952         __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
5953         __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
5954     }
5955 
5956     __ subs(rscratch1, cnt2, isLL ? 16 : 8);
5957     __ br(Assembler::LE, LESS16);
5958     __ align(OptoLoopAlignment);
5959     __ bind(LOOP_COMPARE16);
5960       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5961       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5962       __ cmp(tmp1, tmp2);
5963       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5964       __ br(Assembler::NE, DIFF);
5965       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5966       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5967       __ br(Assembler::LT, LESS16);
5968 
5969       __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
5970       __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
5971       __ cmp(tmp1, tmp2);
5972       __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
5973       __ br(Assembler::NE, DIFF);
5974       __ sub(cnt2, cnt2, isLL ? 16 : 8);
5975       __ subs(rscratch2, cnt2, isLL ? 16 : 8);
5976       __ br(Assembler::GE, LOOP_COMPARE16);
5977       __ cbz(cnt2, LENGTH_DIFF);
5978 
5979     __ bind(LESS16);
5980       // each 8 compare
5981       __ subs(cnt2, cnt2, isLL ? 8 : 4);
5982       __ br(Assembler::LE, LESS8);
5983       __ ldr(tmp1, Address(__ post(str1, 8)));
5984       __ ldr(tmp2, Address(__ post(str2, 8)));
5985       __ eor(rscratch2, tmp1, tmp2);
5986       __ cbnz(rscratch2, CAL_DIFFERENCE);
5987       __ sub(cnt2, cnt2, isLL ? 8 : 4);
5988 
5989     __ bind(LESS8); // directly load last 8 bytes
5990       if (!isLL) {
5991         __ add(cnt2, cnt2, cnt2);
5992       }
5993       __ ldr(tmp1, Address(str1, cnt2));
5994       __ ldr(tmp2, Address(str2, cnt2));
5995       __ eor(rscratch2, tmp1, tmp2);
5996       __ cbz(rscratch2, LENGTH_DIFF);
5997       __ b(CAL_DIFFERENCE);
5998 
5999     __ bind(DIFF);
6000       __ cmp(tmp1, tmp2);
6001       __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
6002       __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
6003       // reuse rscratch2 register for the result of eor instruction
6004       __ eor(rscratch2, tmp1, tmp2);
6005 
6006     __ bind(CAL_DIFFERENCE);
6007       __ rev(rscratch2, rscratch2);
6008       __ clz(rscratch2, rscratch2);
6009       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
6010       __ lsrv(tmp1, tmp1, rscratch2);
6011       __ lsrv(tmp2, tmp2, rscratch2);
6012       if (isLL) {
6013         __ uxtbw(tmp1, tmp1);
6014         __ uxtbw(tmp2, tmp2);
6015       } else {
6016         __ uxthw(tmp1, tmp1);
6017         __ uxthw(tmp2, tmp2);
6018       }
6019       __ subw(result, tmp1, tmp2);
6020 
6021     __ bind(LENGTH_DIFF);
6022       __ ret(lr);
6023     return entry;
6024   }
6025 
6026   enum string_compare_mode {
6027     LL,
6028     LU,
6029     UL,
6030     UU,
6031   };
6032 
6033   // The following registers are declared in aarch64.ad
6034   // r0  = result
6035   // r1  = str1
6036   // r2  = cnt1
6037   // r3  = str2
6038   // r4  = cnt2
6039   // r10 = tmp1
6040   // r11 = tmp2
6041   // z0  = ztmp1
6042   // z1  = ztmp2
6043   // p0  = pgtmp1
6044   // p1  = pgtmp2
6045   address generate_compare_long_string_sve(string_compare_mode mode) {
6046     __ align(CodeEntryAlignment);
6047     address entry = __ pc();
6048     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
6049              tmp1 = r10, tmp2 = r11;
6050 
6051     Label LOOP, DONE, MISMATCH;
6052     Register vec_len = tmp1;
6053     Register idx = tmp2;
6054     // The minimum of the string lengths has been stored in cnt2.
6055     Register cnt = cnt2;
6056     FloatRegister ztmp1 = z0, ztmp2 = z1;
6057     PRegister pgtmp1 = p0, pgtmp2 = p1;
6058 
6059 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx)                       \
6060     switch (mode) {                                                            \
6061       case LL:                                                                 \
6062         __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx));                  \
6063         __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx));                  \
6064         break;                                                                 \
6065       case LU:                                                                 \
6066         __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx));                  \
6067         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
6068         break;                                                                 \
6069       case UL:                                                                 \
6070         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
6071         __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx));                  \
6072         break;                                                                 \
6073       case UU:                                                                 \
6074         __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
6075         __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
6076         break;                                                                 \
6077       default:                                                                 \
6078         ShouldNotReachHere();                                                  \
6079     }
6080 
6081     const char* stubname;
6082     switch (mode) {
6083       case LL: stubname = "compare_long_string_same_encoding LL";      break;
6084       case LU: stubname = "compare_long_string_different_encoding LU"; break;
6085       case UL: stubname = "compare_long_string_different_encoding UL"; break;
6086       case UU: stubname = "compare_long_string_same_encoding UU";      break;
6087       default: ShouldNotReachHere();
6088     }
6089 
6090     StubCodeMark mark(this, "StubRoutines", stubname);
6091 
6092     __ mov(idx, 0);
6093     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
6094 
6095     if (mode == LL) {
6096       __ sve_cntb(vec_len);
6097     } else {
6098       __ sve_cnth(vec_len);
6099     }
6100 
6101     __ sub(rscratch1, cnt, vec_len);
6102 
6103     __ bind(LOOP);
6104 
6105       // main loop
6106       LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
6107       __ add(idx, idx, vec_len);
6108       // Compare strings.
6109       __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
6110       __ br(__ NE, MISMATCH);
6111       __ cmp(idx, rscratch1);
6112       __ br(__ LT, LOOP);
6113 
6114     // post loop, last iteration
6115     __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
6116 
6117     LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
6118     __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
6119     __ br(__ EQ, DONE);
6120 
6121     __ bind(MISMATCH);
6122 
6123     // Crop the vector to find its location.
6124     __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
6125     // Extract the first different characters of each string.
6126     __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
6127     __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
6128 
6129     // Compute the difference of the first different characters.
6130     __ sub(result, rscratch1, rscratch2);
6131 
6132     __ bind(DONE);
6133     __ ret(lr);
6134 #undef LOAD_PAIR
6135     return entry;
6136   }
6137 
6138   void generate_compare_long_strings() {
6139     if (UseSVE == 0) {
6140       StubRoutines::aarch64::_compare_long_string_LL
6141           = generate_compare_long_string_same_encoding(true);
6142       StubRoutines::aarch64::_compare_long_string_UU
6143           = generate_compare_long_string_same_encoding(false);
6144       StubRoutines::aarch64::_compare_long_string_LU
6145           = generate_compare_long_string_different_encoding(true);
6146       StubRoutines::aarch64::_compare_long_string_UL
6147           = generate_compare_long_string_different_encoding(false);
6148     } else {
6149       StubRoutines::aarch64::_compare_long_string_LL
6150           = generate_compare_long_string_sve(LL);
6151       StubRoutines::aarch64::_compare_long_string_UU
6152           = generate_compare_long_string_sve(UU);
6153       StubRoutines::aarch64::_compare_long_string_LU
6154           = generate_compare_long_string_sve(LU);
6155       StubRoutines::aarch64::_compare_long_string_UL
6156           = generate_compare_long_string_sve(UL);
6157     }
6158   }
6159 
6160   // R0 = result
6161   // R1 = str2
6162   // R2 = cnt1
6163   // R3 = str1
6164   // R4 = cnt2
6165   // Clobbers: rscratch1, rscratch2, v0, v1, rflags
6166   //
6167   // This generic linear code use few additional ideas, which makes it faster:
6168   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
6169   // in order to skip initial loading(help in systems with 1 ld pipeline)
6170   // 2) we can use "fast" algorithm of finding single character to search for
6171   // first symbol with less branches(1 branch per each loaded register instead
6172   // of branch for each symbol), so, this is where constants like
6173   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
6174   // 3) after loading and analyzing 1st register of source string, it can be
6175   // used to search for every 1st character entry, saving few loads in
6176   // comparison with "simplier-but-slower" implementation
6177   // 4) in order to avoid lots of push/pop operations, code below is heavily
6178   // re-using/re-initializing/compressing register values, which makes code
6179   // larger and a bit less readable, however, most of extra operations are
6180   // issued during loads or branches, so, penalty is minimal
6181   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
6182     const char* stubName = str1_isL
6183         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
6184         : "indexof_linear_uu";
6185     __ align(CodeEntryAlignment);
6186     StubCodeMark mark(this, "StubRoutines", stubName);
6187     address entry = __ pc();
6188 
6189     int str1_chr_size = str1_isL ? 1 : 2;
6190     int str2_chr_size = str2_isL ? 1 : 2;
6191     int str1_chr_shift = str1_isL ? 0 : 1;
6192     int str2_chr_shift = str2_isL ? 0 : 1;
6193     bool isL = str1_isL && str2_isL;
6194    // parameters
6195     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
6196     // temporary registers
6197     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
6198     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
6199     // redefinitions
6200     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
6201 
6202     __ push(spilled_regs, sp);
6203     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
6204         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
6205         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
6206         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
6207         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
6208         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
6209     // Read whole register from str1. It is safe, because length >=8 here
6210     __ ldr(ch1, Address(str1));
6211     // Read whole register from str2. It is safe, because length >=8 here
6212     __ ldr(ch2, Address(str2));
6213     __ sub(cnt2, cnt2, cnt1);
6214     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
6215     if (str1_isL != str2_isL) {
6216       __ eor(v0, __ T16B, v0, v0);
6217     }
6218     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
6219     __ mul(first, first, tmp1);
6220     // check if we have less than 1 register to check
6221     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
6222     if (str1_isL != str2_isL) {
6223       __ fmovd(v1, ch1);
6224     }
6225     __ br(__ LE, L_SMALL);
6226     __ eor(ch2, first, ch2);
6227     if (str1_isL != str2_isL) {
6228       __ zip1(v1, __ T16B, v1, v0);
6229     }
6230     __ sub(tmp2, ch2, tmp1);
6231     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
6232     __ bics(tmp2, tmp2, ch2);
6233     if (str1_isL != str2_isL) {
6234       __ fmovd(ch1, v1);
6235     }
6236     __ br(__ NE, L_HAS_ZERO);
6237     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
6238     __ add(result, result, wordSize/str2_chr_size);
6239     __ add(str2, str2, wordSize);
6240     __ br(__ LT, L_POST_LOOP);
6241     __ BIND(L_LOOP);
6242       __ ldr(ch2, Address(str2));
6243       __ eor(ch2, first, ch2);
6244       __ sub(tmp2, ch2, tmp1);
6245       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
6246       __ bics(tmp2, tmp2, ch2);
6247       __ br(__ NE, L_HAS_ZERO);
6248     __ BIND(L_LOOP_PROCEED);
6249       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
6250       __ add(str2, str2, wordSize);
6251       __ add(result, result, wordSize/str2_chr_size);
6252       __ br(__ GE, L_LOOP);
6253     __ BIND(L_POST_LOOP);
6254       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
6255       __ br(__ LE, NOMATCH);
6256       __ ldr(ch2, Address(str2));
6257       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
6258       __ eor(ch2, first, ch2);
6259       __ sub(tmp2, ch2, tmp1);
6260       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
6261       __ mov(tmp4, -1); // all bits set
6262       __ b(L_SMALL_PROCEED);
6263     __ align(OptoLoopAlignment);
6264     __ BIND(L_SMALL);
6265       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
6266       __ eor(ch2, first, ch2);
6267       if (str1_isL != str2_isL) {
6268         __ zip1(v1, __ T16B, v1, v0);
6269       }
6270       __ sub(tmp2, ch2, tmp1);
6271       __ mov(tmp4, -1); // all bits set
6272       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
6273       if (str1_isL != str2_isL) {
6274         __ fmovd(ch1, v1); // move converted 4 symbols
6275       }
6276     __ BIND(L_SMALL_PROCEED);
6277       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
6278       __ bic(tmp2, tmp2, ch2);
6279       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
6280       __ rbit(tmp2, tmp2);
6281       __ br(__ EQ, NOMATCH);
6282     __ BIND(L_SMALL_HAS_ZERO_LOOP);
6283       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
6284       __ cmp(cnt1, u1(wordSize/str2_chr_size));
6285       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
6286       if (str2_isL) { // LL
6287         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
6288         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
6289         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
6290         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
6291         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
6292       } else {
6293         __ mov(ch2, 0xE); // all bits in byte set except last one
6294         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6295         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6296         __ lslv(tmp2, tmp2, tmp4);
6297         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6298         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6299         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
6300         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6301       }
6302       __ cmp(ch1, ch2);
6303       __ mov(tmp4, wordSize/str2_chr_size);
6304       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
6305     __ BIND(L_SMALL_CMP_LOOP);
6306       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
6307                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
6308       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
6309                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
6310       __ add(tmp4, tmp4, 1);
6311       __ cmp(tmp4, cnt1);
6312       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
6313       __ cmp(first, ch2);
6314       __ br(__ EQ, L_SMALL_CMP_LOOP);
6315     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
6316       __ cbz(tmp2, NOMATCH); // no more matches. exit
6317       __ clz(tmp4, tmp2);
6318       __ add(result, result, 1); // advance index
6319       __ add(str2, str2, str2_chr_size); // advance pointer
6320       __ b(L_SMALL_HAS_ZERO_LOOP);
6321     __ align(OptoLoopAlignment);
6322     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
6323       __ cmp(first, ch2);
6324       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
6325       __ b(DONE);
6326     __ align(OptoLoopAlignment);
6327     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
6328       if (str2_isL) { // LL
6329         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
6330         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
6331         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
6332         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
6333         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
6334       } else {
6335         __ mov(ch2, 0xE); // all bits in byte set except last one
6336         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6337         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6338         __ lslv(tmp2, tmp2, tmp4);
6339         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6340         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6341         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
6342         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6343       }
6344       __ cmp(ch1, ch2);
6345       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
6346       __ b(DONE);
6347     __ align(OptoLoopAlignment);
6348     __ BIND(L_HAS_ZERO);
6349       __ rbit(tmp2, tmp2);
6350       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
6351       // Now, perform compression of counters(cnt2 and cnt1) into one register.
6352       // It's fine because both counters are 32bit and are not changed in this
6353       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
6354       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
6355       __ sub(result, result, 1);
6356     __ BIND(L_HAS_ZERO_LOOP);
6357       __ mov(cnt1, wordSize/str2_chr_size);
6358       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
6359       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
6360       if (str2_isL) {
6361         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
6362         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6363         __ lslv(tmp2, tmp2, tmp4);
6364         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6365         __ add(tmp4, tmp4, 1);
6366         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6367         __ lsl(tmp2, tmp2, 1);
6368         __ mov(tmp4, wordSize/str2_chr_size);
6369       } else {
6370         __ mov(ch2, 0xE);
6371         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6372         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6373         __ lslv(tmp2, tmp2, tmp4);
6374         __ add(tmp4, tmp4, 1);
6375         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6376         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
6377         __ lsl(tmp2, tmp2, 1);
6378         __ mov(tmp4, wordSize/str2_chr_size);
6379         __ sub(str2, str2, str2_chr_size);
6380       }
6381       __ cmp(ch1, ch2);
6382       __ mov(tmp4, wordSize/str2_chr_size);
6383       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6384     __ BIND(L_CMP_LOOP);
6385       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
6386                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
6387       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
6388                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
6389       __ add(tmp4, tmp4, 1);
6390       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
6391       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
6392       __ cmp(cnt1, ch2);
6393       __ br(__ EQ, L_CMP_LOOP);
6394     __ BIND(L_CMP_LOOP_NOMATCH);
6395       // here we're not matched
6396       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
6397       __ clz(tmp4, tmp2);
6398       __ add(str2, str2, str2_chr_size); // advance pointer
6399       __ b(L_HAS_ZERO_LOOP);
6400     __ align(OptoLoopAlignment);
6401     __ BIND(L_CMP_LOOP_LAST_CMP);
6402       __ cmp(cnt1, ch2);
6403       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6404       __ b(DONE);
6405     __ align(OptoLoopAlignment);
6406     __ BIND(L_CMP_LOOP_LAST_CMP2);
6407       if (str2_isL) {
6408         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
6409         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6410         __ lslv(tmp2, tmp2, tmp4);
6411         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6412         __ add(tmp4, tmp4, 1);
6413         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6414         __ lsl(tmp2, tmp2, 1);
6415       } else {
6416         __ mov(ch2, 0xE);
6417         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
6418         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
6419         __ lslv(tmp2, tmp2, tmp4);
6420         __ add(tmp4, tmp4, 1);
6421         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
6422         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
6423         __ lsl(tmp2, tmp2, 1);
6424         __ sub(str2, str2, str2_chr_size);
6425       }
6426       __ cmp(ch1, ch2);
6427       __ br(__ NE, L_CMP_LOOP_NOMATCH);
6428       __ b(DONE);
6429     __ align(OptoLoopAlignment);
6430     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
6431       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
6432       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
6433       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
6434       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
6435       // result by analyzed characters value, so, we can just reset lower bits
6436       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
6437       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
6438       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
6439       // index of last analyzed substring inside current octet. So, str2 in at
6440       // respective start address. We need to advance it to next octet
6441       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
6442       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
6443       __ bfm(result, zr, 0, 2 - str2_chr_shift);
6444       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
6445       __ movw(cnt2, cnt2);
6446       __ b(L_LOOP_PROCEED);
6447     __ align(OptoLoopAlignment);
6448     __ BIND(NOMATCH);
6449       __ mov(result, -1);
6450     __ BIND(DONE);
6451       __ pop(spilled_regs, sp);
6452       __ ret(lr);
6453     return entry;
6454   }
6455 
6456   void generate_string_indexof_stubs() {
6457     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
6458     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
6459     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
6460   }
6461 
6462   void inflate_and_store_2_fp_registers(bool generatePrfm,
6463       FloatRegister src1, FloatRegister src2) {
6464     Register dst = r1;
6465     __ zip1(v1, __ T16B, src1, v0);
6466     __ zip2(v2, __ T16B, src1, v0);
6467     if (generatePrfm) {
6468       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
6469     }
6470     __ zip1(v3, __ T16B, src2, v0);
6471     __ zip2(v4, __ T16B, src2, v0);
6472     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
6473   }
6474 
6475   // R0 = src
6476   // R1 = dst
6477   // R2 = len
6478   // R3 = len >> 3
6479   // V0 = 0
6480   // v1 = loaded 8 bytes
6481   // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
6482   address generate_large_byte_array_inflate() {
6483     __ align(CodeEntryAlignment);
6484     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
6485     address entry = __ pc();
6486     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
6487     Register src = r0, dst = r1, len = r2, octetCounter = r3;
6488     const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
6489 
6490     // do one more 8-byte read to have address 16-byte aligned in most cases
6491     // also use single store instruction
6492     __ ldrd(v2, __ post(src, 8));
6493     __ sub(octetCounter, octetCounter, 2);
6494     __ zip1(v1, __ T16B, v1, v0);
6495     __ zip1(v2, __ T16B, v2, v0);
6496     __ st1(v1, v2, __ T16B, __ post(dst, 32));
6497     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6498     __ subs(rscratch1, octetCounter, large_loop_threshold);
6499     __ br(__ LE, LOOP_START);
6500     __ b(LOOP_PRFM_START);
6501     __ bind(LOOP_PRFM);
6502       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6503     __ bind(LOOP_PRFM_START);
6504       __ prfm(Address(src, SoftwarePrefetchHintDistance));
6505       __ sub(octetCounter, octetCounter, 8);
6506       __ subs(rscratch1, octetCounter, large_loop_threshold);
6507       inflate_and_store_2_fp_registers(true, v3, v4);
6508       inflate_and_store_2_fp_registers(true, v5, v6);
6509       __ br(__ GT, LOOP_PRFM);
6510       __ cmp(octetCounter, (u1)8);
6511       __ br(__ LT, DONE);
6512     __ bind(LOOP);
6513       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
6514       __ bind(LOOP_START);
6515       __ sub(octetCounter, octetCounter, 8);
6516       __ cmp(octetCounter, (u1)8);
6517       inflate_and_store_2_fp_registers(false, v3, v4);
6518       inflate_and_store_2_fp_registers(false, v5, v6);
6519       __ br(__ GE, LOOP);
6520     __ bind(DONE);
6521       __ ret(lr);
6522     return entry;
6523   }
6524 
6525   /**
6526    *  Arguments:
6527    *
6528    *  Input:
6529    *  c_rarg0   - current state address
6530    *  c_rarg1   - H key address
6531    *  c_rarg2   - data address
6532    *  c_rarg3   - number of blocks
6533    *
6534    *  Output:
6535    *  Updated state at c_rarg0
6536    */
6537   address generate_ghash_processBlocks() {
6538     // Bafflingly, GCM uses little-endian for the byte order, but
6539     // big-endian for the bit order.  For example, the polynomial 1 is
6540     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
6541     //
6542     // So, we must either reverse the bytes in each word and do
6543     // everything big-endian or reverse the bits in each byte and do
6544     // it little-endian.  On AArch64 it's more idiomatic to reverse
6545     // the bits in each byte (we have an instruction, RBIT, to do
6546     // that) and keep the data in little-endian bit order through the
6547     // calculation, bit-reversing the inputs and outputs.
6548 
6549     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
6550     __ align(wordSize * 2);
6551     address p = __ pc();
6552     __ emit_int64(0x87);  // The low-order bits of the field
6553                           // polynomial (i.e. p = z^7+z^2+z+1)
6554                           // repeated in the low and high parts of a
6555                           // 128-bit vector
6556     __ emit_int64(0x87);
6557 
6558     __ align(CodeEntryAlignment);
6559     address start = __ pc();
6560 
6561     Register state   = c_rarg0;
6562     Register subkeyH = c_rarg1;
6563     Register data    = c_rarg2;
6564     Register blocks  = c_rarg3;
6565 
6566     FloatRegister vzr = v30;
6567     __ eor(vzr, __ T16B, vzr, vzr); // zero register
6568 
6569     __ ldrq(v24, p);    // The field polynomial
6570 
6571     __ ldrq(v0, Address(state));
6572     __ ldrq(v1, Address(subkeyH));
6573 
6574     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
6575     __ rbit(v0, __ T16B, v0);
6576     __ rev64(v1, __ T16B, v1);
6577     __ rbit(v1, __ T16B, v1);
6578 
6579     __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
6580     __ eor(v4, __ T16B, v4, v1);       // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
6581 
6582     {
6583       Label L_ghash_loop;
6584       __ bind(L_ghash_loop);
6585 
6586       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
6587                                                  // reversing each byte
6588       __ rbit(v2, __ T16B, v2);
6589       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
6590 
6591       // Multiply state in v2 by subkey in v1
6592       __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
6593                         /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
6594                         /*temps*/v6, v3, /*reuse/clobber b*/v2);
6595       // Reduce v7:v5 by the field polynomial
6596       __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
6597 
6598       __ sub(blocks, blocks, 1);
6599       __ cbnz(blocks, L_ghash_loop);
6600     }
6601 
6602     // The bit-reversed result is at this point in v0
6603     __ rev64(v0, __ T16B, v0);
6604     __ rbit(v0, __ T16B, v0);
6605 
6606     __ st1(v0, __ T16B, state);
6607     __ ret(lr);
6608 
6609     return start;
6610   }
6611 
6612   address generate_ghash_processBlocks_wide() {
6613     address small = generate_ghash_processBlocks();
6614 
6615     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide");
6616     __ align(wordSize * 2);
6617     address p = __ pc();
6618     __ emit_int64(0x87);  // The low-order bits of the field
6619                           // polynomial (i.e. p = z^7+z^2+z+1)
6620                           // repeated in the low and high parts of a
6621                           // 128-bit vector
6622     __ emit_int64(0x87);
6623 
6624     __ align(CodeEntryAlignment);
6625     address start = __ pc();
6626 
6627     Register state   = c_rarg0;
6628     Register subkeyH = c_rarg1;
6629     Register data    = c_rarg2;
6630     Register blocks  = c_rarg3;
6631 
6632     const int unroll = 4;
6633 
6634     __ cmp(blocks, (unsigned char)(unroll * 2));
6635     __ br(__ LT, small);
6636 
6637     if (unroll > 1) {
6638     // Save state before entering routine
6639       __ sub(sp, sp, 4 * 16);
6640       __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
6641       __ sub(sp, sp, 4 * 16);
6642       __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
6643     }
6644 
6645     __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
6646 
6647     if (unroll > 1) {
6648       // And restore state
6649       __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
6650       __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
6651     }
6652 
6653     __ cmp(blocks, (unsigned char)0);
6654     __ br(__ GT, small);
6655 
6656     __ ret(lr);
6657 
6658     return start;
6659   }
6660 
6661   void generate_base64_encode_simdround(Register src, Register dst,
6662         FloatRegister codec, u8 size) {
6663 
6664     FloatRegister in0  = v4,  in1  = v5,  in2  = v6;
6665     FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
6666     FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
6667 
6668     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6669 
6670     __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
6671 
6672     __ ushr(ind0, arrangement, in0,  2);
6673 
6674     __ ushr(ind1, arrangement, in1,  2);
6675     __ shl(in0,   arrangement, in0,  6);
6676     __ orr(ind1,  arrangement, ind1, in0);
6677     __ ushr(ind1, arrangement, ind1, 2);
6678 
6679     __ ushr(ind2, arrangement, in2,  4);
6680     __ shl(in1,   arrangement, in1,  4);
6681     __ orr(ind2,  arrangement, in1,  ind2);
6682     __ ushr(ind2, arrangement, ind2, 2);
6683 
6684     __ shl(ind3,  arrangement, in2,  2);
6685     __ ushr(ind3, arrangement, ind3, 2);
6686 
6687     __ tbl(out0,  arrangement, codec,  4, ind0);
6688     __ tbl(out1,  arrangement, codec,  4, ind1);
6689     __ tbl(out2,  arrangement, codec,  4, ind2);
6690     __ tbl(out3,  arrangement, codec,  4, ind3);
6691 
6692     __ st4(out0,  out1, out2, out3, arrangement, __ post(dst, 4 * size));
6693   }
6694 
6695    /**
6696    *  Arguments:
6697    *
6698    *  Input:
6699    *  c_rarg0   - src_start
6700    *  c_rarg1   - src_offset
6701    *  c_rarg2   - src_length
6702    *  c_rarg3   - dest_start
6703    *  c_rarg4   - dest_offset
6704    *  c_rarg5   - isURL
6705    *
6706    */
6707   address generate_base64_encodeBlock() {
6708 
6709     static const char toBase64[64] = {
6710       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6711       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6712       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6713       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6714       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
6715     };
6716 
6717     static const char toBase64URL[64] = {
6718       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6719       'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6720       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6721       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6722       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
6723     };
6724 
6725     __ align(CodeEntryAlignment);
6726     StubCodeMark mark(this, "StubRoutines", "encodeBlock");
6727     address start = __ pc();
6728 
6729     Register src   = c_rarg0;  // source array
6730     Register soff  = c_rarg1;  // source start offset
6731     Register send  = c_rarg2;  // source end offset
6732     Register dst   = c_rarg3;  // dest array
6733     Register doff  = c_rarg4;  // position for writing to dest array
6734     Register isURL = c_rarg5;  // Base64 or URL character set
6735 
6736     // c_rarg6 and c_rarg7 are free to use as temps
6737     Register codec  = c_rarg6;
6738     Register length = c_rarg7;
6739 
6740     Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
6741 
6742     __ add(src, src, soff);
6743     __ add(dst, dst, doff);
6744     __ sub(length, send, soff);
6745 
6746     // load the codec base address
6747     __ lea(codec, ExternalAddress((address) toBase64));
6748     __ cbz(isURL, ProcessData);
6749     __ lea(codec, ExternalAddress((address) toBase64URL));
6750 
6751     __ BIND(ProcessData);
6752 
6753     // too short to formup a SIMD loop, roll back
6754     __ cmp(length, (u1)24);
6755     __ br(Assembler::LT, Process3B);
6756 
6757     __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
6758 
6759     __ BIND(Process48B);
6760     __ cmp(length, (u1)48);
6761     __ br(Assembler::LT, Process24B);
6762     generate_base64_encode_simdround(src, dst, v0, 16);
6763     __ sub(length, length, 48);
6764     __ b(Process48B);
6765 
6766     __ BIND(Process24B);
6767     __ cmp(length, (u1)24);
6768     __ br(Assembler::LT, SIMDExit);
6769     generate_base64_encode_simdround(src, dst, v0, 8);
6770     __ sub(length, length, 24);
6771 
6772     __ BIND(SIMDExit);
6773     __ cbz(length, Exit);
6774 
6775     __ BIND(Process3B);
6776     //  3 src bytes, 24 bits
6777     __ ldrb(r10, __ post(src, 1));
6778     __ ldrb(r11, __ post(src, 1));
6779     __ ldrb(r12, __ post(src, 1));
6780     __ orrw(r11, r11, r10, Assembler::LSL, 8);
6781     __ orrw(r12, r12, r11, Assembler::LSL, 8);
6782     // codec index
6783     __ ubfmw(r15, r12, 18, 23);
6784     __ ubfmw(r14, r12, 12, 17);
6785     __ ubfmw(r13, r12, 6,  11);
6786     __ andw(r12,  r12, 63);
6787     // get the code based on the codec
6788     __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
6789     __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
6790     __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
6791     __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
6792     __ strb(r15, __ post(dst, 1));
6793     __ strb(r14, __ post(dst, 1));
6794     __ strb(r13, __ post(dst, 1));
6795     __ strb(r12, __ post(dst, 1));
6796     __ sub(length, length, 3);
6797     __ cbnz(length, Process3B);
6798 
6799     __ BIND(Exit);
6800     __ ret(lr);
6801 
6802     return start;
6803   }
6804 
6805   void generate_base64_decode_simdround(Register src, Register dst,
6806         FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
6807 
6808     FloatRegister in0  = v16, in1  = v17,  in2 = v18,  in3 = v19;
6809     FloatRegister out0 = v20, out1 = v21, out2 = v22;
6810 
6811     FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
6812     FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
6813 
6814     Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
6815 
6816     Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
6817 
6818     __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
6819 
6820     // we need unsigned saturating subtract, to make sure all input values
6821     // in range [0, 63] will have 0U value in the higher half lookup
6822     __ uqsubv(decH0, __ T16B, in0, v27);
6823     __ uqsubv(decH1, __ T16B, in1, v27);
6824     __ uqsubv(decH2, __ T16B, in2, v27);
6825     __ uqsubv(decH3, __ T16B, in3, v27);
6826 
6827     // lower half lookup
6828     __ tbl(decL0, arrangement, codecL, 4, in0);
6829     __ tbl(decL1, arrangement, codecL, 4, in1);
6830     __ tbl(decL2, arrangement, codecL, 4, in2);
6831     __ tbl(decL3, arrangement, codecL, 4, in3);
6832 
6833     // higher half lookup
6834     __ tbx(decH0, arrangement, codecH, 4, decH0);
6835     __ tbx(decH1, arrangement, codecH, 4, decH1);
6836     __ tbx(decH2, arrangement, codecH, 4, decH2);
6837     __ tbx(decH3, arrangement, codecH, 4, decH3);
6838 
6839     // combine lower and higher
6840     __ orr(decL0, arrangement, decL0, decH0);
6841     __ orr(decL1, arrangement, decL1, decH1);
6842     __ orr(decL2, arrangement, decL2, decH2);
6843     __ orr(decL3, arrangement, decL3, decH3);
6844 
6845     // check illegal inputs, value larger than 63 (maximum of 6 bits)
6846     __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
6847     __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
6848     __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
6849     __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
6850     __ orr(in0, arrangement, decH0, decH1);
6851     __ orr(in1, arrangement, decH2, decH3);
6852     __ orr(in2, arrangement, in0,   in1);
6853     __ umaxv(in3, arrangement, in2);
6854     __ umov(rscratch2, in3, __ B, 0);
6855 
6856     // get the data to output
6857     __ shl(out0,  arrangement, decL0, 2);
6858     __ ushr(out1, arrangement, decL1, 4);
6859     __ orr(out0,  arrangement, out0,  out1);
6860     __ shl(out1,  arrangement, decL1, 4);
6861     __ ushr(out2, arrangement, decL2, 2);
6862     __ orr(out1,  arrangement, out1,  out2);
6863     __ shl(out2,  arrangement, decL2, 6);
6864     __ orr(out2,  arrangement, out2,  decL3);
6865 
6866     __ cbz(rscratch2, NoIllegalData);
6867 
6868     // handle illegal input
6869     __ umov(r10, in2, __ D, 0);
6870     if (size == 16) {
6871       __ cbnz(r10, ErrorInLowerHalf);
6872 
6873       // illegal input is in higher half, store the lower half now.
6874       __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
6875 
6876       __ umov(r10, in2,  __ D, 1);
6877       __ umov(r11, out0, __ D, 1);
6878       __ umov(r12, out1, __ D, 1);
6879       __ umov(r13, out2, __ D, 1);
6880       __ b(StoreLegalData);
6881 
6882       __ BIND(ErrorInLowerHalf);
6883     }
6884     __ umov(r11, out0, __ D, 0);
6885     __ umov(r12, out1, __ D, 0);
6886     __ umov(r13, out2, __ D, 0);
6887 
6888     __ BIND(StoreLegalData);
6889     __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
6890     __ strb(r11, __ post(dst, 1));
6891     __ strb(r12, __ post(dst, 1));
6892     __ strb(r13, __ post(dst, 1));
6893     __ lsr(r10, r10, 8);
6894     __ lsr(r11, r11, 8);
6895     __ lsr(r12, r12, 8);
6896     __ lsr(r13, r13, 8);
6897     __ b(StoreLegalData);
6898 
6899     __ BIND(NoIllegalData);
6900     __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
6901   }
6902 
6903 
6904    /**
6905    *  Arguments:
6906    *
6907    *  Input:
6908    *  c_rarg0   - src_start
6909    *  c_rarg1   - src_offset
6910    *  c_rarg2   - src_length
6911    *  c_rarg3   - dest_start
6912    *  c_rarg4   - dest_offset
6913    *  c_rarg5   - isURL
6914    *  c_rarg6   - isMIME
6915    *
6916    */
6917   address generate_base64_decodeBlock() {
6918 
6919     // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
6920     // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
6921     // titled "Base64 decoding".
6922 
6923     // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
6924     // except the trailing character '=' is also treated illegal value in this intrinsic. That
6925     // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
6926     static const uint8_t fromBase64ForNoSIMD[256] = {
6927       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6928       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6929       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6930        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6931       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6932        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u, 255u,
6933       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6934        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6935       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6936       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6937       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6938       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6939       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6940       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6941       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6942       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6943     };
6944 
6945     static const uint8_t fromBase64URLForNoSIMD[256] = {
6946       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6947       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6948       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6949        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6950       255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,  14u,
6951        15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,  63u,
6952       255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,  40u,
6953        41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u, 255u,
6954       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6955       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6956       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6957       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6958       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6959       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6960       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6961       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6962     };
6963 
6964     // A legal value of base64 code is in range [0, 127].  We need two lookups
6965     // with tbl/tbx and combine them to get the decode data. The 1st table vector
6966     // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
6967     // table vector lookup use tbx, out of range indices are unchanged in
6968     // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
6969     // The value of index 64 is set to 0, so that we know that we already get the
6970     // decoded data with the 1st lookup.
6971     static const uint8_t fromBase64ForSIMD[128] = {
6972       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6973       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6974       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u, 255u,  63u,
6975        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6976         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6977        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6978       255u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6979        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6980     };
6981 
6982     static const uint8_t fromBase64URLForSIMD[128] = {
6983       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6984       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6985       255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,  62u, 255u, 255u,
6986        52u,  53u,  54u,  55u,  56u,  57u,  58u,  59u,  60u,  61u, 255u, 255u, 255u, 255u, 255u, 255u,
6987         0u, 255u,   0u,   1u,   2u,   3u,   4u,   5u,   6u,   7u,   8u,   9u,  10u,  11u,  12u,  13u,
6988        14u,  15u,  16u,  17u,  18u,  19u,  20u,  21u,  22u,  23u,  24u,  25u, 255u, 255u, 255u, 255u,
6989        63u, 255u,  26u,  27u,  28u,  29u,  30u,  31u,  32u,  33u,  34u,  35u,  36u,  37u,  38u,  39u,
6990        40u,  41u,  42u,  43u,  44u,  45u,  46u,  47u,  48u,  49u,  50u,  51u, 255u, 255u, 255u, 255u,
6991     };
6992 
6993     __ align(CodeEntryAlignment);
6994     StubCodeMark mark(this, "StubRoutines", "decodeBlock");
6995     address start = __ pc();
6996 
6997     Register src    = c_rarg0;  // source array
6998     Register soff   = c_rarg1;  // source start offset
6999     Register send   = c_rarg2;  // source end offset
7000     Register dst    = c_rarg3;  // dest array
7001     Register doff   = c_rarg4;  // position for writing to dest array
7002     Register isURL  = c_rarg5;  // Base64 or URL character set
7003     Register isMIME = c_rarg6;  // Decoding MIME block - unused in this implementation
7004 
7005     Register length = send;    // reuse send as length of source data to process
7006 
7007     Register simd_codec   = c_rarg6;
7008     Register nosimd_codec = c_rarg7;
7009 
7010     Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
7011 
7012     __ enter();
7013 
7014     __ add(src, src, soff);
7015     __ add(dst, dst, doff);
7016 
7017     __ mov(doff, dst);
7018 
7019     __ sub(length, send, soff);
7020     __ bfm(length, zr, 0, 1);
7021 
7022     __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
7023     __ cbz(isURL, ProcessData);
7024     __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
7025 
7026     __ BIND(ProcessData);
7027     __ mov(rscratch1, length);
7028     __ cmp(length, (u1)144); // 144 = 80 + 64
7029     __ br(Assembler::LT, Process4B);
7030 
7031     // In the MIME case, the line length cannot be more than 76
7032     // bytes (see RFC 2045). This is too short a block for SIMD
7033     // to be worthwhile, so we use non-SIMD here.
7034     __ movw(rscratch1, 79);
7035 
7036     __ BIND(Process4B);
7037     __ ldrw(r14, __ post(src, 4));
7038     __ ubfxw(r10, r14, 0,  8);
7039     __ ubfxw(r11, r14, 8,  8);
7040     __ ubfxw(r12, r14, 16, 8);
7041     __ ubfxw(r13, r14, 24, 8);
7042     // get the de-code
7043     __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
7044     __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
7045     __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
7046     __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
7047     // error detection, 255u indicates an illegal input
7048     __ orrw(r14, r10, r11);
7049     __ orrw(r15, r12, r13);
7050     __ orrw(r14, r14, r15);
7051     __ tbnz(r14, 7, Exit);
7052     // recover the data
7053     __ lslw(r14, r10, 10);
7054     __ bfiw(r14, r11, 4, 6);
7055     __ bfmw(r14, r12, 2, 5);
7056     __ rev16w(r14, r14);
7057     __ bfiw(r13, r12, 6, 2);
7058     __ strh(r14, __ post(dst, 2));
7059     __ strb(r13, __ post(dst, 1));
7060     // non-simd loop
7061     __ subsw(rscratch1, rscratch1, 4);
7062     __ br(Assembler::GT, Process4B);
7063 
7064     // if exiting from PreProcess80B, rscratch1 == -1;
7065     // otherwise, rscratch1 == 0.
7066     __ cbzw(rscratch1, Exit);
7067     __ sub(length, length, 80);
7068 
7069     __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
7070     __ cbz(isURL, SIMDEnter);
7071     __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
7072 
7073     __ BIND(SIMDEnter);
7074     __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
7075     __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
7076     __ mov(rscratch1, 63);
7077     __ dup(v27, __ T16B, rscratch1);
7078 
7079     __ BIND(Process64B);
7080     __ cmp(length, (u1)64);
7081     __ br(Assembler::LT, Process32B);
7082     generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
7083     __ sub(length, length, 64);
7084     __ b(Process64B);
7085 
7086     __ BIND(Process32B);
7087     __ cmp(length, (u1)32);
7088     __ br(Assembler::LT, SIMDExit);
7089     generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
7090     __ sub(length, length, 32);
7091     __ b(Process32B);
7092 
7093     __ BIND(SIMDExit);
7094     __ cbz(length, Exit);
7095     __ movw(rscratch1, length);
7096     __ b(Process4B);
7097 
7098     __ BIND(Exit);
7099     __ sub(c_rarg0, dst, doff);
7100 
7101     __ leave();
7102     __ ret(lr);
7103 
7104     return start;
7105   }
7106 
7107   // Support for spin waits.
7108   address generate_spin_wait() {
7109     __ align(CodeEntryAlignment);
7110     StubCodeMark mark(this, "StubRoutines", "spin_wait");
7111     address start = __ pc();
7112 
7113     __ spin_wait();
7114     __ ret(lr);
7115 
7116     return start;
7117   }
7118 
7119   address generate_lookup_secondary_supers_table_stub(u1 super_klass_index) {
7120     StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table");
7121 
7122     address start = __ pc();
7123     const Register
7124       r_super_klass  = r0,
7125       r_array_base   = r1,
7126       r_array_length = r2,
7127       r_array_index  = r3,
7128       r_sub_klass    = r4,
7129       r_bitmap       = rscratch2,
7130       result         = r5;
7131     const FloatRegister
7132       vtemp          = v0;
7133 
7134     Label L_success;
7135     __ enter();
7136     __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
7137                                            r_array_base, r_array_length, r_array_index,
7138                                            vtemp, result, super_klass_index,
7139                                            /*stub_is_near*/true);
7140     __ leave();
7141     __ ret(lr);
7142 
7143     return start;
7144   }
7145 
7146   // Slow path implementation for UseSecondarySupersTable.
7147   address generate_lookup_secondary_supers_table_slow_path_stub() {
7148     StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table_slow_path");
7149 
7150     address start = __ pc();
7151     const Register
7152       r_super_klass  = r0,        // argument
7153       r_array_base   = r1,        // argument
7154       temp1          = r2,        // temp
7155       r_array_index  = r3,        // argument
7156       r_bitmap       = rscratch2, // argument
7157       result         = r5;        // argument
7158 
7159     __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
7160     __ ret(lr);
7161 
7162     return start;
7163   }
7164 
7165 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
7166 
7167   // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
7168   //
7169   // If LSE is in use, generate LSE versions of all the stubs. The
7170   // non-LSE versions are in atomic_aarch64.S.
7171 
7172   // class AtomicStubMark records the entry point of a stub and the
7173   // stub pointer which will point to it. The stub pointer is set to
7174   // the entry point when ~AtomicStubMark() is called, which must be
7175   // after ICache::invalidate_range. This ensures safe publication of
7176   // the generated code.
7177   class AtomicStubMark {
7178     address _entry_point;
7179     aarch64_atomic_stub_t *_stub;
7180     MacroAssembler *_masm;
7181   public:
7182     AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
7183       _masm = masm;
7184       __ align(32);
7185       _entry_point = __ pc();
7186       _stub = stub;
7187     }
7188     ~AtomicStubMark() {
7189       *_stub = (aarch64_atomic_stub_t)_entry_point;
7190     }
7191   };
7192 
7193   // NB: For memory_order_conservative we need a trailing membar after
7194   // LSE atomic operations but not a leading membar.
7195   //
7196   // We don't need a leading membar because a clause in the Arm ARM
7197   // says:
7198   //
7199   //   Barrier-ordered-before
7200   //
7201   //   Barrier instructions order prior Memory effects before subsequent
7202   //   Memory effects generated by the same Observer. A read or a write
7203   //   RW1 is Barrier-ordered-before a read or a write RW 2 from the same
7204   //   Observer if and only if RW1 appears in program order before RW 2
7205   //   and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
7206   //   instruction with both Acquire and Release semantics.
7207   //
7208   // All the atomic instructions {ldaddal, swapal, casal} have Acquire
7209   // and Release semantics, therefore we don't need a leading
7210   // barrier. However, there is no corresponding Barrier-ordered-after
7211   // relationship, therefore we need a trailing membar to prevent a
7212   // later store or load from being reordered with the store in an
7213   // atomic instruction.
7214   //
7215   // This was checked by using the herd7 consistency model simulator
7216   // (http://diy.inria.fr/) with this test case:
7217   //
7218   // AArch64 LseCas
7219   // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
7220   // P0 | P1;
7221   // LDR W4, [X2] | MOV W3, #0;
7222   // DMB LD       | MOV W4, #1;
7223   // LDR W3, [X1] | CASAL W3, W4, [X1];
7224   //              | DMB ISH;
7225   //              | STR W4, [X2];
7226   // exists
7227   // (0:X3=0 /\ 0:X4=1)
7228   //
7229   // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
7230   // with the store to x in P1. Without the DMB in P1 this may happen.
7231   //
7232   // At the time of writing we don't know of any AArch64 hardware that
7233   // reorders stores in this way, but the Reference Manual permits it.
7234 
7235   void gen_cas_entry(Assembler::operand_size size,
7236                      atomic_memory_order order) {
7237     Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
7238       exchange_val = c_rarg2;
7239     bool acquire, release;
7240     switch (order) {
7241       case memory_order_relaxed:
7242         acquire = false;
7243         release = false;
7244         break;
7245       case memory_order_release:
7246         acquire = false;
7247         release = true;
7248         break;
7249       default:
7250         acquire = true;
7251         release = true;
7252         break;
7253     }
7254     __ mov(prev, compare_val);
7255     __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
7256     if (order == memory_order_conservative) {
7257       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
7258     }
7259     if (size == Assembler::xword) {
7260       __ mov(r0, prev);
7261     } else {
7262       __ movw(r0, prev);
7263     }
7264     __ ret(lr);
7265   }
7266 
7267   void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
7268     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
7269     // If not relaxed, then default to conservative.  Relaxed is the only
7270     // case we use enough to be worth specializing.
7271     if (order == memory_order_relaxed) {
7272       __ ldadd(size, incr, prev, addr);
7273     } else {
7274       __ ldaddal(size, incr, prev, addr);
7275       __ membar(Assembler::StoreStore|Assembler::StoreLoad);
7276     }
7277     if (size == Assembler::xword) {
7278       __ mov(r0, prev);
7279     } else {
7280       __ movw(r0, prev);
7281     }
7282     __ ret(lr);
7283   }
7284 
7285   void gen_swpal_entry(Assembler::operand_size size) {
7286     Register prev = r2, addr = c_rarg0, incr = c_rarg1;
7287     __ swpal(size, incr, prev, addr);
7288     __ membar(Assembler::StoreStore|Assembler::StoreLoad);
7289     if (size == Assembler::xword) {
7290       __ mov(r0, prev);
7291     } else {
7292       __ movw(r0, prev);
7293     }
7294     __ ret(lr);
7295   }
7296 
7297   void generate_atomic_entry_points() {
7298     if (! UseLSE) {
7299       return;
7300     }
7301 
7302     __ align(CodeEntryAlignment);
7303     StubCodeMark mark(this, "StubRoutines", "atomic entry points");
7304     address first_entry = __ pc();
7305 
7306     // ADD, memory_order_conservative
7307     AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
7308     gen_ldadd_entry(Assembler::word, memory_order_conservative);
7309     AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
7310     gen_ldadd_entry(Assembler::xword, memory_order_conservative);
7311 
7312     // ADD, memory_order_relaxed
7313     AtomicStubMark mark_fetch_add_4_relaxed
7314       (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
7315     gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
7316     AtomicStubMark mark_fetch_add_8_relaxed
7317       (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
7318     gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
7319 
7320     // XCHG, memory_order_conservative
7321     AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
7322     gen_swpal_entry(Assembler::word);
7323     AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
7324     gen_swpal_entry(Assembler::xword);
7325 
7326     // CAS, memory_order_conservative
7327     AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
7328     gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
7329     AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
7330     gen_cas_entry(MacroAssembler::word, memory_order_conservative);
7331     AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
7332     gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
7333 
7334     // CAS, memory_order_relaxed
7335     AtomicStubMark mark_cmpxchg_1_relaxed
7336       (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
7337     gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
7338     AtomicStubMark mark_cmpxchg_4_relaxed
7339       (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
7340     gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
7341     AtomicStubMark mark_cmpxchg_8_relaxed
7342       (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
7343     gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
7344 
7345     AtomicStubMark mark_cmpxchg_4_release
7346       (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
7347     gen_cas_entry(MacroAssembler::word, memory_order_release);
7348     AtomicStubMark mark_cmpxchg_8_release
7349       (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
7350     gen_cas_entry(MacroAssembler::xword, memory_order_release);
7351 
7352     AtomicStubMark mark_cmpxchg_4_seq_cst
7353       (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
7354     gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
7355     AtomicStubMark mark_cmpxchg_8_seq_cst
7356       (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
7357     gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
7358 
7359     ICache::invalidate_range(first_entry, __ pc() - first_entry);
7360   }
7361 #endif // LINUX
7362 
7363   address generate_cont_thaw(Continuation::thaw_kind kind) {
7364     bool return_barrier = Continuation::is_thaw_return_barrier(kind);
7365     bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
7366 
7367     address start = __ pc();
7368 
7369     if (return_barrier) {
7370       __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
7371       __ mov(sp, rscratch1);
7372     }
7373     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
7374 
7375     if (return_barrier) {
7376       // preserve possible return value from a method returning to the return barrier
7377       __ fmovd(rscratch1, v0);
7378       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
7379     }
7380 
7381     __ movw(c_rarg1, (return_barrier ? 1 : 0));
7382     __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
7383     __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
7384 
7385     if (return_barrier) {
7386       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
7387       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
7388       __ fmovd(v0, rscratch1);
7389     }
7390     assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
7391 
7392 
7393     Label thaw_success;
7394     // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
7395     __ cbnz(rscratch2, thaw_success);
7396     __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
7397     __ br(rscratch1);
7398     __ bind(thaw_success);
7399 
7400     // make room for the thawed frames
7401     __ sub(rscratch1, sp, rscratch2);
7402     __ andr(rscratch1, rscratch1, -16); // align
7403     __ mov(sp, rscratch1);
7404 
7405     if (return_barrier) {
7406       // save original return value -- again
7407       __ fmovd(rscratch1, v0);
7408       __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
7409     }
7410 
7411     // If we want, we can templatize thaw by kind, and have three different entries
7412     __ movw(c_rarg1, (uint32_t)kind);
7413 
7414     __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
7415     __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
7416 
7417     if (return_barrier) {
7418       // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
7419       __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
7420       __ fmovd(v0, rscratch1);
7421     } else {
7422       __ mov(r0, zr); // return 0 (success) from doYield
7423     }
7424 
7425     // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
7426     __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
7427     __ mov(rfp, sp);
7428 
7429     if (return_barrier_exception) {
7430       __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
7431       __ authenticate_return_address(c_rarg1);
7432       __ verify_oop(r0);
7433       // save return value containing the exception oop in callee-saved R19
7434       __ mov(r19, r0);
7435 
7436       __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
7437 
7438       // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
7439       // __ reinitialize_ptrue();
7440 
7441       // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
7442 
7443       __ mov(r1, r0); // the exception handler
7444       __ mov(r0, r19); // restore return value containing the exception oop
7445       __ verify_oop(r0);
7446 
7447       __ leave();
7448       __ mov(r3, lr);
7449       __ br(r1); // the exception handler
7450     } else {
7451       // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
7452       __ leave();
7453       __ ret(lr);
7454     }
7455 
7456     return start;
7457   }
7458 
7459   address generate_cont_thaw() {
7460     if (!Continuations::enabled()) return nullptr;
7461 
7462     StubCodeMark mark(this, "StubRoutines", "Cont thaw");
7463     address start = __ pc();
7464     generate_cont_thaw(Continuation::thaw_top);
7465     return start;
7466   }
7467 
7468   address generate_cont_returnBarrier() {
7469     if (!Continuations::enabled()) return nullptr;
7470 
7471     // TODO: will probably need multiple return barriers depending on return type
7472     StubCodeMark mark(this, "StubRoutines", "cont return barrier");
7473     address start = __ pc();
7474 
7475     generate_cont_thaw(Continuation::thaw_return_barrier);
7476 
7477     return start;
7478   }
7479 
7480   address generate_cont_returnBarrier_exception() {
7481     if (!Continuations::enabled()) return nullptr;
7482 
7483     StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler");
7484     address start = __ pc();
7485 
7486     generate_cont_thaw(Continuation::thaw_return_barrier_exception);
7487 
7488     return start;
7489   }
7490 
7491   // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
7492   // are represented as long[5], with BITS_PER_LIMB = 26.
7493   // Pack five 26-bit limbs into three 64-bit registers.
7494   void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
7495     __ ldp(dest0, rscratch1, Address(src, 0));     // 26 bits
7496     __ add(dest0, dest0, rscratch1, Assembler::LSL, 26);  // 26 bits
7497     __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
7498     __ add(dest0, dest0, rscratch1, Assembler::LSL, 52);  // 12 bits
7499 
7500     __ add(dest1, zr, rscratch1, Assembler::LSR, 12);     // 14 bits
7501     __ add(dest1, dest1, rscratch2, Assembler::LSL, 14);  // 26 bits
7502     __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
7503     __ add(dest1, dest1, rscratch1, Assembler::LSL, 40);  // 24 bits
7504 
7505     if (dest2->is_valid()) {
7506       __ add(dest2, zr, rscratch1, Assembler::LSR, 24);     // 2 bits
7507     } else {
7508 #ifdef ASSERT
7509       Label OK;
7510       __ cmp(zr, rscratch1, Assembler::LSR, 24);     // 2 bits
7511       __ br(__ EQ, OK);
7512       __ stop("high bits of Poly1305 integer should be zero");
7513       __ should_not_reach_here();
7514       __ bind(OK);
7515 #endif
7516     }
7517   }
7518 
7519   // As above, but return only a 128-bit integer, packed into two
7520   // 64-bit registers.
7521   void pack_26(Register dest0, Register dest1, Register src) {
7522     pack_26(dest0, dest1, noreg, src);
7523   }
7524 
7525   // Multiply and multiply-accumulate unsigned 64-bit registers.
7526   void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
7527     __ mul(prod_lo, n, m);
7528     __ umulh(prod_hi, n, m);
7529   }
7530   void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
7531     wide_mul(rscratch1, rscratch2, n, m);
7532     __ adds(sum_lo, sum_lo, rscratch1);
7533     __ adc(sum_hi, sum_hi, rscratch2);
7534   }
7535 
7536   // Poly1305, RFC 7539
7537 
7538   // See https://loup-vaillant.fr/tutorials/poly1305-design for a
7539   // description of the tricks used to simplify and accelerate this
7540   // computation.
7541 
7542   address generate_poly1305_processBlocks() {
7543     __ align(CodeEntryAlignment);
7544     StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks");
7545     address start = __ pc();
7546     Label here;
7547     __ enter();
7548     RegSet callee_saved = RegSet::range(r19, r28);
7549     __ push(callee_saved, sp);
7550 
7551     RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
7552 
7553     // Arguments
7554     const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
7555 
7556     // R_n is the 128-bit randomly-generated key, packed into two
7557     // registers.  The caller passes this key to us as long[5], with
7558     // BITS_PER_LIMB = 26.
7559     const Register R_0 = *++regs, R_1 = *++regs;
7560     pack_26(R_0, R_1, r_start);
7561 
7562     // RR_n is (R_n >> 2) * 5
7563     const Register RR_0 = *++regs, RR_1 = *++regs;
7564     __ lsr(RR_0, R_0, 2);
7565     __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
7566     __ lsr(RR_1, R_1, 2);
7567     __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
7568 
7569     // U_n is the current checksum
7570     const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
7571     pack_26(U_0, U_1, U_2, acc_start);
7572 
7573     static constexpr int BLOCK_LENGTH = 16;
7574     Label DONE, LOOP;
7575 
7576     __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
7577     __ br(Assembler::LT, DONE); {
7578       __ bind(LOOP);
7579 
7580       // S_n is to be the sum of U_n and the next block of data
7581       const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
7582       __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
7583       __ adds(S_0, U_0, S_0);
7584       __ adcs(S_1, U_1, S_1);
7585       __ adc(S_2, U_2, zr);
7586       __ add(S_2, S_2, 1);
7587 
7588       const Register U_0HI = *++regs, U_1HI = *++regs;
7589 
7590       // NB: this logic depends on some of the special properties of
7591       // Poly1305 keys. In particular, because we know that the top
7592       // four bits of R_0 and R_1 are zero, we can add together
7593       // partial products without any risk of needing to propagate a
7594       // carry out.
7595       wide_mul(U_0, U_0HI, S_0, R_0);  wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
7596       wide_mul(U_1, U_1HI, S_0, R_1);  wide_madd(U_1, U_1HI, S_1, R_0);  wide_madd(U_1, U_1HI, S_2, RR_1);
7597       __ andr(U_2, R_0, 3);
7598       __ mul(U_2, S_2, U_2);
7599 
7600       // Recycle registers S_0, S_1, S_2
7601       regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
7602 
7603       // Partial reduction mod 2**130 - 5
7604       __ adds(U_1, U_0HI, U_1);
7605       __ adc(U_2, U_1HI, U_2);
7606       // Sum now in U_2:U_1:U_0.
7607       // Dead: U_0HI, U_1HI.
7608       regs = (regs.remaining() + U_0HI + U_1HI).begin();
7609 
7610       // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
7611 
7612       // First, U_2:U_1:U_0 += (U_2 >> 2)
7613       __ lsr(rscratch1, U_2, 2);
7614       __ andr(U_2, U_2, (u8)3);
7615       __ adds(U_0, U_0, rscratch1);
7616       __ adcs(U_1, U_1, zr);
7617       __ adc(U_2, U_2, zr);
7618       // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
7619       __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
7620       __ adcs(U_1, U_1, zr);
7621       __ adc(U_2, U_2, zr);
7622 
7623       __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
7624       __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
7625       __ br(~ Assembler::LT, LOOP);
7626     }
7627 
7628     // Further reduce modulo 2^130 - 5
7629     __ lsr(rscratch1, U_2, 2);
7630     __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
7631     __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
7632     __ adcs(U_1, U_1, zr);
7633     __ andr(U_2, U_2, (u1)3);
7634     __ adc(U_2, U_2, zr);
7635 
7636     // Unpack the sum into five 26-bit limbs and write to memory.
7637     __ ubfiz(rscratch1, U_0, 0, 26);
7638     __ ubfx(rscratch2, U_0, 26, 26);
7639     __ stp(rscratch1, rscratch2, Address(acc_start));
7640     __ ubfx(rscratch1, U_0, 52, 12);
7641     __ bfi(rscratch1, U_1, 12, 14);
7642     __ ubfx(rscratch2, U_1, 14, 26);
7643     __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
7644     __ ubfx(rscratch1, U_1, 40, 24);
7645     __ bfi(rscratch1, U_2, 24, 3);
7646     __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
7647 
7648     __ bind(DONE);
7649     __ pop(callee_saved, sp);
7650     __ leave();
7651     __ ret(lr);
7652 
7653     return start;
7654   }
7655 
7656   // exception handler for upcall stubs
7657   address generate_upcall_stub_exception_handler() {
7658     StubCodeMark mark(this, "StubRoutines", "upcall stub exception handler");
7659     address start = __ pc();
7660 
7661     // Native caller has no idea how to handle exceptions,
7662     // so we just crash here. Up to callee to catch exceptions.
7663     __ verify_oop(r0);
7664     __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
7665     __ blr(rscratch1);
7666     __ should_not_reach_here();
7667 
7668     return start;
7669   }
7670 
7671   // load Method* target of MethodHandle
7672   // j_rarg0 = jobject receiver
7673   // rmethod = result
7674   address generate_upcall_stub_load_target() {
7675     StubCodeMark mark(this, "StubRoutines", "upcall_stub_load_target");
7676     address start = __ pc();
7677 
7678     __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
7679       // Load target method from receiver
7680     __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
7681     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
7682     __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
7683     __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
7684                       Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
7685                       noreg, noreg);
7686     __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
7687 
7688     __ ret(lr);
7689 
7690     return start;
7691   }
7692 
7693 #undef __
7694 #define __ masm->
7695 
7696   class MontgomeryMultiplyGenerator : public MacroAssembler {
7697 
7698     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
7699       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
7700 
7701     RegSet _toSave;
7702     bool _squaring;
7703 
7704   public:
7705     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
7706       : MacroAssembler(as->code()), _squaring(squaring) {
7707 
7708       // Register allocation
7709 
7710       RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
7711       Pa_base = *regs;       // Argument registers
7712       if (squaring)
7713         Pb_base = Pa_base;
7714       else
7715         Pb_base = *++regs;
7716       Pn_base = *++regs;
7717       Rlen= *++regs;
7718       inv = *++regs;
7719       Pm_base = *++regs;
7720 
7721                           // Working registers:
7722       Ra =  *++regs;        // The current digit of a, b, n, and m.
7723       Rb =  *++regs;
7724       Rm =  *++regs;
7725       Rn =  *++regs;
7726 
7727       Pa =  *++regs;        // Pointers to the current/next digit of a, b, n, and m.
7728       Pb =  *++regs;
7729       Pm =  *++regs;
7730       Pn =  *++regs;
7731 
7732       t0 =  *++regs;        // Three registers which form a
7733       t1 =  *++regs;        // triple-precision accumuator.
7734       t2 =  *++regs;
7735 
7736       Ri =  *++regs;        // Inner and outer loop indexes.
7737       Rj =  *++regs;
7738 
7739       Rhi_ab = *++regs;     // Product registers: low and high parts
7740       Rlo_ab = *++regs;     // of a*b and m*n.
7741       Rhi_mn = *++regs;
7742       Rlo_mn = *++regs;
7743 
7744       // r19 and up are callee-saved.
7745       _toSave = RegSet::range(r19, *regs) + Pm_base;
7746     }
7747 
7748   private:
7749     void save_regs() {
7750       push(_toSave, sp);
7751     }
7752 
7753     void restore_regs() {
7754       pop(_toSave, sp);
7755     }
7756 
7757     template <typename T>
7758     void unroll_2(Register count, T block) {
7759       Label loop, end, odd;
7760       tbnz(count, 0, odd);
7761       cbz(count, end);
7762       align(16);
7763       bind(loop);
7764       (this->*block)();
7765       bind(odd);
7766       (this->*block)();
7767       subs(count, count, 2);
7768       br(Assembler::GT, loop);
7769       bind(end);
7770     }
7771 
7772     template <typename T>
7773     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
7774       Label loop, end, odd;
7775       tbnz(count, 0, odd);
7776       cbz(count, end);
7777       align(16);
7778       bind(loop);
7779       (this->*block)(d, s, tmp);
7780       bind(odd);
7781       (this->*block)(d, s, tmp);
7782       subs(count, count, 2);
7783       br(Assembler::GT, loop);
7784       bind(end);
7785     }
7786 
7787     void pre1(RegisterOrConstant i) {
7788       block_comment("pre1");
7789       // Pa = Pa_base;
7790       // Pb = Pb_base + i;
7791       // Pm = Pm_base;
7792       // Pn = Pn_base + i;
7793       // Ra = *Pa;
7794       // Rb = *Pb;
7795       // Rm = *Pm;
7796       // Rn = *Pn;
7797       ldr(Ra, Address(Pa_base));
7798       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7799       ldr(Rm, Address(Pm_base));
7800       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7801       lea(Pa, Address(Pa_base));
7802       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
7803       lea(Pm, Address(Pm_base));
7804       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7805 
7806       // Zero the m*n result.
7807       mov(Rhi_mn, zr);
7808       mov(Rlo_mn, zr);
7809     }
7810 
7811     // The core multiply-accumulate step of a Montgomery
7812     // multiplication.  The idea is to schedule operations as a
7813     // pipeline so that instructions with long latencies (loads and
7814     // multiplies) have time to complete before their results are
7815     // used.  This most benefits in-order implementations of the
7816     // architecture but out-of-order ones also benefit.
7817     void step() {
7818       block_comment("step");
7819       // MACC(Ra, Rb, t0, t1, t2);
7820       // Ra = *++Pa;
7821       // Rb = *--Pb;
7822       umulh(Rhi_ab, Ra, Rb);
7823       mul(Rlo_ab, Ra, Rb);
7824       ldr(Ra, pre(Pa, wordSize));
7825       ldr(Rb, pre(Pb, -wordSize));
7826       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
7827                                        // previous iteration.
7828       // MACC(Rm, Rn, t0, t1, t2);
7829       // Rm = *++Pm;
7830       // Rn = *--Pn;
7831       umulh(Rhi_mn, Rm, Rn);
7832       mul(Rlo_mn, Rm, Rn);
7833       ldr(Rm, pre(Pm, wordSize));
7834       ldr(Rn, pre(Pn, -wordSize));
7835       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7836     }
7837 
7838     void post1() {
7839       block_comment("post1");
7840 
7841       // MACC(Ra, Rb, t0, t1, t2);
7842       // Ra = *++Pa;
7843       // Rb = *--Pb;
7844       umulh(Rhi_ab, Ra, Rb);
7845       mul(Rlo_ab, Ra, Rb);
7846       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
7847       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7848 
7849       // *Pm = Rm = t0 * inv;
7850       mul(Rm, t0, inv);
7851       str(Rm, Address(Pm));
7852 
7853       // MACC(Rm, Rn, t0, t1, t2);
7854       // t0 = t1; t1 = t2; t2 = 0;
7855       umulh(Rhi_mn, Rm, Rn);
7856 
7857 #ifndef PRODUCT
7858       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
7859       {
7860         mul(Rlo_mn, Rm, Rn);
7861         add(Rlo_mn, t0, Rlo_mn);
7862         Label ok;
7863         cbz(Rlo_mn, ok); {
7864           stop("broken Montgomery multiply");
7865         } bind(ok);
7866       }
7867 #endif
7868       // We have very carefully set things up so that
7869       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
7870       // the lower half of Rm * Rn because we know the result already:
7871       // it must be -t0.  t0 + (-t0) must generate a carry iff
7872       // t0 != 0.  So, rather than do a mul and an adds we just set
7873       // the carry flag iff t0 is nonzero.
7874       //
7875       // mul(Rlo_mn, Rm, Rn);
7876       // adds(zr, t0, Rlo_mn);
7877       subs(zr, t0, 1); // Set carry iff t0 is nonzero
7878       adcs(t0, t1, Rhi_mn);
7879       adc(t1, t2, zr);
7880       mov(t2, zr);
7881     }
7882 
7883     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
7884       block_comment("pre2");
7885       // Pa = Pa_base + i-len;
7886       // Pb = Pb_base + len;
7887       // Pm = Pm_base + i-len;
7888       // Pn = Pn_base + len;
7889 
7890       if (i.is_register()) {
7891         sub(Rj, i.as_register(), len);
7892       } else {
7893         mov(Rj, i.as_constant());
7894         sub(Rj, Rj, len);
7895       }
7896       // Rj == i-len
7897 
7898       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
7899       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
7900       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
7901       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
7902 
7903       // Ra = *++Pa;
7904       // Rb = *--Pb;
7905       // Rm = *++Pm;
7906       // Rn = *--Pn;
7907       ldr(Ra, pre(Pa, wordSize));
7908       ldr(Rb, pre(Pb, -wordSize));
7909       ldr(Rm, pre(Pm, wordSize));
7910       ldr(Rn, pre(Pn, -wordSize));
7911 
7912       mov(Rhi_mn, zr);
7913       mov(Rlo_mn, zr);
7914     }
7915 
7916     void post2(RegisterOrConstant i, RegisterOrConstant len) {
7917       block_comment("post2");
7918       if (i.is_constant()) {
7919         mov(Rj, i.as_constant()-len.as_constant());
7920       } else {
7921         sub(Rj, i.as_register(), len);
7922       }
7923 
7924       adds(t0, t0, Rlo_mn); // The pending m*n, low part
7925 
7926       // As soon as we know the least significant digit of our result,
7927       // store it.
7928       // Pm_base[i-len] = t0;
7929       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
7930 
7931       // t0 = t1; t1 = t2; t2 = 0;
7932       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
7933       adc(t1, t2, zr);
7934       mov(t2, zr);
7935     }
7936 
7937     // A carry in t0 after Montgomery multiplication means that we
7938     // should subtract multiples of n from our result in m.  We'll
7939     // keep doing that until there is no carry.
7940     void normalize(RegisterOrConstant len) {
7941       block_comment("normalize");
7942       // while (t0)
7943       //   t0 = sub(Pm_base, Pn_base, t0, len);
7944       Label loop, post, again;
7945       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
7946       cbz(t0, post); {
7947         bind(again); {
7948           mov(i, zr);
7949           mov(cnt, len);
7950           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7951           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7952           subs(zr, zr, zr); // set carry flag, i.e. no borrow
7953           align(16);
7954           bind(loop); {
7955             sbcs(Rm, Rm, Rn);
7956             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7957             add(i, i, 1);
7958             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
7959             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
7960             sub(cnt, cnt, 1);
7961           } cbnz(cnt, loop);
7962           sbc(t0, t0, zr);
7963         } cbnz(t0, again);
7964       } bind(post);
7965     }
7966 
7967     // Move memory at s to d, reversing words.
7968     //    Increments d to end of copied memory
7969     //    Destroys tmp1, tmp2
7970     //    Preserves len
7971     //    Leaves s pointing to the address which was in d at start
7972     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
7973       assert(tmp1->encoding() < r19->encoding(), "register corruption");
7974       assert(tmp2->encoding() < r19->encoding(), "register corruption");
7975 
7976       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
7977       mov(tmp1, len);
7978       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
7979       sub(s, d, len, ext::uxtw, LogBytesPerWord);
7980     }
7981     // where
7982     void reverse1(Register d, Register s, Register tmp) {
7983       ldr(tmp, pre(s, -wordSize));
7984       ror(tmp, tmp, 32);
7985       str(tmp, post(d, wordSize));
7986     }
7987 
7988     void step_squaring() {
7989       // An extra ACC
7990       step();
7991       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
7992     }
7993 
7994     void last_squaring(RegisterOrConstant i) {
7995       Label dont;
7996       // if ((i & 1) == 0) {
7997       tbnz(i.as_register(), 0, dont); {
7998         // MACC(Ra, Rb, t0, t1, t2);
7999         // Ra = *++Pa;
8000         // Rb = *--Pb;
8001         umulh(Rhi_ab, Ra, Rb);
8002         mul(Rlo_ab, Ra, Rb);
8003         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
8004       } bind(dont);
8005     }
8006 
8007     void extra_step_squaring() {
8008       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
8009 
8010       // MACC(Rm, Rn, t0, t1, t2);
8011       // Rm = *++Pm;
8012       // Rn = *--Pn;
8013       umulh(Rhi_mn, Rm, Rn);
8014       mul(Rlo_mn, Rm, Rn);
8015       ldr(Rm, pre(Pm, wordSize));
8016       ldr(Rn, pre(Pn, -wordSize));
8017     }
8018 
8019     void post1_squaring() {
8020       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
8021 
8022       // *Pm = Rm = t0 * inv;
8023       mul(Rm, t0, inv);
8024       str(Rm, Address(Pm));
8025 
8026       // MACC(Rm, Rn, t0, t1, t2);
8027       // t0 = t1; t1 = t2; t2 = 0;
8028       umulh(Rhi_mn, Rm, Rn);
8029 
8030 #ifndef PRODUCT
8031       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
8032       {
8033         mul(Rlo_mn, Rm, Rn);
8034         add(Rlo_mn, t0, Rlo_mn);
8035         Label ok;
8036         cbz(Rlo_mn, ok); {
8037           stop("broken Montgomery multiply");
8038         } bind(ok);
8039       }
8040 #endif
8041       // We have very carefully set things up so that
8042       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
8043       // the lower half of Rm * Rn because we know the result already:
8044       // it must be -t0.  t0 + (-t0) must generate a carry iff
8045       // t0 != 0.  So, rather than do a mul and an adds we just set
8046       // the carry flag iff t0 is nonzero.
8047       //
8048       // mul(Rlo_mn, Rm, Rn);
8049       // adds(zr, t0, Rlo_mn);
8050       subs(zr, t0, 1); // Set carry iff t0 is nonzero
8051       adcs(t0, t1, Rhi_mn);
8052       adc(t1, t2, zr);
8053       mov(t2, zr);
8054     }
8055 
8056     void acc(Register Rhi, Register Rlo,
8057              Register t0, Register t1, Register t2) {
8058       adds(t0, t0, Rlo);
8059       adcs(t1, t1, Rhi);
8060       adc(t2, t2, zr);
8061     }
8062 
8063   public:
8064     /**
8065      * Fast Montgomery multiplication.  The derivation of the
8066      * algorithm is in A Cryptographic Library for the Motorola
8067      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
8068      *
8069      * Arguments:
8070      *
8071      * Inputs for multiplication:
8072      *   c_rarg0   - int array elements a
8073      *   c_rarg1   - int array elements b
8074      *   c_rarg2   - int array elements n (the modulus)
8075      *   c_rarg3   - int length
8076      *   c_rarg4   - int inv
8077      *   c_rarg5   - int array elements m (the result)
8078      *
8079      * Inputs for squaring:
8080      *   c_rarg0   - int array elements a
8081      *   c_rarg1   - int array elements n (the modulus)
8082      *   c_rarg2   - int length
8083      *   c_rarg3   - int inv
8084      *   c_rarg4   - int array elements m (the result)
8085      *
8086      */
8087     address generate_multiply() {
8088       Label argh, nothing;
8089       bind(argh);
8090       stop("MontgomeryMultiply total_allocation must be <= 8192");
8091 
8092       align(CodeEntryAlignment);
8093       address entry = pc();
8094 
8095       cbzw(Rlen, nothing);
8096 
8097       enter();
8098 
8099       // Make room.
8100       cmpw(Rlen, 512);
8101       br(Assembler::HI, argh);
8102       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
8103       andr(sp, Ra, -2 * wordSize);
8104 
8105       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
8106 
8107       {
8108         // Copy input args, reversing as we go.  We use Ra as a
8109         // temporary variable.
8110         reverse(Ra, Pa_base, Rlen, t0, t1);
8111         if (!_squaring)
8112           reverse(Ra, Pb_base, Rlen, t0, t1);
8113         reverse(Ra, Pn_base, Rlen, t0, t1);
8114       }
8115 
8116       // Push all call-saved registers and also Pm_base which we'll need
8117       // at the end.
8118       save_regs();
8119 
8120 #ifndef PRODUCT
8121       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
8122       {
8123         ldr(Rn, Address(Pn_base, 0));
8124         mul(Rlo_mn, Rn, inv);
8125         subs(zr, Rlo_mn, -1);
8126         Label ok;
8127         br(EQ, ok); {
8128           stop("broken inverse in Montgomery multiply");
8129         } bind(ok);
8130       }
8131 #endif
8132 
8133       mov(Pm_base, Ra);
8134 
8135       mov(t0, zr);
8136       mov(t1, zr);
8137       mov(t2, zr);
8138 
8139       block_comment("for (int i = 0; i < len; i++) {");
8140       mov(Ri, zr); {
8141         Label loop, end;
8142         cmpw(Ri, Rlen);
8143         br(Assembler::GE, end);
8144 
8145         bind(loop);
8146         pre1(Ri);
8147 
8148         block_comment("  for (j = i; j; j--) {"); {
8149           movw(Rj, Ri);
8150           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
8151         } block_comment("  } // j");
8152 
8153         post1();
8154         addw(Ri, Ri, 1);
8155         cmpw(Ri, Rlen);
8156         br(Assembler::LT, loop);
8157         bind(end);
8158         block_comment("} // i");
8159       }
8160 
8161       block_comment("for (int i = len; i < 2*len; i++) {");
8162       mov(Ri, Rlen); {
8163         Label loop, end;
8164         cmpw(Ri, Rlen, Assembler::LSL, 1);
8165         br(Assembler::GE, end);
8166 
8167         bind(loop);
8168         pre2(Ri, Rlen);
8169 
8170         block_comment("  for (j = len*2-i-1; j; j--) {"); {
8171           lslw(Rj, Rlen, 1);
8172           subw(Rj, Rj, Ri);
8173           subw(Rj, Rj, 1);
8174           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
8175         } block_comment("  } // j");
8176 
8177         post2(Ri, Rlen);
8178         addw(Ri, Ri, 1);
8179         cmpw(Ri, Rlen, Assembler::LSL, 1);
8180         br(Assembler::LT, loop);
8181         bind(end);
8182       }
8183       block_comment("} // i");
8184 
8185       normalize(Rlen);
8186 
8187       mov(Ra, Pm_base);  // Save Pm_base in Ra
8188       restore_regs();  // Restore caller's Pm_base
8189 
8190       // Copy our result into caller's Pm_base
8191       reverse(Pm_base, Ra, Rlen, t0, t1);
8192 
8193       leave();
8194       bind(nothing);
8195       ret(lr);
8196 
8197       return entry;
8198     }
8199     // In C, approximately:
8200 
8201     // void
8202     // montgomery_multiply(julong Pa_base[], julong Pb_base[],
8203     //                     julong Pn_base[], julong Pm_base[],
8204     //                     julong inv, int len) {
8205     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
8206     //   julong *Pa, *Pb, *Pn, *Pm;
8207     //   julong Ra, Rb, Rn, Rm;
8208 
8209     //   int i;
8210 
8211     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
8212 
8213     //   for (i = 0; i < len; i++) {
8214     //     int j;
8215 
8216     //     Pa = Pa_base;
8217     //     Pb = Pb_base + i;
8218     //     Pm = Pm_base;
8219     //     Pn = Pn_base + i;
8220 
8221     //     Ra = *Pa;
8222     //     Rb = *Pb;
8223     //     Rm = *Pm;
8224     //     Rn = *Pn;
8225 
8226     //     int iters = i;
8227     //     for (j = 0; iters--; j++) {
8228     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
8229     //       MACC(Ra, Rb, t0, t1, t2);
8230     //       Ra = *++Pa;
8231     //       Rb = *--Pb;
8232     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8233     //       MACC(Rm, Rn, t0, t1, t2);
8234     //       Rm = *++Pm;
8235     //       Rn = *--Pn;
8236     //     }
8237 
8238     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
8239     //     MACC(Ra, Rb, t0, t1, t2);
8240     //     *Pm = Rm = t0 * inv;
8241     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
8242     //     MACC(Rm, Rn, t0, t1, t2);
8243 
8244     //     assert(t0 == 0, "broken Montgomery multiply");
8245 
8246     //     t0 = t1; t1 = t2; t2 = 0;
8247     //   }
8248 
8249     //   for (i = len; i < 2*len; i++) {
8250     //     int j;
8251 
8252     //     Pa = Pa_base + i-len;
8253     //     Pb = Pb_base + len;
8254     //     Pm = Pm_base + i-len;
8255     //     Pn = Pn_base + len;
8256 
8257     //     Ra = *++Pa;
8258     //     Rb = *--Pb;
8259     //     Rm = *++Pm;
8260     //     Rn = *--Pn;
8261 
8262     //     int iters = len*2-i-1;
8263     //     for (j = i-len+1; iters--; j++) {
8264     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
8265     //       MACC(Ra, Rb, t0, t1, t2);
8266     //       Ra = *++Pa;
8267     //       Rb = *--Pb;
8268     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8269     //       MACC(Rm, Rn, t0, t1, t2);
8270     //       Rm = *++Pm;
8271     //       Rn = *--Pn;
8272     //     }
8273 
8274     //     Pm_base[i-len] = t0;
8275     //     t0 = t1; t1 = t2; t2 = 0;
8276     //   }
8277 
8278     //   while (t0)
8279     //     t0 = sub(Pm_base, Pn_base, t0, len);
8280     // }
8281 
8282     /**
8283      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
8284      * multiplies than Montgomery multiplication so it should be up to
8285      * 25% faster.  However, its loop control is more complex and it
8286      * may actually run slower on some machines.
8287      *
8288      * Arguments:
8289      *
8290      * Inputs:
8291      *   c_rarg0   - int array elements a
8292      *   c_rarg1   - int array elements n (the modulus)
8293      *   c_rarg2   - int length
8294      *   c_rarg3   - int inv
8295      *   c_rarg4   - int array elements m (the result)
8296      *
8297      */
8298     address generate_square() {
8299       Label argh;
8300       bind(argh);
8301       stop("MontgomeryMultiply total_allocation must be <= 8192");
8302 
8303       align(CodeEntryAlignment);
8304       address entry = pc();
8305 
8306       enter();
8307 
8308       // Make room.
8309       cmpw(Rlen, 512);
8310       br(Assembler::HI, argh);
8311       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
8312       andr(sp, Ra, -2 * wordSize);
8313 
8314       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
8315 
8316       {
8317         // Copy input args, reversing as we go.  We use Ra as a
8318         // temporary variable.
8319         reverse(Ra, Pa_base, Rlen, t0, t1);
8320         reverse(Ra, Pn_base, Rlen, t0, t1);
8321       }
8322 
8323       // Push all call-saved registers and also Pm_base which we'll need
8324       // at the end.
8325       save_regs();
8326 
8327       mov(Pm_base, Ra);
8328 
8329       mov(t0, zr);
8330       mov(t1, zr);
8331       mov(t2, zr);
8332 
8333       block_comment("for (int i = 0; i < len; i++) {");
8334       mov(Ri, zr); {
8335         Label loop, end;
8336         bind(loop);
8337         cmp(Ri, Rlen);
8338         br(Assembler::GE, end);
8339 
8340         pre1(Ri);
8341 
8342         block_comment("for (j = (i+1)/2; j; j--) {"); {
8343           add(Rj, Ri, 1);
8344           lsr(Rj, Rj, 1);
8345           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
8346         } block_comment("  } // j");
8347 
8348         last_squaring(Ri);
8349 
8350         block_comment("  for (j = i/2; j; j--) {"); {
8351           lsr(Rj, Ri, 1);
8352           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
8353         } block_comment("  } // j");
8354 
8355         post1_squaring();
8356         add(Ri, Ri, 1);
8357         cmp(Ri, Rlen);
8358         br(Assembler::LT, loop);
8359 
8360         bind(end);
8361         block_comment("} // i");
8362       }
8363 
8364       block_comment("for (int i = len; i < 2*len; i++) {");
8365       mov(Ri, Rlen); {
8366         Label loop, end;
8367         bind(loop);
8368         cmp(Ri, Rlen, Assembler::LSL, 1);
8369         br(Assembler::GE, end);
8370 
8371         pre2(Ri, Rlen);
8372 
8373         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
8374           lsl(Rj, Rlen, 1);
8375           sub(Rj, Rj, Ri);
8376           sub(Rj, Rj, 1);
8377           lsr(Rj, Rj, 1);
8378           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
8379         } block_comment("  } // j");
8380 
8381         last_squaring(Ri);
8382 
8383         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
8384           lsl(Rj, Rlen, 1);
8385           sub(Rj, Rj, Ri);
8386           lsr(Rj, Rj, 1);
8387           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
8388         } block_comment("  } // j");
8389 
8390         post2(Ri, Rlen);
8391         add(Ri, Ri, 1);
8392         cmp(Ri, Rlen, Assembler::LSL, 1);
8393 
8394         br(Assembler::LT, loop);
8395         bind(end);
8396         block_comment("} // i");
8397       }
8398 
8399       normalize(Rlen);
8400 
8401       mov(Ra, Pm_base);  // Save Pm_base in Ra
8402       restore_regs();  // Restore caller's Pm_base
8403 
8404       // Copy our result into caller's Pm_base
8405       reverse(Pm_base, Ra, Rlen, t0, t1);
8406 
8407       leave();
8408       ret(lr);
8409 
8410       return entry;
8411     }
8412     // In C, approximately:
8413 
8414     // void
8415     // montgomery_square(julong Pa_base[], julong Pn_base[],
8416     //                   julong Pm_base[], julong inv, int len) {
8417     //   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
8418     //   julong *Pa, *Pb, *Pn, *Pm;
8419     //   julong Ra, Rb, Rn, Rm;
8420 
8421     //   int i;
8422 
8423     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
8424 
8425     //   for (i = 0; i < len; i++) {
8426     //     int j;
8427 
8428     //     Pa = Pa_base;
8429     //     Pb = Pa_base + i;
8430     //     Pm = Pm_base;
8431     //     Pn = Pn_base + i;
8432 
8433     //     Ra = *Pa;
8434     //     Rb = *Pb;
8435     //     Rm = *Pm;
8436     //     Rn = *Pn;
8437 
8438     //     int iters = (i+1)/2;
8439     //     for (j = 0; iters--; j++) {
8440     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
8441     //       MACC2(Ra, Rb, t0, t1, t2);
8442     //       Ra = *++Pa;
8443     //       Rb = *--Pb;
8444     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8445     //       MACC(Rm, Rn, t0, t1, t2);
8446     //       Rm = *++Pm;
8447     //       Rn = *--Pn;
8448     //     }
8449     //     if ((i & 1) == 0) {
8450     //       assert(Ra == Pa_base[j], "must be");
8451     //       MACC(Ra, Ra, t0, t1, t2);
8452     //     }
8453     //     iters = i/2;
8454     //     assert(iters == i-j, "must be");
8455     //     for (; iters--; j++) {
8456     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8457     //       MACC(Rm, Rn, t0, t1, t2);
8458     //       Rm = *++Pm;
8459     //       Rn = *--Pn;
8460     //     }
8461 
8462     //     *Pm = Rm = t0 * inv;
8463     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
8464     //     MACC(Rm, Rn, t0, t1, t2);
8465 
8466     //     assert(t0 == 0, "broken Montgomery multiply");
8467 
8468     //     t0 = t1; t1 = t2; t2 = 0;
8469     //   }
8470 
8471     //   for (i = len; i < 2*len; i++) {
8472     //     int start = i-len+1;
8473     //     int end = start + (len - start)/2;
8474     //     int j;
8475 
8476     //     Pa = Pa_base + i-len;
8477     //     Pb = Pa_base + len;
8478     //     Pm = Pm_base + i-len;
8479     //     Pn = Pn_base + len;
8480 
8481     //     Ra = *++Pa;
8482     //     Rb = *--Pb;
8483     //     Rm = *++Pm;
8484     //     Rn = *--Pn;
8485 
8486     //     int iters = (2*len-i-1)/2;
8487     //     assert(iters == end-start, "must be");
8488     //     for (j = start; iters--; j++) {
8489     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
8490     //       MACC2(Ra, Rb, t0, t1, t2);
8491     //       Ra = *++Pa;
8492     //       Rb = *--Pb;
8493     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8494     //       MACC(Rm, Rn, t0, t1, t2);
8495     //       Rm = *++Pm;
8496     //       Rn = *--Pn;
8497     //     }
8498     //     if ((i & 1) == 0) {
8499     //       assert(Ra == Pa_base[j], "must be");
8500     //       MACC(Ra, Ra, t0, t1, t2);
8501     //     }
8502     //     iters =  (2*len-i)/2;
8503     //     assert(iters == len-j, "must be");
8504     //     for (; iters--; j++) {
8505     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
8506     //       MACC(Rm, Rn, t0, t1, t2);
8507     //       Rm = *++Pm;
8508     //       Rn = *--Pn;
8509     //     }
8510     //     Pm_base[i-len] = t0;
8511     //     t0 = t1; t1 = t2; t2 = 0;
8512     //   }
8513 
8514     //   while (t0)
8515     //     t0 = sub(Pm_base, Pn_base, t0, len);
8516     // }
8517   };
8518 
8519   void generate_vector_math_stubs() {
8520     // Get native vector math stub routine addresses
8521     void* libsleef = nullptr;
8522     char ebuf[1024];
8523     char dll_name[JVM_MAXPATHLEN];
8524     if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "sleef")) {
8525       libsleef = os::dll_load(dll_name, ebuf, sizeof ebuf);
8526     }
8527     if (libsleef == nullptr) {
8528       log_info(library)("Failed to load native vector math library, %s!", ebuf);
8529       return;
8530     }
8531     // Method naming convention
8532     //   All the methods are named as <OP><T><N>_<U><suffix>
8533     //   Where:
8534     //     <OP>     is the operation name, e.g. sin
8535     //     <T>      is optional to indicate float/double
8536     //              "f/d" for vector float/double operation
8537     //     <N>      is the number of elements in the vector
8538     //              "2/4" for neon, and "x" for sve
8539     //     <U>      is the precision level
8540     //              "u10/u05" represents 1.0/0.5 ULP error bounds
8541     //               We use "u10" for all operations by default
8542     //               But for those functions do not have u10 support, we use "u05" instead
8543     //     <suffix> indicates neon/sve
8544     //              "sve/advsimd" for sve/neon implementations
8545     //     e.g. sinfx_u10sve is the method for computing vector float sin using SVE instructions
8546     //          cosd2_u10advsimd is the method for computing 2 elements vector double cos using NEON instructions
8547     //
8548     log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "sleef" JNI_LIB_SUFFIX, p2i(libsleef));
8549 
8550     // Math vector stubs implemented with SVE for scalable vector size.
8551     if (UseSVE > 0) {
8552       for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) {
8553         int vop = VectorSupport::VECTOR_OP_MATH_START + op;
8554         // Skip "tanh" because there is performance regression
8555         if (vop == VectorSupport::VECTOR_OP_TANH) {
8556           continue;
8557         }
8558 
8559         // The native library does not support u10 level of "hypot".
8560         const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10";
8561 
8562         snprintf(ebuf, sizeof(ebuf), "%sfx_%ssve", VectorSupport::mathname[op], ulf);
8563         StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf);
8564 
8565         snprintf(ebuf, sizeof(ebuf), "%sdx_%ssve", VectorSupport::mathname[op], ulf);
8566         StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf);
8567       }
8568     }
8569 
8570     // Math vector stubs implemented with NEON for 64/128 bits vector size.
8571     for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) {
8572       int vop = VectorSupport::VECTOR_OP_MATH_START + op;
8573       // Skip "tanh" because there is performance regression
8574       if (vop == VectorSupport::VECTOR_OP_TANH) {
8575         continue;
8576       }
8577 
8578       // The native library does not support u10 level of "hypot".
8579       const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10";
8580 
8581       snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf);
8582       StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libsleef, ebuf);
8583 
8584       snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf);
8585       StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf);
8586 
8587       snprintf(ebuf, sizeof(ebuf), "%sd2_%sadvsimd", VectorSupport::mathname[op], ulf);
8588       StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf);
8589     }
8590   }
8591 
8592   // Call here from the interpreter or compiled code to either load
8593   // multiple returned values from the inline type instance being
8594   // returned to registers or to store returned values to a newly
8595   // allocated inline type instance.
8596   address generate_return_value_stub(address destination, const char* name, bool has_res) {
8597     // We need to save all registers the calling convention may use so
8598     // the runtime calls read or update those registers. This needs to
8599     // be in sync with SharedRuntime::java_return_convention().
8600     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
8601     enum layout {
8602       j_rarg7_off = 0, j_rarg7_2,    // j_rarg7 is r0
8603       j_rarg6_off, j_rarg6_2,
8604       j_rarg5_off, j_rarg5_2,
8605       j_rarg4_off, j_rarg4_2,
8606       j_rarg3_off, j_rarg3_2,
8607       j_rarg2_off, j_rarg2_2,
8608       j_rarg1_off, j_rarg1_2,
8609       j_rarg0_off, j_rarg0_2,
8610 
8611       j_farg7_off, j_farg7_2,
8612       j_farg6_off, j_farg6_2,
8613       j_farg5_off, j_farg5_2,
8614       j_farg4_off, j_farg4_2,
8615       j_farg3_off, j_farg3_2,
8616       j_farg2_off, j_farg2_2,
8617       j_farg1_off, j_farg1_2,
8618       j_farg0_off, j_farg0_2,
8619 
8620       rfp_off, rfp_off2,
8621       return_off, return_off2,
8622 
8623       framesize // inclusive of return address
8624     };
8625 
8626     CodeBuffer code(name, 512, 64);
8627     MacroAssembler* masm = new MacroAssembler(&code);
8628 
8629     int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16);
8630     assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned");
8631     int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
8632     int frame_size_in_words = frame_size_in_bytes / wordSize;
8633 
8634     OopMapSet* oop_maps = new OopMapSet();
8635     OopMap* map = new OopMap(frame_size_in_slots, 0);
8636 
8637     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg());
8638     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg());
8639     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg());
8640     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg());
8641     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg());
8642     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg());
8643     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg());
8644     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg());
8645 
8646     map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg());
8647     map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg());
8648     map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg());
8649     map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg());
8650     map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg());
8651     map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg());
8652     map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg());
8653     map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg());
8654 
8655     address start = __ pc();
8656 
8657     __ enter(); // Save FP and LR before call
8658 
8659     __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize)));
8660     __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize)));
8661     __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize)));
8662     __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize)));
8663 
8664     __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize)));
8665     __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize)));
8666     __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize)));
8667     __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize)));
8668 
8669     int frame_complete = __ offset();
8670 
8671     // Set up last_Java_sp and last_Java_fp
8672     address the_pc = __ pc();
8673     __ set_last_Java_frame(sp, noreg, the_pc, rscratch1);
8674 
8675     // Call runtime
8676     __ mov(c_rarg1, r0);
8677     __ mov(c_rarg0, rthread);
8678 
8679     __ mov(rscratch1, destination);
8680     __ blr(rscratch1);
8681 
8682     oop_maps->add_gc_map(the_pc - start, map);
8683 
8684     __ reset_last_Java_frame(false);
8685 
8686     __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize)));
8687     __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize)));
8688     __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize)));
8689     __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize)));
8690 
8691     __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize)));
8692     __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize)));
8693     __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize)));
8694     __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize)));
8695 
8696     __ leave();
8697 
8698     // check for pending exceptions
8699     Label pending;
8700     __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
8701     __ cbnz(rscratch1, pending);
8702 
8703     if (has_res) {
8704       __ get_vm_result(r0, rthread);
8705     }
8706 
8707     __ ret(lr);
8708 
8709     __ bind(pending);
8710     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
8711 
8712     // -------------
8713     // make sure all code is generated
8714     masm->flush();
8715 
8716     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false);
8717     return stub->entry_point();
8718   }
8719 
8720   // Initialization
8721   void generate_initial_stubs() {
8722     // Generate initial stubs and initializes the entry points
8723 
8724     // entry points that exist in all platforms Note: This is code
8725     // that could be shared among different platforms - however the
8726     // benefit seems to be smaller than the disadvantage of having a
8727     // much more complicated generator structure. See also comment in
8728     // stubRoutines.hpp.
8729 
8730     StubRoutines::_forward_exception_entry = generate_forward_exception();
8731 
8732     StubRoutines::_call_stub_entry =
8733       generate_call_stub(StubRoutines::_call_stub_return_address);
8734 
8735     // is referenced by megamorphic call
8736     StubRoutines::_catch_exception_entry = generate_catch_exception();
8737 
8738     // Initialize table for copy memory (arraycopy) check.
8739     if (UnsafeMemoryAccess::_table == nullptr) {
8740       UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
8741     }
8742 
8743     if (UseCRC32Intrinsics) {
8744       // set table address before stub generation which use it
8745       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
8746       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
8747     }
8748 
8749     if (UseCRC32CIntrinsics) {
8750       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
8751     }
8752 
8753     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
8754       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
8755     }
8756 
8757     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
8758       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
8759     }
8760 
8761     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
8762         vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
8763       StubRoutines::_hf2f = generate_float16ToFloat();
8764       StubRoutines::_f2hf = generate_floatToFloat16();
8765     }
8766 
8767     if (InlineTypeReturnedAsFields) {
8768       StubRoutines::_load_inline_type_fields_in_regs =
8769          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false);
8770       StubRoutines::_store_inline_type_fields_to_buf =
8771          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true);
8772     }
8773 
8774   }
8775 
8776   void generate_continuation_stubs() {
8777     // Continuation stubs:
8778     StubRoutines::_cont_thaw          = generate_cont_thaw();
8779     StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
8780     StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
8781   }
8782 
8783   void generate_final_stubs() {
8784     // support for verify_oop (must happen after universe_init)
8785     if (VerifyOops) {
8786       StubRoutines::_verify_oop_subroutine_entry   = generate_verify_oop();
8787     }
8788 
8789     // arraycopy stubs used by compilers
8790     generate_arraycopy_stubs();
8791 
8792     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
8793     if (bs_nm != nullptr) {
8794       StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
8795     }
8796 
8797     StubRoutines::aarch64::_spin_wait = generate_spin_wait();
8798 
8799     if (UsePoly1305Intrinsics) {
8800       StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
8801     }
8802 
8803 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
8804 
8805     generate_atomic_entry_points();
8806 
8807 #endif // LINUX
8808 
8809 #ifdef COMPILER2
8810     if (UseSecondarySupersTable) {
8811       StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
8812       if (! InlineSecondarySupersTest) {
8813         for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
8814           StubRoutines::_lookup_secondary_supers_table_stubs[slot]
8815             = generate_lookup_secondary_supers_table_stub(slot);
8816         }
8817       }
8818     }
8819 #endif
8820 
8821     StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
8822     StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
8823 
8824     StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
8825   }
8826 
8827   void generate_compiler_stubs() {
8828 #if COMPILER2_OR_JVMCI
8829 
8830     if (UseSVE == 0) {
8831       StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices");
8832     }
8833 
8834     // array equals stub for large arrays.
8835     if (!UseSimpleArrayEquals) {
8836       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
8837     }
8838 
8839     // arrays_hascode stub for large arrays.
8840     StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
8841     StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
8842     StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
8843     StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
8844     StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
8845 
8846     // byte_array_inflate stub for large arrays.
8847     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
8848 
8849     // countPositives stub for large arrays.
8850     StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
8851 
8852     generate_compare_long_strings();
8853 
8854     generate_string_indexof_stubs();
8855 
8856 #ifdef COMPILER2
8857     if (UseMultiplyToLenIntrinsic) {
8858       StubRoutines::_multiplyToLen = generate_multiplyToLen();
8859     }
8860 
8861     if (UseSquareToLenIntrinsic) {
8862       StubRoutines::_squareToLen = generate_squareToLen();
8863     }
8864 
8865     if (UseMulAddIntrinsic) {
8866       StubRoutines::_mulAdd = generate_mulAdd();
8867     }
8868 
8869     if (UseSIMDForBigIntegerShiftIntrinsics) {
8870       StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
8871       StubRoutines::_bigIntegerLeftShiftWorker  = generate_bigIntegerLeftShift();
8872     }
8873 
8874     if (UseMontgomeryMultiplyIntrinsic) {
8875       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
8876       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
8877       StubRoutines::_montgomeryMultiply = g.generate_multiply();
8878     }
8879 
8880     if (UseMontgomerySquareIntrinsic) {
8881       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
8882       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
8883       // We use generate_multiply() rather than generate_square()
8884       // because it's faster for the sizes of modulus we care about.
8885       StubRoutines::_montgomerySquare = g.generate_multiply();
8886     }
8887 
8888     generate_vector_math_stubs();
8889 
8890 #endif // COMPILER2
8891 
8892     if (UseChaCha20Intrinsics) {
8893       StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
8894     }
8895 
8896     if (UseBASE64Intrinsics) {
8897         StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
8898         StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
8899     }
8900 
8901     // data cache line writeback
8902     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
8903     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
8904 
8905     if (UseAESIntrinsics) {
8906       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
8907       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
8908       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
8909       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
8910       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
8911     }
8912     if (UseGHASHIntrinsics) {
8913       // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
8914       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
8915     }
8916     if (UseAESIntrinsics && UseGHASHIntrinsics) {
8917       StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
8918     }
8919 
8920     if (UseMD5Intrinsics) {
8921       StubRoutines::_md5_implCompress      = generate_md5_implCompress(false,    "md5_implCompress");
8922       StubRoutines::_md5_implCompressMB    = generate_md5_implCompress(true,     "md5_implCompressMB");
8923     }
8924     if (UseSHA1Intrinsics) {
8925       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
8926       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
8927     }
8928     if (UseSHA256Intrinsics) {
8929       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
8930       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
8931     }
8932     if (UseSHA512Intrinsics) {
8933       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
8934       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
8935     }
8936     if (UseSHA3Intrinsics) {
8937       StubRoutines::_sha3_implCompress     = generate_sha3_implCompress(false,   "sha3_implCompress");
8938       StubRoutines::_sha3_implCompressMB   = generate_sha3_implCompress(true,    "sha3_implCompressMB");
8939     }
8940 
8941     // generate Adler32 intrinsics code
8942     if (UseAdler32Intrinsics) {
8943       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
8944     }
8945 
8946 #endif // COMPILER2_OR_JVMCI
8947   }
8948 
8949  public:
8950   StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) {
8951     switch(kind) {
8952     case Initial_stubs:
8953       generate_initial_stubs();
8954       break;
8955      case Continuation_stubs:
8956       generate_continuation_stubs();
8957       break;
8958     case Compiler_stubs:
8959       generate_compiler_stubs();
8960       break;
8961     case Final_stubs:
8962       generate_final_stubs();
8963       break;
8964     default:
8965       fatal("unexpected stubs kind: %d", kind);
8966       break;
8967     };
8968   }
8969 }; // end class declaration
8970 
8971 void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) {
8972   StubGenerator g(code, kind);
8973 }
8974 
8975 
8976 #if defined (LINUX)
8977 
8978 // Define pointers to atomic stubs and initialize them to point to the
8979 // code in atomic_aarch64.S.
8980 
8981 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED)                                \
8982   extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
8983     (volatile void *ptr, uint64_t arg1, uint64_t arg2);                 \
8984   aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
8985     = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
8986 
8987 DEFAULT_ATOMIC_OP(fetch_add, 4, )
8988 DEFAULT_ATOMIC_OP(fetch_add, 8, )
8989 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
8990 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
8991 DEFAULT_ATOMIC_OP(xchg, 4, )
8992 DEFAULT_ATOMIC_OP(xchg, 8, )
8993 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
8994 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
8995 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
8996 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
8997 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
8998 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
8999 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
9000 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
9001 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
9002 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
9003 
9004 #undef DEFAULT_ATOMIC_OP
9005 
9006 #endif // LINUX