1 /* 2 * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2024, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "asm/register.hpp" 30 #include "atomic_aarch64.hpp" 31 #include "compiler/oopMap.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/barrierSetAssembler.hpp" 34 #include "gc/shared/gc_globals.hpp" 35 #include "gc/shared/tlab_globals.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "memory/universe.hpp" 38 #include "nativeInst_aarch64.hpp" 39 #include "oops/instanceOop.hpp" 40 #include "oops/method.hpp" 41 #include "oops/objArrayKlass.hpp" 42 #include "oops/oop.inline.hpp" 43 #include "prims/methodHandles.hpp" 44 #include "prims/upcallLinker.hpp" 45 #include "runtime/arguments.hpp" 46 #include "runtime/atomic.hpp" 47 #include "runtime/continuation.hpp" 48 #include "runtime/continuationEntry.inline.hpp" 49 #include "runtime/frame.inline.hpp" 50 #include "runtime/handles.inline.hpp" 51 #include "runtime/javaThread.hpp" 52 #include "runtime/sharedRuntime.hpp" 53 #include "runtime/stubCodeGenerator.hpp" 54 #include "runtime/stubRoutines.hpp" 55 #include "utilities/align.hpp" 56 #include "utilities/checkedCast.hpp" 57 #include "utilities/debug.hpp" 58 #include "utilities/globalDefinitions.hpp" 59 #include "utilities/intpow.hpp" 60 #include "utilities/powerOfTwo.hpp" 61 #ifdef COMPILER2 62 #include "opto/runtime.hpp" 63 #endif 64 #if INCLUDE_ZGC 65 #include "gc/z/zThreadLocalData.hpp" 66 #endif 67 68 // Declaration and definition of StubGenerator (no .hpp file). 69 // For a more detailed description of the stub routine structure 70 // see the comment in stubRoutines.hpp 71 72 #undef __ 73 #define __ _masm-> 74 75 #ifdef PRODUCT 76 #define BLOCK_COMMENT(str) /* nothing */ 77 #else 78 #define BLOCK_COMMENT(str) __ block_comment(str) 79 #endif 80 81 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 82 83 // Stub Code definitions 84 85 class StubGenerator: public StubCodeGenerator { 86 private: 87 88 #ifdef PRODUCT 89 #define inc_counter_np(counter) ((void)0) 90 #else 91 void inc_counter_np_(uint& counter) { 92 __ incrementw(ExternalAddress((address)&counter)); 93 } 94 #define inc_counter_np(counter) \ 95 BLOCK_COMMENT("inc_counter " #counter); \ 96 inc_counter_np_(counter); 97 #endif 98 99 // Call stubs are used to call Java from C 100 // 101 // Arguments: 102 // c_rarg0: call wrapper address address 103 // c_rarg1: result address 104 // c_rarg2: result type BasicType 105 // c_rarg3: method Method* 106 // c_rarg4: (interpreter) entry point address 107 // c_rarg5: parameters intptr_t* 108 // c_rarg6: parameter size (in words) int 109 // c_rarg7: thread Thread* 110 // 111 // There is no return from the stub itself as any Java result 112 // is written to result 113 // 114 // we save r30 (lr) as the return PC at the base of the frame and 115 // link r29 (fp) below it as the frame pointer installing sp (r31) 116 // into fp. 117 // 118 // we save r0-r7, which accounts for all the c arguments. 119 // 120 // TODO: strictly do we need to save them all? they are treated as 121 // volatile by C so could we omit saving the ones we are going to 122 // place in global registers (thread? method?) or those we only use 123 // during setup of the Java call? 124 // 125 // we don't need to save r8 which C uses as an indirect result location 126 // return register. 127 // 128 // we don't need to save r9-r15 which both C and Java treat as 129 // volatile 130 // 131 // we don't need to save r16-18 because Java does not use them 132 // 133 // we save r19-r28 which Java uses as scratch registers and C 134 // expects to be callee-save 135 // 136 // we save the bottom 64 bits of each value stored in v8-v15; it is 137 // the responsibility of the caller to preserve larger values. 138 // 139 // so the stub frame looks like this when we enter Java code 140 // 141 // [ return_from_Java ] <--- sp 142 // [ argument word n ] 143 // ... 144 // -29 [ argument word 1 ] 145 // -28 [ saved Floating-point Control Register ] 146 // -26 [ saved v15 ] <--- sp_after_call 147 // -25 [ saved v14 ] 148 // -24 [ saved v13 ] 149 // -23 [ saved v12 ] 150 // -22 [ saved v11 ] 151 // -21 [ saved v10 ] 152 // -20 [ saved v9 ] 153 // -19 [ saved v8 ] 154 // -18 [ saved r28 ] 155 // -17 [ saved r27 ] 156 // -16 [ saved r26 ] 157 // -15 [ saved r25 ] 158 // -14 [ saved r24 ] 159 // -13 [ saved r23 ] 160 // -12 [ saved r22 ] 161 // -11 [ saved r21 ] 162 // -10 [ saved r20 ] 163 // -9 [ saved r19 ] 164 // -8 [ call wrapper (r0) ] 165 // -7 [ result (r1) ] 166 // -6 [ result type (r2) ] 167 // -5 [ method (r3) ] 168 // -4 [ entry point (r4) ] 169 // -3 [ parameters (r5) ] 170 // -2 [ parameter size (r6) ] 171 // -1 [ thread (r7) ] 172 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 173 // 1 [ saved lr (r30) ] 174 175 // Call stub stack layout word offsets from fp 176 enum call_stub_layout { 177 sp_after_call_off = -28, 178 179 fpcr_off = sp_after_call_off, 180 d15_off = -26, 181 d13_off = -24, 182 d11_off = -22, 183 d9_off = -20, 184 185 r28_off = -18, 186 r26_off = -16, 187 r24_off = -14, 188 r22_off = -12, 189 r20_off = -10, 190 call_wrapper_off = -8, 191 result_off = -7, 192 result_type_off = -6, 193 method_off = -5, 194 entry_point_off = -4, 195 parameter_size_off = -2, 196 thread_off = -1, 197 fp_f = 0, 198 retaddr_off = 1, 199 }; 200 201 address generate_call_stub(address& return_address) { 202 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 203 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 204 "adjust this code"); 205 206 StubCodeMark mark(this, "StubRoutines", "call_stub"); 207 address start = __ pc(); 208 209 const Address sp_after_call (rfp, sp_after_call_off * wordSize); 210 211 const Address fpcr_save (rfp, fpcr_off * wordSize); 212 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 213 const Address result (rfp, result_off * wordSize); 214 const Address result_type (rfp, result_type_off * wordSize); 215 const Address method (rfp, method_off * wordSize); 216 const Address entry_point (rfp, entry_point_off * wordSize); 217 const Address parameter_size(rfp, parameter_size_off * wordSize); 218 219 const Address thread (rfp, thread_off * wordSize); 220 221 const Address d15_save (rfp, d15_off * wordSize); 222 const Address d13_save (rfp, d13_off * wordSize); 223 const Address d11_save (rfp, d11_off * wordSize); 224 const Address d9_save (rfp, d9_off * wordSize); 225 226 const Address r28_save (rfp, r28_off * wordSize); 227 const Address r26_save (rfp, r26_off * wordSize); 228 const Address r24_save (rfp, r24_off * wordSize); 229 const Address r22_save (rfp, r22_off * wordSize); 230 const Address r20_save (rfp, r20_off * wordSize); 231 232 // stub code 233 234 address aarch64_entry = __ pc(); 235 236 // set up frame and move sp to end of save area 237 __ enter(); 238 __ sub(sp, rfp, -sp_after_call_off * wordSize); 239 240 // save register parameters and Java scratch/global registers 241 // n.b. we save thread even though it gets installed in 242 // rthread because we want to sanity check rthread later 243 __ str(c_rarg7, thread); 244 __ strw(c_rarg6, parameter_size); 245 __ stp(c_rarg4, c_rarg5, entry_point); 246 __ stp(c_rarg2, c_rarg3, result_type); 247 __ stp(c_rarg0, c_rarg1, call_wrapper); 248 249 __ stp(r20, r19, r20_save); 250 __ stp(r22, r21, r22_save); 251 __ stp(r24, r23, r24_save); 252 __ stp(r26, r25, r26_save); 253 __ stp(r28, r27, r28_save); 254 255 __ stpd(v9, v8, d9_save); 256 __ stpd(v11, v10, d11_save); 257 __ stpd(v13, v12, d13_save); 258 __ stpd(v15, v14, d15_save); 259 260 __ get_fpcr(rscratch1); 261 __ str(rscratch1, fpcr_save); 262 // Set FPCR to the state we need. We do want Round to Nearest. We 263 // don't want non-IEEE rounding modes or floating-point traps. 264 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode 265 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12) 266 __ set_fpcr(rscratch1); 267 268 // install Java thread in global register now we have saved 269 // whatever value it held 270 __ mov(rthread, c_rarg7); 271 // And method 272 __ mov(rmethod, c_rarg3); 273 274 // set up the heapbase register 275 __ reinit_heapbase(); 276 277 #ifdef ASSERT 278 // make sure we have no pending exceptions 279 { 280 Label L; 281 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 282 __ cmp(rscratch1, (u1)NULL_WORD); 283 __ br(Assembler::EQ, L); 284 __ stop("StubRoutines::call_stub: entered with pending exception"); 285 __ BIND(L); 286 } 287 #endif 288 // pass parameters if any 289 __ mov(esp, sp); 290 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 291 __ andr(sp, rscratch1, -2 * wordSize); 292 293 BLOCK_COMMENT("pass parameters if any"); 294 Label parameters_done; 295 // parameter count is still in c_rarg6 296 // and parameter pointer identifying param 1 is in c_rarg5 297 __ cbzw(c_rarg6, parameters_done); 298 299 address loop = __ pc(); 300 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 301 __ subsw(c_rarg6, c_rarg6, 1); 302 __ push(rscratch1); 303 __ br(Assembler::GT, loop); 304 305 __ BIND(parameters_done); 306 307 // call Java entry -- passing methdoOop, and current sp 308 // rmethod: Method* 309 // r19_sender_sp: sender sp 310 BLOCK_COMMENT("call Java function"); 311 __ mov(r19_sender_sp, sp); 312 __ blr(c_rarg4); 313 314 // we do this here because the notify will already have been done 315 // if we get to the next instruction via an exception 316 // 317 // n.b. adding this instruction here affects the calculation of 318 // whether or not a routine returns to the call stub (used when 319 // doing stack walks) since the normal test is to check the return 320 // pc against the address saved below. so we may need to allow for 321 // this extra instruction in the check. 322 323 // save current address for use by exception handling code 324 325 return_address = __ pc(); 326 327 // store result depending on type (everything that is not 328 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 329 // n.b. this assumes Java returns an integral result in r0 330 // and a floating result in j_farg0 331 // All of j_rargN may be used to return inline type fields so be careful 332 // not to clobber those. 333 // SharedRuntime::generate_buffered_inline_type_adapter() knows the register 334 // assignment of Rresult below. 335 Register Rresult = r14, Rresult_type = r15; 336 __ ldr(Rresult, result); 337 Label is_long, is_float, is_double, check_prim, exit; 338 __ ldr(Rresult_type, result_type); 339 __ cmp(Rresult_type, (u1)T_OBJECT); 340 __ br(Assembler::EQ, check_prim); 341 __ cmp(Rresult_type, (u1)T_LONG); 342 __ br(Assembler::EQ, is_long); 343 __ cmp(Rresult_type, (u1)T_FLOAT); 344 __ br(Assembler::EQ, is_float); 345 __ cmp(Rresult_type, (u1)T_DOUBLE); 346 __ br(Assembler::EQ, is_double); 347 348 // handle T_INT case 349 __ strw(r0, Address(Rresult)); 350 351 __ BIND(exit); 352 353 // pop parameters 354 __ sub(esp, rfp, -sp_after_call_off * wordSize); 355 356 #ifdef ASSERT 357 // verify that threads correspond 358 { 359 Label L, S; 360 __ ldr(rscratch1, thread); 361 __ cmp(rthread, rscratch1); 362 __ br(Assembler::NE, S); 363 __ get_thread(rscratch1); 364 __ cmp(rthread, rscratch1); 365 __ br(Assembler::EQ, L); 366 __ BIND(S); 367 __ stop("StubRoutines::call_stub: threads must correspond"); 368 __ BIND(L); 369 } 370 #endif 371 372 __ pop_cont_fastpath(rthread); 373 374 // restore callee-save registers 375 __ ldpd(v15, v14, d15_save); 376 __ ldpd(v13, v12, d13_save); 377 __ ldpd(v11, v10, d11_save); 378 __ ldpd(v9, v8, d9_save); 379 380 __ ldp(r28, r27, r28_save); 381 __ ldp(r26, r25, r26_save); 382 __ ldp(r24, r23, r24_save); 383 __ ldp(r22, r21, r22_save); 384 __ ldp(r20, r19, r20_save); 385 386 // restore fpcr 387 __ ldr(rscratch1, fpcr_save); 388 __ set_fpcr(rscratch1); 389 390 __ ldp(c_rarg0, c_rarg1, call_wrapper); 391 __ ldrw(c_rarg2, result_type); 392 __ ldr(c_rarg3, method); 393 __ ldp(c_rarg4, c_rarg5, entry_point); 394 __ ldp(c_rarg6, c_rarg7, parameter_size); 395 396 // leave frame and return to caller 397 __ leave(); 398 __ ret(lr); 399 400 // handle return types different from T_INT 401 __ BIND(check_prim); 402 if (InlineTypeReturnedAsFields) { 403 // Check for scalarized return value 404 __ tbz(r0, 0, is_long); 405 // Load pack handler address 406 __ andr(rscratch1, r0, -2); 407 __ ldr(rscratch1, Address(rscratch1, InstanceKlass::adr_inlineklass_fixed_block_offset())); 408 __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset())); 409 __ blr(rscratch1); 410 __ b(exit); 411 } 412 413 __ BIND(is_long); 414 __ str(r0, Address(Rresult, 0)); 415 __ br(Assembler::AL, exit); 416 417 __ BIND(is_float); 418 __ strs(j_farg0, Address(Rresult, 0)); 419 __ br(Assembler::AL, exit); 420 421 __ BIND(is_double); 422 __ strd(j_farg0, Address(Rresult, 0)); 423 __ br(Assembler::AL, exit); 424 425 return start; 426 } 427 428 // Return point for a Java call if there's an exception thrown in 429 // Java code. The exception is caught and transformed into a 430 // pending exception stored in JavaThread that can be tested from 431 // within the VM. 432 // 433 // Note: Usually the parameters are removed by the callee. In case 434 // of an exception crossing an activation frame boundary, that is 435 // not the case if the callee is compiled code => need to setup the 436 // rsp. 437 // 438 // r0: exception oop 439 440 address generate_catch_exception() { 441 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 442 address start = __ pc(); 443 444 // same as in generate_call_stub(): 445 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 446 const Address thread (rfp, thread_off * wordSize); 447 448 #ifdef ASSERT 449 // verify that threads correspond 450 { 451 Label L, S; 452 __ ldr(rscratch1, thread); 453 __ cmp(rthread, rscratch1); 454 __ br(Assembler::NE, S); 455 __ get_thread(rscratch1); 456 __ cmp(rthread, rscratch1); 457 __ br(Assembler::EQ, L); 458 __ bind(S); 459 __ stop("StubRoutines::catch_exception: threads must correspond"); 460 __ bind(L); 461 } 462 #endif 463 464 // set pending exception 465 __ verify_oop(r0); 466 467 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 468 __ mov(rscratch1, (address)__FILE__); 469 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 470 __ movw(rscratch1, (int)__LINE__); 471 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 472 473 // complete return to VM 474 assert(StubRoutines::_call_stub_return_address != nullptr, 475 "_call_stub_return_address must have been generated before"); 476 __ b(StubRoutines::_call_stub_return_address); 477 478 return start; 479 } 480 481 // Continuation point for runtime calls returning with a pending 482 // exception. The pending exception check happened in the runtime 483 // or native call stub. The pending exception in Thread is 484 // converted into a Java-level exception. 485 // 486 // Contract with Java-level exception handlers: 487 // r0: exception 488 // r3: throwing pc 489 // 490 // NOTE: At entry of this stub, exception-pc must be in LR !! 491 492 // NOTE: this is always used as a jump target within generated code 493 // so it just needs to be generated code with no x86 prolog 494 495 address generate_forward_exception() { 496 StubCodeMark mark(this, "StubRoutines", "forward exception"); 497 address start = __ pc(); 498 499 // Upon entry, LR points to the return address returning into 500 // Java (interpreted or compiled) code; i.e., the return address 501 // becomes the throwing pc. 502 // 503 // Arguments pushed before the runtime call are still on the stack 504 // but the exception handler will reset the stack pointer -> 505 // ignore them. A potential result in registers can be ignored as 506 // well. 507 508 #ifdef ASSERT 509 // make sure this code is only executed if there is a pending exception 510 { 511 Label L; 512 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 513 __ cbnz(rscratch1, L); 514 __ stop("StubRoutines::forward exception: no pending exception (1)"); 515 __ bind(L); 516 } 517 #endif 518 519 // compute exception handler into r19 520 521 // call the VM to find the handler address associated with the 522 // caller address. pass thread in r0 and caller pc (ret address) 523 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 524 // the stack. 525 __ mov(c_rarg1, lr); 526 // lr will be trashed by the VM call so we move it to R19 527 // (callee-saved) because we also need to pass it to the handler 528 // returned by this call. 529 __ mov(r19, lr); 530 BLOCK_COMMENT("call exception_handler_for_return_address"); 531 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 532 SharedRuntime::exception_handler_for_return_address), 533 rthread, c_rarg1); 534 // Reinitialize the ptrue predicate register, in case the external runtime 535 // call clobbers ptrue reg, as we may return to SVE compiled code. 536 __ reinitialize_ptrue(); 537 538 // we should not really care that lr is no longer the callee 539 // address. we saved the value the handler needs in r19 so we can 540 // just copy it to r3. however, the C2 handler will push its own 541 // frame and then calls into the VM and the VM code asserts that 542 // the PC for the frame above the handler belongs to a compiled 543 // Java method. So, we restore lr here to satisfy that assert. 544 __ mov(lr, r19); 545 // setup r0 & r3 & clear pending exception 546 __ mov(r3, r19); 547 __ mov(r19, r0); 548 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 549 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 550 551 #ifdef ASSERT 552 // make sure exception is set 553 { 554 Label L; 555 __ cbnz(r0, L); 556 __ stop("StubRoutines::forward exception: no pending exception (2)"); 557 __ bind(L); 558 } 559 #endif 560 561 // continue at exception handler 562 // r0: exception 563 // r3: throwing pc 564 // r19: exception handler 565 __ verify_oop(r0); 566 __ br(r19); 567 568 return start; 569 } 570 571 // Non-destructive plausibility checks for oops 572 // 573 // Arguments: 574 // r0: oop to verify 575 // rscratch1: error message 576 // 577 // Stack after saving c_rarg3: 578 // [tos + 0]: saved c_rarg3 579 // [tos + 1]: saved c_rarg2 580 // [tos + 2]: saved lr 581 // [tos + 3]: saved rscratch2 582 // [tos + 4]: saved r0 583 // [tos + 5]: saved rscratch1 584 address generate_verify_oop() { 585 586 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 587 address start = __ pc(); 588 589 Label exit, error; 590 591 // save c_rarg2 and c_rarg3 592 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 593 594 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 595 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 596 __ ldr(c_rarg3, Address(c_rarg2)); 597 __ add(c_rarg3, c_rarg3, 1); 598 __ str(c_rarg3, Address(c_rarg2)); 599 600 // object is in r0 601 // make sure object is 'reasonable' 602 __ cbz(r0, exit); // if obj is null it is OK 603 604 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 605 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error); 606 607 // return if everything seems ok 608 __ bind(exit); 609 610 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 611 __ ret(lr); 612 613 // handle errors 614 __ bind(error); 615 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 616 617 __ push(RegSet::range(r0, r29), sp); 618 // debug(char* msg, int64_t pc, int64_t regs[]) 619 __ mov(c_rarg0, rscratch1); // pass address of error message 620 __ mov(c_rarg1, lr); // pass return address 621 __ mov(c_rarg2, sp); // pass address of regs on stack 622 #ifndef PRODUCT 623 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 624 #endif 625 BLOCK_COMMENT("call MacroAssembler::debug"); 626 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 627 __ blr(rscratch1); 628 __ hlt(0); 629 630 return start; 631 } 632 633 // Generate indices for iota vector. 634 address generate_iota_indices(const char *stub_name) { 635 __ align(CodeEntryAlignment); 636 StubCodeMark mark(this, "StubRoutines", stub_name); 637 address start = __ pc(); 638 // B 639 __ emit_data64(0x0706050403020100, relocInfo::none); 640 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 641 // H 642 __ emit_data64(0x0003000200010000, relocInfo::none); 643 __ emit_data64(0x0007000600050004, relocInfo::none); 644 // S 645 __ emit_data64(0x0000000100000000, relocInfo::none); 646 __ emit_data64(0x0000000300000002, relocInfo::none); 647 // D 648 __ emit_data64(0x0000000000000000, relocInfo::none); 649 __ emit_data64(0x0000000000000001, relocInfo::none); 650 // S - FP 651 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f 652 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f 653 // D - FP 654 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d 655 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d 656 return start; 657 } 658 659 // The inner part of zero_words(). This is the bulk operation, 660 // zeroing words in blocks, possibly using DC ZVA to do it. The 661 // caller is responsible for zeroing the last few words. 662 // 663 // Inputs: 664 // r10: the HeapWord-aligned base address of an array to zero. 665 // r11: the count in HeapWords, r11 > 0. 666 // 667 // Returns r10 and r11, adjusted for the caller to clear. 668 // r10: the base address of the tail of words left to clear. 669 // r11: the number of words in the tail. 670 // r11 < MacroAssembler::zero_words_block_size. 671 672 address generate_zero_blocks() { 673 Label done; 674 Label base_aligned; 675 676 Register base = r10, cnt = r11; 677 678 __ align(CodeEntryAlignment); 679 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 680 address start = __ pc(); 681 682 if (UseBlockZeroing) { 683 int zva_length = VM_Version::zva_length(); 684 685 // Ensure ZVA length can be divided by 16. This is required by 686 // the subsequent operations. 687 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 688 689 __ tbz(base, 3, base_aligned); 690 __ str(zr, Address(__ post(base, 8))); 691 __ sub(cnt, cnt, 1); 692 __ bind(base_aligned); 693 694 // Ensure count >= zva_length * 2 so that it still deserves a zva after 695 // alignment. 696 Label small; 697 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 698 __ subs(rscratch1, cnt, low_limit >> 3); 699 __ br(Assembler::LT, small); 700 __ zero_dcache_blocks(base, cnt); 701 __ bind(small); 702 } 703 704 { 705 // Number of stp instructions we'll unroll 706 const int unroll = 707 MacroAssembler::zero_words_block_size / 2; 708 // Clear the remaining blocks. 709 Label loop; 710 __ subs(cnt, cnt, unroll * 2); 711 __ br(Assembler::LT, done); 712 __ bind(loop); 713 for (int i = 0; i < unroll; i++) 714 __ stp(zr, zr, __ post(base, 16)); 715 __ subs(cnt, cnt, unroll * 2); 716 __ br(Assembler::GE, loop); 717 __ bind(done); 718 __ add(cnt, cnt, unroll * 2); 719 } 720 721 __ ret(lr); 722 723 return start; 724 } 725 726 727 typedef enum { 728 copy_forwards = 1, 729 copy_backwards = -1 730 } copy_direction; 731 732 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores 733 // for arraycopy stubs. 734 class ArrayCopyBarrierSetHelper : StackObj { 735 BarrierSetAssembler* _bs_asm; 736 MacroAssembler* _masm; 737 DecoratorSet _decorators; 738 BasicType _type; 739 Register _gct1; 740 Register _gct2; 741 Register _gct3; 742 FloatRegister _gcvt1; 743 FloatRegister _gcvt2; 744 FloatRegister _gcvt3; 745 746 public: 747 ArrayCopyBarrierSetHelper(MacroAssembler* masm, 748 DecoratorSet decorators, 749 BasicType type, 750 Register gct1, 751 Register gct2, 752 Register gct3, 753 FloatRegister gcvt1, 754 FloatRegister gcvt2, 755 FloatRegister gcvt3) 756 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()), 757 _masm(masm), 758 _decorators(decorators), 759 _type(type), 760 _gct1(gct1), 761 _gct2(gct2), 762 _gct3(gct3), 763 _gcvt1(gcvt1), 764 _gcvt2(gcvt2), 765 _gcvt3(gcvt3) { 766 } 767 768 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) { 769 _bs_asm->copy_load_at(_masm, _decorators, _type, 32, 770 dst1, dst2, src, 771 _gct1, _gct2, _gcvt1); 772 } 773 774 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) { 775 _bs_asm->copy_store_at(_masm, _decorators, _type, 32, 776 dst, src1, src2, 777 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3); 778 } 779 780 void copy_load_at_16(Register dst1, Register dst2, Address src) { 781 _bs_asm->copy_load_at(_masm, _decorators, _type, 16, 782 dst1, dst2, src, 783 _gct1); 784 } 785 786 void copy_store_at_16(Address dst, Register src1, Register src2) { 787 _bs_asm->copy_store_at(_masm, _decorators, _type, 16, 788 dst, src1, src2, 789 _gct1, _gct2, _gct3); 790 } 791 792 void copy_load_at_8(Register dst, Address src) { 793 _bs_asm->copy_load_at(_masm, _decorators, _type, 8, 794 dst, noreg, src, 795 _gct1); 796 } 797 798 void copy_store_at_8(Address dst, Register src) { 799 _bs_asm->copy_store_at(_masm, _decorators, _type, 8, 800 dst, src, noreg, 801 _gct1, _gct2, _gct3); 802 } 803 }; 804 805 // Bulk copy of blocks of 8 words. 806 // 807 // count is a count of words. 808 // 809 // Precondition: count >= 8 810 // 811 // Postconditions: 812 // 813 // The least significant bit of count contains the remaining count 814 // of words to copy. The rest of count is trash. 815 // 816 // s and d are adjusted to point to the remaining words to copy 817 // 818 void generate_copy_longs(DecoratorSet decorators, BasicType type, Label &start, Register s, Register d, Register count, 819 copy_direction direction) { 820 int unit = wordSize * direction; 821 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 822 823 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 824 t4 = r7, t5 = r11, t6 = r12, t7 = r13; 825 const Register stride = r14; 826 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 827 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 828 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 829 830 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7); 831 assert_different_registers(s, d, count, rscratch1, rscratch2); 832 833 Label again, drain; 834 const char *stub_name; 835 if (direction == copy_forwards) 836 stub_name = "forward_copy_longs"; 837 else 838 stub_name = "backward_copy_longs"; 839 840 __ align(CodeEntryAlignment); 841 842 StubCodeMark mark(this, "StubRoutines", stub_name); 843 844 __ bind(start); 845 846 Label unaligned_copy_long; 847 if (AvoidUnalignedAccesses) { 848 __ tbnz(d, 3, unaligned_copy_long); 849 } 850 851 if (direction == copy_forwards) { 852 __ sub(s, s, bias); 853 __ sub(d, d, bias); 854 } 855 856 #ifdef ASSERT 857 // Make sure we are never given < 8 words 858 { 859 Label L; 860 __ cmp(count, (u1)8); 861 __ br(Assembler::GE, L); 862 __ stop("genrate_copy_longs called with < 8 words"); 863 __ bind(L); 864 } 865 #endif 866 867 // Fill 8 registers 868 if (UseSIMDForMemoryOps) { 869 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 870 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 871 } else { 872 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 873 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 874 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 875 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 876 } 877 878 __ subs(count, count, 16); 879 __ br(Assembler::LO, drain); 880 881 int prefetch = PrefetchCopyIntervalInBytes; 882 bool use_stride = false; 883 if (direction == copy_backwards) { 884 use_stride = prefetch > 256; 885 prefetch = -prefetch; 886 if (use_stride) __ mov(stride, prefetch); 887 } 888 889 __ bind(again); 890 891 if (PrefetchCopyIntervalInBytes > 0) 892 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 893 894 if (UseSIMDForMemoryOps) { 895 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 896 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 897 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 898 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 899 } else { 900 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 901 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 902 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 903 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 904 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 905 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 906 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 907 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 908 } 909 910 __ subs(count, count, 8); 911 __ br(Assembler::HS, again); 912 913 // Drain 914 __ bind(drain); 915 if (UseSIMDForMemoryOps) { 916 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 917 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 918 } else { 919 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 920 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 921 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 922 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 923 } 924 925 { 926 Label L1, L2; 927 __ tbz(count, exact_log2(4), L1); 928 if (UseSIMDForMemoryOps) { 929 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit))); 930 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1); 931 } else { 932 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 933 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 934 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 935 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3); 936 } 937 __ bind(L1); 938 939 if (direction == copy_forwards) { 940 __ add(s, s, bias); 941 __ add(d, d, bias); 942 } 943 944 __ tbz(count, 1, L2); 945 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 946 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1); 947 __ bind(L2); 948 } 949 950 __ ret(lr); 951 952 if (AvoidUnalignedAccesses) { 953 Label drain, again; 954 // Register order for storing. Order is different for backward copy. 955 956 __ bind(unaligned_copy_long); 957 958 // source address is even aligned, target odd aligned 959 // 960 // when forward copying word pairs we read long pairs at offsets 961 // {0, 2, 4, 6} (in long words). when backwards copying we read 962 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 963 // address by -2 in the forwards case so we can compute the 964 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 965 // or -1. 966 // 967 // when forward copying we need to store 1 word, 3 pairs and 968 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a 969 // zero offset We adjust the destination by -1 which means we 970 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 971 // 972 // When backwards copyng we need to store 1 word, 3 pairs and 973 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 974 // offsets {1, 3, 5, 7, 8} * unit. 975 976 if (direction == copy_forwards) { 977 __ sub(s, s, 16); 978 __ sub(d, d, 8); 979 } 980 981 // Fill 8 registers 982 // 983 // for forwards copy s was offset by -16 from the original input 984 // value of s so the register contents are at these offsets 985 // relative to the 64 bit block addressed by that original input 986 // and so on for each successive 64 byte block when s is updated 987 // 988 // t0 at offset 0, t1 at offset 8 989 // t2 at offset 16, t3 at offset 24 990 // t4 at offset 32, t5 at offset 40 991 // t6 at offset 48, t7 at offset 56 992 993 // for backwards copy s was not offset so the register contents 994 // are at these offsets into the preceding 64 byte block 995 // relative to that original input and so on for each successive 996 // preceding 64 byte block when s is updated. this explains the 997 // slightly counter-intuitive looking pattern of register usage 998 // in the stp instructions for backwards copy. 999 // 1000 // t0 at offset -16, t1 at offset -8 1001 // t2 at offset -32, t3 at offset -24 1002 // t4 at offset -48, t5 at offset -40 1003 // t6 at offset -64, t7 at offset -56 1004 1005 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1006 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1007 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1008 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1009 1010 __ subs(count, count, 16); 1011 __ br(Assembler::LO, drain); 1012 1013 int prefetch = PrefetchCopyIntervalInBytes; 1014 bool use_stride = false; 1015 if (direction == copy_backwards) { 1016 use_stride = prefetch > 256; 1017 prefetch = -prefetch; 1018 if (use_stride) __ mov(stride, prefetch); 1019 } 1020 1021 __ bind(again); 1022 1023 if (PrefetchCopyIntervalInBytes > 0) 1024 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1025 1026 if (direction == copy_forwards) { 1027 // allowing for the offset of -8 the store instructions place 1028 // registers into the target 64 bit block at the following 1029 // offsets 1030 // 1031 // t0 at offset 0 1032 // t1 at offset 8, t2 at offset 16 1033 // t3 at offset 24, t4 at offset 32 1034 // t5 at offset 40, t6 at offset 48 1035 // t7 at offset 56 1036 1037 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1038 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1039 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1040 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1041 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1042 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1043 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1044 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1045 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1046 } else { 1047 // d was not offset when we started so the registers are 1048 // written into the 64 bit block preceding d with the following 1049 // offsets 1050 // 1051 // t1 at offset -8 1052 // t3 at offset -24, t0 at offset -16 1053 // t5 at offset -48, t2 at offset -32 1054 // t7 at offset -56, t4 at offset -48 1055 // t6 at offset -64 1056 // 1057 // note that this matches the offsets previously noted for the 1058 // loads 1059 1060 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1061 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1062 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1063 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1064 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1065 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1066 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1067 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1068 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1069 } 1070 1071 __ subs(count, count, 8); 1072 __ br(Assembler::HS, again); 1073 1074 // Drain 1075 // 1076 // this uses the same pattern of offsets and register arguments 1077 // as above 1078 __ bind(drain); 1079 if (direction == copy_forwards) { 1080 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1081 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1082 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1083 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1084 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1085 } else { 1086 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1087 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1088 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1089 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1090 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1091 } 1092 // now we need to copy any remaining part block which may 1093 // include a 4 word block subblock and/or a 2 word subblock. 1094 // bits 2 and 1 in the count are the tell-tale for whether we 1095 // have each such subblock 1096 { 1097 Label L1, L2; 1098 __ tbz(count, exact_log2(4), L1); 1099 // this is the same as above but copying only 4 longs hence 1100 // with only one intervening stp between the str instructions 1101 // but note that the offsets and registers still follow the 1102 // same pattern 1103 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1104 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 1105 if (direction == copy_forwards) { 1106 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1107 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1108 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3); 1109 } else { 1110 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1111 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1112 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2); 1113 } 1114 __ bind(L1); 1115 1116 __ tbz(count, 1, L2); 1117 // this is the same as above but copying only 2 longs hence 1118 // there is no intervening stp between the str instructions 1119 // but note that the offset and register patterns are still 1120 // the same 1121 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit))); 1122 if (direction == copy_forwards) { 1123 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1124 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1); 1125 } else { 1126 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1127 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0); 1128 } 1129 __ bind(L2); 1130 1131 // for forwards copy we need to re-adjust the offsets we 1132 // applied so that s and d are follow the last words written 1133 1134 if (direction == copy_forwards) { 1135 __ add(s, s, 16); 1136 __ add(d, d, 8); 1137 } 1138 1139 } 1140 1141 __ ret(lr); 1142 } 1143 } 1144 1145 // Small copy: less than 16 bytes. 1146 // 1147 // NB: Ignores all of the bits of count which represent more than 15 1148 // bytes, so a caller doesn't have to mask them. 1149 1150 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) { 1151 bool is_backwards = step < 0; 1152 size_t granularity = uabs(step); 1153 int direction = is_backwards ? -1 : 1; 1154 1155 Label Lword, Lint, Lshort, Lbyte; 1156 1157 assert(granularity 1158 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1159 1160 const Register t0 = r3; 1161 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1162 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg); 1163 1164 // ??? I don't know if this bit-test-and-branch is the right thing 1165 // to do. It does a lot of jumping, resulting in several 1166 // mispredicted branches. It might make more sense to do this 1167 // with something like Duff's device with a single computed branch. 1168 1169 __ tbz(count, 3 - exact_log2(granularity), Lword); 1170 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1171 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1172 __ bind(Lword); 1173 1174 if (granularity <= sizeof (jint)) { 1175 __ tbz(count, 2 - exact_log2(granularity), Lint); 1176 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1177 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1178 __ bind(Lint); 1179 } 1180 1181 if (granularity <= sizeof (jshort)) { 1182 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1183 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1184 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1185 __ bind(Lshort); 1186 } 1187 1188 if (granularity <= sizeof (jbyte)) { 1189 __ tbz(count, 0, Lbyte); 1190 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1191 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1192 __ bind(Lbyte); 1193 } 1194 } 1195 1196 Label copy_f, copy_b; 1197 Label copy_obj_f, copy_obj_b; 1198 Label copy_obj_uninit_f, copy_obj_uninit_b; 1199 1200 // All-singing all-dancing memory copy. 1201 // 1202 // Copy count units of memory from s to d. The size of a unit is 1203 // step, which can be positive or negative depending on the direction 1204 // of copy. If is_aligned is false, we align the source address. 1205 // 1206 1207 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, 1208 Register s, Register d, Register count, int step) { 1209 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1210 bool is_backwards = step < 0; 1211 unsigned int granularity = uabs(step); 1212 const Register t0 = r3, t1 = r4; 1213 1214 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always 1215 // load all the data before writing anything 1216 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1217 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11; 1218 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15; 1219 const Register send = r17, dend = r16; 1220 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1221 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 1222 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 1223 1224 if (PrefetchCopyIntervalInBytes > 0) 1225 __ prfm(Address(s, 0), PLDL1KEEP); 1226 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1227 __ br(Assembler::HI, copy_big); 1228 1229 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1230 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1231 1232 __ cmp(count, u1(16/granularity)); 1233 __ br(Assembler::LS, copy16); 1234 1235 __ cmp(count, u1(64/granularity)); 1236 __ br(Assembler::HI, copy80); 1237 1238 __ cmp(count, u1(32/granularity)); 1239 __ br(Assembler::LS, copy32); 1240 1241 // 33..64 bytes 1242 if (UseSIMDForMemoryOps) { 1243 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1244 bs.copy_load_at_32(v2, v3, Address(send, -32)); 1245 bs.copy_store_at_32(Address(d, 0), v0, v1); 1246 bs.copy_store_at_32(Address(dend, -32), v2, v3); 1247 } else { 1248 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1249 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1250 bs.copy_load_at_16(t4, t5, Address(send, -32)); 1251 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1252 1253 bs.copy_store_at_16(Address(d, 0), t0, t1); 1254 bs.copy_store_at_16(Address(d, 16), t2, t3); 1255 bs.copy_store_at_16(Address(dend, -32), t4, t5); 1256 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1257 } 1258 __ b(finish); 1259 1260 // 17..32 bytes 1261 __ bind(copy32); 1262 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1263 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1264 1265 bs.copy_store_at_16(Address(d, 0), t0, t1); 1266 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1267 __ b(finish); 1268 1269 // 65..80/96 bytes 1270 // (96 bytes if SIMD because we do 32 byes per instruction) 1271 __ bind(copy80); 1272 if (UseSIMDForMemoryOps) { 1273 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1274 bs.copy_load_at_32(v2, v3, Address(s, 32)); 1275 // Unaligned pointers can be an issue for copying. 1276 // The issue has more chances to happen when granularity of data is 1277 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least 1278 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. 1279 // The most performance drop has been seen for the range 65-80 bytes. 1280 // For such cases using the pair of ldp/stp instead of the third pair of 1281 // ldpq/stpq fixes the performance issue. 1282 if (granularity < sizeof (jint)) { 1283 Label copy96; 1284 __ cmp(count, u1(80/granularity)); 1285 __ br(Assembler::HI, copy96); 1286 bs.copy_load_at_16(t0, t1, Address(send, -16)); 1287 1288 bs.copy_store_at_32(Address(d, 0), v0, v1); 1289 bs.copy_store_at_32(Address(d, 32), v2, v3); 1290 1291 bs.copy_store_at_16(Address(dend, -16), t0, t1); 1292 __ b(finish); 1293 1294 __ bind(copy96); 1295 } 1296 bs.copy_load_at_32(v4, v5, Address(send, -32)); 1297 1298 bs.copy_store_at_32(Address(d, 0), v0, v1); 1299 bs.copy_store_at_32(Address(d, 32), v2, v3); 1300 1301 bs.copy_store_at_32(Address(dend, -32), v4, v5); 1302 } else { 1303 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1304 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1305 bs.copy_load_at_16(t4, t5, Address(s, 32)); 1306 bs.copy_load_at_16(t6, t7, Address(s, 48)); 1307 bs.copy_load_at_16(t8, t9, Address(send, -16)); 1308 1309 bs.copy_store_at_16(Address(d, 0), t0, t1); 1310 bs.copy_store_at_16(Address(d, 16), t2, t3); 1311 bs.copy_store_at_16(Address(d, 32), t4, t5); 1312 bs.copy_store_at_16(Address(d, 48), t6, t7); 1313 bs.copy_store_at_16(Address(dend, -16), t8, t9); 1314 } 1315 __ b(finish); 1316 1317 // 0..16 bytes 1318 __ bind(copy16); 1319 __ cmp(count, u1(8/granularity)); 1320 __ br(Assembler::LO, copy8); 1321 1322 // 8..16 bytes 1323 bs.copy_load_at_8(t0, Address(s, 0)); 1324 bs.copy_load_at_8(t1, Address(send, -8)); 1325 bs.copy_store_at_8(Address(d, 0), t0); 1326 bs.copy_store_at_8(Address(dend, -8), t1); 1327 __ b(finish); 1328 1329 if (granularity < 8) { 1330 // 4..7 bytes 1331 __ bind(copy8); 1332 __ tbz(count, 2 - exact_log2(granularity), copy4); 1333 __ ldrw(t0, Address(s, 0)); 1334 __ ldrw(t1, Address(send, -4)); 1335 __ strw(t0, Address(d, 0)); 1336 __ strw(t1, Address(dend, -4)); 1337 __ b(finish); 1338 if (granularity < 4) { 1339 // 0..3 bytes 1340 __ bind(copy4); 1341 __ cbz(count, finish); // get rid of 0 case 1342 if (granularity == 2) { 1343 __ ldrh(t0, Address(s, 0)); 1344 __ strh(t0, Address(d, 0)); 1345 } else { // granularity == 1 1346 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1347 // the first and last byte. 1348 // Handle the 3 byte case by loading and storing base + count/2 1349 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1350 // This does means in the 1 byte case we load/store the same 1351 // byte 3 times. 1352 __ lsr(count, count, 1); 1353 __ ldrb(t0, Address(s, 0)); 1354 __ ldrb(t1, Address(send, -1)); 1355 __ ldrb(t2, Address(s, count)); 1356 __ strb(t0, Address(d, 0)); 1357 __ strb(t1, Address(dend, -1)); 1358 __ strb(t2, Address(d, count)); 1359 } 1360 __ b(finish); 1361 } 1362 } 1363 1364 __ bind(copy_big); 1365 if (is_backwards) { 1366 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1367 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1368 } 1369 1370 // Now we've got the small case out of the way we can align the 1371 // source address on a 2-word boundary. 1372 1373 // Here we will materialize a count in r15, which is used by copy_memory_small 1374 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes. 1375 // Up until here, we have used t9, which aliases r15, but from here on, that register 1376 // can not be used as a temp register, as it contains the count. 1377 1378 Label aligned; 1379 1380 if (is_aligned) { 1381 // We may have to adjust by 1 word to get s 2-word-aligned. 1382 __ tbz(s, exact_log2(wordSize), aligned); 1383 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1384 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1385 __ sub(count, count, wordSize/granularity); 1386 } else { 1387 if (is_backwards) { 1388 __ andr(r15, s, 2 * wordSize - 1); 1389 } else { 1390 __ neg(r15, s); 1391 __ andr(r15, r15, 2 * wordSize - 1); 1392 } 1393 // r15 is the byte adjustment needed to align s. 1394 __ cbz(r15, aligned); 1395 int shift = exact_log2(granularity); 1396 if (shift > 0) { 1397 __ lsr(r15, r15, shift); 1398 } 1399 __ sub(count, count, r15); 1400 1401 #if 0 1402 // ?? This code is only correct for a disjoint copy. It may or 1403 // may not make sense to use it in that case. 1404 1405 // Copy the first pair; s and d may not be aligned. 1406 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1407 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1408 1409 // Align s and d, adjust count 1410 if (is_backwards) { 1411 __ sub(s, s, r15); 1412 __ sub(d, d, r15); 1413 } else { 1414 __ add(s, s, r15); 1415 __ add(d, d, r15); 1416 } 1417 #else 1418 copy_memory_small(decorators, type, s, d, r15, step); 1419 #endif 1420 } 1421 1422 __ bind(aligned); 1423 1424 // s is now 2-word-aligned. 1425 1426 // We have a count of units and some trailing bytes. Adjust the 1427 // count and do a bulk copy of words. If the shift is zero 1428 // perform a move instead to benefit from zero latency moves. 1429 int shift = exact_log2(wordSize/granularity); 1430 if (shift > 0) { 1431 __ lsr(r15, count, shift); 1432 } else { 1433 __ mov(r15, count); 1434 } 1435 if (direction == copy_forwards) { 1436 if (type != T_OBJECT) { 1437 __ bl(copy_f); 1438 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1439 __ bl(copy_obj_uninit_f); 1440 } else { 1441 __ bl(copy_obj_f); 1442 } 1443 } else { 1444 if (type != T_OBJECT) { 1445 __ bl(copy_b); 1446 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1447 __ bl(copy_obj_uninit_b); 1448 } else { 1449 __ bl(copy_obj_b); 1450 } 1451 } 1452 1453 // And the tail. 1454 copy_memory_small(decorators, type, s, d, count, step); 1455 1456 if (granularity >= 8) __ bind(copy8); 1457 if (granularity >= 4) __ bind(copy4); 1458 __ bind(finish); 1459 } 1460 1461 1462 void clobber_registers() { 1463 #ifdef ASSERT 1464 RegSet clobbered 1465 = MacroAssembler::call_clobbered_gp_registers() - rscratch1; 1466 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1467 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1468 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) { 1469 __ mov(*it, rscratch1); 1470 } 1471 #endif 1472 1473 } 1474 1475 // Scan over array at a for count oops, verifying each one. 1476 // Preserves a and count, clobbers rscratch1 and rscratch2. 1477 void verify_oop_array (int size, Register a, Register count, Register temp) { 1478 Label loop, end; 1479 __ mov(rscratch1, a); 1480 __ mov(rscratch2, zr); 1481 __ bind(loop); 1482 __ cmp(rscratch2, count); 1483 __ br(Assembler::HS, end); 1484 if (size == wordSize) { 1485 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1486 __ verify_oop(temp); 1487 } else { 1488 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1489 __ decode_heap_oop(temp); // calls verify_oop 1490 } 1491 __ add(rscratch2, rscratch2, 1); 1492 __ b(loop); 1493 __ bind(end); 1494 } 1495 1496 // Arguments: 1497 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1498 // ignored 1499 // is_oop - true => oop array, so generate store check code 1500 // name - stub name string 1501 // 1502 // Inputs: 1503 // c_rarg0 - source array address 1504 // c_rarg1 - destination array address 1505 // c_rarg2 - element count, treated as ssize_t, can be zero 1506 // 1507 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1508 // the hardware handle it. The two dwords within qwords that span 1509 // cache line boundaries will still be loaded and stored atomically. 1510 // 1511 // Side Effects: 1512 // disjoint_int_copy_entry is set to the no-overlap entry point 1513 // used by generate_conjoint_int_oop_copy(). 1514 // 1515 address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry, 1516 const char *name, bool dest_uninitialized = false) { 1517 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1518 RegSet saved_reg = RegSet::of(s, d, count); 1519 __ align(CodeEntryAlignment); 1520 StubCodeMark mark(this, "StubRoutines", name); 1521 address start = __ pc(); 1522 __ enter(); 1523 1524 if (entry != nullptr) { 1525 *entry = __ pc(); 1526 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1527 BLOCK_COMMENT("Entry:"); 1528 } 1529 1530 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1531 if (dest_uninitialized) { 1532 decorators |= IS_DEST_UNINITIALIZED; 1533 } 1534 if (aligned) { 1535 decorators |= ARRAYCOPY_ALIGNED; 1536 } 1537 1538 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1539 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1540 1541 if (is_oop) { 1542 // save regs before copy_memory 1543 __ push(RegSet::of(d, count), sp); 1544 } 1545 { 1546 // UnsafeMemoryAccess page error: continue after unsafe access 1547 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1548 UnsafeMemoryAccessMark umam(this, add_entry, true); 1549 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); 1550 } 1551 1552 if (is_oop) { 1553 __ pop(RegSet::of(d, count), sp); 1554 if (VerifyOops) 1555 verify_oop_array(size, d, count, r16); 1556 } 1557 1558 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1559 1560 __ leave(); 1561 __ mov(r0, zr); // return 0 1562 __ ret(lr); 1563 return start; 1564 } 1565 1566 // Arguments: 1567 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1568 // ignored 1569 // is_oop - true => oop array, so generate store check code 1570 // name - stub name string 1571 // 1572 // Inputs: 1573 // c_rarg0 - source array address 1574 // c_rarg1 - destination array address 1575 // c_rarg2 - element count, treated as ssize_t, can be zero 1576 // 1577 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1578 // the hardware handle it. The two dwords within qwords that span 1579 // cache line boundaries will still be loaded and stored atomically. 1580 // 1581 address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target, 1582 address *entry, const char *name, 1583 bool dest_uninitialized = false) { 1584 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1585 RegSet saved_regs = RegSet::of(s, d, count); 1586 StubCodeMark mark(this, "StubRoutines", name); 1587 address start = __ pc(); 1588 __ enter(); 1589 1590 if (entry != nullptr) { 1591 *entry = __ pc(); 1592 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1593 BLOCK_COMMENT("Entry:"); 1594 } 1595 1596 // use fwd copy when (d-s) above_equal (count*size) 1597 __ sub(rscratch1, d, s); 1598 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1599 __ br(Assembler::HS, nooverlap_target); 1600 1601 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1602 if (dest_uninitialized) { 1603 decorators |= IS_DEST_UNINITIALIZED; 1604 } 1605 if (aligned) { 1606 decorators |= ARRAYCOPY_ALIGNED; 1607 } 1608 1609 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1610 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1611 1612 if (is_oop) { 1613 // save regs before copy_memory 1614 __ push(RegSet::of(d, count), sp); 1615 } 1616 { 1617 // UnsafeMemoryAccess page error: continue after unsafe access 1618 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1619 UnsafeMemoryAccessMark umam(this, add_entry, true); 1620 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); 1621 } 1622 if (is_oop) { 1623 __ pop(RegSet::of(d, count), sp); 1624 if (VerifyOops) 1625 verify_oop_array(size, d, count, r16); 1626 } 1627 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1628 __ leave(); 1629 __ mov(r0, zr); // return 0 1630 __ ret(lr); 1631 return start; 1632 } 1633 1634 // Arguments: 1635 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1636 // ignored 1637 // name - stub name string 1638 // 1639 // Inputs: 1640 // c_rarg0 - source array address 1641 // c_rarg1 - destination array address 1642 // c_rarg2 - element count, treated as ssize_t, can be zero 1643 // 1644 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1645 // we let the hardware handle it. The one to eight bytes within words, 1646 // dwords or qwords that span cache line boundaries will still be loaded 1647 // and stored atomically. 1648 // 1649 // Side Effects: 1650 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1651 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1652 // we let the hardware handle it. The one to eight bytes within words, 1653 // dwords or qwords that span cache line boundaries will still be loaded 1654 // and stored atomically. 1655 // 1656 // Side Effects: 1657 // disjoint_byte_copy_entry is set to the no-overlap entry point 1658 // used by generate_conjoint_byte_copy(). 1659 // 1660 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1661 const bool not_oop = false; 1662 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1663 } 1664 1665 // Arguments: 1666 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1667 // ignored 1668 // name - stub name string 1669 // 1670 // Inputs: 1671 // c_rarg0 - source array address 1672 // c_rarg1 - destination array address 1673 // c_rarg2 - element count, treated as ssize_t, can be zero 1674 // 1675 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1676 // we let the hardware handle it. The one to eight bytes within words, 1677 // dwords or qwords that span cache line boundaries will still be loaded 1678 // and stored atomically. 1679 // 1680 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1681 address* entry, const char *name) { 1682 const bool not_oop = false; 1683 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1684 } 1685 1686 // Arguments: 1687 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1688 // ignored 1689 // name - stub name string 1690 // 1691 // Inputs: 1692 // c_rarg0 - source array address 1693 // c_rarg1 - destination array address 1694 // c_rarg2 - element count, treated as ssize_t, can be zero 1695 // 1696 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1697 // let the hardware handle it. The two or four words within dwords 1698 // or qwords that span cache line boundaries will still be loaded 1699 // and stored atomically. 1700 // 1701 // Side Effects: 1702 // disjoint_short_copy_entry is set to the no-overlap entry point 1703 // used by generate_conjoint_short_copy(). 1704 // 1705 address generate_disjoint_short_copy(bool aligned, 1706 address* entry, const char *name) { 1707 const bool not_oop = false; 1708 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1709 } 1710 1711 // Arguments: 1712 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1713 // ignored 1714 // name - stub name string 1715 // 1716 // Inputs: 1717 // c_rarg0 - source array address 1718 // c_rarg1 - destination array address 1719 // c_rarg2 - element count, treated as ssize_t, can be zero 1720 // 1721 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1722 // let the hardware handle it. The two or four words within dwords 1723 // or qwords that span cache line boundaries will still be loaded 1724 // and stored atomically. 1725 // 1726 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1727 address *entry, const char *name) { 1728 const bool not_oop = false; 1729 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1730 1731 } 1732 // Arguments: 1733 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1734 // ignored 1735 // name - stub name string 1736 // 1737 // Inputs: 1738 // c_rarg0 - source array address 1739 // c_rarg1 - destination array address 1740 // c_rarg2 - element count, treated as ssize_t, can be zero 1741 // 1742 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1743 // the hardware handle it. The two dwords within qwords that span 1744 // cache line boundaries will still be loaded and stored atomically. 1745 // 1746 // Side Effects: 1747 // disjoint_int_copy_entry is set to the no-overlap entry point 1748 // used by generate_conjoint_int_oop_copy(). 1749 // 1750 address generate_disjoint_int_copy(bool aligned, address *entry, 1751 const char *name, bool dest_uninitialized = false) { 1752 const bool not_oop = false; 1753 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1754 } 1755 1756 // Arguments: 1757 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1758 // ignored 1759 // name - stub name string 1760 // 1761 // Inputs: 1762 // c_rarg0 - source array address 1763 // c_rarg1 - destination array address 1764 // c_rarg2 - element count, treated as ssize_t, can be zero 1765 // 1766 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1767 // the hardware handle it. The two dwords within qwords that span 1768 // cache line boundaries will still be loaded and stored atomically. 1769 // 1770 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1771 address *entry, const char *name, 1772 bool dest_uninitialized = false) { 1773 const bool not_oop = false; 1774 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1775 } 1776 1777 1778 // Arguments: 1779 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1780 // ignored 1781 // name - stub name string 1782 // 1783 // Inputs: 1784 // c_rarg0 - source array address 1785 // c_rarg1 - destination array address 1786 // c_rarg2 - element count, treated as size_t, can be zero 1787 // 1788 // Side Effects: 1789 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1790 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1791 // 1792 address generate_disjoint_long_copy(bool aligned, address *entry, 1793 const char *name, bool dest_uninitialized = false) { 1794 const bool not_oop = false; 1795 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1796 } 1797 1798 // Arguments: 1799 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1800 // ignored 1801 // name - stub name string 1802 // 1803 // Inputs: 1804 // c_rarg0 - source array address 1805 // c_rarg1 - destination array address 1806 // c_rarg2 - element count, treated as size_t, can be zero 1807 // 1808 address generate_conjoint_long_copy(bool aligned, 1809 address nooverlap_target, address *entry, 1810 const char *name, bool dest_uninitialized = false) { 1811 const bool not_oop = false; 1812 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1813 } 1814 1815 // Arguments: 1816 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1817 // ignored 1818 // name - stub name string 1819 // 1820 // Inputs: 1821 // c_rarg0 - source array address 1822 // c_rarg1 - destination array address 1823 // c_rarg2 - element count, treated as size_t, can be zero 1824 // 1825 // Side Effects: 1826 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1827 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1828 // 1829 address generate_disjoint_oop_copy(bool aligned, address *entry, 1830 const char *name, bool dest_uninitialized) { 1831 const bool is_oop = true; 1832 const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1833 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1834 } 1835 1836 // Arguments: 1837 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1838 // ignored 1839 // name - stub name string 1840 // 1841 // Inputs: 1842 // c_rarg0 - source array address 1843 // c_rarg1 - destination array address 1844 // c_rarg2 - element count, treated as size_t, can be zero 1845 // 1846 address generate_conjoint_oop_copy(bool aligned, 1847 address nooverlap_target, address *entry, 1848 const char *name, bool dest_uninitialized) { 1849 const bool is_oop = true; 1850 const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1851 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1852 name, dest_uninitialized); 1853 } 1854 1855 1856 // Helper for generating a dynamic type check. 1857 // Smashes rscratch1, rscratch2. 1858 void generate_type_check(Register sub_klass, 1859 Register super_check_offset, 1860 Register super_klass, 1861 Register temp1, 1862 Register temp2, 1863 Register result, 1864 Label& L_success) { 1865 assert_different_registers(sub_klass, super_check_offset, super_klass); 1866 1867 BLOCK_COMMENT("type_check:"); 1868 1869 Label L_miss; 1870 1871 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, 1872 super_check_offset); 1873 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr); 1874 1875 // Fall through on failure! 1876 __ BIND(L_miss); 1877 } 1878 1879 // 1880 // Generate checkcasting array copy stub 1881 // 1882 // Input: 1883 // c_rarg0 - source array address 1884 // c_rarg1 - destination array address 1885 // c_rarg2 - element count, treated as ssize_t, can be zero 1886 // c_rarg3 - size_t ckoff (super_check_offset) 1887 // c_rarg4 - oop ckval (super_klass) 1888 // 1889 // Output: 1890 // r0 == 0 - success 1891 // r0 == -1^K - failure, where K is partial transfer count 1892 // 1893 address generate_checkcast_copy(const char *name, address *entry, 1894 bool dest_uninitialized = false) { 1895 1896 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1897 1898 // Input registers (after setup_arg_regs) 1899 const Register from = c_rarg0; // source array address 1900 const Register to = c_rarg1; // destination array address 1901 const Register count = c_rarg2; // elementscount 1902 const Register ckoff = c_rarg3; // super_check_offset 1903 const Register ckval = c_rarg4; // super_klass 1904 1905 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1906 RegSet wb_post_saved_regs = RegSet::of(count); 1907 1908 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1909 const Register copied_oop = r22; // actual oop copied 1910 const Register count_save = r21; // orig elementscount 1911 const Register start_to = r20; // destination array start address 1912 const Register r19_klass = r19; // oop._klass 1913 1914 // Registers used as gc temps (r5, r6, r7 are save-on-call) 1915 const Register gct1 = r5, gct2 = r6, gct3 = r7; 1916 1917 //--------------------------------------------------------------- 1918 // Assembler stub will be used for this call to arraycopy 1919 // if the two arrays are subtypes of Object[] but the 1920 // destination array type is not equal to or a supertype 1921 // of the source type. Each element must be separately 1922 // checked. 1923 1924 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1925 copied_oop, r19_klass, count_save); 1926 1927 __ align(CodeEntryAlignment); 1928 StubCodeMark mark(this, "StubRoutines", name); 1929 address start = __ pc(); 1930 1931 __ enter(); // required for proper stackwalking of RuntimeStub frame 1932 1933 #ifdef ASSERT 1934 // caller guarantees that the arrays really are different 1935 // otherwise, we would have to make conjoint checks 1936 { Label L; 1937 __ b(L); // conjoint check not yet implemented 1938 __ stop("checkcast_copy within a single array"); 1939 __ bind(L); 1940 } 1941 #endif //ASSERT 1942 1943 // Caller of this entry point must set up the argument registers. 1944 if (entry != nullptr) { 1945 *entry = __ pc(); 1946 BLOCK_COMMENT("Entry:"); 1947 } 1948 1949 // Empty array: Nothing to do. 1950 __ cbz(count, L_done); 1951 __ push(RegSet::of(r19, r20, r21, r22), sp); 1952 1953 #ifdef ASSERT 1954 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1955 // The ckoff and ckval must be mutually consistent, 1956 // even though caller generates both. 1957 { Label L; 1958 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1959 __ ldrw(start_to, Address(ckval, sco_offset)); 1960 __ cmpw(ckoff, start_to); 1961 __ br(Assembler::EQ, L); 1962 __ stop("super_check_offset inconsistent"); 1963 __ bind(L); 1964 } 1965 #endif //ASSERT 1966 1967 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1968 bool is_oop = true; 1969 int element_size = UseCompressedOops ? 4 : 8; 1970 if (dest_uninitialized) { 1971 decorators |= IS_DEST_UNINITIALIZED; 1972 } 1973 1974 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1975 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1976 1977 // save the original count 1978 __ mov(count_save, count); 1979 1980 // Copy from low to high addresses 1981 __ mov(start_to, to); // Save destination array start address 1982 __ b(L_load_element); 1983 1984 // ======== begin loop ======== 1985 // (Loop is rotated; its entry is L_load_element.) 1986 // Loop control: 1987 // for (; count != 0; count--) { 1988 // copied_oop = load_heap_oop(from++); 1989 // ... generate_type_check ...; 1990 // store_heap_oop(to++, copied_oop); 1991 // } 1992 __ align(OptoLoopAlignment); 1993 1994 __ BIND(L_store_element); 1995 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, 1996 __ post(to, element_size), copied_oop, noreg, 1997 gct1, gct2, gct3); 1998 __ sub(count, count, 1); 1999 __ cbz(count, L_do_card_marks); 2000 2001 // ======== loop entry is here ======== 2002 __ BIND(L_load_element); 2003 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, 2004 copied_oop, noreg, __ post(from, element_size), 2005 gct1); 2006 __ cbz(copied_oop, L_store_element); 2007 2008 __ load_klass(r19_klass, copied_oop);// query the object klass 2009 2010 BLOCK_COMMENT("type_check:"); 2011 generate_type_check(/*sub_klass*/r19_klass, 2012 /*super_check_offset*/ckoff, 2013 /*super_klass*/ckval, 2014 /*r_array_base*/gct1, 2015 /*temp2*/gct2, 2016 /*result*/r10, L_store_element); 2017 2018 // Fall through on failure! 2019 2020 // ======== end loop ======== 2021 2022 // It was a real error; we must depend on the caller to finish the job. 2023 // Register count = remaining oops, count_orig = total oops. 2024 // Emit GC store barriers for the oops we have copied and report 2025 // their number to the caller. 2026 2027 __ subs(count, count_save, count); // K = partially copied oop count 2028 __ eon(count, count, zr); // report (-1^K) to caller 2029 __ br(Assembler::EQ, L_done_pop); 2030 2031 __ BIND(L_do_card_marks); 2032 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 2033 2034 __ bind(L_done_pop); 2035 __ pop(RegSet::of(r19, r20, r21, r22), sp); 2036 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 2037 2038 __ bind(L_done); 2039 __ mov(r0, count); 2040 __ leave(); 2041 __ ret(lr); 2042 2043 return start; 2044 } 2045 2046 // Perform range checks on the proposed arraycopy. 2047 // Kills temp, but nothing else. 2048 // Also, clean the sign bits of src_pos and dst_pos. 2049 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2050 Register src_pos, // source position (c_rarg1) 2051 Register dst, // destination array oo (c_rarg2) 2052 Register dst_pos, // destination position (c_rarg3) 2053 Register length, 2054 Register temp, 2055 Label& L_failed) { 2056 BLOCK_COMMENT("arraycopy_range_checks:"); 2057 2058 assert_different_registers(rscratch1, temp); 2059 2060 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2061 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 2062 __ addw(temp, length, src_pos); 2063 __ cmpw(temp, rscratch1); 2064 __ br(Assembler::HI, L_failed); 2065 2066 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2067 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2068 __ addw(temp, length, dst_pos); 2069 __ cmpw(temp, rscratch1); 2070 __ br(Assembler::HI, L_failed); 2071 2072 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 2073 __ movw(src_pos, src_pos); 2074 __ movw(dst_pos, dst_pos); 2075 2076 BLOCK_COMMENT("arraycopy_range_checks done"); 2077 } 2078 2079 // These stubs get called from some dumb test routine. 2080 // I'll write them properly when they're called from 2081 // something that's actually doing something. 2082 static void fake_arraycopy_stub(address src, address dst, int count) { 2083 assert(count == 0, "huh?"); 2084 } 2085 2086 2087 // 2088 // Generate 'unsafe' array copy stub 2089 // Though just as safe as the other stubs, it takes an unscaled 2090 // size_t argument instead of an element count. 2091 // 2092 // Input: 2093 // c_rarg0 - source array address 2094 // c_rarg1 - destination array address 2095 // c_rarg2 - byte count, treated as ssize_t, can be zero 2096 // 2097 // Examines the alignment of the operands and dispatches 2098 // to a long, int, short, or byte copy loop. 2099 // 2100 address generate_unsafe_copy(const char *name, 2101 address byte_copy_entry, 2102 address short_copy_entry, 2103 address int_copy_entry, 2104 address long_copy_entry) { 2105 Label L_long_aligned, L_int_aligned, L_short_aligned; 2106 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 2107 2108 __ align(CodeEntryAlignment); 2109 StubCodeMark mark(this, "StubRoutines", name); 2110 address start = __ pc(); 2111 __ enter(); // required for proper stackwalking of RuntimeStub frame 2112 2113 // bump this on entry, not on exit: 2114 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2115 2116 __ orr(rscratch1, s, d); 2117 __ orr(rscratch1, rscratch1, count); 2118 2119 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2120 __ cbz(rscratch1, L_long_aligned); 2121 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2122 __ cbz(rscratch1, L_int_aligned); 2123 __ tbz(rscratch1, 0, L_short_aligned); 2124 __ b(RuntimeAddress(byte_copy_entry)); 2125 2126 __ BIND(L_short_aligned); 2127 __ lsr(count, count, LogBytesPerShort); // size => short_count 2128 __ b(RuntimeAddress(short_copy_entry)); 2129 __ BIND(L_int_aligned); 2130 __ lsr(count, count, LogBytesPerInt); // size => int_count 2131 __ b(RuntimeAddress(int_copy_entry)); 2132 __ BIND(L_long_aligned); 2133 __ lsr(count, count, LogBytesPerLong); // size => long_count 2134 __ b(RuntimeAddress(long_copy_entry)); 2135 2136 return start; 2137 } 2138 2139 // 2140 // Generate generic array copy stubs 2141 // 2142 // Input: 2143 // c_rarg0 - src oop 2144 // c_rarg1 - src_pos (32-bits) 2145 // c_rarg2 - dst oop 2146 // c_rarg3 - dst_pos (32-bits) 2147 // c_rarg4 - element count (32-bits) 2148 // 2149 // Output: 2150 // r0 == 0 - success 2151 // r0 == -1^K - failure, where K is partial transfer count 2152 // 2153 address generate_generic_copy(const char *name, 2154 address byte_copy_entry, address short_copy_entry, 2155 address int_copy_entry, address oop_copy_entry, 2156 address long_copy_entry, address checkcast_copy_entry) { 2157 2158 Label L_failed, L_objArray; 2159 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2160 2161 // Input registers 2162 const Register src = c_rarg0; // source array oop 2163 const Register src_pos = c_rarg1; // source position 2164 const Register dst = c_rarg2; // destination array oop 2165 const Register dst_pos = c_rarg3; // destination position 2166 const Register length = c_rarg4; 2167 2168 2169 // Registers used as temps 2170 const Register dst_klass = c_rarg5; 2171 2172 __ align(CodeEntryAlignment); 2173 2174 StubCodeMark mark(this, "StubRoutines", name); 2175 2176 address start = __ pc(); 2177 2178 __ enter(); // required for proper stackwalking of RuntimeStub frame 2179 2180 // bump this on entry, not on exit: 2181 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2182 2183 //----------------------------------------------------------------------- 2184 // Assembler stub will be used for this call to arraycopy 2185 // if the following conditions are met: 2186 // 2187 // (1) src and dst must not be null. 2188 // (2) src_pos must not be negative. 2189 // (3) dst_pos must not be negative. 2190 // (4) length must not be negative. 2191 // (5) src klass and dst klass should be the same and not null. 2192 // (6) src and dst should be arrays. 2193 // (7) src_pos + length must not exceed length of src. 2194 // (8) dst_pos + length must not exceed length of dst. 2195 // 2196 2197 // if (src == nullptr) return -1; 2198 __ cbz(src, L_failed); 2199 2200 // if (src_pos < 0) return -1; 2201 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2202 2203 // if (dst == nullptr) return -1; 2204 __ cbz(dst, L_failed); 2205 2206 // if (dst_pos < 0) return -1; 2207 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2208 2209 // registers used as temp 2210 const Register scratch_length = r16; // elements count to copy 2211 const Register scratch_src_klass = r17; // array klass 2212 const Register lh = r15; // layout helper 2213 2214 // if (length < 0) return -1; 2215 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2216 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2217 2218 __ load_klass(scratch_src_klass, src); 2219 #ifdef ASSERT 2220 // assert(src->klass() != nullptr); 2221 { 2222 BLOCK_COMMENT("assert klasses not null {"); 2223 Label L1, L2; 2224 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null 2225 __ bind(L1); 2226 __ stop("broken null klass"); 2227 __ bind(L2); 2228 __ load_klass(rscratch1, dst); 2229 __ cbz(rscratch1, L1); // this would be broken also 2230 BLOCK_COMMENT("} assert klasses not null done"); 2231 } 2232 #endif 2233 2234 // Load layout helper (32-bits) 2235 // 2236 // |array_tag| | header_size | element_type | |log2_element_size| 2237 // 32 30 24 16 8 2 0 2238 // 2239 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2240 // 2241 2242 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2243 2244 // Handle objArrays completely differently... 2245 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2246 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2247 __ movw(rscratch1, objArray_lh); 2248 __ eorw(rscratch2, lh, rscratch1); 2249 __ cbzw(rscratch2, L_objArray); 2250 2251 // if (src->klass() != dst->klass()) return -1; 2252 __ load_klass(rscratch2, dst); 2253 __ eor(rscratch2, rscratch2, scratch_src_klass); 2254 __ cbnz(rscratch2, L_failed); 2255 2256 // Check for flat inline type array -> return -1 2257 __ test_flat_array_oop(src, rscratch2, L_failed); 2258 2259 // Check for null-free (non-flat) inline type array -> handle as object array 2260 __ test_null_free_array_oop(src, rscratch2, L_objArray); 2261 2262 // if (!src->is_Array()) return -1; 2263 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2264 2265 // At this point, it is known to be a typeArray (array_tag 0x3). 2266 #ifdef ASSERT 2267 { 2268 BLOCK_COMMENT("assert primitive array {"); 2269 Label L; 2270 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2271 __ cmpw(lh, rscratch2); 2272 __ br(Assembler::GE, L); 2273 __ stop("must be a primitive array"); 2274 __ bind(L); 2275 BLOCK_COMMENT("} assert primitive array done"); 2276 } 2277 #endif 2278 2279 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2280 rscratch2, L_failed); 2281 2282 // TypeArrayKlass 2283 // 2284 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2285 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2286 // 2287 2288 const Register rscratch1_offset = rscratch1; // array offset 2289 const Register r15_elsize = lh; // element size 2290 2291 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2292 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2293 __ add(src, src, rscratch1_offset); // src array offset 2294 __ add(dst, dst, rscratch1_offset); // dst array offset 2295 BLOCK_COMMENT("choose copy loop based on element size"); 2296 2297 // next registers should be set before the jump to corresponding stub 2298 const Register from = c_rarg0; // source array address 2299 const Register to = c_rarg1; // destination array address 2300 const Register count = c_rarg2; // elements count 2301 2302 // 'from', 'to', 'count' registers should be set in such order 2303 // since they are the same as 'src', 'src_pos', 'dst'. 2304 2305 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2306 2307 // The possible values of elsize are 0-3, i.e. exact_log2(element 2308 // size in bytes). We do a simple bitwise binary search. 2309 __ BIND(L_copy_bytes); 2310 __ tbnz(r15_elsize, 1, L_copy_ints); 2311 __ tbnz(r15_elsize, 0, L_copy_shorts); 2312 __ lea(from, Address(src, src_pos));// src_addr 2313 __ lea(to, Address(dst, dst_pos));// dst_addr 2314 __ movw(count, scratch_length); // length 2315 __ b(RuntimeAddress(byte_copy_entry)); 2316 2317 __ BIND(L_copy_shorts); 2318 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2319 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2320 __ movw(count, scratch_length); // length 2321 __ b(RuntimeAddress(short_copy_entry)); 2322 2323 __ BIND(L_copy_ints); 2324 __ tbnz(r15_elsize, 0, L_copy_longs); 2325 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2326 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2327 __ movw(count, scratch_length); // length 2328 __ b(RuntimeAddress(int_copy_entry)); 2329 2330 __ BIND(L_copy_longs); 2331 #ifdef ASSERT 2332 { 2333 BLOCK_COMMENT("assert long copy {"); 2334 Label L; 2335 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2336 __ cmpw(r15_elsize, LogBytesPerLong); 2337 __ br(Assembler::EQ, L); 2338 __ stop("must be long copy, but elsize is wrong"); 2339 __ bind(L); 2340 BLOCK_COMMENT("} assert long copy done"); 2341 } 2342 #endif 2343 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2344 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2345 __ movw(count, scratch_length); // length 2346 __ b(RuntimeAddress(long_copy_entry)); 2347 2348 // ObjArrayKlass 2349 __ BIND(L_objArray); 2350 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2351 2352 Label L_plain_copy, L_checkcast_copy; 2353 // test array classes for subtyping 2354 __ load_klass(r15, dst); 2355 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2356 __ br(Assembler::NE, L_checkcast_copy); 2357 2358 // Identically typed arrays can be copied without element-wise checks. 2359 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2360 rscratch2, L_failed); 2361 2362 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2363 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2364 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2365 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2366 __ movw(count, scratch_length); // length 2367 __ BIND(L_plain_copy); 2368 __ b(RuntimeAddress(oop_copy_entry)); 2369 2370 __ BIND(L_checkcast_copy); 2371 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2372 { 2373 // Before looking at dst.length, make sure dst is also an objArray. 2374 __ ldrw(rscratch1, Address(r15, lh_offset)); 2375 __ movw(rscratch2, objArray_lh); 2376 __ eorw(rscratch1, rscratch1, rscratch2); 2377 __ cbnzw(rscratch1, L_failed); 2378 2379 // It is safe to examine both src.length and dst.length. 2380 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2381 r15, L_failed); 2382 2383 __ load_klass(dst_klass, dst); // reload 2384 2385 // Marshal the base address arguments now, freeing registers. 2386 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2387 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2388 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2389 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2390 __ movw(count, length); // length (reloaded) 2391 Register sco_temp = c_rarg3; // this register is free now 2392 assert_different_registers(from, to, count, sco_temp, 2393 dst_klass, scratch_src_klass); 2394 // assert_clean_int(count, sco_temp); 2395 2396 // Generate the type check. 2397 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2398 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2399 2400 // Smashes rscratch1, rscratch2 2401 generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg, 2402 L_plain_copy); 2403 2404 // Fetch destination element klass from the ObjArrayKlass header. 2405 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2406 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2407 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2408 2409 // the checkcast_copy loop needs two extra arguments: 2410 assert(c_rarg3 == sco_temp, "#3 already in place"); 2411 // Set up arguments for checkcast_copy_entry. 2412 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2413 __ b(RuntimeAddress(checkcast_copy_entry)); 2414 } 2415 2416 __ BIND(L_failed); 2417 __ mov(r0, -1); 2418 __ leave(); // required for proper stackwalking of RuntimeStub frame 2419 __ ret(lr); 2420 2421 return start; 2422 } 2423 2424 // 2425 // Generate stub for array fill. If "aligned" is true, the 2426 // "to" address is assumed to be heapword aligned. 2427 // 2428 // Arguments for generated stub: 2429 // to: c_rarg0 2430 // value: c_rarg1 2431 // count: c_rarg2 treated as signed 2432 // 2433 address generate_fill(BasicType t, bool aligned, const char *name) { 2434 __ align(CodeEntryAlignment); 2435 StubCodeMark mark(this, "StubRoutines", name); 2436 address start = __ pc(); 2437 2438 BLOCK_COMMENT("Entry:"); 2439 2440 const Register to = c_rarg0; // source array address 2441 const Register value = c_rarg1; // value 2442 const Register count = c_rarg2; // elements count 2443 2444 const Register bz_base = r10; // base for block_zero routine 2445 const Register cnt_words = r11; // temp register 2446 2447 __ enter(); 2448 2449 Label L_fill_elements, L_exit1; 2450 2451 int shift = -1; 2452 switch (t) { 2453 case T_BYTE: 2454 shift = 0; 2455 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2456 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2457 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2458 __ br(Assembler::LO, L_fill_elements); 2459 break; 2460 case T_SHORT: 2461 shift = 1; 2462 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2463 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2464 __ br(Assembler::LO, L_fill_elements); 2465 break; 2466 case T_INT: 2467 shift = 2; 2468 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2469 __ br(Assembler::LO, L_fill_elements); 2470 break; 2471 default: ShouldNotReachHere(); 2472 } 2473 2474 // Align source address at 8 bytes address boundary. 2475 Label L_skip_align1, L_skip_align2, L_skip_align4; 2476 if (!aligned) { 2477 switch (t) { 2478 case T_BYTE: 2479 // One byte misalignment happens only for byte arrays. 2480 __ tbz(to, 0, L_skip_align1); 2481 __ strb(value, Address(__ post(to, 1))); 2482 __ subw(count, count, 1); 2483 __ bind(L_skip_align1); 2484 // Fallthrough 2485 case T_SHORT: 2486 // Two bytes misalignment happens only for byte and short (char) arrays. 2487 __ tbz(to, 1, L_skip_align2); 2488 __ strh(value, Address(__ post(to, 2))); 2489 __ subw(count, count, 2 >> shift); 2490 __ bind(L_skip_align2); 2491 // Fallthrough 2492 case T_INT: 2493 // Align to 8 bytes, we know we are 4 byte aligned to start. 2494 __ tbz(to, 2, L_skip_align4); 2495 __ strw(value, Address(__ post(to, 4))); 2496 __ subw(count, count, 4 >> shift); 2497 __ bind(L_skip_align4); 2498 break; 2499 default: ShouldNotReachHere(); 2500 } 2501 } 2502 2503 // 2504 // Fill large chunks 2505 // 2506 __ lsrw(cnt_words, count, 3 - shift); // number of words 2507 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2508 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2509 if (UseBlockZeroing) { 2510 Label non_block_zeroing, rest; 2511 // If the fill value is zero we can use the fast zero_words(). 2512 __ cbnz(value, non_block_zeroing); 2513 __ mov(bz_base, to); 2514 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2515 address tpc = __ zero_words(bz_base, cnt_words); 2516 if (tpc == nullptr) { 2517 fatal("CodeCache is full at generate_fill"); 2518 } 2519 __ b(rest); 2520 __ bind(non_block_zeroing); 2521 __ fill_words(to, cnt_words, value); 2522 __ bind(rest); 2523 } else { 2524 __ fill_words(to, cnt_words, value); 2525 } 2526 2527 // Remaining count is less than 8 bytes. Fill it by a single store. 2528 // Note that the total length is no less than 8 bytes. 2529 if (t == T_BYTE || t == T_SHORT) { 2530 Label L_exit1; 2531 __ cbzw(count, L_exit1); 2532 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2533 __ str(value, Address(to, -8)); // overwrite some elements 2534 __ bind(L_exit1); 2535 __ leave(); 2536 __ ret(lr); 2537 } 2538 2539 // Handle copies less than 8 bytes. 2540 Label L_fill_2, L_fill_4, L_exit2; 2541 __ bind(L_fill_elements); 2542 switch (t) { 2543 case T_BYTE: 2544 __ tbz(count, 0, L_fill_2); 2545 __ strb(value, Address(__ post(to, 1))); 2546 __ bind(L_fill_2); 2547 __ tbz(count, 1, L_fill_4); 2548 __ strh(value, Address(__ post(to, 2))); 2549 __ bind(L_fill_4); 2550 __ tbz(count, 2, L_exit2); 2551 __ strw(value, Address(to)); 2552 break; 2553 case T_SHORT: 2554 __ tbz(count, 0, L_fill_4); 2555 __ strh(value, Address(__ post(to, 2))); 2556 __ bind(L_fill_4); 2557 __ tbz(count, 1, L_exit2); 2558 __ strw(value, Address(to)); 2559 break; 2560 case T_INT: 2561 __ cbzw(count, L_exit2); 2562 __ strw(value, Address(to)); 2563 break; 2564 default: ShouldNotReachHere(); 2565 } 2566 __ bind(L_exit2); 2567 __ leave(); 2568 __ ret(lr); 2569 return start; 2570 } 2571 2572 address generate_data_cache_writeback() { 2573 const Register line = c_rarg0; // address of line to write back 2574 2575 __ align(CodeEntryAlignment); 2576 2577 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback"); 2578 2579 address start = __ pc(); 2580 __ enter(); 2581 __ cache_wb(Address(line, 0)); 2582 __ leave(); 2583 __ ret(lr); 2584 2585 return start; 2586 } 2587 2588 address generate_data_cache_writeback_sync() { 2589 const Register is_pre = c_rarg0; // pre or post sync 2590 2591 __ align(CodeEntryAlignment); 2592 2593 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync"); 2594 2595 // pre wbsync is a no-op 2596 // post wbsync translates to an sfence 2597 2598 Label skip; 2599 address start = __ pc(); 2600 __ enter(); 2601 __ cbnz(is_pre, skip); 2602 __ cache_wbsync(false); 2603 __ bind(skip); 2604 __ leave(); 2605 __ ret(lr); 2606 2607 return start; 2608 } 2609 2610 void generate_arraycopy_stubs() { 2611 address entry; 2612 address entry_jbyte_arraycopy; 2613 address entry_jshort_arraycopy; 2614 address entry_jint_arraycopy; 2615 address entry_oop_arraycopy; 2616 address entry_jlong_arraycopy; 2617 address entry_checkcast_arraycopy; 2618 2619 generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_f, r0, r1, r15, copy_forwards); 2620 generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_b, r0, r1, r15, copy_backwards); 2621 2622 generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_f, r0, r1, r15, copy_forwards); 2623 generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_b, r0, r1, r15, copy_backwards); 2624 2625 generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_f, r0, r1, r15, copy_forwards); 2626 generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_b, r0, r1, r15, copy_backwards); 2627 2628 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2629 2630 //*** jbyte 2631 // Always need aligned and unaligned versions 2632 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2633 "jbyte_disjoint_arraycopy"); 2634 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2635 &entry_jbyte_arraycopy, 2636 "jbyte_arraycopy"); 2637 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2638 "arrayof_jbyte_disjoint_arraycopy"); 2639 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, nullptr, 2640 "arrayof_jbyte_arraycopy"); 2641 2642 //*** jshort 2643 // Always need aligned and unaligned versions 2644 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2645 "jshort_disjoint_arraycopy"); 2646 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2647 &entry_jshort_arraycopy, 2648 "jshort_arraycopy"); 2649 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2650 "arrayof_jshort_disjoint_arraycopy"); 2651 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, nullptr, 2652 "arrayof_jshort_arraycopy"); 2653 2654 //*** jint 2655 // Aligned versions 2656 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2657 "arrayof_jint_disjoint_arraycopy"); 2658 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2659 "arrayof_jint_arraycopy"); 2660 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2661 // entry_jint_arraycopy always points to the unaligned version 2662 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2663 "jint_disjoint_arraycopy"); 2664 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2665 &entry_jint_arraycopy, 2666 "jint_arraycopy"); 2667 2668 //*** jlong 2669 // It is always aligned 2670 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2671 "arrayof_jlong_disjoint_arraycopy"); 2672 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2673 "arrayof_jlong_arraycopy"); 2674 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2675 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2676 2677 //*** oops 2678 { 2679 // With compressed oops we need unaligned versions; notice that 2680 // we overwrite entry_oop_arraycopy. 2681 bool aligned = !UseCompressedOops; 2682 2683 StubRoutines::_arrayof_oop_disjoint_arraycopy 2684 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2685 /*dest_uninitialized*/false); 2686 StubRoutines::_arrayof_oop_arraycopy 2687 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2688 /*dest_uninitialized*/false); 2689 // Aligned versions without pre-barriers 2690 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2691 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2692 /*dest_uninitialized*/true); 2693 StubRoutines::_arrayof_oop_arraycopy_uninit 2694 = generate_conjoint_oop_copy(aligned, entry, nullptr, "arrayof_oop_arraycopy_uninit", 2695 /*dest_uninitialized*/true); 2696 } 2697 2698 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2699 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2700 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2701 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2702 2703 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2704 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr, 2705 /*dest_uninitialized*/true); 2706 2707 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2708 entry_jbyte_arraycopy, 2709 entry_jshort_arraycopy, 2710 entry_jint_arraycopy, 2711 entry_jlong_arraycopy); 2712 2713 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2714 entry_jbyte_arraycopy, 2715 entry_jshort_arraycopy, 2716 entry_jint_arraycopy, 2717 entry_oop_arraycopy, 2718 entry_jlong_arraycopy, 2719 entry_checkcast_arraycopy); 2720 2721 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2722 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2723 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2724 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2725 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2726 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2727 } 2728 2729 void generate_math_stubs() { Unimplemented(); } 2730 2731 // Arguments: 2732 // 2733 // Inputs: 2734 // c_rarg0 - source byte array address 2735 // c_rarg1 - destination byte array address 2736 // c_rarg2 - K (key) in little endian int array 2737 // 2738 address generate_aescrypt_encryptBlock() { 2739 __ align(CodeEntryAlignment); 2740 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2741 2742 const Register from = c_rarg0; // source array address 2743 const Register to = c_rarg1; // destination array address 2744 const Register key = c_rarg2; // key array address 2745 const Register keylen = rscratch1; 2746 2747 address start = __ pc(); 2748 __ enter(); 2749 2750 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2751 2752 __ aesenc_loadkeys(key, keylen); 2753 __ aesecb_encrypt(from, to, keylen); 2754 2755 __ mov(r0, 0); 2756 2757 __ leave(); 2758 __ ret(lr); 2759 2760 return start; 2761 } 2762 2763 // Arguments: 2764 // 2765 // Inputs: 2766 // c_rarg0 - source byte array address 2767 // c_rarg1 - destination byte array address 2768 // c_rarg2 - K (key) in little endian int array 2769 // 2770 address generate_aescrypt_decryptBlock() { 2771 assert(UseAES, "need AES cryptographic extension support"); 2772 __ align(CodeEntryAlignment); 2773 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2774 Label L_doLast; 2775 2776 const Register from = c_rarg0; // source array address 2777 const Register to = c_rarg1; // destination array address 2778 const Register key = c_rarg2; // key array address 2779 const Register keylen = rscratch1; 2780 2781 address start = __ pc(); 2782 __ enter(); // required for proper stackwalking of RuntimeStub frame 2783 2784 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2785 2786 __ aesecb_decrypt(from, to, key, keylen); 2787 2788 __ mov(r0, 0); 2789 2790 __ leave(); 2791 __ ret(lr); 2792 2793 return start; 2794 } 2795 2796 // Arguments: 2797 // 2798 // Inputs: 2799 // c_rarg0 - source byte array address 2800 // c_rarg1 - destination byte array address 2801 // c_rarg2 - K (key) in little endian int array 2802 // c_rarg3 - r vector byte array address 2803 // c_rarg4 - input length 2804 // 2805 // Output: 2806 // x0 - input length 2807 // 2808 address generate_cipherBlockChaining_encryptAESCrypt() { 2809 assert(UseAES, "need AES cryptographic extension support"); 2810 __ align(CodeEntryAlignment); 2811 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2812 2813 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2814 2815 const Register from = c_rarg0; // source array address 2816 const Register to = c_rarg1; // destination array address 2817 const Register key = c_rarg2; // key array address 2818 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2819 // and left with the results of the last encryption block 2820 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2821 const Register keylen = rscratch1; 2822 2823 address start = __ pc(); 2824 2825 __ enter(); 2826 2827 __ movw(rscratch2, len_reg); 2828 2829 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2830 2831 __ ld1(v0, __ T16B, rvec); 2832 2833 __ cmpw(keylen, 52); 2834 __ br(Assembler::CC, L_loadkeys_44); 2835 __ br(Assembler::EQ, L_loadkeys_52); 2836 2837 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2838 __ rev32(v17, __ T16B, v17); 2839 __ rev32(v18, __ T16B, v18); 2840 __ BIND(L_loadkeys_52); 2841 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2842 __ rev32(v19, __ T16B, v19); 2843 __ rev32(v20, __ T16B, v20); 2844 __ BIND(L_loadkeys_44); 2845 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2846 __ rev32(v21, __ T16B, v21); 2847 __ rev32(v22, __ T16B, v22); 2848 __ rev32(v23, __ T16B, v23); 2849 __ rev32(v24, __ T16B, v24); 2850 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2851 __ rev32(v25, __ T16B, v25); 2852 __ rev32(v26, __ T16B, v26); 2853 __ rev32(v27, __ T16B, v27); 2854 __ rev32(v28, __ T16B, v28); 2855 __ ld1(v29, v30, v31, __ T16B, key); 2856 __ rev32(v29, __ T16B, v29); 2857 __ rev32(v30, __ T16B, v30); 2858 __ rev32(v31, __ T16B, v31); 2859 2860 __ BIND(L_aes_loop); 2861 __ ld1(v1, __ T16B, __ post(from, 16)); 2862 __ eor(v0, __ T16B, v0, v1); 2863 2864 __ br(Assembler::CC, L_rounds_44); 2865 __ br(Assembler::EQ, L_rounds_52); 2866 2867 __ aese(v0, v17); __ aesmc(v0, v0); 2868 __ aese(v0, v18); __ aesmc(v0, v0); 2869 __ BIND(L_rounds_52); 2870 __ aese(v0, v19); __ aesmc(v0, v0); 2871 __ aese(v0, v20); __ aesmc(v0, v0); 2872 __ BIND(L_rounds_44); 2873 __ aese(v0, v21); __ aesmc(v0, v0); 2874 __ aese(v0, v22); __ aesmc(v0, v0); 2875 __ aese(v0, v23); __ aesmc(v0, v0); 2876 __ aese(v0, v24); __ aesmc(v0, v0); 2877 __ aese(v0, v25); __ aesmc(v0, v0); 2878 __ aese(v0, v26); __ aesmc(v0, v0); 2879 __ aese(v0, v27); __ aesmc(v0, v0); 2880 __ aese(v0, v28); __ aesmc(v0, v0); 2881 __ aese(v0, v29); __ aesmc(v0, v0); 2882 __ aese(v0, v30); 2883 __ eor(v0, __ T16B, v0, v31); 2884 2885 __ st1(v0, __ T16B, __ post(to, 16)); 2886 2887 __ subw(len_reg, len_reg, 16); 2888 __ cbnzw(len_reg, L_aes_loop); 2889 2890 __ st1(v0, __ T16B, rvec); 2891 2892 __ mov(r0, rscratch2); 2893 2894 __ leave(); 2895 __ ret(lr); 2896 2897 return start; 2898 } 2899 2900 // Arguments: 2901 // 2902 // Inputs: 2903 // c_rarg0 - source byte array address 2904 // c_rarg1 - destination byte array address 2905 // c_rarg2 - K (key) in little endian int array 2906 // c_rarg3 - r vector byte array address 2907 // c_rarg4 - input length 2908 // 2909 // Output: 2910 // r0 - input length 2911 // 2912 address generate_cipherBlockChaining_decryptAESCrypt() { 2913 assert(UseAES, "need AES cryptographic extension support"); 2914 __ align(CodeEntryAlignment); 2915 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2916 2917 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2918 2919 const Register from = c_rarg0; // source array address 2920 const Register to = c_rarg1; // destination array address 2921 const Register key = c_rarg2; // key array address 2922 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2923 // and left with the results of the last encryption block 2924 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2925 const Register keylen = rscratch1; 2926 2927 address start = __ pc(); 2928 2929 __ enter(); 2930 2931 __ movw(rscratch2, len_reg); 2932 2933 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2934 2935 __ ld1(v2, __ T16B, rvec); 2936 2937 __ ld1(v31, __ T16B, __ post(key, 16)); 2938 __ rev32(v31, __ T16B, v31); 2939 2940 __ cmpw(keylen, 52); 2941 __ br(Assembler::CC, L_loadkeys_44); 2942 __ br(Assembler::EQ, L_loadkeys_52); 2943 2944 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2945 __ rev32(v17, __ T16B, v17); 2946 __ rev32(v18, __ T16B, v18); 2947 __ BIND(L_loadkeys_52); 2948 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2949 __ rev32(v19, __ T16B, v19); 2950 __ rev32(v20, __ T16B, v20); 2951 __ BIND(L_loadkeys_44); 2952 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2953 __ rev32(v21, __ T16B, v21); 2954 __ rev32(v22, __ T16B, v22); 2955 __ rev32(v23, __ T16B, v23); 2956 __ rev32(v24, __ T16B, v24); 2957 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2958 __ rev32(v25, __ T16B, v25); 2959 __ rev32(v26, __ T16B, v26); 2960 __ rev32(v27, __ T16B, v27); 2961 __ rev32(v28, __ T16B, v28); 2962 __ ld1(v29, v30, __ T16B, key); 2963 __ rev32(v29, __ T16B, v29); 2964 __ rev32(v30, __ T16B, v30); 2965 2966 __ BIND(L_aes_loop); 2967 __ ld1(v0, __ T16B, __ post(from, 16)); 2968 __ orr(v1, __ T16B, v0, v0); 2969 2970 __ br(Assembler::CC, L_rounds_44); 2971 __ br(Assembler::EQ, L_rounds_52); 2972 2973 __ aesd(v0, v17); __ aesimc(v0, v0); 2974 __ aesd(v0, v18); __ aesimc(v0, v0); 2975 __ BIND(L_rounds_52); 2976 __ aesd(v0, v19); __ aesimc(v0, v0); 2977 __ aesd(v0, v20); __ aesimc(v0, v0); 2978 __ BIND(L_rounds_44); 2979 __ aesd(v0, v21); __ aesimc(v0, v0); 2980 __ aesd(v0, v22); __ aesimc(v0, v0); 2981 __ aesd(v0, v23); __ aesimc(v0, v0); 2982 __ aesd(v0, v24); __ aesimc(v0, v0); 2983 __ aesd(v0, v25); __ aesimc(v0, v0); 2984 __ aesd(v0, v26); __ aesimc(v0, v0); 2985 __ aesd(v0, v27); __ aesimc(v0, v0); 2986 __ aesd(v0, v28); __ aesimc(v0, v0); 2987 __ aesd(v0, v29); __ aesimc(v0, v0); 2988 __ aesd(v0, v30); 2989 __ eor(v0, __ T16B, v0, v31); 2990 __ eor(v0, __ T16B, v0, v2); 2991 2992 __ st1(v0, __ T16B, __ post(to, 16)); 2993 __ orr(v2, __ T16B, v1, v1); 2994 2995 __ subw(len_reg, len_reg, 16); 2996 __ cbnzw(len_reg, L_aes_loop); 2997 2998 __ st1(v2, __ T16B, rvec); 2999 3000 __ mov(r0, rscratch2); 3001 3002 __ leave(); 3003 __ ret(lr); 3004 3005 return start; 3006 } 3007 3008 // Big-endian 128-bit + 64-bit -> 128-bit addition. 3009 // Inputs: 128-bits. in is preserved. 3010 // The least-significant 64-bit word is in the upper dword of each vector. 3011 // inc (the 64-bit increment) is preserved. Its lower dword must be zero. 3012 // Output: result 3013 void be_add_128_64(FloatRegister result, FloatRegister in, 3014 FloatRegister inc, FloatRegister tmp) { 3015 assert_different_registers(result, tmp, inc); 3016 3017 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of 3018 // input 3019 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing 3020 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and 3021 // MSD == 0 (must be!) to LSD 3022 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow 3023 } 3024 3025 // CTR AES crypt. 3026 // Arguments: 3027 // 3028 // Inputs: 3029 // c_rarg0 - source byte array address 3030 // c_rarg1 - destination byte array address 3031 // c_rarg2 - K (key) in little endian int array 3032 // c_rarg3 - counter vector byte array address 3033 // c_rarg4 - input length 3034 // c_rarg5 - saved encryptedCounter start 3035 // c_rarg6 - saved used length 3036 // 3037 // Output: 3038 // r0 - input length 3039 // 3040 address generate_counterMode_AESCrypt() { 3041 const Register in = c_rarg0; 3042 const Register out = c_rarg1; 3043 const Register key = c_rarg2; 3044 const Register counter = c_rarg3; 3045 const Register saved_len = c_rarg4, len = r10; 3046 const Register saved_encrypted_ctr = c_rarg5; 3047 const Register used_ptr = c_rarg6, used = r12; 3048 3049 const Register offset = r7; 3050 const Register keylen = r11; 3051 3052 const unsigned char block_size = 16; 3053 const int bulk_width = 4; 3054 // NB: bulk_width can be 4 or 8. 8 gives slightly faster 3055 // performance with larger data sizes, but it also means that the 3056 // fast path isn't used until you have at least 8 blocks, and up 3057 // to 127 bytes of data will be executed on the slow path. For 3058 // that reason, and also so as not to blow away too much icache, 4 3059 // blocks seems like a sensible compromise. 3060 3061 // Algorithm: 3062 // 3063 // if (len == 0) { 3064 // goto DONE; 3065 // } 3066 // int result = len; 3067 // do { 3068 // if (used >= blockSize) { 3069 // if (len >= bulk_width * blockSize) { 3070 // CTR_large_block(); 3071 // if (len == 0) 3072 // goto DONE; 3073 // } 3074 // for (;;) { 3075 // 16ByteVector v0 = counter; 3076 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); 3077 // used = 0; 3078 // if (len < blockSize) 3079 // break; /* goto NEXT */ 3080 // 16ByteVector v1 = load16Bytes(in, offset); 3081 // v1 = v1 ^ encryptedCounter; 3082 // store16Bytes(out, offset); 3083 // used = blockSize; 3084 // offset += blockSize; 3085 // len -= blockSize; 3086 // if (len == 0) 3087 // goto DONE; 3088 // } 3089 // } 3090 // NEXT: 3091 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); 3092 // len--; 3093 // } while (len != 0); 3094 // DONE: 3095 // return result; 3096 // 3097 // CTR_large_block() 3098 // Wide bulk encryption of whole blocks. 3099 3100 __ align(CodeEntryAlignment); 3101 StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt"); 3102 const address start = __ pc(); 3103 __ enter(); 3104 3105 Label DONE, CTR_large_block, large_block_return; 3106 __ ldrw(used, Address(used_ptr)); 3107 __ cbzw(saved_len, DONE); 3108 3109 __ mov(len, saved_len); 3110 __ mov(offset, 0); 3111 3112 // Compute #rounds for AES based on the length of the key array 3113 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3114 3115 __ aesenc_loadkeys(key, keylen); 3116 3117 { 3118 Label L_CTR_loop, NEXT; 3119 3120 __ bind(L_CTR_loop); 3121 3122 __ cmp(used, block_size); 3123 __ br(__ LO, NEXT); 3124 3125 // Maybe we have a lot of data 3126 __ subsw(rscratch1, len, bulk_width * block_size); 3127 __ br(__ HS, CTR_large_block); 3128 __ BIND(large_block_return); 3129 __ cbzw(len, DONE); 3130 3131 // Setup the counter 3132 __ movi(v4, __ T4S, 0); 3133 __ movi(v5, __ T4S, 1); 3134 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 } 3135 3136 // 128-bit big-endian increment 3137 __ ld1(v0, __ T16B, counter); 3138 __ rev64(v16, __ T16B, v0); 3139 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3140 __ rev64(v16, __ T16B, v16); 3141 __ st1(v16, __ T16B, counter); 3142 // Previous counter value is in v0 3143 // v4 contains { 0, 1 } 3144 3145 { 3146 // We have fewer than bulk_width blocks of data left. Encrypt 3147 // them one by one until there is less than a full block 3148 // remaining, being careful to save both the encrypted counter 3149 // and the counter. 3150 3151 Label inner_loop; 3152 __ bind(inner_loop); 3153 // Counter to encrypt is in v0 3154 __ aesecb_encrypt(noreg, noreg, keylen); 3155 __ st1(v0, __ T16B, saved_encrypted_ctr); 3156 3157 // Do we have a remaining full block? 3158 3159 __ mov(used, 0); 3160 __ cmp(len, block_size); 3161 __ br(__ LO, NEXT); 3162 3163 // Yes, we have a full block 3164 __ ldrq(v1, Address(in, offset)); 3165 __ eor(v1, __ T16B, v1, v0); 3166 __ strq(v1, Address(out, offset)); 3167 __ mov(used, block_size); 3168 __ add(offset, offset, block_size); 3169 3170 __ subw(len, len, block_size); 3171 __ cbzw(len, DONE); 3172 3173 // Increment the counter, store it back 3174 __ orr(v0, __ T16B, v16, v16); 3175 __ rev64(v16, __ T16B, v16); 3176 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3177 __ rev64(v16, __ T16B, v16); 3178 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3179 3180 __ b(inner_loop); 3181 } 3182 3183 __ BIND(NEXT); 3184 3185 // Encrypt a single byte, and loop. 3186 // We expect this to be a rare event. 3187 __ ldrb(rscratch1, Address(in, offset)); 3188 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); 3189 __ eor(rscratch1, rscratch1, rscratch2); 3190 __ strb(rscratch1, Address(out, offset)); 3191 __ add(offset, offset, 1); 3192 __ add(used, used, 1); 3193 __ subw(len, len,1); 3194 __ cbnzw(len, L_CTR_loop); 3195 } 3196 3197 __ bind(DONE); 3198 __ strw(used, Address(used_ptr)); 3199 __ mov(r0, saved_len); 3200 3201 __ leave(); // required for proper stackwalking of RuntimeStub frame 3202 __ ret(lr); 3203 3204 // Bulk encryption 3205 3206 __ BIND (CTR_large_block); 3207 assert(bulk_width == 4 || bulk_width == 8, "must be"); 3208 3209 if (bulk_width == 8) { 3210 __ sub(sp, sp, 4 * 16); 3211 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3212 } 3213 __ sub(sp, sp, 4 * 16); 3214 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3215 RegSet saved_regs = (RegSet::of(in, out, offset) 3216 + RegSet::of(saved_encrypted_ctr, used_ptr, len)); 3217 __ push(saved_regs, sp); 3218 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption 3219 __ add(in, in, offset); 3220 __ add(out, out, offset); 3221 3222 // Keys should already be loaded into the correct registers 3223 3224 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3225 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter 3226 3227 // AES/CTR loop 3228 { 3229 Label L_CTR_loop; 3230 __ BIND(L_CTR_loop); 3231 3232 // Setup the counters 3233 __ movi(v8, __ T4S, 0); 3234 __ movi(v9, __ T4S, 1); 3235 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 } 3236 3237 for (int i = 0; i < bulk_width; i++) { 3238 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3239 __ rev64(v0_ofs, __ T16B, v16); 3240 be_add_128_64(v16, v16, v8, /*tmp*/v9); 3241 } 3242 3243 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3244 3245 // Encrypt the counters 3246 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); 3247 3248 if (bulk_width == 8) { 3249 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3250 } 3251 3252 // XOR the encrypted counters with the inputs 3253 for (int i = 0; i < bulk_width; i++) { 3254 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3255 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3256 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3257 } 3258 3259 // Write the encrypted data 3260 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3261 if (bulk_width == 8) { 3262 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3263 } 3264 3265 __ subw(len, len, 16 * bulk_width); 3266 __ cbnzw(len, L_CTR_loop); 3267 } 3268 3269 // Save the counter back where it goes 3270 __ rev64(v16, __ T16B, v16); 3271 __ st1(v16, __ T16B, counter); 3272 3273 __ pop(saved_regs, sp); 3274 3275 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3276 if (bulk_width == 8) { 3277 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3278 } 3279 3280 __ andr(rscratch1, len, -16 * bulk_width); 3281 __ sub(len, len, rscratch1); 3282 __ add(offset, offset, rscratch1); 3283 __ mov(used, 16); 3284 __ strw(used, Address(used_ptr)); 3285 __ b(large_block_return); 3286 3287 return start; 3288 } 3289 3290 // Vector AES Galois Counter Mode implementation. Parameters: 3291 // 3292 // in = c_rarg0 3293 // len = c_rarg1 3294 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt) 3295 // out = c_rarg3 3296 // key = c_rarg4 3297 // state = c_rarg5 - GHASH.state 3298 // subkeyHtbl = c_rarg6 - powers of H 3299 // counter = c_rarg7 - 16 bytes of CTR 3300 // return - number of processed bytes 3301 address generate_galoisCounterMode_AESCrypt() { 3302 address ghash_polynomial = __ pc(); 3303 __ emit_int64(0x87); // The low-order bits of the field 3304 // polynomial (i.e. p = z^7+z^2+z+1) 3305 // repeated in the low and high parts of a 3306 // 128-bit vector 3307 __ emit_int64(0x87); 3308 3309 __ align(CodeEntryAlignment); 3310 StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt"); 3311 address start = __ pc(); 3312 __ enter(); 3313 3314 const Register in = c_rarg0; 3315 const Register len = c_rarg1; 3316 const Register ct = c_rarg2; 3317 const Register out = c_rarg3; 3318 // and updated with the incremented counter in the end 3319 3320 const Register key = c_rarg4; 3321 const Register state = c_rarg5; 3322 3323 const Register subkeyHtbl = c_rarg6; 3324 3325 const Register counter = c_rarg7; 3326 3327 const Register keylen = r10; 3328 // Save state before entering routine 3329 __ sub(sp, sp, 4 * 16); 3330 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3331 __ sub(sp, sp, 4 * 16); 3332 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3333 3334 // __ andr(len, len, -512); 3335 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption 3336 __ str(len, __ pre(sp, -2 * wordSize)); 3337 3338 Label DONE; 3339 __ cbz(len, DONE); 3340 3341 // Compute #rounds for AES based on the length of the key array 3342 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3343 3344 __ aesenc_loadkeys(key, keylen); 3345 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3346 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3347 3348 // AES/CTR loop 3349 { 3350 Label L_CTR_loop; 3351 __ BIND(L_CTR_loop); 3352 3353 // Setup the counters 3354 __ movi(v8, __ T4S, 0); 3355 __ movi(v9, __ T4S, 1); 3356 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3357 3358 assert(v0->encoding() < v8->encoding(), ""); 3359 for (int i = v0->encoding(); i < v8->encoding(); i++) { 3360 FloatRegister f = as_FloatRegister(i); 3361 __ rev32(f, __ T16B, v16); 3362 __ addv(v16, __ T4S, v16, v8); 3363 } 3364 3365 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3366 3367 // Encrypt the counters 3368 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8); 3369 3370 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3371 3372 // XOR the encrypted counters with the inputs 3373 for (int i = 0; i < 8; i++) { 3374 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3375 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3376 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3377 } 3378 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3379 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3380 3381 __ subw(len, len, 16 * 8); 3382 __ cbnzw(len, L_CTR_loop); 3383 } 3384 3385 __ rev32(v16, __ T16B, v16); 3386 __ st1(v16, __ T16B, counter); 3387 3388 __ ldr(len, Address(sp)); 3389 __ lsr(len, len, exact_log2(16)); // We want the count of blocks 3390 3391 // GHASH/CTR loop 3392 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct, 3393 len, /*unrolls*/4); 3394 3395 #ifdef ASSERT 3396 { Label L; 3397 __ cmp(len, (unsigned char)0); 3398 __ br(Assembler::EQ, L); 3399 __ stop("stubGenerator: abort"); 3400 __ bind(L); 3401 } 3402 #endif 3403 3404 __ bind(DONE); 3405 // Return the number of bytes processed 3406 __ ldr(r0, __ post(sp, 2 * wordSize)); 3407 3408 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3409 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3410 3411 __ leave(); // required for proper stackwalking of RuntimeStub frame 3412 __ ret(lr); 3413 return start; 3414 } 3415 3416 class Cached64Bytes { 3417 private: 3418 MacroAssembler *_masm; 3419 Register _regs[8]; 3420 3421 public: 3422 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) { 3423 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size()); 3424 auto it = rs.begin(); 3425 for (auto &r: _regs) { 3426 r = *it; 3427 ++it; 3428 } 3429 } 3430 3431 void gen_loads(Register base) { 3432 for (int i = 0; i < 8; i += 2) { 3433 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i)); 3434 } 3435 } 3436 3437 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes. 3438 void extract_u32(Register dest, int i) { 3439 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32); 3440 } 3441 }; 3442 3443 // Utility routines for md5. 3444 // Clobbers r10 and r11. 3445 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3446 int k, int s, int t) { 3447 Register rscratch3 = r10; 3448 Register rscratch4 = r11; 3449 3450 __ eorw(rscratch3, r3, r4); 3451 __ movw(rscratch2, t); 3452 __ andw(rscratch3, rscratch3, r2); 3453 __ addw(rscratch4, r1, rscratch2); 3454 reg_cache.extract_u32(rscratch1, k); 3455 __ eorw(rscratch3, rscratch3, r4); 3456 __ addw(rscratch4, rscratch4, rscratch1); 3457 __ addw(rscratch3, rscratch3, rscratch4); 3458 __ rorw(rscratch2, rscratch3, 32 - s); 3459 __ addw(r1, rscratch2, r2); 3460 } 3461 3462 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3463 int k, int s, int t) { 3464 Register rscratch3 = r10; 3465 Register rscratch4 = r11; 3466 3467 reg_cache.extract_u32(rscratch1, k); 3468 __ movw(rscratch2, t); 3469 __ addw(rscratch4, r1, rscratch2); 3470 __ addw(rscratch4, rscratch4, rscratch1); 3471 __ bicw(rscratch2, r3, r4); 3472 __ andw(rscratch3, r2, r4); 3473 __ addw(rscratch2, rscratch2, rscratch4); 3474 __ addw(rscratch2, rscratch2, rscratch3); 3475 __ rorw(rscratch2, rscratch2, 32 - s); 3476 __ addw(r1, rscratch2, r2); 3477 } 3478 3479 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3480 int k, int s, int t) { 3481 Register rscratch3 = r10; 3482 Register rscratch4 = r11; 3483 3484 __ eorw(rscratch3, r3, r4); 3485 __ movw(rscratch2, t); 3486 __ addw(rscratch4, r1, rscratch2); 3487 reg_cache.extract_u32(rscratch1, k); 3488 __ eorw(rscratch3, rscratch3, r2); 3489 __ addw(rscratch4, rscratch4, rscratch1); 3490 __ addw(rscratch3, rscratch3, rscratch4); 3491 __ rorw(rscratch2, rscratch3, 32 - s); 3492 __ addw(r1, rscratch2, r2); 3493 } 3494 3495 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3496 int k, int s, int t) { 3497 Register rscratch3 = r10; 3498 Register rscratch4 = r11; 3499 3500 __ movw(rscratch3, t); 3501 __ ornw(rscratch2, r2, r4); 3502 __ addw(rscratch4, r1, rscratch3); 3503 reg_cache.extract_u32(rscratch1, k); 3504 __ eorw(rscratch3, rscratch2, r3); 3505 __ addw(rscratch4, rscratch4, rscratch1); 3506 __ addw(rscratch3, rscratch3, rscratch4); 3507 __ rorw(rscratch2, rscratch3, 32 - s); 3508 __ addw(r1, rscratch2, r2); 3509 } 3510 3511 // Arguments: 3512 // 3513 // Inputs: 3514 // c_rarg0 - byte[] source+offset 3515 // c_rarg1 - int[] SHA.state 3516 // c_rarg2 - int offset 3517 // c_rarg3 - int limit 3518 // 3519 address generate_md5_implCompress(bool multi_block, const char *name) { 3520 __ align(CodeEntryAlignment); 3521 StubCodeMark mark(this, "StubRoutines", name); 3522 address start = __ pc(); 3523 3524 Register buf = c_rarg0; 3525 Register state = c_rarg1; 3526 Register ofs = c_rarg2; 3527 Register limit = c_rarg3; 3528 Register a = r4; 3529 Register b = r5; 3530 Register c = r6; 3531 Register d = r7; 3532 Register rscratch3 = r10; 3533 Register rscratch4 = r11; 3534 3535 Register state_regs[2] = { r12, r13 }; 3536 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls; 3537 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers 3538 3539 __ push(saved_regs, sp); 3540 3541 __ ldp(state_regs[0], state_regs[1], Address(state)); 3542 __ ubfx(a, state_regs[0], 0, 32); 3543 __ ubfx(b, state_regs[0], 32, 32); 3544 __ ubfx(c, state_regs[1], 0, 32); 3545 __ ubfx(d, state_regs[1], 32, 32); 3546 3547 Label md5_loop; 3548 __ BIND(md5_loop); 3549 3550 reg_cache.gen_loads(buf); 3551 3552 // Round 1 3553 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478); 3554 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756); 3555 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db); 3556 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee); 3557 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf); 3558 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a); 3559 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613); 3560 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501); 3561 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8); 3562 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af); 3563 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1); 3564 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be); 3565 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122); 3566 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193); 3567 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e); 3568 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821); 3569 3570 // Round 2 3571 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562); 3572 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340); 3573 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51); 3574 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa); 3575 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d); 3576 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453); 3577 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681); 3578 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8); 3579 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6); 3580 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6); 3581 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87); 3582 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed); 3583 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905); 3584 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8); 3585 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9); 3586 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a); 3587 3588 // Round 3 3589 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942); 3590 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681); 3591 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122); 3592 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c); 3593 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44); 3594 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9); 3595 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60); 3596 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70); 3597 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6); 3598 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa); 3599 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085); 3600 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05); 3601 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039); 3602 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5); 3603 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8); 3604 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665); 3605 3606 // Round 4 3607 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244); 3608 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97); 3609 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7); 3610 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039); 3611 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3); 3612 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92); 3613 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d); 3614 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1); 3615 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f); 3616 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0); 3617 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314); 3618 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1); 3619 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82); 3620 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235); 3621 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb); 3622 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391); 3623 3624 __ addw(a, state_regs[0], a); 3625 __ ubfx(rscratch2, state_regs[0], 32, 32); 3626 __ addw(b, rscratch2, b); 3627 __ addw(c, state_regs[1], c); 3628 __ ubfx(rscratch4, state_regs[1], 32, 32); 3629 __ addw(d, rscratch4, d); 3630 3631 __ orr(state_regs[0], a, b, Assembler::LSL, 32); 3632 __ orr(state_regs[1], c, d, Assembler::LSL, 32); 3633 3634 if (multi_block) { 3635 __ add(buf, buf, 64); 3636 __ add(ofs, ofs, 64); 3637 __ cmp(ofs, limit); 3638 __ br(Assembler::LE, md5_loop); 3639 __ mov(c_rarg0, ofs); // return ofs 3640 } 3641 3642 // write hash values back in the correct order 3643 __ stp(state_regs[0], state_regs[1], Address(state)); 3644 3645 __ pop(saved_regs, sp); 3646 3647 __ ret(lr); 3648 3649 return start; 3650 } 3651 3652 // Arguments: 3653 // 3654 // Inputs: 3655 // c_rarg0 - byte[] source+offset 3656 // c_rarg1 - int[] SHA.state 3657 // c_rarg2 - int offset 3658 // c_rarg3 - int limit 3659 // 3660 address generate_sha1_implCompress(bool multi_block, const char *name) { 3661 __ align(CodeEntryAlignment); 3662 StubCodeMark mark(this, "StubRoutines", name); 3663 address start = __ pc(); 3664 3665 Register buf = c_rarg0; 3666 Register state = c_rarg1; 3667 Register ofs = c_rarg2; 3668 Register limit = c_rarg3; 3669 3670 Label keys; 3671 Label sha1_loop; 3672 3673 // load the keys into v0..v3 3674 __ adr(rscratch1, keys); 3675 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3676 // load 5 words state into v6, v7 3677 __ ldrq(v6, Address(state, 0)); 3678 __ ldrs(v7, Address(state, 16)); 3679 3680 3681 __ BIND(sha1_loop); 3682 // load 64 bytes of data into v16..v19 3683 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3684 __ rev32(v16, __ T16B, v16); 3685 __ rev32(v17, __ T16B, v17); 3686 __ rev32(v18, __ T16B, v18); 3687 __ rev32(v19, __ T16B, v19); 3688 3689 // do the sha1 3690 __ addv(v4, __ T4S, v16, v0); 3691 __ orr(v20, __ T16B, v6, v6); 3692 3693 FloatRegister d0 = v16; 3694 FloatRegister d1 = v17; 3695 FloatRegister d2 = v18; 3696 FloatRegister d3 = v19; 3697 3698 for (int round = 0; round < 20; round++) { 3699 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3700 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3701 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3702 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3703 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3704 3705 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3706 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3707 __ sha1h(tmp2, __ T4S, v20); 3708 if (round < 5) 3709 __ sha1c(v20, __ T4S, tmp3, tmp4); 3710 else if (round < 10 || round >= 15) 3711 __ sha1p(v20, __ T4S, tmp3, tmp4); 3712 else 3713 __ sha1m(v20, __ T4S, tmp3, tmp4); 3714 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3715 3716 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3717 } 3718 3719 __ addv(v7, __ T2S, v7, v21); 3720 __ addv(v6, __ T4S, v6, v20); 3721 3722 if (multi_block) { 3723 __ add(ofs, ofs, 64); 3724 __ cmp(ofs, limit); 3725 __ br(Assembler::LE, sha1_loop); 3726 __ mov(c_rarg0, ofs); // return ofs 3727 } 3728 3729 __ strq(v6, Address(state, 0)); 3730 __ strs(v7, Address(state, 16)); 3731 3732 __ ret(lr); 3733 3734 __ bind(keys); 3735 __ emit_int32(0x5a827999); 3736 __ emit_int32(0x6ed9eba1); 3737 __ emit_int32(0x8f1bbcdc); 3738 __ emit_int32(0xca62c1d6); 3739 3740 return start; 3741 } 3742 3743 3744 // Arguments: 3745 // 3746 // Inputs: 3747 // c_rarg0 - byte[] source+offset 3748 // c_rarg1 - int[] SHA.state 3749 // c_rarg2 - int offset 3750 // c_rarg3 - int limit 3751 // 3752 address generate_sha256_implCompress(bool multi_block, const char *name) { 3753 static const uint32_t round_consts[64] = { 3754 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3755 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3756 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3757 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3758 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3759 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3760 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3761 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3762 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3763 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3764 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3765 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3766 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3767 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3768 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3769 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3770 }; 3771 __ align(CodeEntryAlignment); 3772 StubCodeMark mark(this, "StubRoutines", name); 3773 address start = __ pc(); 3774 3775 Register buf = c_rarg0; 3776 Register state = c_rarg1; 3777 Register ofs = c_rarg2; 3778 Register limit = c_rarg3; 3779 3780 Label sha1_loop; 3781 3782 __ stpd(v8, v9, __ pre(sp, -32)); 3783 __ stpd(v10, v11, Address(sp, 16)); 3784 3785 // dga == v0 3786 // dgb == v1 3787 // dg0 == v2 3788 // dg1 == v3 3789 // dg2 == v4 3790 // t0 == v6 3791 // t1 == v7 3792 3793 // load 16 keys to v16..v31 3794 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3795 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3796 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3797 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3798 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3799 3800 // load 8 words (256 bits) state 3801 __ ldpq(v0, v1, state); 3802 3803 __ BIND(sha1_loop); 3804 // load 64 bytes of data into v8..v11 3805 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3806 __ rev32(v8, __ T16B, v8); 3807 __ rev32(v9, __ T16B, v9); 3808 __ rev32(v10, __ T16B, v10); 3809 __ rev32(v11, __ T16B, v11); 3810 3811 __ addv(v6, __ T4S, v8, v16); 3812 __ orr(v2, __ T16B, v0, v0); 3813 __ orr(v3, __ T16B, v1, v1); 3814 3815 FloatRegister d0 = v8; 3816 FloatRegister d1 = v9; 3817 FloatRegister d2 = v10; 3818 FloatRegister d3 = v11; 3819 3820 3821 for (int round = 0; round < 16; round++) { 3822 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3823 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3824 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3825 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3826 3827 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3828 __ orr(v4, __ T16B, v2, v2); 3829 if (round < 15) 3830 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3831 __ sha256h(v2, __ T4S, v3, tmp2); 3832 __ sha256h2(v3, __ T4S, v4, tmp2); 3833 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3834 3835 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3836 } 3837 3838 __ addv(v0, __ T4S, v0, v2); 3839 __ addv(v1, __ T4S, v1, v3); 3840 3841 if (multi_block) { 3842 __ add(ofs, ofs, 64); 3843 __ cmp(ofs, limit); 3844 __ br(Assembler::LE, sha1_loop); 3845 __ mov(c_rarg0, ofs); // return ofs 3846 } 3847 3848 __ ldpd(v10, v11, Address(sp, 16)); 3849 __ ldpd(v8, v9, __ post(sp, 32)); 3850 3851 __ stpq(v0, v1, state); 3852 3853 __ ret(lr); 3854 3855 return start; 3856 } 3857 3858 // Double rounds for sha512. 3859 void sha512_dround(int dr, 3860 FloatRegister vi0, FloatRegister vi1, 3861 FloatRegister vi2, FloatRegister vi3, 3862 FloatRegister vi4, FloatRegister vrc0, 3863 FloatRegister vrc1, FloatRegister vin0, 3864 FloatRegister vin1, FloatRegister vin2, 3865 FloatRegister vin3, FloatRegister vin4) { 3866 if (dr < 36) { 3867 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16)); 3868 } 3869 __ addv(v5, __ T2D, vrc0, vin0); 3870 __ ext(v6, __ T16B, vi2, vi3, 8); 3871 __ ext(v5, __ T16B, v5, v5, 8); 3872 __ ext(v7, __ T16B, vi1, vi2, 8); 3873 __ addv(vi3, __ T2D, vi3, v5); 3874 if (dr < 32) { 3875 __ ext(v5, __ T16B, vin3, vin4, 8); 3876 __ sha512su0(vin0, __ T2D, vin1); 3877 } 3878 __ sha512h(vi3, __ T2D, v6, v7); 3879 if (dr < 32) { 3880 __ sha512su1(vin0, __ T2D, vin2, v5); 3881 } 3882 __ addv(vi4, __ T2D, vi1, vi3); 3883 __ sha512h2(vi3, __ T2D, vi1, vi0); 3884 } 3885 3886 // Arguments: 3887 // 3888 // Inputs: 3889 // c_rarg0 - byte[] source+offset 3890 // c_rarg1 - int[] SHA.state 3891 // c_rarg2 - int offset 3892 // c_rarg3 - int limit 3893 // 3894 address generate_sha512_implCompress(bool multi_block, const char *name) { 3895 static const uint64_t round_consts[80] = { 3896 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 3897 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 3898 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 3899 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 3900 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 3901 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 3902 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 3903 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 3904 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 3905 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 3906 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 3907 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 3908 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 3909 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 3910 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 3911 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 3912 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 3913 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 3914 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 3915 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 3916 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 3917 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 3918 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 3919 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 3920 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 3921 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 3922 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 3923 }; 3924 3925 __ align(CodeEntryAlignment); 3926 StubCodeMark mark(this, "StubRoutines", name); 3927 address start = __ pc(); 3928 3929 Register buf = c_rarg0; 3930 Register state = c_rarg1; 3931 Register ofs = c_rarg2; 3932 Register limit = c_rarg3; 3933 3934 __ stpd(v8, v9, __ pre(sp, -64)); 3935 __ stpd(v10, v11, Address(sp, 16)); 3936 __ stpd(v12, v13, Address(sp, 32)); 3937 __ stpd(v14, v15, Address(sp, 48)); 3938 3939 Label sha512_loop; 3940 3941 // load state 3942 __ ld1(v8, v9, v10, v11, __ T2D, state); 3943 3944 // load first 4 round constants 3945 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3946 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 3947 3948 __ BIND(sha512_loop); 3949 // load 128B of data into v12..v19 3950 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 3951 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 3952 __ rev64(v12, __ T16B, v12); 3953 __ rev64(v13, __ T16B, v13); 3954 __ rev64(v14, __ T16B, v14); 3955 __ rev64(v15, __ T16B, v15); 3956 __ rev64(v16, __ T16B, v16); 3957 __ rev64(v17, __ T16B, v17); 3958 __ rev64(v18, __ T16B, v18); 3959 __ rev64(v19, __ T16B, v19); 3960 3961 __ mov(rscratch2, rscratch1); 3962 3963 __ mov(v0, __ T16B, v8); 3964 __ mov(v1, __ T16B, v9); 3965 __ mov(v2, __ T16B, v10); 3966 __ mov(v3, __ T16B, v11); 3967 3968 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17); 3969 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18); 3970 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19); 3971 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12); 3972 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13); 3973 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14); 3974 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15); 3975 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16); 3976 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17); 3977 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18); 3978 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19); 3979 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12); 3980 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13); 3981 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14); 3982 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15); 3983 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16); 3984 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17); 3985 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18); 3986 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19); 3987 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12); 3988 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13); 3989 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14); 3990 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15); 3991 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16); 3992 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17); 3993 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18); 3994 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19); 3995 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12); 3996 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13); 3997 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14); 3998 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15); 3999 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16); 4000 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0); 4001 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0); 4002 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0); 4003 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0); 4004 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0); 4005 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0); 4006 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0); 4007 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0); 4008 4009 __ addv(v8, __ T2D, v8, v0); 4010 __ addv(v9, __ T2D, v9, v1); 4011 __ addv(v10, __ T2D, v10, v2); 4012 __ addv(v11, __ T2D, v11, v3); 4013 4014 if (multi_block) { 4015 __ add(ofs, ofs, 128); 4016 __ cmp(ofs, limit); 4017 __ br(Assembler::LE, sha512_loop); 4018 __ mov(c_rarg0, ofs); // return ofs 4019 } 4020 4021 __ st1(v8, v9, v10, v11, __ T2D, state); 4022 4023 __ ldpd(v14, v15, Address(sp, 48)); 4024 __ ldpd(v12, v13, Address(sp, 32)); 4025 __ ldpd(v10, v11, Address(sp, 16)); 4026 __ ldpd(v8, v9, __ post(sp, 64)); 4027 4028 __ ret(lr); 4029 4030 return start; 4031 } 4032 4033 // Arguments: 4034 // 4035 // Inputs: 4036 // c_rarg0 - byte[] source+offset 4037 // c_rarg1 - byte[] SHA.state 4038 // c_rarg2 - int block_size 4039 // c_rarg3 - int offset 4040 // c_rarg4 - int limit 4041 // 4042 address generate_sha3_implCompress(bool multi_block, const char *name) { 4043 static const uint64_t round_consts[24] = { 4044 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4045 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4046 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4047 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4048 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4049 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4050 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4051 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4052 }; 4053 4054 __ align(CodeEntryAlignment); 4055 StubCodeMark mark(this, "StubRoutines", name); 4056 address start = __ pc(); 4057 4058 Register buf = c_rarg0; 4059 Register state = c_rarg1; 4060 Register block_size = c_rarg2; 4061 Register ofs = c_rarg3; 4062 Register limit = c_rarg4; 4063 4064 Label sha3_loop, rounds24_loop; 4065 Label sha3_512_or_sha3_384, shake128; 4066 4067 __ stpd(v8, v9, __ pre(sp, -64)); 4068 __ stpd(v10, v11, Address(sp, 16)); 4069 __ stpd(v12, v13, Address(sp, 32)); 4070 __ stpd(v14, v15, Address(sp, 48)); 4071 4072 // load state 4073 __ add(rscratch1, state, 32); 4074 __ ld1(v0, v1, v2, v3, __ T1D, state); 4075 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); 4076 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); 4077 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); 4078 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); 4079 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); 4080 __ ld1(v24, __ T1D, rscratch1); 4081 4082 __ BIND(sha3_loop); 4083 4084 // 24 keccak rounds 4085 __ movw(rscratch2, 24); 4086 4087 // load round_constants base 4088 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4089 4090 // load input 4091 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4092 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4093 __ eor(v0, __ T8B, v0, v25); 4094 __ eor(v1, __ T8B, v1, v26); 4095 __ eor(v2, __ T8B, v2, v27); 4096 __ eor(v3, __ T8B, v3, v28); 4097 __ eor(v4, __ T8B, v4, v29); 4098 __ eor(v5, __ T8B, v5, v30); 4099 __ eor(v6, __ T8B, v6, v31); 4100 4101 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 4102 __ tbz(block_size, 7, sha3_512_or_sha3_384); 4103 4104 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4105 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4106 __ eor(v7, __ T8B, v7, v25); 4107 __ eor(v8, __ T8B, v8, v26); 4108 __ eor(v9, __ T8B, v9, v27); 4109 __ eor(v10, __ T8B, v10, v28); 4110 __ eor(v11, __ T8B, v11, v29); 4111 __ eor(v12, __ T8B, v12, v30); 4112 __ eor(v13, __ T8B, v13, v31); 4113 4114 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24)); 4115 __ eor(v14, __ T8B, v14, v25); 4116 __ eor(v15, __ T8B, v15, v26); 4117 __ eor(v16, __ T8B, v16, v27); 4118 4119 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 4120 __ andw(c_rarg5, block_size, 48); 4121 __ cbzw(c_rarg5, rounds24_loop); 4122 4123 __ tbnz(block_size, 5, shake128); 4124 // block_size == 144, bit5 == 0, SHA3-244 4125 __ ldrd(v28, __ post(buf, 8)); 4126 __ eor(v17, __ T8B, v17, v28); 4127 __ b(rounds24_loop); 4128 4129 __ BIND(shake128); 4130 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32)); 4131 __ eor(v17, __ T8B, v17, v28); 4132 __ eor(v18, __ T8B, v18, v29); 4133 __ eor(v19, __ T8B, v19, v30); 4134 __ eor(v20, __ T8B, v20, v31); 4135 __ b(rounds24_loop); // block_size == 168, SHAKE128 4136 4137 __ BIND(sha3_512_or_sha3_384); 4138 __ ld1(v25, v26, __ T8B, __ post(buf, 16)); 4139 __ eor(v7, __ T8B, v7, v25); 4140 __ eor(v8, __ T8B, v8, v26); 4141 __ tbz(block_size, 5, rounds24_loop); // SHA3-512 4142 4143 // SHA3-384 4144 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32)); 4145 __ eor(v9, __ T8B, v9, v27); 4146 __ eor(v10, __ T8B, v10, v28); 4147 __ eor(v11, __ T8B, v11, v29); 4148 __ eor(v12, __ T8B, v12, v30); 4149 4150 __ BIND(rounds24_loop); 4151 __ subw(rscratch2, rscratch2, 1); 4152 4153 __ eor3(v29, __ T16B, v4, v9, v14); 4154 __ eor3(v26, __ T16B, v1, v6, v11); 4155 __ eor3(v28, __ T16B, v3, v8, v13); 4156 __ eor3(v25, __ T16B, v0, v5, v10); 4157 __ eor3(v27, __ T16B, v2, v7, v12); 4158 __ eor3(v29, __ T16B, v29, v19, v24); 4159 __ eor3(v26, __ T16B, v26, v16, v21); 4160 __ eor3(v28, __ T16B, v28, v18, v23); 4161 __ eor3(v25, __ T16B, v25, v15, v20); 4162 __ eor3(v27, __ T16B, v27, v17, v22); 4163 4164 __ rax1(v30, __ T2D, v29, v26); 4165 __ rax1(v26, __ T2D, v26, v28); 4166 __ rax1(v28, __ T2D, v28, v25); 4167 __ rax1(v25, __ T2D, v25, v27); 4168 __ rax1(v27, __ T2D, v27, v29); 4169 4170 __ eor(v0, __ T16B, v0, v30); 4171 __ xar(v29, __ T2D, v1, v25, (64 - 1)); 4172 __ xar(v1, __ T2D, v6, v25, (64 - 44)); 4173 __ xar(v6, __ T2D, v9, v28, (64 - 20)); 4174 __ xar(v9, __ T2D, v22, v26, (64 - 61)); 4175 __ xar(v22, __ T2D, v14, v28, (64 - 39)); 4176 __ xar(v14, __ T2D, v20, v30, (64 - 18)); 4177 __ xar(v31, __ T2D, v2, v26, (64 - 62)); 4178 __ xar(v2, __ T2D, v12, v26, (64 - 43)); 4179 __ xar(v12, __ T2D, v13, v27, (64 - 25)); 4180 __ xar(v13, __ T2D, v19, v28, (64 - 8)); 4181 __ xar(v19, __ T2D, v23, v27, (64 - 56)); 4182 __ xar(v23, __ T2D, v15, v30, (64 - 41)); 4183 __ xar(v15, __ T2D, v4, v28, (64 - 27)); 4184 __ xar(v28, __ T2D, v24, v28, (64 - 14)); 4185 __ xar(v24, __ T2D, v21, v25, (64 - 2)); 4186 __ xar(v8, __ T2D, v8, v27, (64 - 55)); 4187 __ xar(v4, __ T2D, v16, v25, (64 - 45)); 4188 __ xar(v16, __ T2D, v5, v30, (64 - 36)); 4189 __ xar(v5, __ T2D, v3, v27, (64 - 28)); 4190 __ xar(v27, __ T2D, v18, v27, (64 - 21)); 4191 __ xar(v3, __ T2D, v17, v26, (64 - 15)); 4192 __ xar(v25, __ T2D, v11, v25, (64 - 10)); 4193 __ xar(v26, __ T2D, v7, v26, (64 - 6)); 4194 __ xar(v30, __ T2D, v10, v30, (64 - 3)); 4195 4196 __ bcax(v20, __ T16B, v31, v22, v8); 4197 __ bcax(v21, __ T16B, v8, v23, v22); 4198 __ bcax(v22, __ T16B, v22, v24, v23); 4199 __ bcax(v23, __ T16B, v23, v31, v24); 4200 __ bcax(v24, __ T16B, v24, v8, v31); 4201 4202 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); 4203 4204 __ bcax(v17, __ T16B, v25, v19, v3); 4205 __ bcax(v18, __ T16B, v3, v15, v19); 4206 __ bcax(v19, __ T16B, v19, v16, v15); 4207 __ bcax(v15, __ T16B, v15, v25, v16); 4208 __ bcax(v16, __ T16B, v16, v3, v25); 4209 4210 __ bcax(v10, __ T16B, v29, v12, v26); 4211 __ bcax(v11, __ T16B, v26, v13, v12); 4212 __ bcax(v12, __ T16B, v12, v14, v13); 4213 __ bcax(v13, __ T16B, v13, v29, v14); 4214 __ bcax(v14, __ T16B, v14, v26, v29); 4215 4216 __ bcax(v7, __ T16B, v30, v9, v4); 4217 __ bcax(v8, __ T16B, v4, v5, v9); 4218 __ bcax(v9, __ T16B, v9, v6, v5); 4219 __ bcax(v5, __ T16B, v5, v30, v6); 4220 __ bcax(v6, __ T16B, v6, v4, v30); 4221 4222 __ bcax(v3, __ T16B, v27, v0, v28); 4223 __ bcax(v4, __ T16B, v28, v1, v0); 4224 __ bcax(v0, __ T16B, v0, v2, v1); 4225 __ bcax(v1, __ T16B, v1, v27, v2); 4226 __ bcax(v2, __ T16B, v2, v28, v27); 4227 4228 __ eor(v0, __ T16B, v0, v31); 4229 4230 __ cbnzw(rscratch2, rounds24_loop); 4231 4232 if (multi_block) { 4233 __ add(ofs, ofs, block_size); 4234 __ cmp(ofs, limit); 4235 __ br(Assembler::LE, sha3_loop); 4236 __ mov(c_rarg0, ofs); // return ofs 4237 } 4238 4239 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); 4240 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); 4241 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); 4242 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); 4243 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); 4244 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); 4245 __ st1(v24, __ T1D, state); 4246 4247 __ ldpd(v14, v15, Address(sp, 48)); 4248 __ ldpd(v12, v13, Address(sp, 32)); 4249 __ ldpd(v10, v11, Address(sp, 16)); 4250 __ ldpd(v8, v9, __ post(sp, 64)); 4251 4252 __ ret(lr); 4253 4254 return start; 4255 } 4256 4257 /** 4258 * Arguments: 4259 * 4260 * Inputs: 4261 * c_rarg0 - int crc 4262 * c_rarg1 - byte* buf 4263 * c_rarg2 - int length 4264 * 4265 * Output: 4266 * rax - int crc result 4267 */ 4268 address generate_updateBytesCRC32() { 4269 assert(UseCRC32Intrinsics, "what are we doing here?"); 4270 4271 __ align(CodeEntryAlignment); 4272 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 4273 4274 address start = __ pc(); 4275 4276 const Register crc = c_rarg0; // crc 4277 const Register buf = c_rarg1; // source java byte array address 4278 const Register len = c_rarg2; // length 4279 const Register table0 = c_rarg3; // crc_table address 4280 const Register table1 = c_rarg4; 4281 const Register table2 = c_rarg5; 4282 const Register table3 = c_rarg6; 4283 const Register tmp3 = c_rarg7; 4284 4285 BLOCK_COMMENT("Entry:"); 4286 __ enter(); // required for proper stackwalking of RuntimeStub frame 4287 4288 __ kernel_crc32(crc, buf, len, 4289 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4290 4291 __ leave(); // required for proper stackwalking of RuntimeStub frame 4292 __ ret(lr); 4293 4294 return start; 4295 } 4296 4297 // ChaCha20 block function. This version parallelizes by loading 4298 // individual 32-bit state elements into vectors for four blocks 4299 // (e.g. all four blocks' worth of state[0] in one register, etc.) 4300 // 4301 // state (int[16]) = c_rarg0 4302 // keystream (byte[1024]) = c_rarg1 4303 // return - number of bytes of keystream (always 256) 4304 address generate_chacha20Block_blockpar() { 4305 Label L_twoRounds, L_cc20_const; 4306 // The constant data is broken into two 128-bit segments to be loaded 4307 // onto FloatRegisters. The first 128 bits are a counter add overlay 4308 // that adds +0/+1/+2/+3 to the vector holding replicated state[12]. 4309 // The second 128-bits is a table constant used for 8-bit left rotations. 4310 __ BIND(L_cc20_const); 4311 __ emit_int64(0x0000000100000000UL); 4312 __ emit_int64(0x0000000300000002UL); 4313 __ emit_int64(0x0605040702010003UL); 4314 __ emit_int64(0x0E0D0C0F0A09080BUL); 4315 4316 __ align(CodeEntryAlignment); 4317 StubCodeMark mark(this, "StubRoutines", "chacha20Block"); 4318 address start = __ pc(); 4319 __ enter(); 4320 4321 int i, j; 4322 const Register state = c_rarg0; 4323 const Register keystream = c_rarg1; 4324 const Register loopCtr = r10; 4325 const Register tmpAddr = r11; 4326 4327 const FloatRegister stateFirst = v0; 4328 const FloatRegister stateSecond = v1; 4329 const FloatRegister stateThird = v2; 4330 const FloatRegister stateFourth = v3; 4331 const FloatRegister origCtrState = v28; 4332 const FloatRegister scratch = v29; 4333 const FloatRegister lrot8Tbl = v30; 4334 4335 // Organize SIMD registers in an array that facilitates 4336 // putting repetitive opcodes into loop structures. It is 4337 // important that each grouping of 4 registers is monotonically 4338 // increasing to support the requirements of multi-register 4339 // instructions (e.g. ld4r, st4, etc.) 4340 const FloatRegister workSt[16] = { 4341 v4, v5, v6, v7, v16, v17, v18, v19, 4342 v20, v21, v22, v23, v24, v25, v26, v27 4343 }; 4344 4345 // Load from memory and interlace across 16 SIMD registers, 4346 // With each word from memory being broadcast to all lanes of 4347 // each successive SIMD register. 4348 // Addr(0) -> All lanes in workSt[i] 4349 // Addr(4) -> All lanes workSt[i + 1], etc. 4350 __ mov(tmpAddr, state); 4351 for (i = 0; i < 16; i += 4) { 4352 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S, 4353 __ post(tmpAddr, 16)); 4354 } 4355 4356 // Pull in constant data. The first 16 bytes are the add overlay 4357 // which is applied to the vector holding the counter (state[12]). 4358 // The second 16 bytes is the index register for the 8-bit left 4359 // rotation tbl instruction. 4360 __ adr(tmpAddr, L_cc20_const); 4361 __ ldpq(origCtrState, lrot8Tbl, Address(tmpAddr)); 4362 __ addv(workSt[12], __ T4S, workSt[12], origCtrState); 4363 4364 // Set up the 10 iteration loop and perform all 8 quarter round ops 4365 __ mov(loopCtr, 10); 4366 __ BIND(L_twoRounds); 4367 4368 __ cc20_quarter_round(workSt[0], workSt[4], workSt[8], workSt[12], 4369 scratch, lrot8Tbl); 4370 __ cc20_quarter_round(workSt[1], workSt[5], workSt[9], workSt[13], 4371 scratch, lrot8Tbl); 4372 __ cc20_quarter_round(workSt[2], workSt[6], workSt[10], workSt[14], 4373 scratch, lrot8Tbl); 4374 __ cc20_quarter_round(workSt[3], workSt[7], workSt[11], workSt[15], 4375 scratch, lrot8Tbl); 4376 4377 __ cc20_quarter_round(workSt[0], workSt[5], workSt[10], workSt[15], 4378 scratch, lrot8Tbl); 4379 __ cc20_quarter_round(workSt[1], workSt[6], workSt[11], workSt[12], 4380 scratch, lrot8Tbl); 4381 __ cc20_quarter_round(workSt[2], workSt[7], workSt[8], workSt[13], 4382 scratch, lrot8Tbl); 4383 __ cc20_quarter_round(workSt[3], workSt[4], workSt[9], workSt[14], 4384 scratch, lrot8Tbl); 4385 4386 // Decrement and iterate 4387 __ sub(loopCtr, loopCtr, 1); 4388 __ cbnz(loopCtr, L_twoRounds); 4389 4390 __ mov(tmpAddr, state); 4391 4392 // Add the starting state back to the post-loop keystream 4393 // state. We read/interlace the state array from memory into 4394 // 4 registers similar to what we did in the beginning. Then 4395 // add the counter overlay onto workSt[12] at the end. 4396 for (i = 0; i < 16; i += 4) { 4397 __ ld4r(stateFirst, stateSecond, stateThird, stateFourth, __ T4S, 4398 __ post(tmpAddr, 16)); 4399 __ addv(workSt[i], __ T4S, workSt[i], stateFirst); 4400 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], stateSecond); 4401 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], stateThird); 4402 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], stateFourth); 4403 } 4404 __ addv(workSt[12], __ T4S, workSt[12], origCtrState); // Add ctr mask 4405 4406 // Write to key stream, storing the same element out of workSt[0..15] 4407 // to consecutive 4-byte offsets in the key stream buffer, then repeating 4408 // for the next element position. 4409 for (i = 0; i < 4; i++) { 4410 for (j = 0; j < 16; j += 4) { 4411 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i, 4412 __ post(keystream, 16)); 4413 } 4414 } 4415 4416 __ mov(r0, 256); // Return length of output keystream 4417 __ leave(); 4418 __ ret(lr); 4419 4420 return start; 4421 } 4422 4423 /** 4424 * Arguments: 4425 * 4426 * Inputs: 4427 * c_rarg0 - int crc 4428 * c_rarg1 - byte* buf 4429 * c_rarg2 - int length 4430 * c_rarg3 - int* table 4431 * 4432 * Output: 4433 * r0 - int crc result 4434 */ 4435 address generate_updateBytesCRC32C() { 4436 assert(UseCRC32CIntrinsics, "what are we doing here?"); 4437 4438 __ align(CodeEntryAlignment); 4439 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 4440 4441 address start = __ pc(); 4442 4443 const Register crc = c_rarg0; // crc 4444 const Register buf = c_rarg1; // source java byte array address 4445 const Register len = c_rarg2; // length 4446 const Register table0 = c_rarg3; // crc_table address 4447 const Register table1 = c_rarg4; 4448 const Register table2 = c_rarg5; 4449 const Register table3 = c_rarg6; 4450 const Register tmp3 = c_rarg7; 4451 4452 BLOCK_COMMENT("Entry:"); 4453 __ enter(); // required for proper stackwalking of RuntimeStub frame 4454 4455 __ kernel_crc32c(crc, buf, len, 4456 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4457 4458 __ leave(); // required for proper stackwalking of RuntimeStub frame 4459 __ ret(lr); 4460 4461 return start; 4462 } 4463 4464 /*** 4465 * Arguments: 4466 * 4467 * Inputs: 4468 * c_rarg0 - int adler 4469 * c_rarg1 - byte* buff 4470 * c_rarg2 - int len 4471 * 4472 * Output: 4473 * c_rarg0 - int adler result 4474 */ 4475 address generate_updateBytesAdler32() { 4476 __ align(CodeEntryAlignment); 4477 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 4478 address start = __ pc(); 4479 4480 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 4481 4482 // Aliases 4483 Register adler = c_rarg0; 4484 Register s1 = c_rarg0; 4485 Register s2 = c_rarg3; 4486 Register buff = c_rarg1; 4487 Register len = c_rarg2; 4488 Register nmax = r4; 4489 Register base = r5; 4490 Register count = r6; 4491 Register temp0 = rscratch1; 4492 Register temp1 = rscratch2; 4493 FloatRegister vbytes = v0; 4494 FloatRegister vs1acc = v1; 4495 FloatRegister vs2acc = v2; 4496 FloatRegister vtable = v3; 4497 4498 // Max number of bytes we can process before having to take the mod 4499 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 4500 uint64_t BASE = 0xfff1; 4501 uint64_t NMAX = 0x15B0; 4502 4503 __ mov(base, BASE); 4504 __ mov(nmax, NMAX); 4505 4506 // Load accumulation coefficients for the upper 16 bits 4507 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 4508 __ ld1(vtable, __ T16B, Address(temp0)); 4509 4510 // s1 is initialized to the lower 16 bits of adler 4511 // s2 is initialized to the upper 16 bits of adler 4512 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 4513 __ uxth(s1, adler); // s1 = (adler & 0xffff) 4514 4515 // The pipelined loop needs at least 16 elements for 1 iteration 4516 // It does check this, but it is more effective to skip to the cleanup loop 4517 __ cmp(len, (u1)16); 4518 __ br(Assembler::HS, L_nmax); 4519 __ cbz(len, L_combine); 4520 4521 __ bind(L_simple_by1_loop); 4522 __ ldrb(temp0, Address(__ post(buff, 1))); 4523 __ add(s1, s1, temp0); 4524 __ add(s2, s2, s1); 4525 __ subs(len, len, 1); 4526 __ br(Assembler::HI, L_simple_by1_loop); 4527 4528 // s1 = s1 % BASE 4529 __ subs(temp0, s1, base); 4530 __ csel(s1, temp0, s1, Assembler::HS); 4531 4532 // s2 = s2 % BASE 4533 __ lsr(temp0, s2, 16); 4534 __ lsl(temp1, temp0, 4); 4535 __ sub(temp1, temp1, temp0); 4536 __ add(s2, temp1, s2, ext::uxth); 4537 4538 __ subs(temp0, s2, base); 4539 __ csel(s2, temp0, s2, Assembler::HS); 4540 4541 __ b(L_combine); 4542 4543 __ bind(L_nmax); 4544 __ subs(len, len, nmax); 4545 __ sub(count, nmax, 16); 4546 __ br(Assembler::LO, L_by16); 4547 4548 __ bind(L_nmax_loop); 4549 4550 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4551 vbytes, vs1acc, vs2acc, vtable); 4552 4553 __ subs(count, count, 16); 4554 __ br(Assembler::HS, L_nmax_loop); 4555 4556 // s1 = s1 % BASE 4557 __ lsr(temp0, s1, 16); 4558 __ lsl(temp1, temp0, 4); 4559 __ sub(temp1, temp1, temp0); 4560 __ add(temp1, temp1, s1, ext::uxth); 4561 4562 __ lsr(temp0, temp1, 16); 4563 __ lsl(s1, temp0, 4); 4564 __ sub(s1, s1, temp0); 4565 __ add(s1, s1, temp1, ext:: uxth); 4566 4567 __ subs(temp0, s1, base); 4568 __ csel(s1, temp0, s1, Assembler::HS); 4569 4570 // s2 = s2 % BASE 4571 __ lsr(temp0, s2, 16); 4572 __ lsl(temp1, temp0, 4); 4573 __ sub(temp1, temp1, temp0); 4574 __ add(temp1, temp1, s2, ext::uxth); 4575 4576 __ lsr(temp0, temp1, 16); 4577 __ lsl(s2, temp0, 4); 4578 __ sub(s2, s2, temp0); 4579 __ add(s2, s2, temp1, ext:: uxth); 4580 4581 __ subs(temp0, s2, base); 4582 __ csel(s2, temp0, s2, Assembler::HS); 4583 4584 __ subs(len, len, nmax); 4585 __ sub(count, nmax, 16); 4586 __ br(Assembler::HS, L_nmax_loop); 4587 4588 __ bind(L_by16); 4589 __ adds(len, len, count); 4590 __ br(Assembler::LO, L_by1); 4591 4592 __ bind(L_by16_loop); 4593 4594 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4595 vbytes, vs1acc, vs2acc, vtable); 4596 4597 __ subs(len, len, 16); 4598 __ br(Assembler::HS, L_by16_loop); 4599 4600 __ bind(L_by1); 4601 __ adds(len, len, 15); 4602 __ br(Assembler::LO, L_do_mod); 4603 4604 __ bind(L_by1_loop); 4605 __ ldrb(temp0, Address(__ post(buff, 1))); 4606 __ add(s1, temp0, s1); 4607 __ add(s2, s2, s1); 4608 __ subs(len, len, 1); 4609 __ br(Assembler::HS, L_by1_loop); 4610 4611 __ bind(L_do_mod); 4612 // s1 = s1 % BASE 4613 __ lsr(temp0, s1, 16); 4614 __ lsl(temp1, temp0, 4); 4615 __ sub(temp1, temp1, temp0); 4616 __ add(temp1, temp1, s1, ext::uxth); 4617 4618 __ lsr(temp0, temp1, 16); 4619 __ lsl(s1, temp0, 4); 4620 __ sub(s1, s1, temp0); 4621 __ add(s1, s1, temp1, ext:: uxth); 4622 4623 __ subs(temp0, s1, base); 4624 __ csel(s1, temp0, s1, Assembler::HS); 4625 4626 // s2 = s2 % BASE 4627 __ lsr(temp0, s2, 16); 4628 __ lsl(temp1, temp0, 4); 4629 __ sub(temp1, temp1, temp0); 4630 __ add(temp1, temp1, s2, ext::uxth); 4631 4632 __ lsr(temp0, temp1, 16); 4633 __ lsl(s2, temp0, 4); 4634 __ sub(s2, s2, temp0); 4635 __ add(s2, s2, temp1, ext:: uxth); 4636 4637 __ subs(temp0, s2, base); 4638 __ csel(s2, temp0, s2, Assembler::HS); 4639 4640 // Combine lower bits and higher bits 4641 __ bind(L_combine); 4642 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 4643 4644 __ ret(lr); 4645 4646 return start; 4647 } 4648 4649 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 4650 Register temp0, Register temp1, FloatRegister vbytes, 4651 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 4652 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 4653 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 4654 // In non-vectorized code, we update s1 and s2 as: 4655 // s1 <- s1 + b1 4656 // s2 <- s2 + s1 4657 // s1 <- s1 + b2 4658 // s2 <- s2 + b1 4659 // ... 4660 // s1 <- s1 + b16 4661 // s2 <- s2 + s1 4662 // Putting above assignments together, we have: 4663 // s1_new = s1 + b1 + b2 + ... + b16 4664 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 4665 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 4666 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 4667 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 4668 4669 // s2 = s2 + s1 * 16 4670 __ add(s2, s2, s1, Assembler::LSL, 4); 4671 4672 // vs1acc = b1 + b2 + b3 + ... + b16 4673 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 4674 __ umullv(vs2acc, __ T8B, vtable, vbytes); 4675 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 4676 __ uaddlv(vs1acc, __ T16B, vbytes); 4677 __ uaddlv(vs2acc, __ T8H, vs2acc); 4678 4679 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 4680 __ fmovd(temp0, vs1acc); 4681 __ fmovd(temp1, vs2acc); 4682 __ add(s1, s1, temp0); 4683 __ add(s2, s2, temp1); 4684 } 4685 4686 /** 4687 * Arguments: 4688 * 4689 * Input: 4690 * c_rarg0 - x address 4691 * c_rarg1 - x length 4692 * c_rarg2 - y address 4693 * c_rarg3 - y length 4694 * c_rarg4 - z address 4695 */ 4696 address generate_multiplyToLen() { 4697 __ align(CodeEntryAlignment); 4698 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 4699 4700 address start = __ pc(); 4701 const Register x = r0; 4702 const Register xlen = r1; 4703 const Register y = r2; 4704 const Register ylen = r3; 4705 const Register z = r4; 4706 4707 const Register tmp0 = r5; 4708 const Register tmp1 = r10; 4709 const Register tmp2 = r11; 4710 const Register tmp3 = r12; 4711 const Register tmp4 = r13; 4712 const Register tmp5 = r14; 4713 const Register tmp6 = r15; 4714 const Register tmp7 = r16; 4715 4716 BLOCK_COMMENT("Entry:"); 4717 __ enter(); // required for proper stackwalking of RuntimeStub frame 4718 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4719 __ leave(); // required for proper stackwalking of RuntimeStub frame 4720 __ ret(lr); 4721 4722 return start; 4723 } 4724 4725 address generate_squareToLen() { 4726 // squareToLen algorithm for sizes 1..127 described in java code works 4727 // faster than multiply_to_len on some CPUs and slower on others, but 4728 // multiply_to_len shows a bit better overall results 4729 __ align(CodeEntryAlignment); 4730 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 4731 address start = __ pc(); 4732 4733 const Register x = r0; 4734 const Register xlen = r1; 4735 const Register z = r2; 4736 const Register y = r4; // == x 4737 const Register ylen = r5; // == xlen 4738 4739 const Register tmp0 = r3; 4740 const Register tmp1 = r10; 4741 const Register tmp2 = r11; 4742 const Register tmp3 = r12; 4743 const Register tmp4 = r13; 4744 const Register tmp5 = r14; 4745 const Register tmp6 = r15; 4746 const Register tmp7 = r16; 4747 4748 RegSet spilled_regs = RegSet::of(y, ylen); 4749 BLOCK_COMMENT("Entry:"); 4750 __ enter(); 4751 __ push(spilled_regs, sp); 4752 __ mov(y, x); 4753 __ mov(ylen, xlen); 4754 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4755 __ pop(spilled_regs, sp); 4756 __ leave(); 4757 __ ret(lr); 4758 return start; 4759 } 4760 4761 address generate_mulAdd() { 4762 __ align(CodeEntryAlignment); 4763 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 4764 4765 address start = __ pc(); 4766 4767 const Register out = r0; 4768 const Register in = r1; 4769 const Register offset = r2; 4770 const Register len = r3; 4771 const Register k = r4; 4772 4773 BLOCK_COMMENT("Entry:"); 4774 __ enter(); 4775 __ mul_add(out, in, offset, len, k); 4776 __ leave(); 4777 __ ret(lr); 4778 4779 return start; 4780 } 4781 4782 // Arguments: 4783 // 4784 // Input: 4785 // c_rarg0 - newArr address 4786 // c_rarg1 - oldArr address 4787 // c_rarg2 - newIdx 4788 // c_rarg3 - shiftCount 4789 // c_rarg4 - numIter 4790 // 4791 address generate_bigIntegerRightShift() { 4792 __ align(CodeEntryAlignment); 4793 StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker"); 4794 address start = __ pc(); 4795 4796 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 4797 4798 Register newArr = c_rarg0; 4799 Register oldArr = c_rarg1; 4800 Register newIdx = c_rarg2; 4801 Register shiftCount = c_rarg3; 4802 Register numIter = c_rarg4; 4803 Register idx = numIter; 4804 4805 Register newArrCur = rscratch1; 4806 Register shiftRevCount = rscratch2; 4807 Register oldArrCur = r13; 4808 Register oldArrNext = r14; 4809 4810 FloatRegister oldElem0 = v0; 4811 FloatRegister oldElem1 = v1; 4812 FloatRegister newElem = v2; 4813 FloatRegister shiftVCount = v3; 4814 FloatRegister shiftVRevCount = v4; 4815 4816 __ cbz(idx, Exit); 4817 4818 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 4819 4820 // left shift count 4821 __ movw(shiftRevCount, 32); 4822 __ subw(shiftRevCount, shiftRevCount, shiftCount); 4823 4824 // numIter too small to allow a 4-words SIMD loop, rolling back 4825 __ cmp(numIter, (u1)4); 4826 __ br(Assembler::LT, ShiftThree); 4827 4828 __ dup(shiftVCount, __ T4S, shiftCount); 4829 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 4830 __ negr(shiftVCount, __ T4S, shiftVCount); 4831 4832 __ BIND(ShiftSIMDLoop); 4833 4834 // Calculate the load addresses 4835 __ sub(idx, idx, 4); 4836 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4837 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4838 __ add(oldArrCur, oldArrNext, 4); 4839 4840 // Load 4 words and process 4841 __ ld1(oldElem0, __ T4S, Address(oldArrCur)); 4842 __ ld1(oldElem1, __ T4S, Address(oldArrNext)); 4843 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 4844 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 4845 __ orr(newElem, __ T16B, oldElem0, oldElem1); 4846 __ st1(newElem, __ T4S, Address(newArrCur)); 4847 4848 __ cmp(idx, (u1)4); 4849 __ br(Assembler::LT, ShiftTwoLoop); 4850 __ b(ShiftSIMDLoop); 4851 4852 __ BIND(ShiftTwoLoop); 4853 __ cbz(idx, Exit); 4854 __ cmp(idx, (u1)1); 4855 __ br(Assembler::EQ, ShiftOne); 4856 4857 // Calculate the load addresses 4858 __ sub(idx, idx, 2); 4859 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4860 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4861 __ add(oldArrCur, oldArrNext, 4); 4862 4863 // Load 2 words and process 4864 __ ld1(oldElem0, __ T2S, Address(oldArrCur)); 4865 __ ld1(oldElem1, __ T2S, Address(oldArrNext)); 4866 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 4867 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 4868 __ orr(newElem, __ T8B, oldElem0, oldElem1); 4869 __ st1(newElem, __ T2S, Address(newArrCur)); 4870 __ b(ShiftTwoLoop); 4871 4872 __ BIND(ShiftThree); 4873 __ tbz(idx, 1, ShiftOne); 4874 __ tbz(idx, 0, ShiftTwo); 4875 __ ldrw(r10, Address(oldArr, 12)); 4876 __ ldrw(r11, Address(oldArr, 8)); 4877 __ lsrvw(r10, r10, shiftCount); 4878 __ lslvw(r11, r11, shiftRevCount); 4879 __ orrw(r12, r10, r11); 4880 __ strw(r12, Address(newArr, 8)); 4881 4882 __ BIND(ShiftTwo); 4883 __ ldrw(r10, Address(oldArr, 8)); 4884 __ ldrw(r11, Address(oldArr, 4)); 4885 __ lsrvw(r10, r10, shiftCount); 4886 __ lslvw(r11, r11, shiftRevCount); 4887 __ orrw(r12, r10, r11); 4888 __ strw(r12, Address(newArr, 4)); 4889 4890 __ BIND(ShiftOne); 4891 __ ldrw(r10, Address(oldArr, 4)); 4892 __ ldrw(r11, Address(oldArr)); 4893 __ lsrvw(r10, r10, shiftCount); 4894 __ lslvw(r11, r11, shiftRevCount); 4895 __ orrw(r12, r10, r11); 4896 __ strw(r12, Address(newArr)); 4897 4898 __ BIND(Exit); 4899 __ ret(lr); 4900 4901 return start; 4902 } 4903 4904 // Arguments: 4905 // 4906 // Input: 4907 // c_rarg0 - newArr address 4908 // c_rarg1 - oldArr address 4909 // c_rarg2 - newIdx 4910 // c_rarg3 - shiftCount 4911 // c_rarg4 - numIter 4912 // 4913 address generate_bigIntegerLeftShift() { 4914 __ align(CodeEntryAlignment); 4915 StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker"); 4916 address start = __ pc(); 4917 4918 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 4919 4920 Register newArr = c_rarg0; 4921 Register oldArr = c_rarg1; 4922 Register newIdx = c_rarg2; 4923 Register shiftCount = c_rarg3; 4924 Register numIter = c_rarg4; 4925 4926 Register shiftRevCount = rscratch1; 4927 Register oldArrNext = rscratch2; 4928 4929 FloatRegister oldElem0 = v0; 4930 FloatRegister oldElem1 = v1; 4931 FloatRegister newElem = v2; 4932 FloatRegister shiftVCount = v3; 4933 FloatRegister shiftVRevCount = v4; 4934 4935 __ cbz(numIter, Exit); 4936 4937 __ add(oldArrNext, oldArr, 4); 4938 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 4939 4940 // right shift count 4941 __ movw(shiftRevCount, 32); 4942 __ subw(shiftRevCount, shiftRevCount, shiftCount); 4943 4944 // numIter too small to allow a 4-words SIMD loop, rolling back 4945 __ cmp(numIter, (u1)4); 4946 __ br(Assembler::LT, ShiftThree); 4947 4948 __ dup(shiftVCount, __ T4S, shiftCount); 4949 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 4950 __ negr(shiftVRevCount, __ T4S, shiftVRevCount); 4951 4952 __ BIND(ShiftSIMDLoop); 4953 4954 // load 4 words and process 4955 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); 4956 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); 4957 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 4958 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 4959 __ orr(newElem, __ T16B, oldElem0, oldElem1); 4960 __ st1(newElem, __ T4S, __ post(newArr, 16)); 4961 __ sub(numIter, numIter, 4); 4962 4963 __ cmp(numIter, (u1)4); 4964 __ br(Assembler::LT, ShiftTwoLoop); 4965 __ b(ShiftSIMDLoop); 4966 4967 __ BIND(ShiftTwoLoop); 4968 __ cbz(numIter, Exit); 4969 __ cmp(numIter, (u1)1); 4970 __ br(Assembler::EQ, ShiftOne); 4971 4972 // load 2 words and process 4973 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); 4974 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); 4975 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 4976 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 4977 __ orr(newElem, __ T8B, oldElem0, oldElem1); 4978 __ st1(newElem, __ T2S, __ post(newArr, 8)); 4979 __ sub(numIter, numIter, 2); 4980 __ b(ShiftTwoLoop); 4981 4982 __ BIND(ShiftThree); 4983 __ ldrw(r10, __ post(oldArr, 4)); 4984 __ ldrw(r11, __ post(oldArrNext, 4)); 4985 __ lslvw(r10, r10, shiftCount); 4986 __ lsrvw(r11, r11, shiftRevCount); 4987 __ orrw(r12, r10, r11); 4988 __ strw(r12, __ post(newArr, 4)); 4989 __ tbz(numIter, 1, Exit); 4990 __ tbz(numIter, 0, ShiftOne); 4991 4992 __ BIND(ShiftTwo); 4993 __ ldrw(r10, __ post(oldArr, 4)); 4994 __ ldrw(r11, __ post(oldArrNext, 4)); 4995 __ lslvw(r10, r10, shiftCount); 4996 __ lsrvw(r11, r11, shiftRevCount); 4997 __ orrw(r12, r10, r11); 4998 __ strw(r12, __ post(newArr, 4)); 4999 5000 __ BIND(ShiftOne); 5001 __ ldrw(r10, Address(oldArr)); 5002 __ ldrw(r11, Address(oldArrNext)); 5003 __ lslvw(r10, r10, shiftCount); 5004 __ lsrvw(r11, r11, shiftRevCount); 5005 __ orrw(r12, r10, r11); 5006 __ strw(r12, Address(newArr)); 5007 5008 __ BIND(Exit); 5009 __ ret(lr); 5010 5011 return start; 5012 } 5013 5014 address generate_count_positives(address &count_positives_long) { 5015 const u1 large_loop_size = 64; 5016 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 5017 int dcache_line = VM_Version::dcache_line_size(); 5018 5019 Register ary1 = r1, len = r2, result = r0; 5020 5021 __ align(CodeEntryAlignment); 5022 5023 StubCodeMark mark(this, "StubRoutines", "count_positives"); 5024 5025 address entry = __ pc(); 5026 5027 __ enter(); 5028 // precondition: a copy of len is already in result 5029 // __ mov(result, len); 5030 5031 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16, 5032 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 5033 5034 __ cmp(len, (u1)15); 5035 __ br(Assembler::GT, LEN_OVER_15); 5036 // The only case when execution falls into this code is when pointer is near 5037 // the end of memory page and we have to avoid reading next page 5038 __ add(ary1, ary1, len); 5039 __ subs(len, len, 8); 5040 __ br(Assembler::GT, LEN_OVER_8); 5041 __ ldr(rscratch2, Address(ary1, -8)); 5042 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 5043 __ lsrv(rscratch2, rscratch2, rscratch1); 5044 __ tst(rscratch2, UPPER_BIT_MASK); 5045 __ csel(result, zr, result, Assembler::NE); 5046 __ leave(); 5047 __ ret(lr); 5048 __ bind(LEN_OVER_8); 5049 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 5050 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 5051 __ tst(rscratch2, UPPER_BIT_MASK); 5052 __ br(Assembler::NE, RET_NO_POP); 5053 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 5054 __ lsrv(rscratch1, rscratch1, rscratch2); 5055 __ tst(rscratch1, UPPER_BIT_MASK); 5056 __ bind(RET_NO_POP); 5057 __ csel(result, zr, result, Assembler::NE); 5058 __ leave(); 5059 __ ret(lr); 5060 5061 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 5062 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 5063 5064 count_positives_long = __ pc(); // 2nd entry point 5065 5066 __ enter(); 5067 5068 __ bind(LEN_OVER_15); 5069 __ push(spilled_regs, sp); 5070 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 5071 __ cbz(rscratch2, ALIGNED); 5072 __ ldp(tmp6, tmp1, Address(ary1)); 5073 __ mov(tmp5, 16); 5074 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 5075 __ add(ary1, ary1, rscratch1); 5076 __ orr(tmp6, tmp6, tmp1); 5077 __ tst(tmp6, UPPER_BIT_MASK); 5078 __ br(Assembler::NE, RET_ADJUST); 5079 __ sub(len, len, rscratch1); 5080 5081 __ bind(ALIGNED); 5082 __ cmp(len, large_loop_size); 5083 __ br(Assembler::LT, CHECK_16); 5084 // Perform 16-byte load as early return in pre-loop to handle situation 5085 // when initially aligned large array has negative values at starting bytes, 5086 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 5087 // slower. Cases with negative bytes further ahead won't be affected that 5088 // much. In fact, it'll be faster due to early loads, less instructions and 5089 // less branches in LARGE_LOOP. 5090 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 5091 __ sub(len, len, 16); 5092 __ orr(tmp6, tmp6, tmp1); 5093 __ tst(tmp6, UPPER_BIT_MASK); 5094 __ br(Assembler::NE, RET_ADJUST_16); 5095 __ cmp(len, large_loop_size); 5096 __ br(Assembler::LT, CHECK_16); 5097 5098 if (SoftwarePrefetchHintDistance >= 0 5099 && SoftwarePrefetchHintDistance >= dcache_line) { 5100 // initial prefetch 5101 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 5102 } 5103 __ bind(LARGE_LOOP); 5104 if (SoftwarePrefetchHintDistance >= 0) { 5105 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 5106 } 5107 // Issue load instructions first, since it can save few CPU/MEM cycles, also 5108 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 5109 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 5110 // instructions per cycle and have less branches, but this approach disables 5111 // early return, thus, all 64 bytes are loaded and checked every time. 5112 __ ldp(tmp2, tmp3, Address(ary1)); 5113 __ ldp(tmp4, tmp5, Address(ary1, 16)); 5114 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 5115 __ ldp(tmp6, tmp1, Address(ary1, 48)); 5116 __ add(ary1, ary1, large_loop_size); 5117 __ sub(len, len, large_loop_size); 5118 __ orr(tmp2, tmp2, tmp3); 5119 __ orr(tmp4, tmp4, tmp5); 5120 __ orr(rscratch1, rscratch1, rscratch2); 5121 __ orr(tmp6, tmp6, tmp1); 5122 __ orr(tmp2, tmp2, tmp4); 5123 __ orr(rscratch1, rscratch1, tmp6); 5124 __ orr(tmp2, tmp2, rscratch1); 5125 __ tst(tmp2, UPPER_BIT_MASK); 5126 __ br(Assembler::NE, RET_ADJUST_LONG); 5127 __ cmp(len, large_loop_size); 5128 __ br(Assembler::GE, LARGE_LOOP); 5129 5130 __ bind(CHECK_16); // small 16-byte load pre-loop 5131 __ cmp(len, (u1)16); 5132 __ br(Assembler::LT, POST_LOOP16); 5133 5134 __ bind(LOOP16); // small 16-byte load loop 5135 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 5136 __ sub(len, len, 16); 5137 __ orr(tmp2, tmp2, tmp3); 5138 __ tst(tmp2, UPPER_BIT_MASK); 5139 __ br(Assembler::NE, RET_ADJUST_16); 5140 __ cmp(len, (u1)16); 5141 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 5142 5143 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 5144 __ cmp(len, (u1)8); 5145 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 5146 __ ldr(tmp3, Address(__ post(ary1, 8))); 5147 __ tst(tmp3, UPPER_BIT_MASK); 5148 __ br(Assembler::NE, RET_ADJUST); 5149 __ sub(len, len, 8); 5150 5151 __ bind(POST_LOOP16_LOAD_TAIL); 5152 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0 5153 __ ldr(tmp1, Address(ary1)); 5154 __ mov(tmp2, 64); 5155 __ sub(tmp4, tmp2, len, __ LSL, 3); 5156 __ lslv(tmp1, tmp1, tmp4); 5157 __ tst(tmp1, UPPER_BIT_MASK); 5158 __ br(Assembler::NE, RET_ADJUST); 5159 // Fallthrough 5160 5161 __ bind(RET_LEN); 5162 __ pop(spilled_regs, sp); 5163 __ leave(); 5164 __ ret(lr); 5165 5166 // difference result - len is the count of guaranteed to be 5167 // positive bytes 5168 5169 __ bind(RET_ADJUST_LONG); 5170 __ add(len, len, (u1)(large_loop_size - 16)); 5171 __ bind(RET_ADJUST_16); 5172 __ add(len, len, 16); 5173 __ bind(RET_ADJUST); 5174 __ pop(spilled_regs, sp); 5175 __ leave(); 5176 __ sub(result, result, len); 5177 __ ret(lr); 5178 5179 return entry; 5180 } 5181 5182 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 5183 bool usePrefetch, Label &NOT_EQUAL) { 5184 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5185 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 5186 tmp7 = r12, tmp8 = r13; 5187 Label LOOP; 5188 5189 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5190 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5191 __ bind(LOOP); 5192 if (usePrefetch) { 5193 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 5194 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 5195 } 5196 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 5197 __ eor(tmp1, tmp1, tmp2); 5198 __ eor(tmp3, tmp3, tmp4); 5199 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 5200 __ orr(tmp1, tmp1, tmp3); 5201 __ cbnz(tmp1, NOT_EQUAL); 5202 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5203 __ eor(tmp5, tmp5, tmp6); 5204 __ eor(tmp7, tmp7, tmp8); 5205 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5206 __ orr(tmp5, tmp5, tmp7); 5207 __ cbnz(tmp5, NOT_EQUAL); 5208 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 5209 __ eor(tmp1, tmp1, tmp2); 5210 __ eor(tmp3, tmp3, tmp4); 5211 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 5212 __ orr(tmp1, tmp1, tmp3); 5213 __ cbnz(tmp1, NOT_EQUAL); 5214 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5215 __ eor(tmp5, tmp5, tmp6); 5216 __ sub(cnt1, cnt1, 8 * wordSize); 5217 __ eor(tmp7, tmp7, tmp8); 5218 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5219 // tmp6 is not used. MacroAssembler::subs is used here (rather than 5220 // cmp) because subs allows an unlimited range of immediate operand. 5221 __ subs(tmp6, cnt1, loopThreshold); 5222 __ orr(tmp5, tmp5, tmp7); 5223 __ cbnz(tmp5, NOT_EQUAL); 5224 __ br(__ GE, LOOP); 5225 // post-loop 5226 __ eor(tmp1, tmp1, tmp2); 5227 __ eor(tmp3, tmp3, tmp4); 5228 __ orr(tmp1, tmp1, tmp3); 5229 __ sub(cnt1, cnt1, 2 * wordSize); 5230 __ cbnz(tmp1, NOT_EQUAL); 5231 } 5232 5233 void generate_large_array_equals_loop_simd(int loopThreshold, 5234 bool usePrefetch, Label &NOT_EQUAL) { 5235 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5236 tmp2 = rscratch2; 5237 Label LOOP; 5238 5239 __ bind(LOOP); 5240 if (usePrefetch) { 5241 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 5242 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 5243 } 5244 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 5245 __ sub(cnt1, cnt1, 8 * wordSize); 5246 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 5247 __ subs(tmp1, cnt1, loopThreshold); 5248 __ eor(v0, __ T16B, v0, v4); 5249 __ eor(v1, __ T16B, v1, v5); 5250 __ eor(v2, __ T16B, v2, v6); 5251 __ eor(v3, __ T16B, v3, v7); 5252 __ orr(v0, __ T16B, v0, v1); 5253 __ orr(v1, __ T16B, v2, v3); 5254 __ orr(v0, __ T16B, v0, v1); 5255 __ umov(tmp1, v0, __ D, 0); 5256 __ umov(tmp2, v0, __ D, 1); 5257 __ orr(tmp1, tmp1, tmp2); 5258 __ cbnz(tmp1, NOT_EQUAL); 5259 __ br(__ GE, LOOP); 5260 } 5261 5262 // a1 = r1 - array1 address 5263 // a2 = r2 - array2 address 5264 // result = r0 - return value. Already contains "false" 5265 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 5266 // r3-r5 are reserved temporary registers 5267 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2 5268 address generate_large_array_equals() { 5269 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5270 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 5271 tmp7 = r12, tmp8 = r13; 5272 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 5273 SMALL_LOOP, POST_LOOP; 5274 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 5275 // calculate if at least 32 prefetched bytes are used 5276 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 5277 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 5278 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 5279 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 5280 tmp5, tmp6, tmp7, tmp8); 5281 5282 __ align(CodeEntryAlignment); 5283 5284 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 5285 5286 address entry = __ pc(); 5287 __ enter(); 5288 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 5289 // also advance pointers to use post-increment instead of pre-increment 5290 __ add(a1, a1, wordSize); 5291 __ add(a2, a2, wordSize); 5292 if (AvoidUnalignedAccesses) { 5293 // both implementations (SIMD/nonSIMD) are using relatively large load 5294 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 5295 // on some CPUs in case of address is not at least 16-byte aligned. 5296 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 5297 // load if needed at least for 1st address and make if 16-byte aligned. 5298 Label ALIGNED16; 5299 __ tbz(a1, 3, ALIGNED16); 5300 __ ldr(tmp1, Address(__ post(a1, wordSize))); 5301 __ ldr(tmp2, Address(__ post(a2, wordSize))); 5302 __ sub(cnt1, cnt1, wordSize); 5303 __ eor(tmp1, tmp1, tmp2); 5304 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 5305 __ bind(ALIGNED16); 5306 } 5307 if (UseSIMDForArrayEquals) { 5308 if (SoftwarePrefetchHintDistance >= 0) { 5309 __ subs(tmp1, cnt1, prefetchLoopThreshold); 5310 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 5311 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 5312 /* prfm = */ true, NOT_EQUAL); 5313 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 5314 __ br(__ LT, TAIL); 5315 } 5316 __ bind(NO_PREFETCH_LARGE_LOOP); 5317 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 5318 /* prfm = */ false, NOT_EQUAL); 5319 } else { 5320 __ push(spilled_regs, sp); 5321 if (SoftwarePrefetchHintDistance >= 0) { 5322 __ subs(tmp1, cnt1, prefetchLoopThreshold); 5323 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 5324 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 5325 /* prfm = */ true, NOT_EQUAL); 5326 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 5327 __ br(__ LT, TAIL); 5328 } 5329 __ bind(NO_PREFETCH_LARGE_LOOP); 5330 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 5331 /* prfm = */ false, NOT_EQUAL); 5332 } 5333 __ bind(TAIL); 5334 __ cbz(cnt1, EQUAL); 5335 __ subs(cnt1, cnt1, wordSize); 5336 __ br(__ LE, POST_LOOP); 5337 __ bind(SMALL_LOOP); 5338 __ ldr(tmp1, Address(__ post(a1, wordSize))); 5339 __ ldr(tmp2, Address(__ post(a2, wordSize))); 5340 __ subs(cnt1, cnt1, wordSize); 5341 __ eor(tmp1, tmp1, tmp2); 5342 __ cbnz(tmp1, NOT_EQUAL); 5343 __ br(__ GT, SMALL_LOOP); 5344 __ bind(POST_LOOP); 5345 __ ldr(tmp1, Address(a1, cnt1)); 5346 __ ldr(tmp2, Address(a2, cnt1)); 5347 __ eor(tmp1, tmp1, tmp2); 5348 __ cbnz(tmp1, NOT_EQUAL); 5349 __ bind(EQUAL); 5350 __ mov(result, true); 5351 __ bind(NOT_EQUAL); 5352 if (!UseSIMDForArrayEquals) { 5353 __ pop(spilled_regs, sp); 5354 } 5355 __ bind(NOT_EQUAL_NO_POP); 5356 __ leave(); 5357 __ ret(lr); 5358 return entry; 5359 } 5360 5361 // result = r0 - return value. Contains initial hashcode value on entry. 5362 // ary = r1 - array address 5363 // cnt = r2 - elements count 5364 // Clobbers: v0-v13, rscratch1, rscratch2 5365 address generate_large_arrays_hashcode(BasicType eltype) { 5366 const Register result = r0, ary = r1, cnt = r2; 5367 const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0; 5368 const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7; 5369 const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0> 5370 const FloatRegister vpowm = v13; 5371 5372 ARRAYS_HASHCODE_REGISTERS; 5373 5374 Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE; 5375 5376 unsigned int vf; // vectorization factor 5377 bool multiply_by_halves; 5378 Assembler::SIMD_Arrangement load_arrangement; 5379 switch (eltype) { 5380 case T_BOOLEAN: 5381 case T_BYTE: 5382 load_arrangement = Assembler::T8B; 5383 multiply_by_halves = true; 5384 vf = 8; 5385 break; 5386 case T_CHAR: 5387 case T_SHORT: 5388 load_arrangement = Assembler::T8H; 5389 multiply_by_halves = true; 5390 vf = 8; 5391 break; 5392 case T_INT: 5393 load_arrangement = Assembler::T4S; 5394 multiply_by_halves = false; 5395 vf = 4; 5396 break; 5397 default: 5398 ShouldNotReachHere(); 5399 } 5400 5401 // Unroll factor 5402 const unsigned uf = 4; 5403 5404 // Effective vectorization factor 5405 const unsigned evf = vf * uf; 5406 5407 __ align(CodeEntryAlignment); 5408 5409 const char *mark_name = ""; 5410 switch (eltype) { 5411 case T_BOOLEAN: 5412 mark_name = "_large_arrays_hashcode_boolean"; 5413 break; 5414 case T_BYTE: 5415 mark_name = "_large_arrays_hashcode_byte"; 5416 break; 5417 case T_CHAR: 5418 mark_name = "_large_arrays_hashcode_char"; 5419 break; 5420 case T_SHORT: 5421 mark_name = "_large_arrays_hashcode_short"; 5422 break; 5423 case T_INT: 5424 mark_name = "_large_arrays_hashcode_int"; 5425 break; 5426 default: 5427 mark_name = "_large_arrays_hashcode_incorrect_type"; 5428 __ should_not_reach_here(); 5429 }; 5430 5431 StubCodeMark mark(this, "StubRoutines", mark_name); 5432 5433 address entry = __ pc(); 5434 __ enter(); 5435 5436 // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in 5437 // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's 5438 // value shouldn't change throughout both loops. 5439 __ movw(rscratch1, intpow(31U, 3)); 5440 __ mov(vpow, Assembler::S, 0, rscratch1); 5441 __ movw(rscratch1, intpow(31U, 2)); 5442 __ mov(vpow, Assembler::S, 1, rscratch1); 5443 __ movw(rscratch1, intpow(31U, 1)); 5444 __ mov(vpow, Assembler::S, 2, rscratch1); 5445 __ movw(rscratch1, intpow(31U, 0)); 5446 __ mov(vpow, Assembler::S, 3, rscratch1); 5447 5448 __ mov(vmul0, Assembler::T16B, 0); 5449 __ mov(vmul0, Assembler::S, 3, result); 5450 5451 __ andr(rscratch2, cnt, (uf - 1) * vf); 5452 __ cbz(rscratch2, LARGE_LOOP_PREHEADER); 5453 5454 __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf)); 5455 __ mov(vpowm, Assembler::S, 0, rscratch1); 5456 5457 // SMALL LOOP 5458 __ bind(SMALL_LOOP); 5459 5460 __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype)))); 5461 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 5462 __ subsw(rscratch2, rscratch2, vf); 5463 5464 if (load_arrangement == Assembler::T8B) { 5465 // Extend 8B to 8H to be able to use vector multiply 5466 // instructions 5467 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 5468 if (is_signed_subword_type(eltype)) { 5469 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 5470 } else { 5471 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 5472 } 5473 } 5474 5475 switch (load_arrangement) { 5476 case Assembler::T4S: 5477 __ addv(vmul0, load_arrangement, vmul0, vdata0); 5478 break; 5479 case Assembler::T8B: 5480 case Assembler::T8H: 5481 assert(is_subword_type(eltype), "subword type expected"); 5482 if (is_signed_subword_type(eltype)) { 5483 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 5484 } else { 5485 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 5486 } 5487 break; 5488 default: 5489 __ should_not_reach_here(); 5490 } 5491 5492 // Process the upper half of a vector 5493 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 5494 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 5495 if (is_signed_subword_type(eltype)) { 5496 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 5497 } else { 5498 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 5499 } 5500 } 5501 5502 __ br(Assembler::HI, SMALL_LOOP); 5503 5504 // SMALL LOOP'S EPILOQUE 5505 __ lsr(rscratch2, cnt, exact_log2(evf)); 5506 __ cbnz(rscratch2, LARGE_LOOP_PREHEADER); 5507 5508 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 5509 __ addv(vmul0, Assembler::T4S, vmul0); 5510 __ umov(result, vmul0, Assembler::S, 0); 5511 5512 // TAIL 5513 __ bind(TAIL); 5514 5515 // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs 5516 // of load + madd insns i.e. it only executes cnt % vf load + madd pairs. 5517 assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC"); 5518 __ andr(rscratch2, cnt, vf - 1); 5519 __ bind(TAIL_SHORTCUT); 5520 __ adr(rscratch1, BR_BASE); 5521 __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, 3); 5522 __ movw(rscratch2, 0x1f); 5523 __ br(rscratch1); 5524 5525 for (size_t i = 0; i < vf - 1; ++i) { 5526 __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))), 5527 eltype); 5528 __ maddw(result, result, rscratch2, rscratch1); 5529 } 5530 __ bind(BR_BASE); 5531 5532 __ leave(); 5533 __ ret(lr); 5534 5535 // LARGE LOOP 5536 __ bind(LARGE_LOOP_PREHEADER); 5537 5538 __ lsr(rscratch2, cnt, exact_log2(evf)); 5539 5540 if (multiply_by_halves) { 5541 // 31^4 - multiplier between lower and upper parts of a register 5542 __ movw(rscratch1, intpow(31U, vf / 2)); 5543 __ mov(vpowm, Assembler::S, 1, rscratch1); 5544 // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4 5545 __ movw(rscratch1, intpow(31U, evf - vf / 2)); 5546 __ mov(vpowm, Assembler::S, 0, rscratch1); 5547 } else { 5548 // 31^16 5549 __ movw(rscratch1, intpow(31U, evf)); 5550 __ mov(vpowm, Assembler::S, 0, rscratch1); 5551 } 5552 5553 __ mov(vmul3, Assembler::T16B, 0); 5554 __ mov(vmul2, Assembler::T16B, 0); 5555 __ mov(vmul1, Assembler::T16B, 0); 5556 5557 __ bind(LARGE_LOOP); 5558 5559 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0); 5560 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0); 5561 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0); 5562 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 5563 5564 __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement, 5565 Address(__ post(ary, evf * type2aelembytes(eltype)))); 5566 5567 if (load_arrangement == Assembler::T8B) { 5568 // Extend 8B to 8H to be able to use vector multiply 5569 // instructions 5570 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 5571 if (is_signed_subword_type(eltype)) { 5572 __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 5573 __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 5574 __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 5575 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 5576 } else { 5577 __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 5578 __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 5579 __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 5580 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 5581 } 5582 } 5583 5584 switch (load_arrangement) { 5585 case Assembler::T4S: 5586 __ addv(vmul3, load_arrangement, vmul3, vdata3); 5587 __ addv(vmul2, load_arrangement, vmul2, vdata2); 5588 __ addv(vmul1, load_arrangement, vmul1, vdata1); 5589 __ addv(vmul0, load_arrangement, vmul0, vdata0); 5590 break; 5591 case Assembler::T8B: 5592 case Assembler::T8H: 5593 assert(is_subword_type(eltype), "subword type expected"); 5594 if (is_signed_subword_type(eltype)) { 5595 __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 5596 __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 5597 __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 5598 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 5599 } else { 5600 __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 5601 __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 5602 __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 5603 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 5604 } 5605 break; 5606 default: 5607 __ should_not_reach_here(); 5608 } 5609 5610 // Process the upper half of a vector 5611 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 5612 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1); 5613 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1); 5614 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1); 5615 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1); 5616 if (is_signed_subword_type(eltype)) { 5617 __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 5618 __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 5619 __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 5620 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 5621 } else { 5622 __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 5623 __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 5624 __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 5625 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 5626 } 5627 } 5628 5629 __ subsw(rscratch2, rscratch2, 1); 5630 __ br(Assembler::HI, LARGE_LOOP); 5631 5632 __ mulv(vmul3, Assembler::T4S, vmul3, vpow); 5633 __ addv(vmul3, Assembler::T4S, vmul3); 5634 __ umov(result, vmul3, Assembler::S, 0); 5635 5636 __ mov(rscratch2, intpow(31U, vf)); 5637 5638 __ mulv(vmul2, Assembler::T4S, vmul2, vpow); 5639 __ addv(vmul2, Assembler::T4S, vmul2); 5640 __ umov(rscratch1, vmul2, Assembler::S, 0); 5641 __ maddw(result, result, rscratch2, rscratch1); 5642 5643 __ mulv(vmul1, Assembler::T4S, vmul1, vpow); 5644 __ addv(vmul1, Assembler::T4S, vmul1); 5645 __ umov(rscratch1, vmul1, Assembler::S, 0); 5646 __ maddw(result, result, rscratch2, rscratch1); 5647 5648 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 5649 __ addv(vmul0, Assembler::T4S, vmul0); 5650 __ umov(rscratch1, vmul0, Assembler::S, 0); 5651 __ maddw(result, result, rscratch2, rscratch1); 5652 5653 __ andr(rscratch2, cnt, vf - 1); 5654 __ cbnz(rscratch2, TAIL_SHORTCUT); 5655 5656 __ leave(); 5657 __ ret(lr); 5658 5659 return entry; 5660 } 5661 5662 address generate_dsin_dcos(bool isCos) { 5663 __ align(CodeEntryAlignment); 5664 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin"); 5665 address start = __ pc(); 5666 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 5667 (address)StubRoutines::aarch64::_two_over_pi, 5668 (address)StubRoutines::aarch64::_pio2, 5669 (address)StubRoutines::aarch64::_dsin_coef, 5670 (address)StubRoutines::aarch64::_dcos_coef); 5671 return start; 5672 } 5673 5674 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 5675 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 5676 Label &DIFF2) { 5677 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 5678 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 5679 5680 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 5681 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5682 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 5683 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 5684 5685 __ fmovd(tmpL, vtmp3); 5686 __ eor(rscratch2, tmp3, tmpL); 5687 __ cbnz(rscratch2, DIFF2); 5688 5689 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5690 __ umov(tmpL, vtmp3, __ D, 1); 5691 __ eor(rscratch2, tmpU, tmpL); 5692 __ cbnz(rscratch2, DIFF1); 5693 5694 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 5695 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5696 __ fmovd(tmpL, vtmp); 5697 __ eor(rscratch2, tmp3, tmpL); 5698 __ cbnz(rscratch2, DIFF2); 5699 5700 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5701 __ umov(tmpL, vtmp, __ D, 1); 5702 __ eor(rscratch2, tmpU, tmpL); 5703 __ cbnz(rscratch2, DIFF1); 5704 } 5705 5706 // r0 = result 5707 // r1 = str1 5708 // r2 = cnt1 5709 // r3 = str2 5710 // r4 = cnt2 5711 // r10 = tmp1 5712 // r11 = tmp2 5713 address generate_compare_long_string_different_encoding(bool isLU) { 5714 __ align(CodeEntryAlignment); 5715 StubCodeMark mark(this, "StubRoutines", isLU 5716 ? "compare_long_string_different_encoding LU" 5717 : "compare_long_string_different_encoding UL"); 5718 address entry = __ pc(); 5719 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 5720 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 5721 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 5722 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5723 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 5724 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 5725 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 5726 5727 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 5728 5729 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 5730 // cnt2 == amount of characters left to compare 5731 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 5732 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5733 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 5734 __ add(str2, str2, isLU ? wordSize : wordSize/2); 5735 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 5736 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 5737 __ eor(rscratch2, tmp1, tmp2); 5738 __ mov(rscratch1, tmp2); 5739 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 5740 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 5741 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 5742 __ push(spilled_regs, sp); 5743 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 5744 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 5745 5746 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5747 5748 if (SoftwarePrefetchHintDistance >= 0) { 5749 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5750 __ br(__ LT, NO_PREFETCH); 5751 __ bind(LARGE_LOOP_PREFETCH); 5752 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 5753 __ mov(tmp4, 2); 5754 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5755 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 5756 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5757 __ subs(tmp4, tmp4, 1); 5758 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 5759 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5760 __ mov(tmp4, 2); 5761 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 5762 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5763 __ subs(tmp4, tmp4, 1); 5764 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 5765 __ sub(cnt2, cnt2, 64); 5766 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5767 __ br(__ GE, LARGE_LOOP_PREFETCH); 5768 } 5769 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 5770 __ bind(NO_PREFETCH); 5771 __ subs(cnt2, cnt2, 16); 5772 __ br(__ LT, TAIL); 5773 __ align(OptoLoopAlignment); 5774 __ bind(SMALL_LOOP); // smaller loop 5775 __ subs(cnt2, cnt2, 16); 5776 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5777 __ br(__ GE, SMALL_LOOP); 5778 __ cmn(cnt2, (u1)16); 5779 __ br(__ EQ, LOAD_LAST); 5780 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 5781 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 5782 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 5783 __ ldr(tmp3, Address(cnt1, -8)); 5784 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 5785 __ b(LOAD_LAST); 5786 __ bind(DIFF2); 5787 __ mov(tmpU, tmp3); 5788 __ bind(DIFF1); 5789 __ pop(spilled_regs, sp); 5790 __ b(CALCULATE_DIFFERENCE); 5791 __ bind(LOAD_LAST); 5792 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 5793 // No need to load it again 5794 __ mov(tmpU, tmp3); 5795 __ pop(spilled_regs, sp); 5796 5797 // tmp2 points to the address of the last 4 Latin1 characters right now 5798 __ ldrs(vtmp, Address(tmp2)); 5799 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5800 __ fmovd(tmpL, vtmp); 5801 5802 __ eor(rscratch2, tmpU, tmpL); 5803 __ cbz(rscratch2, DONE); 5804 5805 // Find the first different characters in the longwords and 5806 // compute their difference. 5807 __ bind(CALCULATE_DIFFERENCE); 5808 __ rev(rscratch2, rscratch2); 5809 __ clz(rscratch2, rscratch2); 5810 __ andr(rscratch2, rscratch2, -16); 5811 __ lsrv(tmp1, tmp1, rscratch2); 5812 __ uxthw(tmp1, tmp1); 5813 __ lsrv(rscratch1, rscratch1, rscratch2); 5814 __ uxthw(rscratch1, rscratch1); 5815 __ subw(result, tmp1, rscratch1); 5816 __ bind(DONE); 5817 __ ret(lr); 5818 return entry; 5819 } 5820 5821 // r0 = input (float16) 5822 // v0 = result (float) 5823 // v1 = temporary float register 5824 address generate_float16ToFloat() { 5825 __ align(CodeEntryAlignment); 5826 StubCodeMark mark(this, "StubRoutines", "float16ToFloat"); 5827 address entry = __ pc(); 5828 BLOCK_COMMENT("Entry:"); 5829 __ flt16_to_flt(v0, r0, v1); 5830 __ ret(lr); 5831 return entry; 5832 } 5833 5834 // v0 = input (float) 5835 // r0 = result (float16) 5836 // v1 = temporary float register 5837 address generate_floatToFloat16() { 5838 __ align(CodeEntryAlignment); 5839 StubCodeMark mark(this, "StubRoutines", "floatToFloat16"); 5840 address entry = __ pc(); 5841 BLOCK_COMMENT("Entry:"); 5842 __ flt_to_flt16(r0, v0, v1); 5843 __ ret(lr); 5844 return entry; 5845 } 5846 5847 address generate_method_entry_barrier() { 5848 __ align(CodeEntryAlignment); 5849 StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier"); 5850 5851 Label deoptimize_label; 5852 5853 address start = __ pc(); 5854 5855 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 5856 5857 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 5858 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 5859 // We can get here despite the nmethod being good, if we have not 5860 // yet applied our cross modification fence (or data fence). 5861 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 5862 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr())); 5863 __ ldrw(rscratch2, rscratch2); 5864 __ strw(rscratch2, thread_epoch_addr); 5865 __ isb(); 5866 __ membar(__ LoadLoad); 5867 } 5868 5869 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 5870 5871 __ enter(); 5872 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 5873 5874 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 5875 5876 __ push_call_clobbered_registers(); 5877 5878 __ mov(c_rarg0, rscratch2); 5879 __ call_VM_leaf 5880 (CAST_FROM_FN_PTR 5881 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 5882 5883 __ reset_last_Java_frame(true); 5884 5885 __ mov(rscratch1, r0); 5886 5887 __ pop_call_clobbered_registers(); 5888 5889 __ cbnz(rscratch1, deoptimize_label); 5890 5891 __ leave(); 5892 __ ret(lr); 5893 5894 __ BIND(deoptimize_label); 5895 5896 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 5897 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 5898 5899 __ mov(sp, rscratch1); 5900 __ br(rscratch2); 5901 5902 return start; 5903 } 5904 5905 // r0 = result 5906 // r1 = str1 5907 // r2 = cnt1 5908 // r3 = str2 5909 // r4 = cnt2 5910 // r10 = tmp1 5911 // r11 = tmp2 5912 address generate_compare_long_string_same_encoding(bool isLL) { 5913 __ align(CodeEntryAlignment); 5914 StubCodeMark mark(this, "StubRoutines", isLL 5915 ? "compare_long_string_same_encoding LL" 5916 : "compare_long_string_same_encoding UU"); 5917 address entry = __ pc(); 5918 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5919 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; 5920 5921 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; 5922 5923 // exit from large loop when less than 64 bytes left to read or we're about 5924 // to prefetch memory behind array border 5925 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 5926 5927 // before jumping to stub, pre-load 8 bytes already, so do comparison directly 5928 __ eor(rscratch2, tmp1, tmp2); 5929 __ cbnz(rscratch2, CAL_DIFFERENCE); 5930 5931 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 5932 // update pointers, because of previous read 5933 __ add(str1, str1, wordSize); 5934 __ add(str2, str2, wordSize); 5935 if (SoftwarePrefetchHintDistance >= 0) { 5936 __ align(OptoLoopAlignment); 5937 __ bind(LARGE_LOOP_PREFETCH); 5938 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 5939 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 5940 5941 for (int i = 0; i < 4; i++) { 5942 __ ldp(tmp1, tmp1h, Address(str1, i * 16)); 5943 __ ldp(tmp2, tmp2h, Address(str2, i * 16)); 5944 __ cmp(tmp1, tmp2); 5945 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5946 __ br(Assembler::NE, DIFF); 5947 } 5948 __ sub(cnt2, cnt2, isLL ? 64 : 32); 5949 __ add(str1, str1, 64); 5950 __ add(str2, str2, 64); 5951 __ subs(rscratch2, cnt2, largeLoopExitCondition); 5952 __ br(Assembler::GE, LARGE_LOOP_PREFETCH); 5953 __ cbz(cnt2, LENGTH_DIFF); // no more chars left? 5954 } 5955 5956 __ subs(rscratch1, cnt2, isLL ? 16 : 8); 5957 __ br(Assembler::LE, LESS16); 5958 __ align(OptoLoopAlignment); 5959 __ bind(LOOP_COMPARE16); 5960 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 5961 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 5962 __ cmp(tmp1, tmp2); 5963 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5964 __ br(Assembler::NE, DIFF); 5965 __ sub(cnt2, cnt2, isLL ? 16 : 8); 5966 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 5967 __ br(Assembler::LT, LESS16); 5968 5969 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 5970 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 5971 __ cmp(tmp1, tmp2); 5972 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5973 __ br(Assembler::NE, DIFF); 5974 __ sub(cnt2, cnt2, isLL ? 16 : 8); 5975 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 5976 __ br(Assembler::GE, LOOP_COMPARE16); 5977 __ cbz(cnt2, LENGTH_DIFF); 5978 5979 __ bind(LESS16); 5980 // each 8 compare 5981 __ subs(cnt2, cnt2, isLL ? 8 : 4); 5982 __ br(Assembler::LE, LESS8); 5983 __ ldr(tmp1, Address(__ post(str1, 8))); 5984 __ ldr(tmp2, Address(__ post(str2, 8))); 5985 __ eor(rscratch2, tmp1, tmp2); 5986 __ cbnz(rscratch2, CAL_DIFFERENCE); 5987 __ sub(cnt2, cnt2, isLL ? 8 : 4); 5988 5989 __ bind(LESS8); // directly load last 8 bytes 5990 if (!isLL) { 5991 __ add(cnt2, cnt2, cnt2); 5992 } 5993 __ ldr(tmp1, Address(str1, cnt2)); 5994 __ ldr(tmp2, Address(str2, cnt2)); 5995 __ eor(rscratch2, tmp1, tmp2); 5996 __ cbz(rscratch2, LENGTH_DIFF); 5997 __ b(CAL_DIFFERENCE); 5998 5999 __ bind(DIFF); 6000 __ cmp(tmp1, tmp2); 6001 __ csel(tmp1, tmp1, tmp1h, Assembler::NE); 6002 __ csel(tmp2, tmp2, tmp2h, Assembler::NE); 6003 // reuse rscratch2 register for the result of eor instruction 6004 __ eor(rscratch2, tmp1, tmp2); 6005 6006 __ bind(CAL_DIFFERENCE); 6007 __ rev(rscratch2, rscratch2); 6008 __ clz(rscratch2, rscratch2); 6009 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 6010 __ lsrv(tmp1, tmp1, rscratch2); 6011 __ lsrv(tmp2, tmp2, rscratch2); 6012 if (isLL) { 6013 __ uxtbw(tmp1, tmp1); 6014 __ uxtbw(tmp2, tmp2); 6015 } else { 6016 __ uxthw(tmp1, tmp1); 6017 __ uxthw(tmp2, tmp2); 6018 } 6019 __ subw(result, tmp1, tmp2); 6020 6021 __ bind(LENGTH_DIFF); 6022 __ ret(lr); 6023 return entry; 6024 } 6025 6026 enum string_compare_mode { 6027 LL, 6028 LU, 6029 UL, 6030 UU, 6031 }; 6032 6033 // The following registers are declared in aarch64.ad 6034 // r0 = result 6035 // r1 = str1 6036 // r2 = cnt1 6037 // r3 = str2 6038 // r4 = cnt2 6039 // r10 = tmp1 6040 // r11 = tmp2 6041 // z0 = ztmp1 6042 // z1 = ztmp2 6043 // p0 = pgtmp1 6044 // p1 = pgtmp2 6045 address generate_compare_long_string_sve(string_compare_mode mode) { 6046 __ align(CodeEntryAlignment); 6047 address entry = __ pc(); 6048 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 6049 tmp1 = r10, tmp2 = r11; 6050 6051 Label LOOP, DONE, MISMATCH; 6052 Register vec_len = tmp1; 6053 Register idx = tmp2; 6054 // The minimum of the string lengths has been stored in cnt2. 6055 Register cnt = cnt2; 6056 FloatRegister ztmp1 = z0, ztmp2 = z1; 6057 PRegister pgtmp1 = p0, pgtmp2 = p1; 6058 6059 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \ 6060 switch (mode) { \ 6061 case LL: \ 6062 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \ 6063 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \ 6064 break; \ 6065 case LU: \ 6066 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \ 6067 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 6068 break; \ 6069 case UL: \ 6070 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 6071 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \ 6072 break; \ 6073 case UU: \ 6074 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 6075 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 6076 break; \ 6077 default: \ 6078 ShouldNotReachHere(); \ 6079 } 6080 6081 const char* stubname; 6082 switch (mode) { 6083 case LL: stubname = "compare_long_string_same_encoding LL"; break; 6084 case LU: stubname = "compare_long_string_different_encoding LU"; break; 6085 case UL: stubname = "compare_long_string_different_encoding UL"; break; 6086 case UU: stubname = "compare_long_string_same_encoding UU"; break; 6087 default: ShouldNotReachHere(); 6088 } 6089 6090 StubCodeMark mark(this, "StubRoutines", stubname); 6091 6092 __ mov(idx, 0); 6093 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 6094 6095 if (mode == LL) { 6096 __ sve_cntb(vec_len); 6097 } else { 6098 __ sve_cnth(vec_len); 6099 } 6100 6101 __ sub(rscratch1, cnt, vec_len); 6102 6103 __ bind(LOOP); 6104 6105 // main loop 6106 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 6107 __ add(idx, idx, vec_len); 6108 // Compare strings. 6109 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 6110 __ br(__ NE, MISMATCH); 6111 __ cmp(idx, rscratch1); 6112 __ br(__ LT, LOOP); 6113 6114 // post loop, last iteration 6115 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 6116 6117 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 6118 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 6119 __ br(__ EQ, DONE); 6120 6121 __ bind(MISMATCH); 6122 6123 // Crop the vector to find its location. 6124 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */); 6125 // Extract the first different characters of each string. 6126 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1); 6127 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2); 6128 6129 // Compute the difference of the first different characters. 6130 __ sub(result, rscratch1, rscratch2); 6131 6132 __ bind(DONE); 6133 __ ret(lr); 6134 #undef LOAD_PAIR 6135 return entry; 6136 } 6137 6138 void generate_compare_long_strings() { 6139 if (UseSVE == 0) { 6140 StubRoutines::aarch64::_compare_long_string_LL 6141 = generate_compare_long_string_same_encoding(true); 6142 StubRoutines::aarch64::_compare_long_string_UU 6143 = generate_compare_long_string_same_encoding(false); 6144 StubRoutines::aarch64::_compare_long_string_LU 6145 = generate_compare_long_string_different_encoding(true); 6146 StubRoutines::aarch64::_compare_long_string_UL 6147 = generate_compare_long_string_different_encoding(false); 6148 } else { 6149 StubRoutines::aarch64::_compare_long_string_LL 6150 = generate_compare_long_string_sve(LL); 6151 StubRoutines::aarch64::_compare_long_string_UU 6152 = generate_compare_long_string_sve(UU); 6153 StubRoutines::aarch64::_compare_long_string_LU 6154 = generate_compare_long_string_sve(LU); 6155 StubRoutines::aarch64::_compare_long_string_UL 6156 = generate_compare_long_string_sve(UL); 6157 } 6158 } 6159 6160 // R0 = result 6161 // R1 = str2 6162 // R2 = cnt1 6163 // R3 = str1 6164 // R4 = cnt2 6165 // Clobbers: rscratch1, rscratch2, v0, v1, rflags 6166 // 6167 // This generic linear code use few additional ideas, which makes it faster: 6168 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 6169 // in order to skip initial loading(help in systems with 1 ld pipeline) 6170 // 2) we can use "fast" algorithm of finding single character to search for 6171 // first symbol with less branches(1 branch per each loaded register instead 6172 // of branch for each symbol), so, this is where constants like 6173 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 6174 // 3) after loading and analyzing 1st register of source string, it can be 6175 // used to search for every 1st character entry, saving few loads in 6176 // comparison with "simplier-but-slower" implementation 6177 // 4) in order to avoid lots of push/pop operations, code below is heavily 6178 // re-using/re-initializing/compressing register values, which makes code 6179 // larger and a bit less readable, however, most of extra operations are 6180 // issued during loads or branches, so, penalty is minimal 6181 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 6182 const char* stubName = str1_isL 6183 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul") 6184 : "indexof_linear_uu"; 6185 __ align(CodeEntryAlignment); 6186 StubCodeMark mark(this, "StubRoutines", stubName); 6187 address entry = __ pc(); 6188 6189 int str1_chr_size = str1_isL ? 1 : 2; 6190 int str2_chr_size = str2_isL ? 1 : 2; 6191 int str1_chr_shift = str1_isL ? 0 : 1; 6192 int str2_chr_shift = str2_isL ? 0 : 1; 6193 bool isL = str1_isL && str2_isL; 6194 // parameters 6195 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 6196 // temporary registers 6197 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 6198 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 6199 // redefinitions 6200 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 6201 6202 __ push(spilled_regs, sp); 6203 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 6204 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 6205 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 6206 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 6207 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 6208 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 6209 // Read whole register from str1. It is safe, because length >=8 here 6210 __ ldr(ch1, Address(str1)); 6211 // Read whole register from str2. It is safe, because length >=8 here 6212 __ ldr(ch2, Address(str2)); 6213 __ sub(cnt2, cnt2, cnt1); 6214 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 6215 if (str1_isL != str2_isL) { 6216 __ eor(v0, __ T16B, v0, v0); 6217 } 6218 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 6219 __ mul(first, first, tmp1); 6220 // check if we have less than 1 register to check 6221 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 6222 if (str1_isL != str2_isL) { 6223 __ fmovd(v1, ch1); 6224 } 6225 __ br(__ LE, L_SMALL); 6226 __ eor(ch2, first, ch2); 6227 if (str1_isL != str2_isL) { 6228 __ zip1(v1, __ T16B, v1, v0); 6229 } 6230 __ sub(tmp2, ch2, tmp1); 6231 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 6232 __ bics(tmp2, tmp2, ch2); 6233 if (str1_isL != str2_isL) { 6234 __ fmovd(ch1, v1); 6235 } 6236 __ br(__ NE, L_HAS_ZERO); 6237 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 6238 __ add(result, result, wordSize/str2_chr_size); 6239 __ add(str2, str2, wordSize); 6240 __ br(__ LT, L_POST_LOOP); 6241 __ BIND(L_LOOP); 6242 __ ldr(ch2, Address(str2)); 6243 __ eor(ch2, first, ch2); 6244 __ sub(tmp2, ch2, tmp1); 6245 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 6246 __ bics(tmp2, tmp2, ch2); 6247 __ br(__ NE, L_HAS_ZERO); 6248 __ BIND(L_LOOP_PROCEED); 6249 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 6250 __ add(str2, str2, wordSize); 6251 __ add(result, result, wordSize/str2_chr_size); 6252 __ br(__ GE, L_LOOP); 6253 __ BIND(L_POST_LOOP); 6254 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 6255 __ br(__ LE, NOMATCH); 6256 __ ldr(ch2, Address(str2)); 6257 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 6258 __ eor(ch2, first, ch2); 6259 __ sub(tmp2, ch2, tmp1); 6260 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 6261 __ mov(tmp4, -1); // all bits set 6262 __ b(L_SMALL_PROCEED); 6263 __ align(OptoLoopAlignment); 6264 __ BIND(L_SMALL); 6265 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 6266 __ eor(ch2, first, ch2); 6267 if (str1_isL != str2_isL) { 6268 __ zip1(v1, __ T16B, v1, v0); 6269 } 6270 __ sub(tmp2, ch2, tmp1); 6271 __ mov(tmp4, -1); // all bits set 6272 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 6273 if (str1_isL != str2_isL) { 6274 __ fmovd(ch1, v1); // move converted 4 symbols 6275 } 6276 __ BIND(L_SMALL_PROCEED); 6277 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 6278 __ bic(tmp2, tmp2, ch2); 6279 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 6280 __ rbit(tmp2, tmp2); 6281 __ br(__ EQ, NOMATCH); 6282 __ BIND(L_SMALL_HAS_ZERO_LOOP); 6283 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 6284 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 6285 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 6286 if (str2_isL) { // LL 6287 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 6288 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 6289 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 6290 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 6291 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 6292 } else { 6293 __ mov(ch2, 0xE); // all bits in byte set except last one 6294 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6295 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6296 __ lslv(tmp2, tmp2, tmp4); 6297 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6298 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6299 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 6300 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6301 } 6302 __ cmp(ch1, ch2); 6303 __ mov(tmp4, wordSize/str2_chr_size); 6304 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 6305 __ BIND(L_SMALL_CMP_LOOP); 6306 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 6307 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 6308 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 6309 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 6310 __ add(tmp4, tmp4, 1); 6311 __ cmp(tmp4, cnt1); 6312 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 6313 __ cmp(first, ch2); 6314 __ br(__ EQ, L_SMALL_CMP_LOOP); 6315 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 6316 __ cbz(tmp2, NOMATCH); // no more matches. exit 6317 __ clz(tmp4, tmp2); 6318 __ add(result, result, 1); // advance index 6319 __ add(str2, str2, str2_chr_size); // advance pointer 6320 __ b(L_SMALL_HAS_ZERO_LOOP); 6321 __ align(OptoLoopAlignment); 6322 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 6323 __ cmp(first, ch2); 6324 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 6325 __ b(DONE); 6326 __ align(OptoLoopAlignment); 6327 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 6328 if (str2_isL) { // LL 6329 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 6330 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 6331 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 6332 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 6333 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 6334 } else { 6335 __ mov(ch2, 0xE); // all bits in byte set except last one 6336 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6337 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6338 __ lslv(tmp2, tmp2, tmp4); 6339 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6340 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6341 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 6342 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6343 } 6344 __ cmp(ch1, ch2); 6345 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 6346 __ b(DONE); 6347 __ align(OptoLoopAlignment); 6348 __ BIND(L_HAS_ZERO); 6349 __ rbit(tmp2, tmp2); 6350 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 6351 // Now, perform compression of counters(cnt2 and cnt1) into one register. 6352 // It's fine because both counters are 32bit and are not changed in this 6353 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 6354 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 6355 __ sub(result, result, 1); 6356 __ BIND(L_HAS_ZERO_LOOP); 6357 __ mov(cnt1, wordSize/str2_chr_size); 6358 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 6359 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 6360 if (str2_isL) { 6361 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 6362 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6363 __ lslv(tmp2, tmp2, tmp4); 6364 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6365 __ add(tmp4, tmp4, 1); 6366 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6367 __ lsl(tmp2, tmp2, 1); 6368 __ mov(tmp4, wordSize/str2_chr_size); 6369 } else { 6370 __ mov(ch2, 0xE); 6371 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6372 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6373 __ lslv(tmp2, tmp2, tmp4); 6374 __ add(tmp4, tmp4, 1); 6375 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6376 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 6377 __ lsl(tmp2, tmp2, 1); 6378 __ mov(tmp4, wordSize/str2_chr_size); 6379 __ sub(str2, str2, str2_chr_size); 6380 } 6381 __ cmp(ch1, ch2); 6382 __ mov(tmp4, wordSize/str2_chr_size); 6383 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6384 __ BIND(L_CMP_LOOP); 6385 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 6386 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 6387 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 6388 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 6389 __ add(tmp4, tmp4, 1); 6390 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 6391 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 6392 __ cmp(cnt1, ch2); 6393 __ br(__ EQ, L_CMP_LOOP); 6394 __ BIND(L_CMP_LOOP_NOMATCH); 6395 // here we're not matched 6396 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 6397 __ clz(tmp4, tmp2); 6398 __ add(str2, str2, str2_chr_size); // advance pointer 6399 __ b(L_HAS_ZERO_LOOP); 6400 __ align(OptoLoopAlignment); 6401 __ BIND(L_CMP_LOOP_LAST_CMP); 6402 __ cmp(cnt1, ch2); 6403 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6404 __ b(DONE); 6405 __ align(OptoLoopAlignment); 6406 __ BIND(L_CMP_LOOP_LAST_CMP2); 6407 if (str2_isL) { 6408 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 6409 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6410 __ lslv(tmp2, tmp2, tmp4); 6411 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6412 __ add(tmp4, tmp4, 1); 6413 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6414 __ lsl(tmp2, tmp2, 1); 6415 } else { 6416 __ mov(ch2, 0xE); 6417 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6418 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6419 __ lslv(tmp2, tmp2, tmp4); 6420 __ add(tmp4, tmp4, 1); 6421 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6422 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 6423 __ lsl(tmp2, tmp2, 1); 6424 __ sub(str2, str2, str2_chr_size); 6425 } 6426 __ cmp(ch1, ch2); 6427 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6428 __ b(DONE); 6429 __ align(OptoLoopAlignment); 6430 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 6431 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 6432 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 6433 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 6434 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 6435 // result by analyzed characters value, so, we can just reset lower bits 6436 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 6437 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 6438 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 6439 // index of last analyzed substring inside current octet. So, str2 in at 6440 // respective start address. We need to advance it to next octet 6441 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 6442 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 6443 __ bfm(result, zr, 0, 2 - str2_chr_shift); 6444 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 6445 __ movw(cnt2, cnt2); 6446 __ b(L_LOOP_PROCEED); 6447 __ align(OptoLoopAlignment); 6448 __ BIND(NOMATCH); 6449 __ mov(result, -1); 6450 __ BIND(DONE); 6451 __ pop(spilled_regs, sp); 6452 __ ret(lr); 6453 return entry; 6454 } 6455 6456 void generate_string_indexof_stubs() { 6457 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 6458 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 6459 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 6460 } 6461 6462 void inflate_and_store_2_fp_registers(bool generatePrfm, 6463 FloatRegister src1, FloatRegister src2) { 6464 Register dst = r1; 6465 __ zip1(v1, __ T16B, src1, v0); 6466 __ zip2(v2, __ T16B, src1, v0); 6467 if (generatePrfm) { 6468 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 6469 } 6470 __ zip1(v3, __ T16B, src2, v0); 6471 __ zip2(v4, __ T16B, src2, v0); 6472 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 6473 } 6474 6475 // R0 = src 6476 // R1 = dst 6477 // R2 = len 6478 // R3 = len >> 3 6479 // V0 = 0 6480 // v1 = loaded 8 bytes 6481 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6 6482 address generate_large_byte_array_inflate() { 6483 __ align(CodeEntryAlignment); 6484 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); 6485 address entry = __ pc(); 6486 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 6487 Register src = r0, dst = r1, len = r2, octetCounter = r3; 6488 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 6489 6490 // do one more 8-byte read to have address 16-byte aligned in most cases 6491 // also use single store instruction 6492 __ ldrd(v2, __ post(src, 8)); 6493 __ sub(octetCounter, octetCounter, 2); 6494 __ zip1(v1, __ T16B, v1, v0); 6495 __ zip1(v2, __ T16B, v2, v0); 6496 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 6497 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6498 __ subs(rscratch1, octetCounter, large_loop_threshold); 6499 __ br(__ LE, LOOP_START); 6500 __ b(LOOP_PRFM_START); 6501 __ bind(LOOP_PRFM); 6502 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6503 __ bind(LOOP_PRFM_START); 6504 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 6505 __ sub(octetCounter, octetCounter, 8); 6506 __ subs(rscratch1, octetCounter, large_loop_threshold); 6507 inflate_and_store_2_fp_registers(true, v3, v4); 6508 inflate_and_store_2_fp_registers(true, v5, v6); 6509 __ br(__ GT, LOOP_PRFM); 6510 __ cmp(octetCounter, (u1)8); 6511 __ br(__ LT, DONE); 6512 __ bind(LOOP); 6513 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6514 __ bind(LOOP_START); 6515 __ sub(octetCounter, octetCounter, 8); 6516 __ cmp(octetCounter, (u1)8); 6517 inflate_and_store_2_fp_registers(false, v3, v4); 6518 inflate_and_store_2_fp_registers(false, v5, v6); 6519 __ br(__ GE, LOOP); 6520 __ bind(DONE); 6521 __ ret(lr); 6522 return entry; 6523 } 6524 6525 /** 6526 * Arguments: 6527 * 6528 * Input: 6529 * c_rarg0 - current state address 6530 * c_rarg1 - H key address 6531 * c_rarg2 - data address 6532 * c_rarg3 - number of blocks 6533 * 6534 * Output: 6535 * Updated state at c_rarg0 6536 */ 6537 address generate_ghash_processBlocks() { 6538 // Bafflingly, GCM uses little-endian for the byte order, but 6539 // big-endian for the bit order. For example, the polynomial 1 is 6540 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 6541 // 6542 // So, we must either reverse the bytes in each word and do 6543 // everything big-endian or reverse the bits in each byte and do 6544 // it little-endian. On AArch64 it's more idiomatic to reverse 6545 // the bits in each byte (we have an instruction, RBIT, to do 6546 // that) and keep the data in little-endian bit order through the 6547 // calculation, bit-reversing the inputs and outputs. 6548 6549 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 6550 __ align(wordSize * 2); 6551 address p = __ pc(); 6552 __ emit_int64(0x87); // The low-order bits of the field 6553 // polynomial (i.e. p = z^7+z^2+z+1) 6554 // repeated in the low and high parts of a 6555 // 128-bit vector 6556 __ emit_int64(0x87); 6557 6558 __ align(CodeEntryAlignment); 6559 address start = __ pc(); 6560 6561 Register state = c_rarg0; 6562 Register subkeyH = c_rarg1; 6563 Register data = c_rarg2; 6564 Register blocks = c_rarg3; 6565 6566 FloatRegister vzr = v30; 6567 __ eor(vzr, __ T16B, vzr, vzr); // zero register 6568 6569 __ ldrq(v24, p); // The field polynomial 6570 6571 __ ldrq(v0, Address(state)); 6572 __ ldrq(v1, Address(subkeyH)); 6573 6574 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 6575 __ rbit(v0, __ T16B, v0); 6576 __ rev64(v1, __ T16B, v1); 6577 __ rbit(v1, __ T16B, v1); 6578 6579 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 6580 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 6581 6582 { 6583 Label L_ghash_loop; 6584 __ bind(L_ghash_loop); 6585 6586 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 6587 // reversing each byte 6588 __ rbit(v2, __ T16B, v2); 6589 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 6590 6591 // Multiply state in v2 by subkey in v1 6592 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 6593 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4, 6594 /*temps*/v6, v3, /*reuse/clobber b*/v2); 6595 // Reduce v7:v5 by the field polynomial 6596 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3); 6597 6598 __ sub(blocks, blocks, 1); 6599 __ cbnz(blocks, L_ghash_loop); 6600 } 6601 6602 // The bit-reversed result is at this point in v0 6603 __ rev64(v0, __ T16B, v0); 6604 __ rbit(v0, __ T16B, v0); 6605 6606 __ st1(v0, __ T16B, state); 6607 __ ret(lr); 6608 6609 return start; 6610 } 6611 6612 address generate_ghash_processBlocks_wide() { 6613 address small = generate_ghash_processBlocks(); 6614 6615 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide"); 6616 __ align(wordSize * 2); 6617 address p = __ pc(); 6618 __ emit_int64(0x87); // The low-order bits of the field 6619 // polynomial (i.e. p = z^7+z^2+z+1) 6620 // repeated in the low and high parts of a 6621 // 128-bit vector 6622 __ emit_int64(0x87); 6623 6624 __ align(CodeEntryAlignment); 6625 address start = __ pc(); 6626 6627 Register state = c_rarg0; 6628 Register subkeyH = c_rarg1; 6629 Register data = c_rarg2; 6630 Register blocks = c_rarg3; 6631 6632 const int unroll = 4; 6633 6634 __ cmp(blocks, (unsigned char)(unroll * 2)); 6635 __ br(__ LT, small); 6636 6637 if (unroll > 1) { 6638 // Save state before entering routine 6639 __ sub(sp, sp, 4 * 16); 6640 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 6641 __ sub(sp, sp, 4 * 16); 6642 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 6643 } 6644 6645 __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); 6646 6647 if (unroll > 1) { 6648 // And restore state 6649 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 6650 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 6651 } 6652 6653 __ cmp(blocks, (unsigned char)0); 6654 __ br(__ GT, small); 6655 6656 __ ret(lr); 6657 6658 return start; 6659 } 6660 6661 void generate_base64_encode_simdround(Register src, Register dst, 6662 FloatRegister codec, u8 size) { 6663 6664 FloatRegister in0 = v4, in1 = v5, in2 = v6; 6665 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; 6666 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; 6667 6668 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 6669 6670 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); 6671 6672 __ ushr(ind0, arrangement, in0, 2); 6673 6674 __ ushr(ind1, arrangement, in1, 2); 6675 __ shl(in0, arrangement, in0, 6); 6676 __ orr(ind1, arrangement, ind1, in0); 6677 __ ushr(ind1, arrangement, ind1, 2); 6678 6679 __ ushr(ind2, arrangement, in2, 4); 6680 __ shl(in1, arrangement, in1, 4); 6681 __ orr(ind2, arrangement, in1, ind2); 6682 __ ushr(ind2, arrangement, ind2, 2); 6683 6684 __ shl(ind3, arrangement, in2, 2); 6685 __ ushr(ind3, arrangement, ind3, 2); 6686 6687 __ tbl(out0, arrangement, codec, 4, ind0); 6688 __ tbl(out1, arrangement, codec, 4, ind1); 6689 __ tbl(out2, arrangement, codec, 4, ind2); 6690 __ tbl(out3, arrangement, codec, 4, ind3); 6691 6692 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); 6693 } 6694 6695 /** 6696 * Arguments: 6697 * 6698 * Input: 6699 * c_rarg0 - src_start 6700 * c_rarg1 - src_offset 6701 * c_rarg2 - src_length 6702 * c_rarg3 - dest_start 6703 * c_rarg4 - dest_offset 6704 * c_rarg5 - isURL 6705 * 6706 */ 6707 address generate_base64_encodeBlock() { 6708 6709 static const char toBase64[64] = { 6710 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 6711 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 6712 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 6713 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 6714 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 6715 }; 6716 6717 static const char toBase64URL[64] = { 6718 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 6719 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 6720 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 6721 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 6722 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 6723 }; 6724 6725 __ align(CodeEntryAlignment); 6726 StubCodeMark mark(this, "StubRoutines", "encodeBlock"); 6727 address start = __ pc(); 6728 6729 Register src = c_rarg0; // source array 6730 Register soff = c_rarg1; // source start offset 6731 Register send = c_rarg2; // source end offset 6732 Register dst = c_rarg3; // dest array 6733 Register doff = c_rarg4; // position for writing to dest array 6734 Register isURL = c_rarg5; // Base64 or URL character set 6735 6736 // c_rarg6 and c_rarg7 are free to use as temps 6737 Register codec = c_rarg6; 6738 Register length = c_rarg7; 6739 6740 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; 6741 6742 __ add(src, src, soff); 6743 __ add(dst, dst, doff); 6744 __ sub(length, send, soff); 6745 6746 // load the codec base address 6747 __ lea(codec, ExternalAddress((address) toBase64)); 6748 __ cbz(isURL, ProcessData); 6749 __ lea(codec, ExternalAddress((address) toBase64URL)); 6750 6751 __ BIND(ProcessData); 6752 6753 // too short to formup a SIMD loop, roll back 6754 __ cmp(length, (u1)24); 6755 __ br(Assembler::LT, Process3B); 6756 6757 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); 6758 6759 __ BIND(Process48B); 6760 __ cmp(length, (u1)48); 6761 __ br(Assembler::LT, Process24B); 6762 generate_base64_encode_simdround(src, dst, v0, 16); 6763 __ sub(length, length, 48); 6764 __ b(Process48B); 6765 6766 __ BIND(Process24B); 6767 __ cmp(length, (u1)24); 6768 __ br(Assembler::LT, SIMDExit); 6769 generate_base64_encode_simdround(src, dst, v0, 8); 6770 __ sub(length, length, 24); 6771 6772 __ BIND(SIMDExit); 6773 __ cbz(length, Exit); 6774 6775 __ BIND(Process3B); 6776 // 3 src bytes, 24 bits 6777 __ ldrb(r10, __ post(src, 1)); 6778 __ ldrb(r11, __ post(src, 1)); 6779 __ ldrb(r12, __ post(src, 1)); 6780 __ orrw(r11, r11, r10, Assembler::LSL, 8); 6781 __ orrw(r12, r12, r11, Assembler::LSL, 8); 6782 // codec index 6783 __ ubfmw(r15, r12, 18, 23); 6784 __ ubfmw(r14, r12, 12, 17); 6785 __ ubfmw(r13, r12, 6, 11); 6786 __ andw(r12, r12, 63); 6787 // get the code based on the codec 6788 __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); 6789 __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); 6790 __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); 6791 __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); 6792 __ strb(r15, __ post(dst, 1)); 6793 __ strb(r14, __ post(dst, 1)); 6794 __ strb(r13, __ post(dst, 1)); 6795 __ strb(r12, __ post(dst, 1)); 6796 __ sub(length, length, 3); 6797 __ cbnz(length, Process3B); 6798 6799 __ BIND(Exit); 6800 __ ret(lr); 6801 6802 return start; 6803 } 6804 6805 void generate_base64_decode_simdround(Register src, Register dst, 6806 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { 6807 6808 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; 6809 FloatRegister out0 = v20, out1 = v21, out2 = v22; 6810 6811 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; 6812 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; 6813 6814 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; 6815 6816 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 6817 6818 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); 6819 6820 // we need unsigned saturating subtract, to make sure all input values 6821 // in range [0, 63] will have 0U value in the higher half lookup 6822 __ uqsubv(decH0, __ T16B, in0, v27); 6823 __ uqsubv(decH1, __ T16B, in1, v27); 6824 __ uqsubv(decH2, __ T16B, in2, v27); 6825 __ uqsubv(decH3, __ T16B, in3, v27); 6826 6827 // lower half lookup 6828 __ tbl(decL0, arrangement, codecL, 4, in0); 6829 __ tbl(decL1, arrangement, codecL, 4, in1); 6830 __ tbl(decL2, arrangement, codecL, 4, in2); 6831 __ tbl(decL3, arrangement, codecL, 4, in3); 6832 6833 // higher half lookup 6834 __ tbx(decH0, arrangement, codecH, 4, decH0); 6835 __ tbx(decH1, arrangement, codecH, 4, decH1); 6836 __ tbx(decH2, arrangement, codecH, 4, decH2); 6837 __ tbx(decH3, arrangement, codecH, 4, decH3); 6838 6839 // combine lower and higher 6840 __ orr(decL0, arrangement, decL0, decH0); 6841 __ orr(decL1, arrangement, decL1, decH1); 6842 __ orr(decL2, arrangement, decL2, decH2); 6843 __ orr(decL3, arrangement, decL3, decH3); 6844 6845 // check illegal inputs, value larger than 63 (maximum of 6 bits) 6846 __ cm(Assembler::HI, decH0, arrangement, decL0, v27); 6847 __ cm(Assembler::HI, decH1, arrangement, decL1, v27); 6848 __ cm(Assembler::HI, decH2, arrangement, decL2, v27); 6849 __ cm(Assembler::HI, decH3, arrangement, decL3, v27); 6850 __ orr(in0, arrangement, decH0, decH1); 6851 __ orr(in1, arrangement, decH2, decH3); 6852 __ orr(in2, arrangement, in0, in1); 6853 __ umaxv(in3, arrangement, in2); 6854 __ umov(rscratch2, in3, __ B, 0); 6855 6856 // get the data to output 6857 __ shl(out0, arrangement, decL0, 2); 6858 __ ushr(out1, arrangement, decL1, 4); 6859 __ orr(out0, arrangement, out0, out1); 6860 __ shl(out1, arrangement, decL1, 4); 6861 __ ushr(out2, arrangement, decL2, 2); 6862 __ orr(out1, arrangement, out1, out2); 6863 __ shl(out2, arrangement, decL2, 6); 6864 __ orr(out2, arrangement, out2, decL3); 6865 6866 __ cbz(rscratch2, NoIllegalData); 6867 6868 // handle illegal input 6869 __ umov(r10, in2, __ D, 0); 6870 if (size == 16) { 6871 __ cbnz(r10, ErrorInLowerHalf); 6872 6873 // illegal input is in higher half, store the lower half now. 6874 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); 6875 6876 __ umov(r10, in2, __ D, 1); 6877 __ umov(r11, out0, __ D, 1); 6878 __ umov(r12, out1, __ D, 1); 6879 __ umov(r13, out2, __ D, 1); 6880 __ b(StoreLegalData); 6881 6882 __ BIND(ErrorInLowerHalf); 6883 } 6884 __ umov(r11, out0, __ D, 0); 6885 __ umov(r12, out1, __ D, 0); 6886 __ umov(r13, out2, __ D, 0); 6887 6888 __ BIND(StoreLegalData); 6889 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input 6890 __ strb(r11, __ post(dst, 1)); 6891 __ strb(r12, __ post(dst, 1)); 6892 __ strb(r13, __ post(dst, 1)); 6893 __ lsr(r10, r10, 8); 6894 __ lsr(r11, r11, 8); 6895 __ lsr(r12, r12, 8); 6896 __ lsr(r13, r13, 8); 6897 __ b(StoreLegalData); 6898 6899 __ BIND(NoIllegalData); 6900 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); 6901 } 6902 6903 6904 /** 6905 * Arguments: 6906 * 6907 * Input: 6908 * c_rarg0 - src_start 6909 * c_rarg1 - src_offset 6910 * c_rarg2 - src_length 6911 * c_rarg3 - dest_start 6912 * c_rarg4 - dest_offset 6913 * c_rarg5 - isURL 6914 * c_rarg6 - isMIME 6915 * 6916 */ 6917 address generate_base64_decodeBlock() { 6918 6919 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined 6920 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section 6921 // titled "Base64 decoding". 6922 6923 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, 6924 // except the trailing character '=' is also treated illegal value in this intrinsic. That 6925 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. 6926 static const uint8_t fromBase64ForNoSIMD[256] = { 6927 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6928 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6929 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 6930 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6931 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 6932 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 6933 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 6934 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 6935 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6936 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6937 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6938 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6939 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6940 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6941 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6942 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6943 }; 6944 6945 static const uint8_t fromBase64URLForNoSIMD[256] = { 6946 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6947 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6948 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 6949 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6950 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 6951 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 6952 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 6953 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 6954 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6955 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6956 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6957 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6958 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6959 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6960 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6961 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6962 }; 6963 6964 // A legal value of base64 code is in range [0, 127]. We need two lookups 6965 // with tbl/tbx and combine them to get the decode data. The 1st table vector 6966 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd 6967 // table vector lookup use tbx, out of range indices are unchanged in 6968 // destination. Input [64..126] is mapped to index [65, 127] in second lookup. 6969 // The value of index 64 is set to 0, so that we know that we already get the 6970 // decoded data with the 1st lookup. 6971 static const uint8_t fromBase64ForSIMD[128] = { 6972 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6973 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6974 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 6975 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6976 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 6977 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 6978 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 6979 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 6980 }; 6981 6982 static const uint8_t fromBase64URLForSIMD[128] = { 6983 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6984 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6985 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 6986 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6987 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 6988 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 6989 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 6990 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 6991 }; 6992 6993 __ align(CodeEntryAlignment); 6994 StubCodeMark mark(this, "StubRoutines", "decodeBlock"); 6995 address start = __ pc(); 6996 6997 Register src = c_rarg0; // source array 6998 Register soff = c_rarg1; // source start offset 6999 Register send = c_rarg2; // source end offset 7000 Register dst = c_rarg3; // dest array 7001 Register doff = c_rarg4; // position for writing to dest array 7002 Register isURL = c_rarg5; // Base64 or URL character set 7003 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation 7004 7005 Register length = send; // reuse send as length of source data to process 7006 7007 Register simd_codec = c_rarg6; 7008 Register nosimd_codec = c_rarg7; 7009 7010 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; 7011 7012 __ enter(); 7013 7014 __ add(src, src, soff); 7015 __ add(dst, dst, doff); 7016 7017 __ mov(doff, dst); 7018 7019 __ sub(length, send, soff); 7020 __ bfm(length, zr, 0, 1); 7021 7022 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); 7023 __ cbz(isURL, ProcessData); 7024 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); 7025 7026 __ BIND(ProcessData); 7027 __ mov(rscratch1, length); 7028 __ cmp(length, (u1)144); // 144 = 80 + 64 7029 __ br(Assembler::LT, Process4B); 7030 7031 // In the MIME case, the line length cannot be more than 76 7032 // bytes (see RFC 2045). This is too short a block for SIMD 7033 // to be worthwhile, so we use non-SIMD here. 7034 __ movw(rscratch1, 79); 7035 7036 __ BIND(Process4B); 7037 __ ldrw(r14, __ post(src, 4)); 7038 __ ubfxw(r10, r14, 0, 8); 7039 __ ubfxw(r11, r14, 8, 8); 7040 __ ubfxw(r12, r14, 16, 8); 7041 __ ubfxw(r13, r14, 24, 8); 7042 // get the de-code 7043 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); 7044 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); 7045 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); 7046 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); 7047 // error detection, 255u indicates an illegal input 7048 __ orrw(r14, r10, r11); 7049 __ orrw(r15, r12, r13); 7050 __ orrw(r14, r14, r15); 7051 __ tbnz(r14, 7, Exit); 7052 // recover the data 7053 __ lslw(r14, r10, 10); 7054 __ bfiw(r14, r11, 4, 6); 7055 __ bfmw(r14, r12, 2, 5); 7056 __ rev16w(r14, r14); 7057 __ bfiw(r13, r12, 6, 2); 7058 __ strh(r14, __ post(dst, 2)); 7059 __ strb(r13, __ post(dst, 1)); 7060 // non-simd loop 7061 __ subsw(rscratch1, rscratch1, 4); 7062 __ br(Assembler::GT, Process4B); 7063 7064 // if exiting from PreProcess80B, rscratch1 == -1; 7065 // otherwise, rscratch1 == 0. 7066 __ cbzw(rscratch1, Exit); 7067 __ sub(length, length, 80); 7068 7069 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); 7070 __ cbz(isURL, SIMDEnter); 7071 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); 7072 7073 __ BIND(SIMDEnter); 7074 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); 7075 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); 7076 __ mov(rscratch1, 63); 7077 __ dup(v27, __ T16B, rscratch1); 7078 7079 __ BIND(Process64B); 7080 __ cmp(length, (u1)64); 7081 __ br(Assembler::LT, Process32B); 7082 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); 7083 __ sub(length, length, 64); 7084 __ b(Process64B); 7085 7086 __ BIND(Process32B); 7087 __ cmp(length, (u1)32); 7088 __ br(Assembler::LT, SIMDExit); 7089 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); 7090 __ sub(length, length, 32); 7091 __ b(Process32B); 7092 7093 __ BIND(SIMDExit); 7094 __ cbz(length, Exit); 7095 __ movw(rscratch1, length); 7096 __ b(Process4B); 7097 7098 __ BIND(Exit); 7099 __ sub(c_rarg0, dst, doff); 7100 7101 __ leave(); 7102 __ ret(lr); 7103 7104 return start; 7105 } 7106 7107 // Support for spin waits. 7108 address generate_spin_wait() { 7109 __ align(CodeEntryAlignment); 7110 StubCodeMark mark(this, "StubRoutines", "spin_wait"); 7111 address start = __ pc(); 7112 7113 __ spin_wait(); 7114 __ ret(lr); 7115 7116 return start; 7117 } 7118 7119 address generate_lookup_secondary_supers_table_stub(u1 super_klass_index) { 7120 StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table"); 7121 7122 address start = __ pc(); 7123 const Register 7124 r_super_klass = r0, 7125 r_array_base = r1, 7126 r_array_length = r2, 7127 r_array_index = r3, 7128 r_sub_klass = r4, 7129 r_bitmap = rscratch2, 7130 result = r5; 7131 const FloatRegister 7132 vtemp = v0; 7133 7134 Label L_success; 7135 __ enter(); 7136 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass, 7137 r_array_base, r_array_length, r_array_index, 7138 vtemp, result, super_klass_index, 7139 /*stub_is_near*/true); 7140 __ leave(); 7141 __ ret(lr); 7142 7143 return start; 7144 } 7145 7146 // Slow path implementation for UseSecondarySupersTable. 7147 address generate_lookup_secondary_supers_table_slow_path_stub() { 7148 StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table_slow_path"); 7149 7150 address start = __ pc(); 7151 const Register 7152 r_super_klass = r0, // argument 7153 r_array_base = r1, // argument 7154 temp1 = r2, // temp 7155 r_array_index = r3, // argument 7156 r_bitmap = rscratch2, // argument 7157 result = r5; // argument 7158 7159 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result); 7160 __ ret(lr); 7161 7162 return start; 7163 } 7164 7165 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 7166 7167 // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX. 7168 // 7169 // If LSE is in use, generate LSE versions of all the stubs. The 7170 // non-LSE versions are in atomic_aarch64.S. 7171 7172 // class AtomicStubMark records the entry point of a stub and the 7173 // stub pointer which will point to it. The stub pointer is set to 7174 // the entry point when ~AtomicStubMark() is called, which must be 7175 // after ICache::invalidate_range. This ensures safe publication of 7176 // the generated code. 7177 class AtomicStubMark { 7178 address _entry_point; 7179 aarch64_atomic_stub_t *_stub; 7180 MacroAssembler *_masm; 7181 public: 7182 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { 7183 _masm = masm; 7184 __ align(32); 7185 _entry_point = __ pc(); 7186 _stub = stub; 7187 } 7188 ~AtomicStubMark() { 7189 *_stub = (aarch64_atomic_stub_t)_entry_point; 7190 } 7191 }; 7192 7193 // NB: For memory_order_conservative we need a trailing membar after 7194 // LSE atomic operations but not a leading membar. 7195 // 7196 // We don't need a leading membar because a clause in the Arm ARM 7197 // says: 7198 // 7199 // Barrier-ordered-before 7200 // 7201 // Barrier instructions order prior Memory effects before subsequent 7202 // Memory effects generated by the same Observer. A read or a write 7203 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same 7204 // Observer if and only if RW1 appears in program order before RW 2 7205 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic 7206 // instruction with both Acquire and Release semantics. 7207 // 7208 // All the atomic instructions {ldaddal, swapal, casal} have Acquire 7209 // and Release semantics, therefore we don't need a leading 7210 // barrier. However, there is no corresponding Barrier-ordered-after 7211 // relationship, therefore we need a trailing membar to prevent a 7212 // later store or load from being reordered with the store in an 7213 // atomic instruction. 7214 // 7215 // This was checked by using the herd7 consistency model simulator 7216 // (http://diy.inria.fr/) with this test case: 7217 // 7218 // AArch64 LseCas 7219 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } 7220 // P0 | P1; 7221 // LDR W4, [X2] | MOV W3, #0; 7222 // DMB LD | MOV W4, #1; 7223 // LDR W3, [X1] | CASAL W3, W4, [X1]; 7224 // | DMB ISH; 7225 // | STR W4, [X2]; 7226 // exists 7227 // (0:X3=0 /\ 0:X4=1) 7228 // 7229 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered 7230 // with the store to x in P1. Without the DMB in P1 this may happen. 7231 // 7232 // At the time of writing we don't know of any AArch64 hardware that 7233 // reorders stores in this way, but the Reference Manual permits it. 7234 7235 void gen_cas_entry(Assembler::operand_size size, 7236 atomic_memory_order order) { 7237 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, 7238 exchange_val = c_rarg2; 7239 bool acquire, release; 7240 switch (order) { 7241 case memory_order_relaxed: 7242 acquire = false; 7243 release = false; 7244 break; 7245 case memory_order_release: 7246 acquire = false; 7247 release = true; 7248 break; 7249 default: 7250 acquire = true; 7251 release = true; 7252 break; 7253 } 7254 __ mov(prev, compare_val); 7255 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); 7256 if (order == memory_order_conservative) { 7257 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 7258 } 7259 if (size == Assembler::xword) { 7260 __ mov(r0, prev); 7261 } else { 7262 __ movw(r0, prev); 7263 } 7264 __ ret(lr); 7265 } 7266 7267 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) { 7268 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 7269 // If not relaxed, then default to conservative. Relaxed is the only 7270 // case we use enough to be worth specializing. 7271 if (order == memory_order_relaxed) { 7272 __ ldadd(size, incr, prev, addr); 7273 } else { 7274 __ ldaddal(size, incr, prev, addr); 7275 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 7276 } 7277 if (size == Assembler::xword) { 7278 __ mov(r0, prev); 7279 } else { 7280 __ movw(r0, prev); 7281 } 7282 __ ret(lr); 7283 } 7284 7285 void gen_swpal_entry(Assembler::operand_size size) { 7286 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 7287 __ swpal(size, incr, prev, addr); 7288 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 7289 if (size == Assembler::xword) { 7290 __ mov(r0, prev); 7291 } else { 7292 __ movw(r0, prev); 7293 } 7294 __ ret(lr); 7295 } 7296 7297 void generate_atomic_entry_points() { 7298 if (! UseLSE) { 7299 return; 7300 } 7301 7302 __ align(CodeEntryAlignment); 7303 StubCodeMark mark(this, "StubRoutines", "atomic entry points"); 7304 address first_entry = __ pc(); 7305 7306 // ADD, memory_order_conservative 7307 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); 7308 gen_ldadd_entry(Assembler::word, memory_order_conservative); 7309 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); 7310 gen_ldadd_entry(Assembler::xword, memory_order_conservative); 7311 7312 // ADD, memory_order_relaxed 7313 AtomicStubMark mark_fetch_add_4_relaxed 7314 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl); 7315 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed); 7316 AtomicStubMark mark_fetch_add_8_relaxed 7317 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl); 7318 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed); 7319 7320 // XCHG, memory_order_conservative 7321 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); 7322 gen_swpal_entry(Assembler::word); 7323 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); 7324 gen_swpal_entry(Assembler::xword); 7325 7326 // CAS, memory_order_conservative 7327 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); 7328 gen_cas_entry(MacroAssembler::byte, memory_order_conservative); 7329 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); 7330 gen_cas_entry(MacroAssembler::word, memory_order_conservative); 7331 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); 7332 gen_cas_entry(MacroAssembler::xword, memory_order_conservative); 7333 7334 // CAS, memory_order_relaxed 7335 AtomicStubMark mark_cmpxchg_1_relaxed 7336 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); 7337 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); 7338 AtomicStubMark mark_cmpxchg_4_relaxed 7339 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); 7340 gen_cas_entry(MacroAssembler::word, memory_order_relaxed); 7341 AtomicStubMark mark_cmpxchg_8_relaxed 7342 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); 7343 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); 7344 7345 AtomicStubMark mark_cmpxchg_4_release 7346 (_masm, &aarch64_atomic_cmpxchg_4_release_impl); 7347 gen_cas_entry(MacroAssembler::word, memory_order_release); 7348 AtomicStubMark mark_cmpxchg_8_release 7349 (_masm, &aarch64_atomic_cmpxchg_8_release_impl); 7350 gen_cas_entry(MacroAssembler::xword, memory_order_release); 7351 7352 AtomicStubMark mark_cmpxchg_4_seq_cst 7353 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); 7354 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); 7355 AtomicStubMark mark_cmpxchg_8_seq_cst 7356 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); 7357 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); 7358 7359 ICache::invalidate_range(first_entry, __ pc() - first_entry); 7360 } 7361 #endif // LINUX 7362 7363 address generate_cont_thaw(Continuation::thaw_kind kind) { 7364 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 7365 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 7366 7367 address start = __ pc(); 7368 7369 if (return_barrier) { 7370 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())); 7371 __ mov(sp, rscratch1); 7372 } 7373 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 7374 7375 if (return_barrier) { 7376 // preserve possible return value from a method returning to the return barrier 7377 __ fmovd(rscratch1, v0); 7378 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 7379 } 7380 7381 __ movw(c_rarg1, (return_barrier ? 1 : 0)); 7382 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1); 7383 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames 7384 7385 if (return_barrier) { 7386 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 7387 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 7388 __ fmovd(v0, rscratch1); 7389 } 7390 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 7391 7392 7393 Label thaw_success; 7394 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames 7395 __ cbnz(rscratch2, thaw_success); 7396 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry())); 7397 __ br(rscratch1); 7398 __ bind(thaw_success); 7399 7400 // make room for the thawed frames 7401 __ sub(rscratch1, sp, rscratch2); 7402 __ andr(rscratch1, rscratch1, -16); // align 7403 __ mov(sp, rscratch1); 7404 7405 if (return_barrier) { 7406 // save original return value -- again 7407 __ fmovd(rscratch1, v0); 7408 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 7409 } 7410 7411 // If we want, we can templatize thaw by kind, and have three different entries 7412 __ movw(c_rarg1, (uint32_t)kind); 7413 7414 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1); 7415 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame 7416 7417 if (return_barrier) { 7418 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 7419 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 7420 __ fmovd(v0, rscratch1); 7421 } else { 7422 __ mov(r0, zr); // return 0 (success) from doYield 7423 } 7424 7425 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down) 7426 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill 7427 __ mov(rfp, sp); 7428 7429 if (return_barrier_exception) { 7430 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address 7431 __ authenticate_return_address(c_rarg1); 7432 __ verify_oop(r0); 7433 // save return value containing the exception oop in callee-saved R19 7434 __ mov(r19, r0); 7435 7436 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); 7437 7438 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code. 7439 // __ reinitialize_ptrue(); 7440 7441 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc 7442 7443 __ mov(r1, r0); // the exception handler 7444 __ mov(r0, r19); // restore return value containing the exception oop 7445 __ verify_oop(r0); 7446 7447 __ leave(); 7448 __ mov(r3, lr); 7449 __ br(r1); // the exception handler 7450 } else { 7451 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 7452 __ leave(); 7453 __ ret(lr); 7454 } 7455 7456 return start; 7457 } 7458 7459 address generate_cont_thaw() { 7460 if (!Continuations::enabled()) return nullptr; 7461 7462 StubCodeMark mark(this, "StubRoutines", "Cont thaw"); 7463 address start = __ pc(); 7464 generate_cont_thaw(Continuation::thaw_top); 7465 return start; 7466 } 7467 7468 address generate_cont_returnBarrier() { 7469 if (!Continuations::enabled()) return nullptr; 7470 7471 // TODO: will probably need multiple return barriers depending on return type 7472 StubCodeMark mark(this, "StubRoutines", "cont return barrier"); 7473 address start = __ pc(); 7474 7475 generate_cont_thaw(Continuation::thaw_return_barrier); 7476 7477 return start; 7478 } 7479 7480 address generate_cont_returnBarrier_exception() { 7481 if (!Continuations::enabled()) return nullptr; 7482 7483 StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler"); 7484 address start = __ pc(); 7485 7486 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 7487 7488 return start; 7489 } 7490 7491 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers 7492 // are represented as long[5], with BITS_PER_LIMB = 26. 7493 // Pack five 26-bit limbs into three 64-bit registers. 7494 void pack_26(Register dest0, Register dest1, Register dest2, Register src) { 7495 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits 7496 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits 7497 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong))); 7498 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits 7499 7500 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits 7501 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits 7502 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong))); 7503 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits 7504 7505 if (dest2->is_valid()) { 7506 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits 7507 } else { 7508 #ifdef ASSERT 7509 Label OK; 7510 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits 7511 __ br(__ EQ, OK); 7512 __ stop("high bits of Poly1305 integer should be zero"); 7513 __ should_not_reach_here(); 7514 __ bind(OK); 7515 #endif 7516 } 7517 } 7518 7519 // As above, but return only a 128-bit integer, packed into two 7520 // 64-bit registers. 7521 void pack_26(Register dest0, Register dest1, Register src) { 7522 pack_26(dest0, dest1, noreg, src); 7523 } 7524 7525 // Multiply and multiply-accumulate unsigned 64-bit registers. 7526 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) { 7527 __ mul(prod_lo, n, m); 7528 __ umulh(prod_hi, n, m); 7529 } 7530 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) { 7531 wide_mul(rscratch1, rscratch2, n, m); 7532 __ adds(sum_lo, sum_lo, rscratch1); 7533 __ adc(sum_hi, sum_hi, rscratch2); 7534 } 7535 7536 // Poly1305, RFC 7539 7537 7538 // See https://loup-vaillant.fr/tutorials/poly1305-design for a 7539 // description of the tricks used to simplify and accelerate this 7540 // computation. 7541 7542 address generate_poly1305_processBlocks() { 7543 __ align(CodeEntryAlignment); 7544 StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks"); 7545 address start = __ pc(); 7546 Label here; 7547 __ enter(); 7548 RegSet callee_saved = RegSet::range(r19, r28); 7549 __ push(callee_saved, sp); 7550 7551 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin(); 7552 7553 // Arguments 7554 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs; 7555 7556 // R_n is the 128-bit randomly-generated key, packed into two 7557 // registers. The caller passes this key to us as long[5], with 7558 // BITS_PER_LIMB = 26. 7559 const Register R_0 = *++regs, R_1 = *++regs; 7560 pack_26(R_0, R_1, r_start); 7561 7562 // RR_n is (R_n >> 2) * 5 7563 const Register RR_0 = *++regs, RR_1 = *++regs; 7564 __ lsr(RR_0, R_0, 2); 7565 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2); 7566 __ lsr(RR_1, R_1, 2); 7567 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2); 7568 7569 // U_n is the current checksum 7570 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs; 7571 pack_26(U_0, U_1, U_2, acc_start); 7572 7573 static constexpr int BLOCK_LENGTH = 16; 7574 Label DONE, LOOP; 7575 7576 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 7577 __ br(Assembler::LT, DONE); { 7578 __ bind(LOOP); 7579 7580 // S_n is to be the sum of U_n and the next block of data 7581 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs; 7582 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize)); 7583 __ adds(S_0, U_0, S_0); 7584 __ adcs(S_1, U_1, S_1); 7585 __ adc(S_2, U_2, zr); 7586 __ add(S_2, S_2, 1); 7587 7588 const Register U_0HI = *++regs, U_1HI = *++regs; 7589 7590 // NB: this logic depends on some of the special properties of 7591 // Poly1305 keys. In particular, because we know that the top 7592 // four bits of R_0 and R_1 are zero, we can add together 7593 // partial products without any risk of needing to propagate a 7594 // carry out. 7595 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0); 7596 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1); 7597 __ andr(U_2, R_0, 3); 7598 __ mul(U_2, S_2, U_2); 7599 7600 // Recycle registers S_0, S_1, S_2 7601 regs = (regs.remaining() + S_0 + S_1 + S_2).begin(); 7602 7603 // Partial reduction mod 2**130 - 5 7604 __ adds(U_1, U_0HI, U_1); 7605 __ adc(U_2, U_1HI, U_2); 7606 // Sum now in U_2:U_1:U_0. 7607 // Dead: U_0HI, U_1HI. 7608 regs = (regs.remaining() + U_0HI + U_1HI).begin(); 7609 7610 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps 7611 7612 // First, U_2:U_1:U_0 += (U_2 >> 2) 7613 __ lsr(rscratch1, U_2, 2); 7614 __ andr(U_2, U_2, (u8)3); 7615 __ adds(U_0, U_0, rscratch1); 7616 __ adcs(U_1, U_1, zr); 7617 __ adc(U_2, U_2, zr); 7618 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2 7619 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2); 7620 __ adcs(U_1, U_1, zr); 7621 __ adc(U_2, U_2, zr); 7622 7623 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH)); 7624 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 7625 __ br(~ Assembler::LT, LOOP); 7626 } 7627 7628 // Further reduce modulo 2^130 - 5 7629 __ lsr(rscratch1, U_2, 2); 7630 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5 7631 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5 7632 __ adcs(U_1, U_1, zr); 7633 __ andr(U_2, U_2, (u1)3); 7634 __ adc(U_2, U_2, zr); 7635 7636 // Unpack the sum into five 26-bit limbs and write to memory. 7637 __ ubfiz(rscratch1, U_0, 0, 26); 7638 __ ubfx(rscratch2, U_0, 26, 26); 7639 __ stp(rscratch1, rscratch2, Address(acc_start)); 7640 __ ubfx(rscratch1, U_0, 52, 12); 7641 __ bfi(rscratch1, U_1, 12, 14); 7642 __ ubfx(rscratch2, U_1, 14, 26); 7643 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong))); 7644 __ ubfx(rscratch1, U_1, 40, 24); 7645 __ bfi(rscratch1, U_2, 24, 3); 7646 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong))); 7647 7648 __ bind(DONE); 7649 __ pop(callee_saved, sp); 7650 __ leave(); 7651 __ ret(lr); 7652 7653 return start; 7654 } 7655 7656 // exception handler for upcall stubs 7657 address generate_upcall_stub_exception_handler() { 7658 StubCodeMark mark(this, "StubRoutines", "upcall stub exception handler"); 7659 address start = __ pc(); 7660 7661 // Native caller has no idea how to handle exceptions, 7662 // so we just crash here. Up to callee to catch exceptions. 7663 __ verify_oop(r0); 7664 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception)); 7665 __ blr(rscratch1); 7666 __ should_not_reach_here(); 7667 7668 return start; 7669 } 7670 7671 // load Method* target of MethodHandle 7672 // j_rarg0 = jobject receiver 7673 // rmethod = result 7674 address generate_upcall_stub_load_target() { 7675 StubCodeMark mark(this, "StubRoutines", "upcall_stub_load_target"); 7676 address start = __ pc(); 7677 7678 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2); 7679 // Load target method from receiver 7680 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2); 7681 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2); 7682 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2); 7683 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod, 7684 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()), 7685 noreg, noreg); 7686 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized 7687 7688 __ ret(lr); 7689 7690 return start; 7691 } 7692 7693 #undef __ 7694 #define __ masm-> 7695 7696 class MontgomeryMultiplyGenerator : public MacroAssembler { 7697 7698 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 7699 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 7700 7701 RegSet _toSave; 7702 bool _squaring; 7703 7704 public: 7705 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 7706 : MacroAssembler(as->code()), _squaring(squaring) { 7707 7708 // Register allocation 7709 7710 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin(); 7711 Pa_base = *regs; // Argument registers 7712 if (squaring) 7713 Pb_base = Pa_base; 7714 else 7715 Pb_base = *++regs; 7716 Pn_base = *++regs; 7717 Rlen= *++regs; 7718 inv = *++regs; 7719 Pm_base = *++regs; 7720 7721 // Working registers: 7722 Ra = *++regs; // The current digit of a, b, n, and m. 7723 Rb = *++regs; 7724 Rm = *++regs; 7725 Rn = *++regs; 7726 7727 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 7728 Pb = *++regs; 7729 Pm = *++regs; 7730 Pn = *++regs; 7731 7732 t0 = *++regs; // Three registers which form a 7733 t1 = *++regs; // triple-precision accumuator. 7734 t2 = *++regs; 7735 7736 Ri = *++regs; // Inner and outer loop indexes. 7737 Rj = *++regs; 7738 7739 Rhi_ab = *++regs; // Product registers: low and high parts 7740 Rlo_ab = *++regs; // of a*b and m*n. 7741 Rhi_mn = *++regs; 7742 Rlo_mn = *++regs; 7743 7744 // r19 and up are callee-saved. 7745 _toSave = RegSet::range(r19, *regs) + Pm_base; 7746 } 7747 7748 private: 7749 void save_regs() { 7750 push(_toSave, sp); 7751 } 7752 7753 void restore_regs() { 7754 pop(_toSave, sp); 7755 } 7756 7757 template <typename T> 7758 void unroll_2(Register count, T block) { 7759 Label loop, end, odd; 7760 tbnz(count, 0, odd); 7761 cbz(count, end); 7762 align(16); 7763 bind(loop); 7764 (this->*block)(); 7765 bind(odd); 7766 (this->*block)(); 7767 subs(count, count, 2); 7768 br(Assembler::GT, loop); 7769 bind(end); 7770 } 7771 7772 template <typename T> 7773 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 7774 Label loop, end, odd; 7775 tbnz(count, 0, odd); 7776 cbz(count, end); 7777 align(16); 7778 bind(loop); 7779 (this->*block)(d, s, tmp); 7780 bind(odd); 7781 (this->*block)(d, s, tmp); 7782 subs(count, count, 2); 7783 br(Assembler::GT, loop); 7784 bind(end); 7785 } 7786 7787 void pre1(RegisterOrConstant i) { 7788 block_comment("pre1"); 7789 // Pa = Pa_base; 7790 // Pb = Pb_base + i; 7791 // Pm = Pm_base; 7792 // Pn = Pn_base + i; 7793 // Ra = *Pa; 7794 // Rb = *Pb; 7795 // Rm = *Pm; 7796 // Rn = *Pn; 7797 ldr(Ra, Address(Pa_base)); 7798 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 7799 ldr(Rm, Address(Pm_base)); 7800 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7801 lea(Pa, Address(Pa_base)); 7802 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 7803 lea(Pm, Address(Pm_base)); 7804 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7805 7806 // Zero the m*n result. 7807 mov(Rhi_mn, zr); 7808 mov(Rlo_mn, zr); 7809 } 7810 7811 // The core multiply-accumulate step of a Montgomery 7812 // multiplication. The idea is to schedule operations as a 7813 // pipeline so that instructions with long latencies (loads and 7814 // multiplies) have time to complete before their results are 7815 // used. This most benefits in-order implementations of the 7816 // architecture but out-of-order ones also benefit. 7817 void step() { 7818 block_comment("step"); 7819 // MACC(Ra, Rb, t0, t1, t2); 7820 // Ra = *++Pa; 7821 // Rb = *--Pb; 7822 umulh(Rhi_ab, Ra, Rb); 7823 mul(Rlo_ab, Ra, Rb); 7824 ldr(Ra, pre(Pa, wordSize)); 7825 ldr(Rb, pre(Pb, -wordSize)); 7826 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 7827 // previous iteration. 7828 // MACC(Rm, Rn, t0, t1, t2); 7829 // Rm = *++Pm; 7830 // Rn = *--Pn; 7831 umulh(Rhi_mn, Rm, Rn); 7832 mul(Rlo_mn, Rm, Rn); 7833 ldr(Rm, pre(Pm, wordSize)); 7834 ldr(Rn, pre(Pn, -wordSize)); 7835 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7836 } 7837 7838 void post1() { 7839 block_comment("post1"); 7840 7841 // MACC(Ra, Rb, t0, t1, t2); 7842 // Ra = *++Pa; 7843 // Rb = *--Pb; 7844 umulh(Rhi_ab, Ra, Rb); 7845 mul(Rlo_ab, Ra, Rb); 7846 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7847 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7848 7849 // *Pm = Rm = t0 * inv; 7850 mul(Rm, t0, inv); 7851 str(Rm, Address(Pm)); 7852 7853 // MACC(Rm, Rn, t0, t1, t2); 7854 // t0 = t1; t1 = t2; t2 = 0; 7855 umulh(Rhi_mn, Rm, Rn); 7856 7857 #ifndef PRODUCT 7858 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 7859 { 7860 mul(Rlo_mn, Rm, Rn); 7861 add(Rlo_mn, t0, Rlo_mn); 7862 Label ok; 7863 cbz(Rlo_mn, ok); { 7864 stop("broken Montgomery multiply"); 7865 } bind(ok); 7866 } 7867 #endif 7868 // We have very carefully set things up so that 7869 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 7870 // the lower half of Rm * Rn because we know the result already: 7871 // it must be -t0. t0 + (-t0) must generate a carry iff 7872 // t0 != 0. So, rather than do a mul and an adds we just set 7873 // the carry flag iff t0 is nonzero. 7874 // 7875 // mul(Rlo_mn, Rm, Rn); 7876 // adds(zr, t0, Rlo_mn); 7877 subs(zr, t0, 1); // Set carry iff t0 is nonzero 7878 adcs(t0, t1, Rhi_mn); 7879 adc(t1, t2, zr); 7880 mov(t2, zr); 7881 } 7882 7883 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 7884 block_comment("pre2"); 7885 // Pa = Pa_base + i-len; 7886 // Pb = Pb_base + len; 7887 // Pm = Pm_base + i-len; 7888 // Pn = Pn_base + len; 7889 7890 if (i.is_register()) { 7891 sub(Rj, i.as_register(), len); 7892 } else { 7893 mov(Rj, i.as_constant()); 7894 sub(Rj, Rj, len); 7895 } 7896 // Rj == i-len 7897 7898 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 7899 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 7900 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 7901 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 7902 7903 // Ra = *++Pa; 7904 // Rb = *--Pb; 7905 // Rm = *++Pm; 7906 // Rn = *--Pn; 7907 ldr(Ra, pre(Pa, wordSize)); 7908 ldr(Rb, pre(Pb, -wordSize)); 7909 ldr(Rm, pre(Pm, wordSize)); 7910 ldr(Rn, pre(Pn, -wordSize)); 7911 7912 mov(Rhi_mn, zr); 7913 mov(Rlo_mn, zr); 7914 } 7915 7916 void post2(RegisterOrConstant i, RegisterOrConstant len) { 7917 block_comment("post2"); 7918 if (i.is_constant()) { 7919 mov(Rj, i.as_constant()-len.as_constant()); 7920 } else { 7921 sub(Rj, i.as_register(), len); 7922 } 7923 7924 adds(t0, t0, Rlo_mn); // The pending m*n, low part 7925 7926 // As soon as we know the least significant digit of our result, 7927 // store it. 7928 // Pm_base[i-len] = t0; 7929 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 7930 7931 // t0 = t1; t1 = t2; t2 = 0; 7932 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 7933 adc(t1, t2, zr); 7934 mov(t2, zr); 7935 } 7936 7937 // A carry in t0 after Montgomery multiplication means that we 7938 // should subtract multiples of n from our result in m. We'll 7939 // keep doing that until there is no carry. 7940 void normalize(RegisterOrConstant len) { 7941 block_comment("normalize"); 7942 // while (t0) 7943 // t0 = sub(Pm_base, Pn_base, t0, len); 7944 Label loop, post, again; 7945 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 7946 cbz(t0, post); { 7947 bind(again); { 7948 mov(i, zr); 7949 mov(cnt, len); 7950 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7951 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7952 subs(zr, zr, zr); // set carry flag, i.e. no borrow 7953 align(16); 7954 bind(loop); { 7955 sbcs(Rm, Rm, Rn); 7956 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7957 add(i, i, 1); 7958 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7959 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7960 sub(cnt, cnt, 1); 7961 } cbnz(cnt, loop); 7962 sbc(t0, t0, zr); 7963 } cbnz(t0, again); 7964 } bind(post); 7965 } 7966 7967 // Move memory at s to d, reversing words. 7968 // Increments d to end of copied memory 7969 // Destroys tmp1, tmp2 7970 // Preserves len 7971 // Leaves s pointing to the address which was in d at start 7972 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 7973 assert(tmp1->encoding() < r19->encoding(), "register corruption"); 7974 assert(tmp2->encoding() < r19->encoding(), "register corruption"); 7975 7976 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 7977 mov(tmp1, len); 7978 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 7979 sub(s, d, len, ext::uxtw, LogBytesPerWord); 7980 } 7981 // where 7982 void reverse1(Register d, Register s, Register tmp) { 7983 ldr(tmp, pre(s, -wordSize)); 7984 ror(tmp, tmp, 32); 7985 str(tmp, post(d, wordSize)); 7986 } 7987 7988 void step_squaring() { 7989 // An extra ACC 7990 step(); 7991 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7992 } 7993 7994 void last_squaring(RegisterOrConstant i) { 7995 Label dont; 7996 // if ((i & 1) == 0) { 7997 tbnz(i.as_register(), 0, dont); { 7998 // MACC(Ra, Rb, t0, t1, t2); 7999 // Ra = *++Pa; 8000 // Rb = *--Pb; 8001 umulh(Rhi_ab, Ra, Rb); 8002 mul(Rlo_ab, Ra, Rb); 8003 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 8004 } bind(dont); 8005 } 8006 8007 void extra_step_squaring() { 8008 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 8009 8010 // MACC(Rm, Rn, t0, t1, t2); 8011 // Rm = *++Pm; 8012 // Rn = *--Pn; 8013 umulh(Rhi_mn, Rm, Rn); 8014 mul(Rlo_mn, Rm, Rn); 8015 ldr(Rm, pre(Pm, wordSize)); 8016 ldr(Rn, pre(Pn, -wordSize)); 8017 } 8018 8019 void post1_squaring() { 8020 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 8021 8022 // *Pm = Rm = t0 * inv; 8023 mul(Rm, t0, inv); 8024 str(Rm, Address(Pm)); 8025 8026 // MACC(Rm, Rn, t0, t1, t2); 8027 // t0 = t1; t1 = t2; t2 = 0; 8028 umulh(Rhi_mn, Rm, Rn); 8029 8030 #ifndef PRODUCT 8031 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 8032 { 8033 mul(Rlo_mn, Rm, Rn); 8034 add(Rlo_mn, t0, Rlo_mn); 8035 Label ok; 8036 cbz(Rlo_mn, ok); { 8037 stop("broken Montgomery multiply"); 8038 } bind(ok); 8039 } 8040 #endif 8041 // We have very carefully set things up so that 8042 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 8043 // the lower half of Rm * Rn because we know the result already: 8044 // it must be -t0. t0 + (-t0) must generate a carry iff 8045 // t0 != 0. So, rather than do a mul and an adds we just set 8046 // the carry flag iff t0 is nonzero. 8047 // 8048 // mul(Rlo_mn, Rm, Rn); 8049 // adds(zr, t0, Rlo_mn); 8050 subs(zr, t0, 1); // Set carry iff t0 is nonzero 8051 adcs(t0, t1, Rhi_mn); 8052 adc(t1, t2, zr); 8053 mov(t2, zr); 8054 } 8055 8056 void acc(Register Rhi, Register Rlo, 8057 Register t0, Register t1, Register t2) { 8058 adds(t0, t0, Rlo); 8059 adcs(t1, t1, Rhi); 8060 adc(t2, t2, zr); 8061 } 8062 8063 public: 8064 /** 8065 * Fast Montgomery multiplication. The derivation of the 8066 * algorithm is in A Cryptographic Library for the Motorola 8067 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 8068 * 8069 * Arguments: 8070 * 8071 * Inputs for multiplication: 8072 * c_rarg0 - int array elements a 8073 * c_rarg1 - int array elements b 8074 * c_rarg2 - int array elements n (the modulus) 8075 * c_rarg3 - int length 8076 * c_rarg4 - int inv 8077 * c_rarg5 - int array elements m (the result) 8078 * 8079 * Inputs for squaring: 8080 * c_rarg0 - int array elements a 8081 * c_rarg1 - int array elements n (the modulus) 8082 * c_rarg2 - int length 8083 * c_rarg3 - int inv 8084 * c_rarg4 - int array elements m (the result) 8085 * 8086 */ 8087 address generate_multiply() { 8088 Label argh, nothing; 8089 bind(argh); 8090 stop("MontgomeryMultiply total_allocation must be <= 8192"); 8091 8092 align(CodeEntryAlignment); 8093 address entry = pc(); 8094 8095 cbzw(Rlen, nothing); 8096 8097 enter(); 8098 8099 // Make room. 8100 cmpw(Rlen, 512); 8101 br(Assembler::HI, argh); 8102 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 8103 andr(sp, Ra, -2 * wordSize); 8104 8105 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 8106 8107 { 8108 // Copy input args, reversing as we go. We use Ra as a 8109 // temporary variable. 8110 reverse(Ra, Pa_base, Rlen, t0, t1); 8111 if (!_squaring) 8112 reverse(Ra, Pb_base, Rlen, t0, t1); 8113 reverse(Ra, Pn_base, Rlen, t0, t1); 8114 } 8115 8116 // Push all call-saved registers and also Pm_base which we'll need 8117 // at the end. 8118 save_regs(); 8119 8120 #ifndef PRODUCT 8121 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 8122 { 8123 ldr(Rn, Address(Pn_base, 0)); 8124 mul(Rlo_mn, Rn, inv); 8125 subs(zr, Rlo_mn, -1); 8126 Label ok; 8127 br(EQ, ok); { 8128 stop("broken inverse in Montgomery multiply"); 8129 } bind(ok); 8130 } 8131 #endif 8132 8133 mov(Pm_base, Ra); 8134 8135 mov(t0, zr); 8136 mov(t1, zr); 8137 mov(t2, zr); 8138 8139 block_comment("for (int i = 0; i < len; i++) {"); 8140 mov(Ri, zr); { 8141 Label loop, end; 8142 cmpw(Ri, Rlen); 8143 br(Assembler::GE, end); 8144 8145 bind(loop); 8146 pre1(Ri); 8147 8148 block_comment(" for (j = i; j; j--) {"); { 8149 movw(Rj, Ri); 8150 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 8151 } block_comment(" } // j"); 8152 8153 post1(); 8154 addw(Ri, Ri, 1); 8155 cmpw(Ri, Rlen); 8156 br(Assembler::LT, loop); 8157 bind(end); 8158 block_comment("} // i"); 8159 } 8160 8161 block_comment("for (int i = len; i < 2*len; i++) {"); 8162 mov(Ri, Rlen); { 8163 Label loop, end; 8164 cmpw(Ri, Rlen, Assembler::LSL, 1); 8165 br(Assembler::GE, end); 8166 8167 bind(loop); 8168 pre2(Ri, Rlen); 8169 8170 block_comment(" for (j = len*2-i-1; j; j--) {"); { 8171 lslw(Rj, Rlen, 1); 8172 subw(Rj, Rj, Ri); 8173 subw(Rj, Rj, 1); 8174 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 8175 } block_comment(" } // j"); 8176 8177 post2(Ri, Rlen); 8178 addw(Ri, Ri, 1); 8179 cmpw(Ri, Rlen, Assembler::LSL, 1); 8180 br(Assembler::LT, loop); 8181 bind(end); 8182 } 8183 block_comment("} // i"); 8184 8185 normalize(Rlen); 8186 8187 mov(Ra, Pm_base); // Save Pm_base in Ra 8188 restore_regs(); // Restore caller's Pm_base 8189 8190 // Copy our result into caller's Pm_base 8191 reverse(Pm_base, Ra, Rlen, t0, t1); 8192 8193 leave(); 8194 bind(nothing); 8195 ret(lr); 8196 8197 return entry; 8198 } 8199 // In C, approximately: 8200 8201 // void 8202 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 8203 // julong Pn_base[], julong Pm_base[], 8204 // julong inv, int len) { 8205 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 8206 // julong *Pa, *Pb, *Pn, *Pm; 8207 // julong Ra, Rb, Rn, Rm; 8208 8209 // int i; 8210 8211 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 8212 8213 // for (i = 0; i < len; i++) { 8214 // int j; 8215 8216 // Pa = Pa_base; 8217 // Pb = Pb_base + i; 8218 // Pm = Pm_base; 8219 // Pn = Pn_base + i; 8220 8221 // Ra = *Pa; 8222 // Rb = *Pb; 8223 // Rm = *Pm; 8224 // Rn = *Pn; 8225 8226 // int iters = i; 8227 // for (j = 0; iters--; j++) { 8228 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 8229 // MACC(Ra, Rb, t0, t1, t2); 8230 // Ra = *++Pa; 8231 // Rb = *--Pb; 8232 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8233 // MACC(Rm, Rn, t0, t1, t2); 8234 // Rm = *++Pm; 8235 // Rn = *--Pn; 8236 // } 8237 8238 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 8239 // MACC(Ra, Rb, t0, t1, t2); 8240 // *Pm = Rm = t0 * inv; 8241 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 8242 // MACC(Rm, Rn, t0, t1, t2); 8243 8244 // assert(t0 == 0, "broken Montgomery multiply"); 8245 8246 // t0 = t1; t1 = t2; t2 = 0; 8247 // } 8248 8249 // for (i = len; i < 2*len; i++) { 8250 // int j; 8251 8252 // Pa = Pa_base + i-len; 8253 // Pb = Pb_base + len; 8254 // Pm = Pm_base + i-len; 8255 // Pn = Pn_base + len; 8256 8257 // Ra = *++Pa; 8258 // Rb = *--Pb; 8259 // Rm = *++Pm; 8260 // Rn = *--Pn; 8261 8262 // int iters = len*2-i-1; 8263 // for (j = i-len+1; iters--; j++) { 8264 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 8265 // MACC(Ra, Rb, t0, t1, t2); 8266 // Ra = *++Pa; 8267 // Rb = *--Pb; 8268 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8269 // MACC(Rm, Rn, t0, t1, t2); 8270 // Rm = *++Pm; 8271 // Rn = *--Pn; 8272 // } 8273 8274 // Pm_base[i-len] = t0; 8275 // t0 = t1; t1 = t2; t2 = 0; 8276 // } 8277 8278 // while (t0) 8279 // t0 = sub(Pm_base, Pn_base, t0, len); 8280 // } 8281 8282 /** 8283 * Fast Montgomery squaring. This uses asymptotically 25% fewer 8284 * multiplies than Montgomery multiplication so it should be up to 8285 * 25% faster. However, its loop control is more complex and it 8286 * may actually run slower on some machines. 8287 * 8288 * Arguments: 8289 * 8290 * Inputs: 8291 * c_rarg0 - int array elements a 8292 * c_rarg1 - int array elements n (the modulus) 8293 * c_rarg2 - int length 8294 * c_rarg3 - int inv 8295 * c_rarg4 - int array elements m (the result) 8296 * 8297 */ 8298 address generate_square() { 8299 Label argh; 8300 bind(argh); 8301 stop("MontgomeryMultiply total_allocation must be <= 8192"); 8302 8303 align(CodeEntryAlignment); 8304 address entry = pc(); 8305 8306 enter(); 8307 8308 // Make room. 8309 cmpw(Rlen, 512); 8310 br(Assembler::HI, argh); 8311 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 8312 andr(sp, Ra, -2 * wordSize); 8313 8314 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 8315 8316 { 8317 // Copy input args, reversing as we go. We use Ra as a 8318 // temporary variable. 8319 reverse(Ra, Pa_base, Rlen, t0, t1); 8320 reverse(Ra, Pn_base, Rlen, t0, t1); 8321 } 8322 8323 // Push all call-saved registers and also Pm_base which we'll need 8324 // at the end. 8325 save_regs(); 8326 8327 mov(Pm_base, Ra); 8328 8329 mov(t0, zr); 8330 mov(t1, zr); 8331 mov(t2, zr); 8332 8333 block_comment("for (int i = 0; i < len; i++) {"); 8334 mov(Ri, zr); { 8335 Label loop, end; 8336 bind(loop); 8337 cmp(Ri, Rlen); 8338 br(Assembler::GE, end); 8339 8340 pre1(Ri); 8341 8342 block_comment("for (j = (i+1)/2; j; j--) {"); { 8343 add(Rj, Ri, 1); 8344 lsr(Rj, Rj, 1); 8345 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 8346 } block_comment(" } // j"); 8347 8348 last_squaring(Ri); 8349 8350 block_comment(" for (j = i/2; j; j--) {"); { 8351 lsr(Rj, Ri, 1); 8352 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 8353 } block_comment(" } // j"); 8354 8355 post1_squaring(); 8356 add(Ri, Ri, 1); 8357 cmp(Ri, Rlen); 8358 br(Assembler::LT, loop); 8359 8360 bind(end); 8361 block_comment("} // i"); 8362 } 8363 8364 block_comment("for (int i = len; i < 2*len; i++) {"); 8365 mov(Ri, Rlen); { 8366 Label loop, end; 8367 bind(loop); 8368 cmp(Ri, Rlen, Assembler::LSL, 1); 8369 br(Assembler::GE, end); 8370 8371 pre2(Ri, Rlen); 8372 8373 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 8374 lsl(Rj, Rlen, 1); 8375 sub(Rj, Rj, Ri); 8376 sub(Rj, Rj, 1); 8377 lsr(Rj, Rj, 1); 8378 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 8379 } block_comment(" } // j"); 8380 8381 last_squaring(Ri); 8382 8383 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 8384 lsl(Rj, Rlen, 1); 8385 sub(Rj, Rj, Ri); 8386 lsr(Rj, Rj, 1); 8387 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 8388 } block_comment(" } // j"); 8389 8390 post2(Ri, Rlen); 8391 add(Ri, Ri, 1); 8392 cmp(Ri, Rlen, Assembler::LSL, 1); 8393 8394 br(Assembler::LT, loop); 8395 bind(end); 8396 block_comment("} // i"); 8397 } 8398 8399 normalize(Rlen); 8400 8401 mov(Ra, Pm_base); // Save Pm_base in Ra 8402 restore_regs(); // Restore caller's Pm_base 8403 8404 // Copy our result into caller's Pm_base 8405 reverse(Pm_base, Ra, Rlen, t0, t1); 8406 8407 leave(); 8408 ret(lr); 8409 8410 return entry; 8411 } 8412 // In C, approximately: 8413 8414 // void 8415 // montgomery_square(julong Pa_base[], julong Pn_base[], 8416 // julong Pm_base[], julong inv, int len) { 8417 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 8418 // julong *Pa, *Pb, *Pn, *Pm; 8419 // julong Ra, Rb, Rn, Rm; 8420 8421 // int i; 8422 8423 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 8424 8425 // for (i = 0; i < len; i++) { 8426 // int j; 8427 8428 // Pa = Pa_base; 8429 // Pb = Pa_base + i; 8430 // Pm = Pm_base; 8431 // Pn = Pn_base + i; 8432 8433 // Ra = *Pa; 8434 // Rb = *Pb; 8435 // Rm = *Pm; 8436 // Rn = *Pn; 8437 8438 // int iters = (i+1)/2; 8439 // for (j = 0; iters--; j++) { 8440 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 8441 // MACC2(Ra, Rb, t0, t1, t2); 8442 // Ra = *++Pa; 8443 // Rb = *--Pb; 8444 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8445 // MACC(Rm, Rn, t0, t1, t2); 8446 // Rm = *++Pm; 8447 // Rn = *--Pn; 8448 // } 8449 // if ((i & 1) == 0) { 8450 // assert(Ra == Pa_base[j], "must be"); 8451 // MACC(Ra, Ra, t0, t1, t2); 8452 // } 8453 // iters = i/2; 8454 // assert(iters == i-j, "must be"); 8455 // for (; iters--; j++) { 8456 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8457 // MACC(Rm, Rn, t0, t1, t2); 8458 // Rm = *++Pm; 8459 // Rn = *--Pn; 8460 // } 8461 8462 // *Pm = Rm = t0 * inv; 8463 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 8464 // MACC(Rm, Rn, t0, t1, t2); 8465 8466 // assert(t0 == 0, "broken Montgomery multiply"); 8467 8468 // t0 = t1; t1 = t2; t2 = 0; 8469 // } 8470 8471 // for (i = len; i < 2*len; i++) { 8472 // int start = i-len+1; 8473 // int end = start + (len - start)/2; 8474 // int j; 8475 8476 // Pa = Pa_base + i-len; 8477 // Pb = Pa_base + len; 8478 // Pm = Pm_base + i-len; 8479 // Pn = Pn_base + len; 8480 8481 // Ra = *++Pa; 8482 // Rb = *--Pb; 8483 // Rm = *++Pm; 8484 // Rn = *--Pn; 8485 8486 // int iters = (2*len-i-1)/2; 8487 // assert(iters == end-start, "must be"); 8488 // for (j = start; iters--; j++) { 8489 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 8490 // MACC2(Ra, Rb, t0, t1, t2); 8491 // Ra = *++Pa; 8492 // Rb = *--Pb; 8493 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8494 // MACC(Rm, Rn, t0, t1, t2); 8495 // Rm = *++Pm; 8496 // Rn = *--Pn; 8497 // } 8498 // if ((i & 1) == 0) { 8499 // assert(Ra == Pa_base[j], "must be"); 8500 // MACC(Ra, Ra, t0, t1, t2); 8501 // } 8502 // iters = (2*len-i)/2; 8503 // assert(iters == len-j, "must be"); 8504 // for (; iters--; j++) { 8505 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8506 // MACC(Rm, Rn, t0, t1, t2); 8507 // Rm = *++Pm; 8508 // Rn = *--Pn; 8509 // } 8510 // Pm_base[i-len] = t0; 8511 // t0 = t1; t1 = t2; t2 = 0; 8512 // } 8513 8514 // while (t0) 8515 // t0 = sub(Pm_base, Pn_base, t0, len); 8516 // } 8517 }; 8518 8519 void generate_vector_math_stubs() { 8520 // Get native vector math stub routine addresses 8521 void* libsleef = nullptr; 8522 char ebuf[1024]; 8523 char dll_name[JVM_MAXPATHLEN]; 8524 if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "sleef")) { 8525 libsleef = os::dll_load(dll_name, ebuf, sizeof ebuf); 8526 } 8527 if (libsleef == nullptr) { 8528 log_info(library)("Failed to load native vector math library, %s!", ebuf); 8529 return; 8530 } 8531 // Method naming convention 8532 // All the methods are named as <OP><T><N>_<U><suffix> 8533 // Where: 8534 // <OP> is the operation name, e.g. sin 8535 // <T> is optional to indicate float/double 8536 // "f/d" for vector float/double operation 8537 // <N> is the number of elements in the vector 8538 // "2/4" for neon, and "x" for sve 8539 // <U> is the precision level 8540 // "u10/u05" represents 1.0/0.5 ULP error bounds 8541 // We use "u10" for all operations by default 8542 // But for those functions do not have u10 support, we use "u05" instead 8543 // <suffix> indicates neon/sve 8544 // "sve/advsimd" for sve/neon implementations 8545 // e.g. sinfx_u10sve is the method for computing vector float sin using SVE instructions 8546 // cosd2_u10advsimd is the method for computing 2 elements vector double cos using NEON instructions 8547 // 8548 log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "sleef" JNI_LIB_SUFFIX, p2i(libsleef)); 8549 8550 // Math vector stubs implemented with SVE for scalable vector size. 8551 if (UseSVE > 0) { 8552 for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) { 8553 int vop = VectorSupport::VECTOR_OP_MATH_START + op; 8554 // Skip "tanh" because there is performance regression 8555 if (vop == VectorSupport::VECTOR_OP_TANH) { 8556 continue; 8557 } 8558 8559 // The native library does not support u10 level of "hypot". 8560 const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10"; 8561 8562 snprintf(ebuf, sizeof(ebuf), "%sfx_%ssve", VectorSupport::mathname[op], ulf); 8563 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf); 8564 8565 snprintf(ebuf, sizeof(ebuf), "%sdx_%ssve", VectorSupport::mathname[op], ulf); 8566 StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf); 8567 } 8568 } 8569 8570 // Math vector stubs implemented with NEON for 64/128 bits vector size. 8571 for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) { 8572 int vop = VectorSupport::VECTOR_OP_MATH_START + op; 8573 // Skip "tanh" because there is performance regression 8574 if (vop == VectorSupport::VECTOR_OP_TANH) { 8575 continue; 8576 } 8577 8578 // The native library does not support u10 level of "hypot". 8579 const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10"; 8580 8581 snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf); 8582 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libsleef, ebuf); 8583 8584 snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf); 8585 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf); 8586 8587 snprintf(ebuf, sizeof(ebuf), "%sd2_%sadvsimd", VectorSupport::mathname[op], ulf); 8588 StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf); 8589 } 8590 } 8591 8592 // Call here from the interpreter or compiled code to either load 8593 // multiple returned values from the inline type instance being 8594 // returned to registers or to store returned values to a newly 8595 // allocated inline type instance. 8596 address generate_return_value_stub(address destination, const char* name, bool has_res) { 8597 // We need to save all registers the calling convention may use so 8598 // the runtime calls read or update those registers. This needs to 8599 // be in sync with SharedRuntime::java_return_convention(). 8600 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 8601 enum layout { 8602 j_rarg7_off = 0, j_rarg7_2, // j_rarg7 is r0 8603 j_rarg6_off, j_rarg6_2, 8604 j_rarg5_off, j_rarg5_2, 8605 j_rarg4_off, j_rarg4_2, 8606 j_rarg3_off, j_rarg3_2, 8607 j_rarg2_off, j_rarg2_2, 8608 j_rarg1_off, j_rarg1_2, 8609 j_rarg0_off, j_rarg0_2, 8610 8611 j_farg7_off, j_farg7_2, 8612 j_farg6_off, j_farg6_2, 8613 j_farg5_off, j_farg5_2, 8614 j_farg4_off, j_farg4_2, 8615 j_farg3_off, j_farg3_2, 8616 j_farg2_off, j_farg2_2, 8617 j_farg1_off, j_farg1_2, 8618 j_farg0_off, j_farg0_2, 8619 8620 rfp_off, rfp_off2, 8621 return_off, return_off2, 8622 8623 framesize // inclusive of return address 8624 }; 8625 8626 CodeBuffer code(name, 512, 64); 8627 MacroAssembler* masm = new MacroAssembler(&code); 8628 8629 int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16); 8630 assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned"); 8631 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt; 8632 int frame_size_in_words = frame_size_in_bytes / wordSize; 8633 8634 OopMapSet* oop_maps = new OopMapSet(); 8635 OopMap* map = new OopMap(frame_size_in_slots, 0); 8636 8637 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg()); 8638 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg()); 8639 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg()); 8640 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg()); 8641 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg()); 8642 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg()); 8643 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg()); 8644 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg()); 8645 8646 map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg()); 8647 map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg()); 8648 map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg()); 8649 map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg()); 8650 map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg()); 8651 map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg()); 8652 map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg()); 8653 map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg()); 8654 8655 address start = __ pc(); 8656 8657 __ enter(); // Save FP and LR before call 8658 8659 __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize))); 8660 __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize))); 8661 __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize))); 8662 __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize))); 8663 8664 __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize))); 8665 __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize))); 8666 __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize))); 8667 __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize))); 8668 8669 int frame_complete = __ offset(); 8670 8671 // Set up last_Java_sp and last_Java_fp 8672 address the_pc = __ pc(); 8673 __ set_last_Java_frame(sp, noreg, the_pc, rscratch1); 8674 8675 // Call runtime 8676 __ mov(c_rarg1, r0); 8677 __ mov(c_rarg0, rthread); 8678 8679 __ mov(rscratch1, destination); 8680 __ blr(rscratch1); 8681 8682 oop_maps->add_gc_map(the_pc - start, map); 8683 8684 __ reset_last_Java_frame(false); 8685 8686 __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize))); 8687 __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize))); 8688 __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize))); 8689 __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize))); 8690 8691 __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize))); 8692 __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize))); 8693 __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize))); 8694 __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize))); 8695 8696 __ leave(); 8697 8698 // check for pending exceptions 8699 Label pending; 8700 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 8701 __ cbnz(rscratch1, pending); 8702 8703 if (has_res) { 8704 __ get_vm_result(r0, rthread); 8705 } 8706 8707 __ ret(lr); 8708 8709 __ bind(pending); 8710 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 8711 8712 // ------------- 8713 // make sure all code is generated 8714 masm->flush(); 8715 8716 RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false); 8717 return stub->entry_point(); 8718 } 8719 8720 // Initialization 8721 void generate_initial_stubs() { 8722 // Generate initial stubs and initializes the entry points 8723 8724 // entry points that exist in all platforms Note: This is code 8725 // that could be shared among different platforms - however the 8726 // benefit seems to be smaller than the disadvantage of having a 8727 // much more complicated generator structure. See also comment in 8728 // stubRoutines.hpp. 8729 8730 StubRoutines::_forward_exception_entry = generate_forward_exception(); 8731 8732 StubRoutines::_call_stub_entry = 8733 generate_call_stub(StubRoutines::_call_stub_return_address); 8734 8735 // is referenced by megamorphic call 8736 StubRoutines::_catch_exception_entry = generate_catch_exception(); 8737 8738 // Initialize table for copy memory (arraycopy) check. 8739 if (UnsafeMemoryAccess::_table == nullptr) { 8740 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory 8741 } 8742 8743 if (UseCRC32Intrinsics) { 8744 // set table address before stub generation which use it 8745 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 8746 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 8747 } 8748 8749 if (UseCRC32CIntrinsics) { 8750 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 8751 } 8752 8753 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 8754 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 8755 } 8756 8757 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 8758 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 8759 } 8760 8761 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) && 8762 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) { 8763 StubRoutines::_hf2f = generate_float16ToFloat(); 8764 StubRoutines::_f2hf = generate_floatToFloat16(); 8765 } 8766 8767 if (InlineTypeReturnedAsFields) { 8768 StubRoutines::_load_inline_type_fields_in_regs = 8769 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false); 8770 StubRoutines::_store_inline_type_fields_to_buf = 8771 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true); 8772 } 8773 8774 } 8775 8776 void generate_continuation_stubs() { 8777 // Continuation stubs: 8778 StubRoutines::_cont_thaw = generate_cont_thaw(); 8779 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 8780 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 8781 } 8782 8783 void generate_final_stubs() { 8784 // support for verify_oop (must happen after universe_init) 8785 if (VerifyOops) { 8786 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 8787 } 8788 8789 // arraycopy stubs used by compilers 8790 generate_arraycopy_stubs(); 8791 8792 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 8793 if (bs_nm != nullptr) { 8794 StubRoutines::_method_entry_barrier = generate_method_entry_barrier(); 8795 } 8796 8797 StubRoutines::aarch64::_spin_wait = generate_spin_wait(); 8798 8799 if (UsePoly1305Intrinsics) { 8800 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); 8801 } 8802 8803 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 8804 8805 generate_atomic_entry_points(); 8806 8807 #endif // LINUX 8808 8809 #ifdef COMPILER2 8810 if (UseSecondarySupersTable) { 8811 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub(); 8812 if (! InlineSecondarySupersTest) { 8813 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) { 8814 StubRoutines::_lookup_secondary_supers_table_stubs[slot] 8815 = generate_lookup_secondary_supers_table_stub(slot); 8816 } 8817 } 8818 } 8819 #endif 8820 8821 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler(); 8822 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target(); 8823 8824 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated 8825 } 8826 8827 void generate_compiler_stubs() { 8828 #if COMPILER2_OR_JVMCI 8829 8830 if (UseSVE == 0) { 8831 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices"); 8832 } 8833 8834 // array equals stub for large arrays. 8835 if (!UseSimpleArrayEquals) { 8836 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 8837 } 8838 8839 // arrays_hascode stub for large arrays. 8840 StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN); 8841 StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE); 8842 StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR); 8843 StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT); 8844 StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT); 8845 8846 // byte_array_inflate stub for large arrays. 8847 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 8848 8849 // countPositives stub for large arrays. 8850 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long); 8851 8852 generate_compare_long_strings(); 8853 8854 generate_string_indexof_stubs(); 8855 8856 #ifdef COMPILER2 8857 if (UseMultiplyToLenIntrinsic) { 8858 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 8859 } 8860 8861 if (UseSquareToLenIntrinsic) { 8862 StubRoutines::_squareToLen = generate_squareToLen(); 8863 } 8864 8865 if (UseMulAddIntrinsic) { 8866 StubRoutines::_mulAdd = generate_mulAdd(); 8867 } 8868 8869 if (UseSIMDForBigIntegerShiftIntrinsics) { 8870 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 8871 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 8872 } 8873 8874 if (UseMontgomeryMultiplyIntrinsic) { 8875 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 8876 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 8877 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 8878 } 8879 8880 if (UseMontgomerySquareIntrinsic) { 8881 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 8882 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 8883 // We use generate_multiply() rather than generate_square() 8884 // because it's faster for the sizes of modulus we care about. 8885 StubRoutines::_montgomerySquare = g.generate_multiply(); 8886 } 8887 8888 generate_vector_math_stubs(); 8889 8890 #endif // COMPILER2 8891 8892 if (UseChaCha20Intrinsics) { 8893 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar(); 8894 } 8895 8896 if (UseBASE64Intrinsics) { 8897 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 8898 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 8899 } 8900 8901 // data cache line writeback 8902 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 8903 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 8904 8905 if (UseAESIntrinsics) { 8906 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 8907 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 8908 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 8909 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 8910 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); 8911 } 8912 if (UseGHASHIntrinsics) { 8913 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 8914 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); 8915 } 8916 if (UseAESIntrinsics && UseGHASHIntrinsics) { 8917 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt(); 8918 } 8919 8920 if (UseMD5Intrinsics) { 8921 StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress"); 8922 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB"); 8923 } 8924 if (UseSHA1Intrinsics) { 8925 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 8926 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 8927 } 8928 if (UseSHA256Intrinsics) { 8929 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 8930 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 8931 } 8932 if (UseSHA512Intrinsics) { 8933 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); 8934 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); 8935 } 8936 if (UseSHA3Intrinsics) { 8937 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(false, "sha3_implCompress"); 8938 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(true, "sha3_implCompressMB"); 8939 } 8940 8941 // generate Adler32 intrinsics code 8942 if (UseAdler32Intrinsics) { 8943 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 8944 } 8945 8946 #endif // COMPILER2_OR_JVMCI 8947 } 8948 8949 public: 8950 StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) { 8951 switch(kind) { 8952 case Initial_stubs: 8953 generate_initial_stubs(); 8954 break; 8955 case Continuation_stubs: 8956 generate_continuation_stubs(); 8957 break; 8958 case Compiler_stubs: 8959 generate_compiler_stubs(); 8960 break; 8961 case Final_stubs: 8962 generate_final_stubs(); 8963 break; 8964 default: 8965 fatal("unexpected stubs kind: %d", kind); 8966 break; 8967 }; 8968 } 8969 }; // end class declaration 8970 8971 void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) { 8972 StubGenerator g(code, kind); 8973 } 8974 8975 8976 #if defined (LINUX) 8977 8978 // Define pointers to atomic stubs and initialize them to point to the 8979 // code in atomic_aarch64.S. 8980 8981 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ 8982 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ 8983 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ 8984 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ 8985 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; 8986 8987 DEFAULT_ATOMIC_OP(fetch_add, 4, ) 8988 DEFAULT_ATOMIC_OP(fetch_add, 8, ) 8989 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed) 8990 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed) 8991 DEFAULT_ATOMIC_OP(xchg, 4, ) 8992 DEFAULT_ATOMIC_OP(xchg, 8, ) 8993 DEFAULT_ATOMIC_OP(cmpxchg, 1, ) 8994 DEFAULT_ATOMIC_OP(cmpxchg, 4, ) 8995 DEFAULT_ATOMIC_OP(cmpxchg, 8, ) 8996 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) 8997 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) 8998 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) 8999 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) 9000 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) 9001 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) 9002 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) 9003 9004 #undef DEFAULT_ATOMIC_OP 9005 9006 #endif // LINUX