1 /* 2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "asm/macroAssembler.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "asm/register.hpp" 29 #include "atomic_aarch64.hpp" 30 #include "compiler/oopMap.hpp" 31 #include "gc/shared/barrierSet.hpp" 32 #include "gc/shared/barrierSetAssembler.hpp" 33 #include "gc/shared/gc_globals.hpp" 34 #include "gc/shared/tlab_globals.hpp" 35 #include "interpreter/interpreter.hpp" 36 #include "memory/universe.hpp" 37 #include "nativeInst_aarch64.hpp" 38 #include "oops/instanceOop.hpp" 39 #include "oops/method.hpp" 40 #include "oops/objArrayKlass.hpp" 41 #include "oops/oop.inline.hpp" 42 #include "prims/methodHandles.hpp" 43 #include "prims/upcallLinker.hpp" 44 #include "runtime/arguments.hpp" 45 #include "runtime/atomic.hpp" 46 #include "runtime/continuation.hpp" 47 #include "runtime/continuationEntry.inline.hpp" 48 #include "runtime/frame.inline.hpp" 49 #include "runtime/handles.inline.hpp" 50 #include "runtime/javaThread.hpp" 51 #include "runtime/sharedRuntime.hpp" 52 #include "runtime/stubCodeGenerator.hpp" 53 #include "runtime/stubRoutines.hpp" 54 #include "utilities/align.hpp" 55 #include "utilities/checkedCast.hpp" 56 #include "utilities/debug.hpp" 57 #include "utilities/globalDefinitions.hpp" 58 #include "utilities/intpow.hpp" 59 #include "utilities/powerOfTwo.hpp" 60 #ifdef COMPILER2 61 #include "opto/runtime.hpp" 62 #endif 63 #if INCLUDE_ZGC 64 #include "gc/z/zThreadLocalData.hpp" 65 #endif 66 67 // Declaration and definition of StubGenerator (no .hpp file). 68 // For a more detailed description of the stub routine structure 69 // see the comment in stubRoutines.hpp 70 71 #undef __ 72 #define __ _masm-> 73 74 #ifdef PRODUCT 75 #define BLOCK_COMMENT(str) /* nothing */ 76 #else 77 #define BLOCK_COMMENT(str) __ block_comment(str) 78 #endif 79 80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 81 82 // Stub Code definitions 83 84 class StubGenerator: public StubCodeGenerator { 85 private: 86 87 #ifdef PRODUCT 88 #define inc_counter_np(counter) ((void)0) 89 #else 90 void inc_counter_np_(uint& counter) { 91 __ incrementw(ExternalAddress((address)&counter)); 92 } 93 #define inc_counter_np(counter) \ 94 BLOCK_COMMENT("inc_counter " #counter); \ 95 inc_counter_np_(counter); 96 #endif 97 98 // Call stubs are used to call Java from C 99 // 100 // Arguments: 101 // c_rarg0: call wrapper address address 102 // c_rarg1: result address 103 // c_rarg2: result type BasicType 104 // c_rarg3: method Method* 105 // c_rarg4: (interpreter) entry point address 106 // c_rarg5: parameters intptr_t* 107 // c_rarg6: parameter size (in words) int 108 // c_rarg7: thread Thread* 109 // 110 // There is no return from the stub itself as any Java result 111 // is written to result 112 // 113 // we save r30 (lr) as the return PC at the base of the frame and 114 // link r29 (fp) below it as the frame pointer installing sp (r31) 115 // into fp. 116 // 117 // we save r0-r7, which accounts for all the c arguments. 118 // 119 // TODO: strictly do we need to save them all? they are treated as 120 // volatile by C so could we omit saving the ones we are going to 121 // place in global registers (thread? method?) or those we only use 122 // during setup of the Java call? 123 // 124 // we don't need to save r8 which C uses as an indirect result location 125 // return register. 126 // 127 // we don't need to save r9-r15 which both C and Java treat as 128 // volatile 129 // 130 // we don't need to save r16-18 because Java does not use them 131 // 132 // we save r19-r28 which Java uses as scratch registers and C 133 // expects to be callee-save 134 // 135 // we save the bottom 64 bits of each value stored in v8-v15; it is 136 // the responsibility of the caller to preserve larger values. 137 // 138 // so the stub frame looks like this when we enter Java code 139 // 140 // [ return_from_Java ] <--- sp 141 // [ argument word n ] 142 // ... 143 // -29 [ argument word 1 ] 144 // -28 [ saved Floating-point Control Register ] 145 // -26 [ saved v15 ] <--- sp_after_call 146 // -25 [ saved v14 ] 147 // -24 [ saved v13 ] 148 // -23 [ saved v12 ] 149 // -22 [ saved v11 ] 150 // -21 [ saved v10 ] 151 // -20 [ saved v9 ] 152 // -19 [ saved v8 ] 153 // -18 [ saved r28 ] 154 // -17 [ saved r27 ] 155 // -16 [ saved r26 ] 156 // -15 [ saved r25 ] 157 // -14 [ saved r24 ] 158 // -13 [ saved r23 ] 159 // -12 [ saved r22 ] 160 // -11 [ saved r21 ] 161 // -10 [ saved r20 ] 162 // -9 [ saved r19 ] 163 // -8 [ call wrapper (r0) ] 164 // -7 [ result (r1) ] 165 // -6 [ result type (r2) ] 166 // -5 [ method (r3) ] 167 // -4 [ entry point (r4) ] 168 // -3 [ parameters (r5) ] 169 // -2 [ parameter size (r6) ] 170 // -1 [ thread (r7) ] 171 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 172 // 1 [ saved lr (r30) ] 173 174 // Call stub stack layout word offsets from fp 175 enum call_stub_layout { 176 sp_after_call_off = -28, 177 178 fpcr_off = sp_after_call_off, 179 d15_off = -26, 180 d13_off = -24, 181 d11_off = -22, 182 d9_off = -20, 183 184 r28_off = -18, 185 r26_off = -16, 186 r24_off = -14, 187 r22_off = -12, 188 r20_off = -10, 189 call_wrapper_off = -8, 190 result_off = -7, 191 result_type_off = -6, 192 method_off = -5, 193 entry_point_off = -4, 194 parameter_size_off = -2, 195 thread_off = -1, 196 fp_f = 0, 197 retaddr_off = 1, 198 }; 199 200 address generate_call_stub(address& return_address) { 201 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 202 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 203 "adjust this code"); 204 205 StubGenStubId stub_id = StubGenStubId::call_stub_id; 206 StubCodeMark mark(this, stub_id); 207 address start = __ pc(); 208 209 const Address sp_after_call (rfp, sp_after_call_off * wordSize); 210 211 const Address fpcr_save (rfp, fpcr_off * wordSize); 212 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 213 const Address result (rfp, result_off * wordSize); 214 const Address result_type (rfp, result_type_off * wordSize); 215 const Address method (rfp, method_off * wordSize); 216 const Address entry_point (rfp, entry_point_off * wordSize); 217 const Address parameter_size(rfp, parameter_size_off * wordSize); 218 219 const Address thread (rfp, thread_off * wordSize); 220 221 const Address d15_save (rfp, d15_off * wordSize); 222 const Address d13_save (rfp, d13_off * wordSize); 223 const Address d11_save (rfp, d11_off * wordSize); 224 const Address d9_save (rfp, d9_off * wordSize); 225 226 const Address r28_save (rfp, r28_off * wordSize); 227 const Address r26_save (rfp, r26_off * wordSize); 228 const Address r24_save (rfp, r24_off * wordSize); 229 const Address r22_save (rfp, r22_off * wordSize); 230 const Address r20_save (rfp, r20_off * wordSize); 231 232 // stub code 233 234 address aarch64_entry = __ pc(); 235 236 // set up frame and move sp to end of save area 237 __ enter(); 238 __ sub(sp, rfp, -sp_after_call_off * wordSize); 239 240 // save register parameters and Java scratch/global registers 241 // n.b. we save thread even though it gets installed in 242 // rthread because we want to sanity check rthread later 243 __ str(c_rarg7, thread); 244 __ strw(c_rarg6, parameter_size); 245 __ stp(c_rarg4, c_rarg5, entry_point); 246 __ stp(c_rarg2, c_rarg3, result_type); 247 __ stp(c_rarg0, c_rarg1, call_wrapper); 248 249 __ stp(r20, r19, r20_save); 250 __ stp(r22, r21, r22_save); 251 __ stp(r24, r23, r24_save); 252 __ stp(r26, r25, r26_save); 253 __ stp(r28, r27, r28_save); 254 255 __ stpd(v9, v8, d9_save); 256 __ stpd(v11, v10, d11_save); 257 __ stpd(v13, v12, d13_save); 258 __ stpd(v15, v14, d15_save); 259 260 __ get_fpcr(rscratch1); 261 __ str(rscratch1, fpcr_save); 262 // Set FPCR to the state we need. We do want Round to Nearest. We 263 // don't want non-IEEE rounding modes or floating-point traps. 264 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode 265 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12) 266 __ set_fpcr(rscratch1); 267 268 // install Java thread in global register now we have saved 269 // whatever value it held 270 __ mov(rthread, c_rarg7); 271 // And method 272 __ mov(rmethod, c_rarg3); 273 274 // set up the heapbase register 275 __ reinit_heapbase(); 276 277 #ifdef ASSERT 278 // make sure we have no pending exceptions 279 { 280 Label L; 281 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 282 __ cmp(rscratch1, (u1)NULL_WORD); 283 __ br(Assembler::EQ, L); 284 __ stop("StubRoutines::call_stub: entered with pending exception"); 285 __ BIND(L); 286 } 287 #endif 288 // pass parameters if any 289 __ mov(esp, sp); 290 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 291 __ andr(sp, rscratch1, -2 * wordSize); 292 293 BLOCK_COMMENT("pass parameters if any"); 294 Label parameters_done; 295 // parameter count is still in c_rarg6 296 // and parameter pointer identifying param 1 is in c_rarg5 297 __ cbzw(c_rarg6, parameters_done); 298 299 address loop = __ pc(); 300 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 301 __ subsw(c_rarg6, c_rarg6, 1); 302 __ push(rscratch1); 303 __ br(Assembler::GT, loop); 304 305 __ BIND(parameters_done); 306 307 // call Java entry -- passing methdoOop, and current sp 308 // rmethod: Method* 309 // r19_sender_sp: sender sp 310 BLOCK_COMMENT("call Java function"); 311 __ mov(r19_sender_sp, sp); 312 __ blr(c_rarg4); 313 314 // we do this here because the notify will already have been done 315 // if we get to the next instruction via an exception 316 // 317 // n.b. adding this instruction here affects the calculation of 318 // whether or not a routine returns to the call stub (used when 319 // doing stack walks) since the normal test is to check the return 320 // pc against the address saved below. so we may need to allow for 321 // this extra instruction in the check. 322 323 // save current address for use by exception handling code 324 325 return_address = __ pc(); 326 327 // store result depending on type (everything that is not 328 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 329 // n.b. this assumes Java returns an integral result in r0 330 // and a floating result in j_farg0 331 // All of j_rargN may be used to return inline type fields so be careful 332 // not to clobber those. 333 // SharedRuntime::generate_buffered_inline_type_adapter() knows the register 334 // assignment of Rresult below. 335 Register Rresult = r14, Rresult_type = r15; 336 __ ldr(Rresult, result); 337 Label is_long, is_float, is_double, check_prim, exit; 338 __ ldr(Rresult_type, result_type); 339 __ cmp(Rresult_type, (u1)T_OBJECT); 340 __ br(Assembler::EQ, check_prim); 341 __ cmp(Rresult_type, (u1)T_LONG); 342 __ br(Assembler::EQ, is_long); 343 __ cmp(Rresult_type, (u1)T_FLOAT); 344 __ br(Assembler::EQ, is_float); 345 __ cmp(Rresult_type, (u1)T_DOUBLE); 346 __ br(Assembler::EQ, is_double); 347 348 // handle T_INT case 349 __ strw(r0, Address(Rresult)); 350 351 __ BIND(exit); 352 353 // pop parameters 354 __ sub(esp, rfp, -sp_after_call_off * wordSize); 355 356 #ifdef ASSERT 357 // verify that threads correspond 358 { 359 Label L, S; 360 __ ldr(rscratch1, thread); 361 __ cmp(rthread, rscratch1); 362 __ br(Assembler::NE, S); 363 __ get_thread(rscratch1); 364 __ cmp(rthread, rscratch1); 365 __ br(Assembler::EQ, L); 366 __ BIND(S); 367 __ stop("StubRoutines::call_stub: threads must correspond"); 368 __ BIND(L); 369 } 370 #endif 371 372 __ pop_cont_fastpath(rthread); 373 374 // restore callee-save registers 375 __ ldpd(v15, v14, d15_save); 376 __ ldpd(v13, v12, d13_save); 377 __ ldpd(v11, v10, d11_save); 378 __ ldpd(v9, v8, d9_save); 379 380 __ ldp(r28, r27, r28_save); 381 __ ldp(r26, r25, r26_save); 382 __ ldp(r24, r23, r24_save); 383 __ ldp(r22, r21, r22_save); 384 __ ldp(r20, r19, r20_save); 385 386 // restore fpcr 387 __ ldr(rscratch1, fpcr_save); 388 __ set_fpcr(rscratch1); 389 390 __ ldp(c_rarg0, c_rarg1, call_wrapper); 391 __ ldrw(c_rarg2, result_type); 392 __ ldr(c_rarg3, method); 393 __ ldp(c_rarg4, c_rarg5, entry_point); 394 __ ldp(c_rarg6, c_rarg7, parameter_size); 395 396 // leave frame and return to caller 397 __ leave(); 398 __ ret(lr); 399 400 // handle return types different from T_INT 401 __ BIND(check_prim); 402 if (InlineTypeReturnedAsFields) { 403 // Check for scalarized return value 404 __ tbz(r0, 0, is_long); 405 // Load pack handler address 406 __ andr(rscratch1, r0, -2); 407 __ ldr(rscratch1, Address(rscratch1, InstanceKlass::adr_inlineklass_fixed_block_offset())); 408 __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset())); 409 __ blr(rscratch1); 410 __ b(exit); 411 } 412 413 __ BIND(is_long); 414 __ str(r0, Address(Rresult, 0)); 415 __ br(Assembler::AL, exit); 416 417 __ BIND(is_float); 418 __ strs(j_farg0, Address(Rresult, 0)); 419 __ br(Assembler::AL, exit); 420 421 __ BIND(is_double); 422 __ strd(j_farg0, Address(Rresult, 0)); 423 __ br(Assembler::AL, exit); 424 425 return start; 426 } 427 428 // Return point for a Java call if there's an exception thrown in 429 // Java code. The exception is caught and transformed into a 430 // pending exception stored in JavaThread that can be tested from 431 // within the VM. 432 // 433 // Note: Usually the parameters are removed by the callee. In case 434 // of an exception crossing an activation frame boundary, that is 435 // not the case if the callee is compiled code => need to setup the 436 // rsp. 437 // 438 // r0: exception oop 439 440 address generate_catch_exception() { 441 StubGenStubId stub_id = StubGenStubId::catch_exception_id; 442 StubCodeMark mark(this, stub_id); 443 address start = __ pc(); 444 445 // same as in generate_call_stub(): 446 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 447 const Address thread (rfp, thread_off * wordSize); 448 449 #ifdef ASSERT 450 // verify that threads correspond 451 { 452 Label L, S; 453 __ ldr(rscratch1, thread); 454 __ cmp(rthread, rscratch1); 455 __ br(Assembler::NE, S); 456 __ get_thread(rscratch1); 457 __ cmp(rthread, rscratch1); 458 __ br(Assembler::EQ, L); 459 __ bind(S); 460 __ stop("StubRoutines::catch_exception: threads must correspond"); 461 __ bind(L); 462 } 463 #endif 464 465 // set pending exception 466 __ verify_oop(r0); 467 468 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 469 __ mov(rscratch1, (address)__FILE__); 470 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 471 __ movw(rscratch1, (int)__LINE__); 472 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 473 474 // complete return to VM 475 assert(StubRoutines::_call_stub_return_address != nullptr, 476 "_call_stub_return_address must have been generated before"); 477 __ b(StubRoutines::_call_stub_return_address); 478 479 return start; 480 } 481 482 // Continuation point for runtime calls returning with a pending 483 // exception. The pending exception check happened in the runtime 484 // or native call stub. The pending exception in Thread is 485 // converted into a Java-level exception. 486 // 487 // Contract with Java-level exception handlers: 488 // r0: exception 489 // r3: throwing pc 490 // 491 // NOTE: At entry of this stub, exception-pc must be in LR !! 492 493 // NOTE: this is always used as a jump target within generated code 494 // so it just needs to be generated code with no x86 prolog 495 496 address generate_forward_exception() { 497 StubGenStubId stub_id = StubGenStubId::forward_exception_id; 498 StubCodeMark mark(this, stub_id); 499 address start = __ pc(); 500 501 // Upon entry, LR points to the return address returning into 502 // Java (interpreted or compiled) code; i.e., the return address 503 // becomes the throwing pc. 504 // 505 // Arguments pushed before the runtime call are still on the stack 506 // but the exception handler will reset the stack pointer -> 507 // ignore them. A potential result in registers can be ignored as 508 // well. 509 510 #ifdef ASSERT 511 // make sure this code is only executed if there is a pending exception 512 { 513 Label L; 514 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 515 __ cbnz(rscratch1, L); 516 __ stop("StubRoutines::forward exception: no pending exception (1)"); 517 __ bind(L); 518 } 519 #endif 520 521 // compute exception handler into r19 522 523 // call the VM to find the handler address associated with the 524 // caller address. pass thread in r0 and caller pc (ret address) 525 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 526 // the stack. 527 __ mov(c_rarg1, lr); 528 // lr will be trashed by the VM call so we move it to R19 529 // (callee-saved) because we also need to pass it to the handler 530 // returned by this call. 531 __ mov(r19, lr); 532 BLOCK_COMMENT("call exception_handler_for_return_address"); 533 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 534 SharedRuntime::exception_handler_for_return_address), 535 rthread, c_rarg1); 536 // Reinitialize the ptrue predicate register, in case the external runtime 537 // call clobbers ptrue reg, as we may return to SVE compiled code. 538 __ reinitialize_ptrue(); 539 540 // we should not really care that lr is no longer the callee 541 // address. we saved the value the handler needs in r19 so we can 542 // just copy it to r3. however, the C2 handler will push its own 543 // frame and then calls into the VM and the VM code asserts that 544 // the PC for the frame above the handler belongs to a compiled 545 // Java method. So, we restore lr here to satisfy that assert. 546 __ mov(lr, r19); 547 // setup r0 & r3 & clear pending exception 548 __ mov(r3, r19); 549 __ mov(r19, r0); 550 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 551 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 552 553 #ifdef ASSERT 554 // make sure exception is set 555 { 556 Label L; 557 __ cbnz(r0, L); 558 __ stop("StubRoutines::forward exception: no pending exception (2)"); 559 __ bind(L); 560 } 561 #endif 562 563 // continue at exception handler 564 // r0: exception 565 // r3: throwing pc 566 // r19: exception handler 567 __ verify_oop(r0); 568 __ br(r19); 569 570 return start; 571 } 572 573 // Non-destructive plausibility checks for oops 574 // 575 // Arguments: 576 // r0: oop to verify 577 // rscratch1: error message 578 // 579 // Stack after saving c_rarg3: 580 // [tos + 0]: saved c_rarg3 581 // [tos + 1]: saved c_rarg2 582 // [tos + 2]: saved lr 583 // [tos + 3]: saved rscratch2 584 // [tos + 4]: saved r0 585 // [tos + 5]: saved rscratch1 586 address generate_verify_oop() { 587 StubGenStubId stub_id = StubGenStubId::verify_oop_id; 588 StubCodeMark mark(this, stub_id); 589 address start = __ pc(); 590 591 Label exit, error; 592 593 // save c_rarg2 and c_rarg3 594 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 595 596 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 597 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 598 __ ldr(c_rarg3, Address(c_rarg2)); 599 __ add(c_rarg3, c_rarg3, 1); 600 __ str(c_rarg3, Address(c_rarg2)); 601 602 // object is in r0 603 // make sure object is 'reasonable' 604 __ cbz(r0, exit); // if obj is null it is OK 605 606 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 607 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error); 608 609 // return if everything seems ok 610 __ bind(exit); 611 612 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 613 __ ret(lr); 614 615 // handle errors 616 __ bind(error); 617 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 618 619 __ push(RegSet::range(r0, r29), sp); 620 // debug(char* msg, int64_t pc, int64_t regs[]) 621 __ mov(c_rarg0, rscratch1); // pass address of error message 622 __ mov(c_rarg1, lr); // pass return address 623 __ mov(c_rarg2, sp); // pass address of regs on stack 624 #ifndef PRODUCT 625 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 626 #endif 627 BLOCK_COMMENT("call MacroAssembler::debug"); 628 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 629 __ blr(rscratch1); 630 __ hlt(0); 631 632 return start; 633 } 634 635 // Generate indices for iota vector. 636 address generate_iota_indices(StubGenStubId stub_id) { 637 __ align(CodeEntryAlignment); 638 StubCodeMark mark(this, stub_id); 639 address start = __ pc(); 640 // B 641 __ emit_data64(0x0706050403020100, relocInfo::none); 642 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 643 // H 644 __ emit_data64(0x0003000200010000, relocInfo::none); 645 __ emit_data64(0x0007000600050004, relocInfo::none); 646 // S 647 __ emit_data64(0x0000000100000000, relocInfo::none); 648 __ emit_data64(0x0000000300000002, relocInfo::none); 649 // D 650 __ emit_data64(0x0000000000000000, relocInfo::none); 651 __ emit_data64(0x0000000000000001, relocInfo::none); 652 // S - FP 653 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f 654 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f 655 // D - FP 656 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d 657 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d 658 return start; 659 } 660 661 // The inner part of zero_words(). This is the bulk operation, 662 // zeroing words in blocks, possibly using DC ZVA to do it. The 663 // caller is responsible for zeroing the last few words. 664 // 665 // Inputs: 666 // r10: the HeapWord-aligned base address of an array to zero. 667 // r11: the count in HeapWords, r11 > 0. 668 // 669 // Returns r10 and r11, adjusted for the caller to clear. 670 // r10: the base address of the tail of words left to clear. 671 // r11: the number of words in the tail. 672 // r11 < MacroAssembler::zero_words_block_size. 673 674 address generate_zero_blocks() { 675 Label done; 676 Label base_aligned; 677 678 Register base = r10, cnt = r11; 679 680 __ align(CodeEntryAlignment); 681 StubGenStubId stub_id = StubGenStubId::zero_blocks_id; 682 StubCodeMark mark(this, stub_id); 683 address start = __ pc(); 684 685 if (UseBlockZeroing) { 686 int zva_length = VM_Version::zva_length(); 687 688 // Ensure ZVA length can be divided by 16. This is required by 689 // the subsequent operations. 690 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 691 692 __ tbz(base, 3, base_aligned); 693 __ str(zr, Address(__ post(base, 8))); 694 __ sub(cnt, cnt, 1); 695 __ bind(base_aligned); 696 697 // Ensure count >= zva_length * 2 so that it still deserves a zva after 698 // alignment. 699 Label small; 700 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 701 __ subs(rscratch1, cnt, low_limit >> 3); 702 __ br(Assembler::LT, small); 703 __ zero_dcache_blocks(base, cnt); 704 __ bind(small); 705 } 706 707 { 708 // Number of stp instructions we'll unroll 709 const int unroll = 710 MacroAssembler::zero_words_block_size / 2; 711 // Clear the remaining blocks. 712 Label loop; 713 __ subs(cnt, cnt, unroll * 2); 714 __ br(Assembler::LT, done); 715 __ bind(loop); 716 for (int i = 0; i < unroll; i++) 717 __ stp(zr, zr, __ post(base, 16)); 718 __ subs(cnt, cnt, unroll * 2); 719 __ br(Assembler::GE, loop); 720 __ bind(done); 721 __ add(cnt, cnt, unroll * 2); 722 } 723 724 __ ret(lr); 725 726 return start; 727 } 728 729 730 typedef enum { 731 copy_forwards = 1, 732 copy_backwards = -1 733 } copy_direction; 734 735 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores 736 // for arraycopy stubs. 737 class ArrayCopyBarrierSetHelper : StackObj { 738 BarrierSetAssembler* _bs_asm; 739 MacroAssembler* _masm; 740 DecoratorSet _decorators; 741 BasicType _type; 742 Register _gct1; 743 Register _gct2; 744 Register _gct3; 745 FloatRegister _gcvt1; 746 FloatRegister _gcvt2; 747 FloatRegister _gcvt3; 748 749 public: 750 ArrayCopyBarrierSetHelper(MacroAssembler* masm, 751 DecoratorSet decorators, 752 BasicType type, 753 Register gct1, 754 Register gct2, 755 Register gct3, 756 FloatRegister gcvt1, 757 FloatRegister gcvt2, 758 FloatRegister gcvt3) 759 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()), 760 _masm(masm), 761 _decorators(decorators), 762 _type(type), 763 _gct1(gct1), 764 _gct2(gct2), 765 _gct3(gct3), 766 _gcvt1(gcvt1), 767 _gcvt2(gcvt2), 768 _gcvt3(gcvt3) { 769 } 770 771 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) { 772 _bs_asm->copy_load_at(_masm, _decorators, _type, 32, 773 dst1, dst2, src, 774 _gct1, _gct2, _gcvt1); 775 } 776 777 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) { 778 _bs_asm->copy_store_at(_masm, _decorators, _type, 32, 779 dst, src1, src2, 780 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3); 781 } 782 783 void copy_load_at_16(Register dst1, Register dst2, Address src) { 784 _bs_asm->copy_load_at(_masm, _decorators, _type, 16, 785 dst1, dst2, src, 786 _gct1); 787 } 788 789 void copy_store_at_16(Address dst, Register src1, Register src2) { 790 _bs_asm->copy_store_at(_masm, _decorators, _type, 16, 791 dst, src1, src2, 792 _gct1, _gct2, _gct3); 793 } 794 795 void copy_load_at_8(Register dst, Address src) { 796 _bs_asm->copy_load_at(_masm, _decorators, _type, 8, 797 dst, noreg, src, 798 _gct1); 799 } 800 801 void copy_store_at_8(Address dst, Register src) { 802 _bs_asm->copy_store_at(_masm, _decorators, _type, 8, 803 dst, src, noreg, 804 _gct1, _gct2, _gct3); 805 } 806 }; 807 808 // Bulk copy of blocks of 8 words. 809 // 810 // count is a count of words. 811 // 812 // Precondition: count >= 8 813 // 814 // Postconditions: 815 // 816 // The least significant bit of count contains the remaining count 817 // of words to copy. The rest of count is trash. 818 // 819 // s and d are adjusted to point to the remaining words to copy 820 // 821 void generate_copy_longs(StubGenStubId stub_id, DecoratorSet decorators, Label &start, Register s, Register d, Register count) { 822 BasicType type; 823 copy_direction direction; 824 825 switch (stub_id) { 826 case copy_byte_f_id: 827 direction = copy_forwards; 828 type = T_BYTE; 829 break; 830 case copy_byte_b_id: 831 direction = copy_backwards; 832 type = T_BYTE; 833 break; 834 case copy_oop_f_id: 835 direction = copy_forwards; 836 type = T_OBJECT; 837 break; 838 case copy_oop_b_id: 839 direction = copy_backwards; 840 type = T_OBJECT; 841 break; 842 case copy_oop_uninit_f_id: 843 direction = copy_forwards; 844 type = T_OBJECT; 845 break; 846 case copy_oop_uninit_b_id: 847 direction = copy_backwards; 848 type = T_OBJECT; 849 break; 850 default: 851 ShouldNotReachHere(); 852 } 853 854 int unit = wordSize * direction; 855 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 856 857 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 858 t4 = r7, t5 = r11, t6 = r12, t7 = r13; 859 const Register stride = r14; 860 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 861 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 862 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 863 864 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7); 865 assert_different_registers(s, d, count, rscratch1, rscratch2); 866 867 Label again, drain; 868 869 __ align(CodeEntryAlignment); 870 871 StubCodeMark mark(this, stub_id); 872 873 __ bind(start); 874 875 Label unaligned_copy_long; 876 if (AvoidUnalignedAccesses) { 877 __ tbnz(d, 3, unaligned_copy_long); 878 } 879 880 if (direction == copy_forwards) { 881 __ sub(s, s, bias); 882 __ sub(d, d, bias); 883 } 884 885 #ifdef ASSERT 886 // Make sure we are never given < 8 words 887 { 888 Label L; 889 __ cmp(count, (u1)8); 890 __ br(Assembler::GE, L); 891 __ stop("genrate_copy_longs called with < 8 words"); 892 __ bind(L); 893 } 894 #endif 895 896 // Fill 8 registers 897 if (UseSIMDForMemoryOps) { 898 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 899 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 900 } else { 901 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 902 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 903 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 904 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 905 } 906 907 __ subs(count, count, 16); 908 __ br(Assembler::LO, drain); 909 910 int prefetch = PrefetchCopyIntervalInBytes; 911 bool use_stride = false; 912 if (direction == copy_backwards) { 913 use_stride = prefetch > 256; 914 prefetch = -prefetch; 915 if (use_stride) __ mov(stride, prefetch); 916 } 917 918 __ bind(again); 919 920 if (PrefetchCopyIntervalInBytes > 0) 921 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 922 923 if (UseSIMDForMemoryOps) { 924 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 925 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 926 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 927 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 928 } else { 929 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 930 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 931 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 932 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 933 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 934 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 935 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 936 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 937 } 938 939 __ subs(count, count, 8); 940 __ br(Assembler::HS, again); 941 942 // Drain 943 __ bind(drain); 944 if (UseSIMDForMemoryOps) { 945 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 946 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 947 } else { 948 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 949 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 950 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 951 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 952 } 953 954 { 955 Label L1, L2; 956 __ tbz(count, exact_log2(4), L1); 957 if (UseSIMDForMemoryOps) { 958 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit))); 959 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1); 960 } else { 961 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 962 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 963 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 964 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3); 965 } 966 __ bind(L1); 967 968 if (direction == copy_forwards) { 969 __ add(s, s, bias); 970 __ add(d, d, bias); 971 } 972 973 __ tbz(count, 1, L2); 974 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 975 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1); 976 __ bind(L2); 977 } 978 979 __ ret(lr); 980 981 if (AvoidUnalignedAccesses) { 982 Label drain, again; 983 // Register order for storing. Order is different for backward copy. 984 985 __ bind(unaligned_copy_long); 986 987 // source address is even aligned, target odd aligned 988 // 989 // when forward copying word pairs we read long pairs at offsets 990 // {0, 2, 4, 6} (in long words). when backwards copying we read 991 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 992 // address by -2 in the forwards case so we can compute the 993 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 994 // or -1. 995 // 996 // when forward copying we need to store 1 word, 3 pairs and 997 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a 998 // zero offset We adjust the destination by -1 which means we 999 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 1000 // 1001 // When backwards copyng we need to store 1 word, 3 pairs and 1002 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 1003 // offsets {1, 3, 5, 7, 8} * unit. 1004 1005 if (direction == copy_forwards) { 1006 __ sub(s, s, 16); 1007 __ sub(d, d, 8); 1008 } 1009 1010 // Fill 8 registers 1011 // 1012 // for forwards copy s was offset by -16 from the original input 1013 // value of s so the register contents are at these offsets 1014 // relative to the 64 bit block addressed by that original input 1015 // and so on for each successive 64 byte block when s is updated 1016 // 1017 // t0 at offset 0, t1 at offset 8 1018 // t2 at offset 16, t3 at offset 24 1019 // t4 at offset 32, t5 at offset 40 1020 // t6 at offset 48, t7 at offset 56 1021 1022 // for backwards copy s was not offset so the register contents 1023 // are at these offsets into the preceding 64 byte block 1024 // relative to that original input and so on for each successive 1025 // preceding 64 byte block when s is updated. this explains the 1026 // slightly counter-intuitive looking pattern of register usage 1027 // in the stp instructions for backwards copy. 1028 // 1029 // t0 at offset -16, t1 at offset -8 1030 // t2 at offset -32, t3 at offset -24 1031 // t4 at offset -48, t5 at offset -40 1032 // t6 at offset -64, t7 at offset -56 1033 1034 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1035 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1036 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1037 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1038 1039 __ subs(count, count, 16); 1040 __ br(Assembler::LO, drain); 1041 1042 int prefetch = PrefetchCopyIntervalInBytes; 1043 bool use_stride = false; 1044 if (direction == copy_backwards) { 1045 use_stride = prefetch > 256; 1046 prefetch = -prefetch; 1047 if (use_stride) __ mov(stride, prefetch); 1048 } 1049 1050 __ bind(again); 1051 1052 if (PrefetchCopyIntervalInBytes > 0) 1053 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1054 1055 if (direction == copy_forwards) { 1056 // allowing for the offset of -8 the store instructions place 1057 // registers into the target 64 bit block at the following 1058 // offsets 1059 // 1060 // t0 at offset 0 1061 // t1 at offset 8, t2 at offset 16 1062 // t3 at offset 24, t4 at offset 32 1063 // t5 at offset 40, t6 at offset 48 1064 // t7 at offset 56 1065 1066 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1067 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1068 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1069 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1070 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1071 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1072 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1073 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1074 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1075 } else { 1076 // d was not offset when we started so the registers are 1077 // written into the 64 bit block preceding d with the following 1078 // offsets 1079 // 1080 // t1 at offset -8 1081 // t3 at offset -24, t0 at offset -16 1082 // t5 at offset -48, t2 at offset -32 1083 // t7 at offset -56, t4 at offset -48 1084 // t6 at offset -64 1085 // 1086 // note that this matches the offsets previously noted for the 1087 // loads 1088 1089 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1090 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1091 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1092 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1093 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1094 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1095 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1096 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1097 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1098 } 1099 1100 __ subs(count, count, 8); 1101 __ br(Assembler::HS, again); 1102 1103 // Drain 1104 // 1105 // this uses the same pattern of offsets and register arguments 1106 // as above 1107 __ bind(drain); 1108 if (direction == copy_forwards) { 1109 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1110 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1111 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1112 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1113 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1114 } else { 1115 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1116 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1117 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1118 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1119 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1120 } 1121 // now we need to copy any remaining part block which may 1122 // include a 4 word block subblock and/or a 2 word subblock. 1123 // bits 2 and 1 in the count are the tell-tale for whether we 1124 // have each such subblock 1125 { 1126 Label L1, L2; 1127 __ tbz(count, exact_log2(4), L1); 1128 // this is the same as above but copying only 4 longs hence 1129 // with only one intervening stp between the str instructions 1130 // but note that the offsets and registers still follow the 1131 // same pattern 1132 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1133 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 1134 if (direction == copy_forwards) { 1135 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1136 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1137 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3); 1138 } else { 1139 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1140 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1141 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2); 1142 } 1143 __ bind(L1); 1144 1145 __ tbz(count, 1, L2); 1146 // this is the same as above but copying only 2 longs hence 1147 // there is no intervening stp between the str instructions 1148 // but note that the offset and register patterns are still 1149 // the same 1150 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit))); 1151 if (direction == copy_forwards) { 1152 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1153 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1); 1154 } else { 1155 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1156 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0); 1157 } 1158 __ bind(L2); 1159 1160 // for forwards copy we need to re-adjust the offsets we 1161 // applied so that s and d are follow the last words written 1162 1163 if (direction == copy_forwards) { 1164 __ add(s, s, 16); 1165 __ add(d, d, 8); 1166 } 1167 1168 } 1169 1170 __ ret(lr); 1171 } 1172 } 1173 1174 // Small copy: less than 16 bytes. 1175 // 1176 // NB: Ignores all of the bits of count which represent more than 15 1177 // bytes, so a caller doesn't have to mask them. 1178 1179 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) { 1180 bool is_backwards = step < 0; 1181 size_t granularity = uabs(step); 1182 int direction = is_backwards ? -1 : 1; 1183 1184 Label Lword, Lint, Lshort, Lbyte; 1185 1186 assert(granularity 1187 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1188 1189 const Register t0 = r3; 1190 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1191 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg); 1192 1193 // ??? I don't know if this bit-test-and-branch is the right thing 1194 // to do. It does a lot of jumping, resulting in several 1195 // mispredicted branches. It might make more sense to do this 1196 // with something like Duff's device with a single computed branch. 1197 1198 __ tbz(count, 3 - exact_log2(granularity), Lword); 1199 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1200 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1201 __ bind(Lword); 1202 1203 if (granularity <= sizeof (jint)) { 1204 __ tbz(count, 2 - exact_log2(granularity), Lint); 1205 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1206 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1207 __ bind(Lint); 1208 } 1209 1210 if (granularity <= sizeof (jshort)) { 1211 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1212 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1213 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1214 __ bind(Lshort); 1215 } 1216 1217 if (granularity <= sizeof (jbyte)) { 1218 __ tbz(count, 0, Lbyte); 1219 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1220 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1221 __ bind(Lbyte); 1222 } 1223 } 1224 1225 Label copy_f, copy_b; 1226 Label copy_obj_f, copy_obj_b; 1227 Label copy_obj_uninit_f, copy_obj_uninit_b; 1228 1229 // All-singing all-dancing memory copy. 1230 // 1231 // Copy count units of memory from s to d. The size of a unit is 1232 // step, which can be positive or negative depending on the direction 1233 // of copy. If is_aligned is false, we align the source address. 1234 // 1235 1236 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, 1237 Register s, Register d, Register count, int step) { 1238 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1239 bool is_backwards = step < 0; 1240 unsigned int granularity = uabs(step); 1241 const Register t0 = r3, t1 = r4; 1242 1243 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always 1244 // load all the data before writing anything 1245 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1246 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11; 1247 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15; 1248 const Register send = r17, dend = r16; 1249 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1250 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 1251 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 1252 1253 if (PrefetchCopyIntervalInBytes > 0) 1254 __ prfm(Address(s, 0), PLDL1KEEP); 1255 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1256 __ br(Assembler::HI, copy_big); 1257 1258 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1259 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1260 1261 __ cmp(count, u1(16/granularity)); 1262 __ br(Assembler::LS, copy16); 1263 1264 __ cmp(count, u1(64/granularity)); 1265 __ br(Assembler::HI, copy80); 1266 1267 __ cmp(count, u1(32/granularity)); 1268 __ br(Assembler::LS, copy32); 1269 1270 // 33..64 bytes 1271 if (UseSIMDForMemoryOps) { 1272 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1273 bs.copy_load_at_32(v2, v3, Address(send, -32)); 1274 bs.copy_store_at_32(Address(d, 0), v0, v1); 1275 bs.copy_store_at_32(Address(dend, -32), v2, v3); 1276 } else { 1277 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1278 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1279 bs.copy_load_at_16(t4, t5, Address(send, -32)); 1280 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1281 1282 bs.copy_store_at_16(Address(d, 0), t0, t1); 1283 bs.copy_store_at_16(Address(d, 16), t2, t3); 1284 bs.copy_store_at_16(Address(dend, -32), t4, t5); 1285 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1286 } 1287 __ b(finish); 1288 1289 // 17..32 bytes 1290 __ bind(copy32); 1291 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1292 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1293 1294 bs.copy_store_at_16(Address(d, 0), t0, t1); 1295 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1296 __ b(finish); 1297 1298 // 65..80/96 bytes 1299 // (96 bytes if SIMD because we do 32 byes per instruction) 1300 __ bind(copy80); 1301 if (UseSIMDForMemoryOps) { 1302 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1303 bs.copy_load_at_32(v2, v3, Address(s, 32)); 1304 // Unaligned pointers can be an issue for copying. 1305 // The issue has more chances to happen when granularity of data is 1306 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least 1307 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. 1308 // The most performance drop has been seen for the range 65-80 bytes. 1309 // For such cases using the pair of ldp/stp instead of the third pair of 1310 // ldpq/stpq fixes the performance issue. 1311 if (granularity < sizeof (jint)) { 1312 Label copy96; 1313 __ cmp(count, u1(80/granularity)); 1314 __ br(Assembler::HI, copy96); 1315 bs.copy_load_at_16(t0, t1, Address(send, -16)); 1316 1317 bs.copy_store_at_32(Address(d, 0), v0, v1); 1318 bs.copy_store_at_32(Address(d, 32), v2, v3); 1319 1320 bs.copy_store_at_16(Address(dend, -16), t0, t1); 1321 __ b(finish); 1322 1323 __ bind(copy96); 1324 } 1325 bs.copy_load_at_32(v4, v5, Address(send, -32)); 1326 1327 bs.copy_store_at_32(Address(d, 0), v0, v1); 1328 bs.copy_store_at_32(Address(d, 32), v2, v3); 1329 1330 bs.copy_store_at_32(Address(dend, -32), v4, v5); 1331 } else { 1332 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1333 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1334 bs.copy_load_at_16(t4, t5, Address(s, 32)); 1335 bs.copy_load_at_16(t6, t7, Address(s, 48)); 1336 bs.copy_load_at_16(t8, t9, Address(send, -16)); 1337 1338 bs.copy_store_at_16(Address(d, 0), t0, t1); 1339 bs.copy_store_at_16(Address(d, 16), t2, t3); 1340 bs.copy_store_at_16(Address(d, 32), t4, t5); 1341 bs.copy_store_at_16(Address(d, 48), t6, t7); 1342 bs.copy_store_at_16(Address(dend, -16), t8, t9); 1343 } 1344 __ b(finish); 1345 1346 // 0..16 bytes 1347 __ bind(copy16); 1348 __ cmp(count, u1(8/granularity)); 1349 __ br(Assembler::LO, copy8); 1350 1351 // 8..16 bytes 1352 bs.copy_load_at_8(t0, Address(s, 0)); 1353 bs.copy_load_at_8(t1, Address(send, -8)); 1354 bs.copy_store_at_8(Address(d, 0), t0); 1355 bs.copy_store_at_8(Address(dend, -8), t1); 1356 __ b(finish); 1357 1358 if (granularity < 8) { 1359 // 4..7 bytes 1360 __ bind(copy8); 1361 __ tbz(count, 2 - exact_log2(granularity), copy4); 1362 __ ldrw(t0, Address(s, 0)); 1363 __ ldrw(t1, Address(send, -4)); 1364 __ strw(t0, Address(d, 0)); 1365 __ strw(t1, Address(dend, -4)); 1366 __ b(finish); 1367 if (granularity < 4) { 1368 // 0..3 bytes 1369 __ bind(copy4); 1370 __ cbz(count, finish); // get rid of 0 case 1371 if (granularity == 2) { 1372 __ ldrh(t0, Address(s, 0)); 1373 __ strh(t0, Address(d, 0)); 1374 } else { // granularity == 1 1375 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1376 // the first and last byte. 1377 // Handle the 3 byte case by loading and storing base + count/2 1378 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1379 // This does means in the 1 byte case we load/store the same 1380 // byte 3 times. 1381 __ lsr(count, count, 1); 1382 __ ldrb(t0, Address(s, 0)); 1383 __ ldrb(t1, Address(send, -1)); 1384 __ ldrb(t2, Address(s, count)); 1385 __ strb(t0, Address(d, 0)); 1386 __ strb(t1, Address(dend, -1)); 1387 __ strb(t2, Address(d, count)); 1388 } 1389 __ b(finish); 1390 } 1391 } 1392 1393 __ bind(copy_big); 1394 if (is_backwards) { 1395 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1396 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1397 } 1398 1399 // Now we've got the small case out of the way we can align the 1400 // source address on a 2-word boundary. 1401 1402 // Here we will materialize a count in r15, which is used by copy_memory_small 1403 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes. 1404 // Up until here, we have used t9, which aliases r15, but from here on, that register 1405 // can not be used as a temp register, as it contains the count. 1406 1407 Label aligned; 1408 1409 if (is_aligned) { 1410 // We may have to adjust by 1 word to get s 2-word-aligned. 1411 __ tbz(s, exact_log2(wordSize), aligned); 1412 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1413 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1414 __ sub(count, count, wordSize/granularity); 1415 } else { 1416 if (is_backwards) { 1417 __ andr(r15, s, 2 * wordSize - 1); 1418 } else { 1419 __ neg(r15, s); 1420 __ andr(r15, r15, 2 * wordSize - 1); 1421 } 1422 // r15 is the byte adjustment needed to align s. 1423 __ cbz(r15, aligned); 1424 int shift = exact_log2(granularity); 1425 if (shift > 0) { 1426 __ lsr(r15, r15, shift); 1427 } 1428 __ sub(count, count, r15); 1429 1430 #if 0 1431 // ?? This code is only correct for a disjoint copy. It may or 1432 // may not make sense to use it in that case. 1433 1434 // Copy the first pair; s and d may not be aligned. 1435 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1436 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1437 1438 // Align s and d, adjust count 1439 if (is_backwards) { 1440 __ sub(s, s, r15); 1441 __ sub(d, d, r15); 1442 } else { 1443 __ add(s, s, r15); 1444 __ add(d, d, r15); 1445 } 1446 #else 1447 copy_memory_small(decorators, type, s, d, r15, step); 1448 #endif 1449 } 1450 1451 __ bind(aligned); 1452 1453 // s is now 2-word-aligned. 1454 1455 // We have a count of units and some trailing bytes. Adjust the 1456 // count and do a bulk copy of words. If the shift is zero 1457 // perform a move instead to benefit from zero latency moves. 1458 int shift = exact_log2(wordSize/granularity); 1459 if (shift > 0) { 1460 __ lsr(r15, count, shift); 1461 } else { 1462 __ mov(r15, count); 1463 } 1464 if (direction == copy_forwards) { 1465 if (type != T_OBJECT) { 1466 __ bl(copy_f); 1467 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1468 __ bl(copy_obj_uninit_f); 1469 } else { 1470 __ bl(copy_obj_f); 1471 } 1472 } else { 1473 if (type != T_OBJECT) { 1474 __ bl(copy_b); 1475 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1476 __ bl(copy_obj_uninit_b); 1477 } else { 1478 __ bl(copy_obj_b); 1479 } 1480 } 1481 1482 // And the tail. 1483 copy_memory_small(decorators, type, s, d, count, step); 1484 1485 if (granularity >= 8) __ bind(copy8); 1486 if (granularity >= 4) __ bind(copy4); 1487 __ bind(finish); 1488 } 1489 1490 1491 void clobber_registers() { 1492 #ifdef ASSERT 1493 RegSet clobbered 1494 = MacroAssembler::call_clobbered_gp_registers() - rscratch1; 1495 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1496 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1497 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) { 1498 __ mov(*it, rscratch1); 1499 } 1500 #endif 1501 1502 } 1503 1504 // Scan over array at a for count oops, verifying each one. 1505 // Preserves a and count, clobbers rscratch1 and rscratch2. 1506 void verify_oop_array (int size, Register a, Register count, Register temp) { 1507 Label loop, end; 1508 __ mov(rscratch1, a); 1509 __ mov(rscratch2, zr); 1510 __ bind(loop); 1511 __ cmp(rscratch2, count); 1512 __ br(Assembler::HS, end); 1513 if (size == wordSize) { 1514 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1515 __ verify_oop(temp); 1516 } else { 1517 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1518 __ decode_heap_oop(temp); // calls verify_oop 1519 } 1520 __ add(rscratch2, rscratch2, 1); 1521 __ b(loop); 1522 __ bind(end); 1523 } 1524 1525 // Arguments: 1526 // stub_id - is used to name the stub and identify all details of 1527 // how to perform the copy. 1528 // 1529 // entry - is assigned to the stub's post push entry point unless 1530 // it is null 1531 // 1532 // Inputs: 1533 // c_rarg0 - source array address 1534 // c_rarg1 - destination array address 1535 // c_rarg2 - element count, treated as ssize_t, can be zero 1536 // 1537 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1538 // the hardware handle it. The two dwords within qwords that span 1539 // cache line boundaries will still be loaded and stored atomically. 1540 // 1541 // Side Effects: entry is set to the (post push) entry point so it 1542 // can be used by the corresponding conjoint copy 1543 // method 1544 // 1545 address generate_disjoint_copy(StubGenStubId stub_id, address *entry) { 1546 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1547 RegSet saved_reg = RegSet::of(s, d, count); 1548 int size; 1549 bool aligned; 1550 bool is_oop; 1551 bool dest_uninitialized; 1552 switch (stub_id) { 1553 case jbyte_disjoint_arraycopy_id: 1554 size = sizeof(jbyte); 1555 aligned = false; 1556 is_oop = false; 1557 dest_uninitialized = false; 1558 break; 1559 case arrayof_jbyte_disjoint_arraycopy_id: 1560 size = sizeof(jbyte); 1561 aligned = true; 1562 is_oop = false; 1563 dest_uninitialized = false; 1564 break; 1565 case jshort_disjoint_arraycopy_id: 1566 size = sizeof(jshort); 1567 aligned = false; 1568 is_oop = false; 1569 dest_uninitialized = false; 1570 break; 1571 case arrayof_jshort_disjoint_arraycopy_id: 1572 size = sizeof(jshort); 1573 aligned = true; 1574 is_oop = false; 1575 dest_uninitialized = false; 1576 break; 1577 case jint_disjoint_arraycopy_id: 1578 size = sizeof(jint); 1579 aligned = false; 1580 is_oop = false; 1581 dest_uninitialized = false; 1582 break; 1583 case arrayof_jint_disjoint_arraycopy_id: 1584 size = sizeof(jint); 1585 aligned = true; 1586 is_oop = false; 1587 dest_uninitialized = false; 1588 break; 1589 case jlong_disjoint_arraycopy_id: 1590 // since this is always aligned we can (should!) use the same 1591 // stub as for case arrayof_jlong_disjoint_arraycopy 1592 ShouldNotReachHere(); 1593 break; 1594 case arrayof_jlong_disjoint_arraycopy_id: 1595 size = sizeof(jlong); 1596 aligned = true; 1597 is_oop = false; 1598 dest_uninitialized = false; 1599 break; 1600 case oop_disjoint_arraycopy_id: 1601 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1602 aligned = !UseCompressedOops; 1603 is_oop = true; 1604 dest_uninitialized = false; 1605 break; 1606 case arrayof_oop_disjoint_arraycopy_id: 1607 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1608 aligned = !UseCompressedOops; 1609 is_oop = true; 1610 dest_uninitialized = false; 1611 break; 1612 case oop_disjoint_arraycopy_uninit_id: 1613 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1614 aligned = !UseCompressedOops; 1615 is_oop = true; 1616 dest_uninitialized = true; 1617 break; 1618 case arrayof_oop_disjoint_arraycopy_uninit_id: 1619 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1620 aligned = !UseCompressedOops; 1621 is_oop = true; 1622 dest_uninitialized = true; 1623 break; 1624 default: 1625 ShouldNotReachHere(); 1626 break; 1627 } 1628 1629 __ align(CodeEntryAlignment); 1630 StubCodeMark mark(this, stub_id); 1631 address start = __ pc(); 1632 __ enter(); 1633 1634 if (entry != nullptr) { 1635 *entry = __ pc(); 1636 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1637 BLOCK_COMMENT("Entry:"); 1638 } 1639 1640 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1641 if (dest_uninitialized) { 1642 decorators |= IS_DEST_UNINITIALIZED; 1643 } 1644 if (aligned) { 1645 decorators |= ARRAYCOPY_ALIGNED; 1646 } 1647 1648 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1649 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1650 1651 if (is_oop) { 1652 // save regs before copy_memory 1653 __ push(RegSet::of(d, count), sp); 1654 } 1655 { 1656 // UnsafeMemoryAccess page error: continue after unsafe access 1657 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1658 UnsafeMemoryAccessMark umam(this, add_entry, true); 1659 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); 1660 } 1661 1662 if (is_oop) { 1663 __ pop(RegSet::of(d, count), sp); 1664 if (VerifyOops) 1665 verify_oop_array(size, d, count, r16); 1666 } 1667 1668 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1669 1670 __ leave(); 1671 __ mov(r0, zr); // return 0 1672 __ ret(lr); 1673 return start; 1674 } 1675 1676 // Arguments: 1677 // stub_id - is used to name the stub and identify all details of 1678 // how to perform the copy. 1679 // 1680 // nooverlap_target - identifes the (post push) entry for the 1681 // corresponding disjoint copy routine which can be 1682 // jumped to if the ranges do not actually overlap 1683 // 1684 // entry - is assigned to the stub's post push entry point unless 1685 // it is null 1686 // 1687 // 1688 // Inputs: 1689 // c_rarg0 - source array address 1690 // c_rarg1 - destination array address 1691 // c_rarg2 - element count, treated as ssize_t, can be zero 1692 // 1693 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1694 // the hardware handle it. The two dwords within qwords that span 1695 // cache line boundaries will still be loaded and stored atomically. 1696 // 1697 // Side Effects: 1698 // entry is set to the no-overlap entry point so it can be used by 1699 // some other conjoint copy method 1700 // 1701 address generate_conjoint_copy(StubGenStubId stub_id, address nooverlap_target, address *entry) { 1702 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1703 RegSet saved_regs = RegSet::of(s, d, count); 1704 int size; 1705 bool aligned; 1706 bool is_oop; 1707 bool dest_uninitialized; 1708 switch (stub_id) { 1709 case jbyte_arraycopy_id: 1710 size = sizeof(jbyte); 1711 aligned = false; 1712 is_oop = false; 1713 dest_uninitialized = false; 1714 break; 1715 case arrayof_jbyte_arraycopy_id: 1716 size = sizeof(jbyte); 1717 aligned = true; 1718 is_oop = false; 1719 dest_uninitialized = false; 1720 break; 1721 case jshort_arraycopy_id: 1722 size = sizeof(jshort); 1723 aligned = false; 1724 is_oop = false; 1725 dest_uninitialized = false; 1726 break; 1727 case arrayof_jshort_arraycopy_id: 1728 size = sizeof(jshort); 1729 aligned = true; 1730 is_oop = false; 1731 dest_uninitialized = false; 1732 break; 1733 case jint_arraycopy_id: 1734 size = sizeof(jint); 1735 aligned = false; 1736 is_oop = false; 1737 dest_uninitialized = false; 1738 break; 1739 case arrayof_jint_arraycopy_id: 1740 size = sizeof(jint); 1741 aligned = true; 1742 is_oop = false; 1743 dest_uninitialized = false; 1744 break; 1745 case jlong_arraycopy_id: 1746 // since this is always aligned we can (should!) use the same 1747 // stub as for case arrayof_jlong_disjoint_arraycopy 1748 ShouldNotReachHere(); 1749 break; 1750 case arrayof_jlong_arraycopy_id: 1751 size = sizeof(jlong); 1752 aligned = true; 1753 is_oop = false; 1754 dest_uninitialized = false; 1755 break; 1756 case oop_arraycopy_id: 1757 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1758 aligned = !UseCompressedOops; 1759 is_oop = true; 1760 dest_uninitialized = false; 1761 break; 1762 case arrayof_oop_arraycopy_id: 1763 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1764 aligned = !UseCompressedOops; 1765 is_oop = true; 1766 dest_uninitialized = false; 1767 break; 1768 case oop_arraycopy_uninit_id: 1769 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1770 aligned = !UseCompressedOops; 1771 is_oop = true; 1772 dest_uninitialized = true; 1773 break; 1774 case arrayof_oop_arraycopy_uninit_id: 1775 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1776 aligned = !UseCompressedOops; 1777 is_oop = true; 1778 dest_uninitialized = true; 1779 break; 1780 default: 1781 ShouldNotReachHere(); 1782 } 1783 1784 StubCodeMark mark(this, stub_id); 1785 address start = __ pc(); 1786 __ enter(); 1787 1788 if (entry != nullptr) { 1789 *entry = __ pc(); 1790 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1791 BLOCK_COMMENT("Entry:"); 1792 } 1793 1794 // use fwd copy when (d-s) above_equal (count*size) 1795 __ sub(rscratch1, d, s); 1796 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1797 __ br(Assembler::HS, nooverlap_target); 1798 1799 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1800 if (dest_uninitialized) { 1801 decorators |= IS_DEST_UNINITIALIZED; 1802 } 1803 if (aligned) { 1804 decorators |= ARRAYCOPY_ALIGNED; 1805 } 1806 1807 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1808 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1809 1810 if (is_oop) { 1811 // save regs before copy_memory 1812 __ push(RegSet::of(d, count), sp); 1813 } 1814 { 1815 // UnsafeMemoryAccess page error: continue after unsafe access 1816 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1817 UnsafeMemoryAccessMark umam(this, add_entry, true); 1818 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); 1819 } 1820 if (is_oop) { 1821 __ pop(RegSet::of(d, count), sp); 1822 if (VerifyOops) 1823 verify_oop_array(size, d, count, r16); 1824 } 1825 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1826 __ leave(); 1827 __ mov(r0, zr); // return 0 1828 __ ret(lr); 1829 return start; 1830 } 1831 1832 // Helper for generating a dynamic type check. 1833 // Smashes rscratch1, rscratch2. 1834 void generate_type_check(Register sub_klass, 1835 Register super_check_offset, 1836 Register super_klass, 1837 Register temp1, 1838 Register temp2, 1839 Register result, 1840 Label& L_success) { 1841 assert_different_registers(sub_klass, super_check_offset, super_klass); 1842 1843 BLOCK_COMMENT("type_check:"); 1844 1845 Label L_miss; 1846 1847 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, 1848 super_check_offset); 1849 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr); 1850 1851 // Fall through on failure! 1852 __ BIND(L_miss); 1853 } 1854 1855 // 1856 // Generate checkcasting array copy stub 1857 // 1858 // Input: 1859 // c_rarg0 - source array address 1860 // c_rarg1 - destination array address 1861 // c_rarg2 - element count, treated as ssize_t, can be zero 1862 // c_rarg3 - size_t ckoff (super_check_offset) 1863 // c_rarg4 - oop ckval (super_klass) 1864 // 1865 // Output: 1866 // r0 == 0 - success 1867 // r0 == -1^K - failure, where K is partial transfer count 1868 // 1869 address generate_checkcast_copy(StubGenStubId stub_id, address *entry) { 1870 bool dest_uninitialized; 1871 switch (stub_id) { 1872 case checkcast_arraycopy_id: 1873 dest_uninitialized = false; 1874 break; 1875 case checkcast_arraycopy_uninit_id: 1876 dest_uninitialized = true; 1877 break; 1878 default: 1879 ShouldNotReachHere(); 1880 } 1881 1882 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1883 1884 // Input registers (after setup_arg_regs) 1885 const Register from = c_rarg0; // source array address 1886 const Register to = c_rarg1; // destination array address 1887 const Register count = c_rarg2; // elementscount 1888 const Register ckoff = c_rarg3; // super_check_offset 1889 const Register ckval = c_rarg4; // super_klass 1890 1891 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1892 RegSet wb_post_saved_regs = RegSet::of(count); 1893 1894 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1895 const Register copied_oop = r22; // actual oop copied 1896 const Register count_save = r21; // orig elementscount 1897 const Register start_to = r20; // destination array start address 1898 const Register r19_klass = r19; // oop._klass 1899 1900 // Registers used as gc temps (r5, r6, r7 are save-on-call) 1901 const Register gct1 = r5, gct2 = r6, gct3 = r7; 1902 1903 //--------------------------------------------------------------- 1904 // Assembler stub will be used for this call to arraycopy 1905 // if the two arrays are subtypes of Object[] but the 1906 // destination array type is not equal to or a supertype 1907 // of the source type. Each element must be separately 1908 // checked. 1909 1910 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1911 copied_oop, r19_klass, count_save); 1912 1913 __ align(CodeEntryAlignment); 1914 StubCodeMark mark(this, stub_id); 1915 address start = __ pc(); 1916 1917 __ enter(); // required for proper stackwalking of RuntimeStub frame 1918 1919 #ifdef ASSERT 1920 // caller guarantees that the arrays really are different 1921 // otherwise, we would have to make conjoint checks 1922 { Label L; 1923 __ b(L); // conjoint check not yet implemented 1924 __ stop("checkcast_copy within a single array"); 1925 __ bind(L); 1926 } 1927 #endif //ASSERT 1928 1929 // Caller of this entry point must set up the argument registers. 1930 if (entry != nullptr) { 1931 *entry = __ pc(); 1932 BLOCK_COMMENT("Entry:"); 1933 } 1934 1935 // Empty array: Nothing to do. 1936 __ cbz(count, L_done); 1937 __ push(RegSet::of(r19, r20, r21, r22), sp); 1938 1939 #ifdef ASSERT 1940 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1941 // The ckoff and ckval must be mutually consistent, 1942 // even though caller generates both. 1943 { Label L; 1944 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1945 __ ldrw(start_to, Address(ckval, sco_offset)); 1946 __ cmpw(ckoff, start_to); 1947 __ br(Assembler::EQ, L); 1948 __ stop("super_check_offset inconsistent"); 1949 __ bind(L); 1950 } 1951 #endif //ASSERT 1952 1953 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1954 bool is_oop = true; 1955 int element_size = UseCompressedOops ? 4 : 8; 1956 if (dest_uninitialized) { 1957 decorators |= IS_DEST_UNINITIALIZED; 1958 } 1959 1960 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1961 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1962 1963 // save the original count 1964 __ mov(count_save, count); 1965 1966 // Copy from low to high addresses 1967 __ mov(start_to, to); // Save destination array start address 1968 __ b(L_load_element); 1969 1970 // ======== begin loop ======== 1971 // (Loop is rotated; its entry is L_load_element.) 1972 // Loop control: 1973 // for (; count != 0; count--) { 1974 // copied_oop = load_heap_oop(from++); 1975 // ... generate_type_check ...; 1976 // store_heap_oop(to++, copied_oop); 1977 // } 1978 __ align(OptoLoopAlignment); 1979 1980 __ BIND(L_store_element); 1981 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, 1982 __ post(to, element_size), copied_oop, noreg, 1983 gct1, gct2, gct3); 1984 __ sub(count, count, 1); 1985 __ cbz(count, L_do_card_marks); 1986 1987 // ======== loop entry is here ======== 1988 __ BIND(L_load_element); 1989 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, 1990 copied_oop, noreg, __ post(from, element_size), 1991 gct1); 1992 __ cbz(copied_oop, L_store_element); 1993 1994 __ load_klass(r19_klass, copied_oop);// query the object klass 1995 1996 BLOCK_COMMENT("type_check:"); 1997 generate_type_check(/*sub_klass*/r19_klass, 1998 /*super_check_offset*/ckoff, 1999 /*super_klass*/ckval, 2000 /*r_array_base*/gct1, 2001 /*temp2*/gct2, 2002 /*result*/r10, L_store_element); 2003 2004 // Fall through on failure! 2005 2006 // ======== end loop ======== 2007 2008 // It was a real error; we must depend on the caller to finish the job. 2009 // Register count = remaining oops, count_orig = total oops. 2010 // Emit GC store barriers for the oops we have copied and report 2011 // their number to the caller. 2012 2013 __ subs(count, count_save, count); // K = partially copied oop count 2014 __ eon(count, count, zr); // report (-1^K) to caller 2015 __ br(Assembler::EQ, L_done_pop); 2016 2017 __ BIND(L_do_card_marks); 2018 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 2019 2020 __ bind(L_done_pop); 2021 __ pop(RegSet::of(r19, r20, r21, r22), sp); 2022 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 2023 2024 __ bind(L_done); 2025 __ mov(r0, count); 2026 __ leave(); 2027 __ ret(lr); 2028 2029 return start; 2030 } 2031 2032 // Perform range checks on the proposed arraycopy. 2033 // Kills temp, but nothing else. 2034 // Also, clean the sign bits of src_pos and dst_pos. 2035 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2036 Register src_pos, // source position (c_rarg1) 2037 Register dst, // destination array oo (c_rarg2) 2038 Register dst_pos, // destination position (c_rarg3) 2039 Register length, 2040 Register temp, 2041 Label& L_failed) { 2042 BLOCK_COMMENT("arraycopy_range_checks:"); 2043 2044 assert_different_registers(rscratch1, temp); 2045 2046 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2047 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 2048 __ addw(temp, length, src_pos); 2049 __ cmpw(temp, rscratch1); 2050 __ br(Assembler::HI, L_failed); 2051 2052 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2053 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2054 __ addw(temp, length, dst_pos); 2055 __ cmpw(temp, rscratch1); 2056 __ br(Assembler::HI, L_failed); 2057 2058 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 2059 __ movw(src_pos, src_pos); 2060 __ movw(dst_pos, dst_pos); 2061 2062 BLOCK_COMMENT("arraycopy_range_checks done"); 2063 } 2064 2065 // These stubs get called from some dumb test routine. 2066 // I'll write them properly when they're called from 2067 // something that's actually doing something. 2068 static void fake_arraycopy_stub(address src, address dst, int count) { 2069 assert(count == 0, "huh?"); 2070 } 2071 2072 2073 // 2074 // Generate 'unsafe' array copy stub 2075 // Though just as safe as the other stubs, it takes an unscaled 2076 // size_t argument instead of an element count. 2077 // 2078 // Input: 2079 // c_rarg0 - source array address 2080 // c_rarg1 - destination array address 2081 // c_rarg2 - byte count, treated as ssize_t, can be zero 2082 // 2083 // Examines the alignment of the operands and dispatches 2084 // to a long, int, short, or byte copy loop. 2085 // 2086 address generate_unsafe_copy(address byte_copy_entry, 2087 address short_copy_entry, 2088 address int_copy_entry, 2089 address long_copy_entry) { 2090 StubGenStubId stub_id = StubGenStubId::unsafe_arraycopy_id; 2091 2092 Label L_long_aligned, L_int_aligned, L_short_aligned; 2093 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 2094 2095 __ align(CodeEntryAlignment); 2096 StubCodeMark mark(this, stub_id); 2097 address start = __ pc(); 2098 __ enter(); // required for proper stackwalking of RuntimeStub frame 2099 2100 // bump this on entry, not on exit: 2101 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2102 2103 __ orr(rscratch1, s, d); 2104 __ orr(rscratch1, rscratch1, count); 2105 2106 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2107 __ cbz(rscratch1, L_long_aligned); 2108 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2109 __ cbz(rscratch1, L_int_aligned); 2110 __ tbz(rscratch1, 0, L_short_aligned); 2111 __ b(RuntimeAddress(byte_copy_entry)); 2112 2113 __ BIND(L_short_aligned); 2114 __ lsr(count, count, LogBytesPerShort); // size => short_count 2115 __ b(RuntimeAddress(short_copy_entry)); 2116 __ BIND(L_int_aligned); 2117 __ lsr(count, count, LogBytesPerInt); // size => int_count 2118 __ b(RuntimeAddress(int_copy_entry)); 2119 __ BIND(L_long_aligned); 2120 __ lsr(count, count, LogBytesPerLong); // size => long_count 2121 __ b(RuntimeAddress(long_copy_entry)); 2122 2123 return start; 2124 } 2125 2126 // 2127 // Generate generic array copy stubs 2128 // 2129 // Input: 2130 // c_rarg0 - src oop 2131 // c_rarg1 - src_pos (32-bits) 2132 // c_rarg2 - dst oop 2133 // c_rarg3 - dst_pos (32-bits) 2134 // c_rarg4 - element count (32-bits) 2135 // 2136 // Output: 2137 // r0 == 0 - success 2138 // r0 == -1^K - failure, where K is partial transfer count 2139 // 2140 address generate_generic_copy(address byte_copy_entry, address short_copy_entry, 2141 address int_copy_entry, address oop_copy_entry, 2142 address long_copy_entry, address checkcast_copy_entry) { 2143 StubGenStubId stub_id = StubGenStubId::generic_arraycopy_id; 2144 2145 Label L_failed, L_objArray; 2146 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2147 2148 // Input registers 2149 const Register src = c_rarg0; // source array oop 2150 const Register src_pos = c_rarg1; // source position 2151 const Register dst = c_rarg2; // destination array oop 2152 const Register dst_pos = c_rarg3; // destination position 2153 const Register length = c_rarg4; 2154 2155 2156 // Registers used as temps 2157 const Register dst_klass = c_rarg5; 2158 2159 __ align(CodeEntryAlignment); 2160 2161 StubCodeMark mark(this, stub_id); 2162 2163 address start = __ pc(); 2164 2165 __ enter(); // required for proper stackwalking of RuntimeStub frame 2166 2167 // bump this on entry, not on exit: 2168 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2169 2170 //----------------------------------------------------------------------- 2171 // Assembler stub will be used for this call to arraycopy 2172 // if the following conditions are met: 2173 // 2174 // (1) src and dst must not be null. 2175 // (2) src_pos must not be negative. 2176 // (3) dst_pos must not be negative. 2177 // (4) length must not be negative. 2178 // (5) src klass and dst klass should be the same and not null. 2179 // (6) src and dst should be arrays. 2180 // (7) src_pos + length must not exceed length of src. 2181 // (8) dst_pos + length must not exceed length of dst. 2182 // 2183 2184 // if (src == nullptr) return -1; 2185 __ cbz(src, L_failed); 2186 2187 // if (src_pos < 0) return -1; 2188 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2189 2190 // if (dst == nullptr) return -1; 2191 __ cbz(dst, L_failed); 2192 2193 // if (dst_pos < 0) return -1; 2194 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2195 2196 // registers used as temp 2197 const Register scratch_length = r16; // elements count to copy 2198 const Register scratch_src_klass = r17; // array klass 2199 const Register lh = r15; // layout helper 2200 2201 // if (length < 0) return -1; 2202 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2203 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2204 2205 __ load_klass(scratch_src_klass, src); 2206 #ifdef ASSERT 2207 // assert(src->klass() != nullptr); 2208 { 2209 BLOCK_COMMENT("assert klasses not null {"); 2210 Label L1, L2; 2211 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null 2212 __ bind(L1); 2213 __ stop("broken null klass"); 2214 __ bind(L2); 2215 __ load_klass(rscratch1, dst); 2216 __ cbz(rscratch1, L1); // this would be broken also 2217 BLOCK_COMMENT("} assert klasses not null done"); 2218 } 2219 #endif 2220 2221 // Load layout helper (32-bits) 2222 // 2223 // |array_tag| | header_size | element_type | |log2_element_size| 2224 // 32 30 24 16 8 2 0 2225 // 2226 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2227 // 2228 2229 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2230 2231 // Handle objArrays completely differently... 2232 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2233 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2234 __ movw(rscratch1, objArray_lh); 2235 __ eorw(rscratch2, lh, rscratch1); 2236 __ cbzw(rscratch2, L_objArray); 2237 2238 // if (src->klass() != dst->klass()) return -1; 2239 __ load_klass(rscratch2, dst); 2240 __ eor(rscratch2, rscratch2, scratch_src_klass); 2241 __ cbnz(rscratch2, L_failed); 2242 2243 // Check for flat inline type array -> return -1 2244 __ test_flat_array_oop(src, rscratch2, L_failed); 2245 2246 // Check for null-free (non-flat) inline type array -> handle as object array 2247 __ test_null_free_array_oop(src, rscratch2, L_objArray); 2248 2249 // if (!src->is_Array()) return -1; 2250 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2251 2252 // At this point, it is known to be a typeArray (array_tag 0x3). 2253 #ifdef ASSERT 2254 { 2255 BLOCK_COMMENT("assert primitive array {"); 2256 Label L; 2257 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2258 __ cmpw(lh, rscratch2); 2259 __ br(Assembler::GE, L); 2260 __ stop("must be a primitive array"); 2261 __ bind(L); 2262 BLOCK_COMMENT("} assert primitive array done"); 2263 } 2264 #endif 2265 2266 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2267 rscratch2, L_failed); 2268 2269 // TypeArrayKlass 2270 // 2271 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2272 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2273 // 2274 2275 const Register rscratch1_offset = rscratch1; // array offset 2276 const Register r15_elsize = lh; // element size 2277 2278 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2279 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2280 __ add(src, src, rscratch1_offset); // src array offset 2281 __ add(dst, dst, rscratch1_offset); // dst array offset 2282 BLOCK_COMMENT("choose copy loop based on element size"); 2283 2284 // next registers should be set before the jump to corresponding stub 2285 const Register from = c_rarg0; // source array address 2286 const Register to = c_rarg1; // destination array address 2287 const Register count = c_rarg2; // elements count 2288 2289 // 'from', 'to', 'count' registers should be set in such order 2290 // since they are the same as 'src', 'src_pos', 'dst'. 2291 2292 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2293 2294 // The possible values of elsize are 0-3, i.e. exact_log2(element 2295 // size in bytes). We do a simple bitwise binary search. 2296 __ BIND(L_copy_bytes); 2297 __ tbnz(r15_elsize, 1, L_copy_ints); 2298 __ tbnz(r15_elsize, 0, L_copy_shorts); 2299 __ lea(from, Address(src, src_pos));// src_addr 2300 __ lea(to, Address(dst, dst_pos));// dst_addr 2301 __ movw(count, scratch_length); // length 2302 __ b(RuntimeAddress(byte_copy_entry)); 2303 2304 __ BIND(L_copy_shorts); 2305 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2306 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2307 __ movw(count, scratch_length); // length 2308 __ b(RuntimeAddress(short_copy_entry)); 2309 2310 __ BIND(L_copy_ints); 2311 __ tbnz(r15_elsize, 0, L_copy_longs); 2312 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2313 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2314 __ movw(count, scratch_length); // length 2315 __ b(RuntimeAddress(int_copy_entry)); 2316 2317 __ BIND(L_copy_longs); 2318 #ifdef ASSERT 2319 { 2320 BLOCK_COMMENT("assert long copy {"); 2321 Label L; 2322 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2323 __ cmpw(r15_elsize, LogBytesPerLong); 2324 __ br(Assembler::EQ, L); 2325 __ stop("must be long copy, but elsize is wrong"); 2326 __ bind(L); 2327 BLOCK_COMMENT("} assert long copy done"); 2328 } 2329 #endif 2330 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2331 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2332 __ movw(count, scratch_length); // length 2333 __ b(RuntimeAddress(long_copy_entry)); 2334 2335 // ObjArrayKlass 2336 __ BIND(L_objArray); 2337 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2338 2339 Label L_plain_copy, L_checkcast_copy; 2340 // test array classes for subtyping 2341 __ load_klass(r15, dst); 2342 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2343 __ br(Assembler::NE, L_checkcast_copy); 2344 2345 // Identically typed arrays can be copied without element-wise checks. 2346 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2347 rscratch2, L_failed); 2348 2349 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2350 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2351 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2352 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2353 __ movw(count, scratch_length); // length 2354 __ BIND(L_plain_copy); 2355 __ b(RuntimeAddress(oop_copy_entry)); 2356 2357 __ BIND(L_checkcast_copy); 2358 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2359 { 2360 // Before looking at dst.length, make sure dst is also an objArray. 2361 __ ldrw(rscratch1, Address(r15, lh_offset)); 2362 __ movw(rscratch2, objArray_lh); 2363 __ eorw(rscratch1, rscratch1, rscratch2); 2364 __ cbnzw(rscratch1, L_failed); 2365 2366 // It is safe to examine both src.length and dst.length. 2367 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2368 r15, L_failed); 2369 2370 __ load_klass(dst_klass, dst); // reload 2371 2372 // Marshal the base address arguments now, freeing registers. 2373 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2374 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2375 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2376 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2377 __ movw(count, length); // length (reloaded) 2378 Register sco_temp = c_rarg3; // this register is free now 2379 assert_different_registers(from, to, count, sco_temp, 2380 dst_klass, scratch_src_klass); 2381 // assert_clean_int(count, sco_temp); 2382 2383 // Generate the type check. 2384 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2385 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2386 2387 // Smashes rscratch1, rscratch2 2388 generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg, 2389 L_plain_copy); 2390 2391 // Fetch destination element klass from the ObjArrayKlass header. 2392 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2393 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2394 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2395 2396 // the checkcast_copy loop needs two extra arguments: 2397 assert(c_rarg3 == sco_temp, "#3 already in place"); 2398 // Set up arguments for checkcast_copy_entry. 2399 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2400 __ b(RuntimeAddress(checkcast_copy_entry)); 2401 } 2402 2403 __ BIND(L_failed); 2404 __ mov(r0, -1); 2405 __ leave(); // required for proper stackwalking of RuntimeStub frame 2406 __ ret(lr); 2407 2408 return start; 2409 } 2410 2411 // 2412 // Generate stub for array fill. If "aligned" is true, the 2413 // "to" address is assumed to be heapword aligned. 2414 // 2415 // Arguments for generated stub: 2416 // to: c_rarg0 2417 // value: c_rarg1 2418 // count: c_rarg2 treated as signed 2419 // 2420 address generate_fill(StubGenStubId stub_id) { 2421 BasicType t; 2422 bool aligned; 2423 2424 switch (stub_id) { 2425 case jbyte_fill_id: 2426 t = T_BYTE; 2427 aligned = false; 2428 break; 2429 case jshort_fill_id: 2430 t = T_SHORT; 2431 aligned = false; 2432 break; 2433 case jint_fill_id: 2434 t = T_INT; 2435 aligned = false; 2436 break; 2437 case arrayof_jbyte_fill_id: 2438 t = T_BYTE; 2439 aligned = true; 2440 break; 2441 case arrayof_jshort_fill_id: 2442 t = T_SHORT; 2443 aligned = true; 2444 break; 2445 case arrayof_jint_fill_id: 2446 t = T_INT; 2447 aligned = true; 2448 break; 2449 default: 2450 ShouldNotReachHere(); 2451 }; 2452 2453 __ align(CodeEntryAlignment); 2454 StubCodeMark mark(this, stub_id); 2455 address start = __ pc(); 2456 2457 BLOCK_COMMENT("Entry:"); 2458 2459 const Register to = c_rarg0; // source array address 2460 const Register value = c_rarg1; // value 2461 const Register count = c_rarg2; // elements count 2462 2463 const Register bz_base = r10; // base for block_zero routine 2464 const Register cnt_words = r11; // temp register 2465 2466 __ enter(); 2467 2468 Label L_fill_elements, L_exit1; 2469 2470 int shift = -1; 2471 switch (t) { 2472 case T_BYTE: 2473 shift = 0; 2474 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2475 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2476 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2477 __ br(Assembler::LO, L_fill_elements); 2478 break; 2479 case T_SHORT: 2480 shift = 1; 2481 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2482 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2483 __ br(Assembler::LO, L_fill_elements); 2484 break; 2485 case T_INT: 2486 shift = 2; 2487 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2488 __ br(Assembler::LO, L_fill_elements); 2489 break; 2490 default: ShouldNotReachHere(); 2491 } 2492 2493 // Align source address at 8 bytes address boundary. 2494 Label L_skip_align1, L_skip_align2, L_skip_align4; 2495 if (!aligned) { 2496 switch (t) { 2497 case T_BYTE: 2498 // One byte misalignment happens only for byte arrays. 2499 __ tbz(to, 0, L_skip_align1); 2500 __ strb(value, Address(__ post(to, 1))); 2501 __ subw(count, count, 1); 2502 __ bind(L_skip_align1); 2503 // Fallthrough 2504 case T_SHORT: 2505 // Two bytes misalignment happens only for byte and short (char) arrays. 2506 __ tbz(to, 1, L_skip_align2); 2507 __ strh(value, Address(__ post(to, 2))); 2508 __ subw(count, count, 2 >> shift); 2509 __ bind(L_skip_align2); 2510 // Fallthrough 2511 case T_INT: 2512 // Align to 8 bytes, we know we are 4 byte aligned to start. 2513 __ tbz(to, 2, L_skip_align4); 2514 __ strw(value, Address(__ post(to, 4))); 2515 __ subw(count, count, 4 >> shift); 2516 __ bind(L_skip_align4); 2517 break; 2518 default: ShouldNotReachHere(); 2519 } 2520 } 2521 2522 // 2523 // Fill large chunks 2524 // 2525 __ lsrw(cnt_words, count, 3 - shift); // number of words 2526 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2527 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2528 if (UseBlockZeroing) { 2529 Label non_block_zeroing, rest; 2530 // If the fill value is zero we can use the fast zero_words(). 2531 __ cbnz(value, non_block_zeroing); 2532 __ mov(bz_base, to); 2533 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2534 address tpc = __ zero_words(bz_base, cnt_words); 2535 if (tpc == nullptr) { 2536 fatal("CodeCache is full at generate_fill"); 2537 } 2538 __ b(rest); 2539 __ bind(non_block_zeroing); 2540 __ fill_words(to, cnt_words, value); 2541 __ bind(rest); 2542 } else { 2543 __ fill_words(to, cnt_words, value); 2544 } 2545 2546 // Remaining count is less than 8 bytes. Fill it by a single store. 2547 // Note that the total length is no less than 8 bytes. 2548 if (t == T_BYTE || t == T_SHORT) { 2549 Label L_exit1; 2550 __ cbzw(count, L_exit1); 2551 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2552 __ str(value, Address(to, -8)); // overwrite some elements 2553 __ bind(L_exit1); 2554 __ leave(); 2555 __ ret(lr); 2556 } 2557 2558 // Handle copies less than 8 bytes. 2559 Label L_fill_2, L_fill_4, L_exit2; 2560 __ bind(L_fill_elements); 2561 switch (t) { 2562 case T_BYTE: 2563 __ tbz(count, 0, L_fill_2); 2564 __ strb(value, Address(__ post(to, 1))); 2565 __ bind(L_fill_2); 2566 __ tbz(count, 1, L_fill_4); 2567 __ strh(value, Address(__ post(to, 2))); 2568 __ bind(L_fill_4); 2569 __ tbz(count, 2, L_exit2); 2570 __ strw(value, Address(to)); 2571 break; 2572 case T_SHORT: 2573 __ tbz(count, 0, L_fill_4); 2574 __ strh(value, Address(__ post(to, 2))); 2575 __ bind(L_fill_4); 2576 __ tbz(count, 1, L_exit2); 2577 __ strw(value, Address(to)); 2578 break; 2579 case T_INT: 2580 __ cbzw(count, L_exit2); 2581 __ strw(value, Address(to)); 2582 break; 2583 default: ShouldNotReachHere(); 2584 } 2585 __ bind(L_exit2); 2586 __ leave(); 2587 __ ret(lr); 2588 return start; 2589 } 2590 2591 address generate_data_cache_writeback() { 2592 const Register line = c_rarg0; // address of line to write back 2593 2594 __ align(CodeEntryAlignment); 2595 2596 StubGenStubId stub_id = StubGenStubId::data_cache_writeback_id; 2597 StubCodeMark mark(this, stub_id); 2598 2599 address start = __ pc(); 2600 __ enter(); 2601 __ cache_wb(Address(line, 0)); 2602 __ leave(); 2603 __ ret(lr); 2604 2605 return start; 2606 } 2607 2608 address generate_data_cache_writeback_sync() { 2609 const Register is_pre = c_rarg0; // pre or post sync 2610 2611 __ align(CodeEntryAlignment); 2612 2613 StubGenStubId stub_id = StubGenStubId::data_cache_writeback_sync_id; 2614 StubCodeMark mark(this, stub_id); 2615 2616 // pre wbsync is a no-op 2617 // post wbsync translates to an sfence 2618 2619 Label skip; 2620 address start = __ pc(); 2621 __ enter(); 2622 __ cbnz(is_pre, skip); 2623 __ cache_wbsync(false); 2624 __ bind(skip); 2625 __ leave(); 2626 __ ret(lr); 2627 2628 return start; 2629 } 2630 2631 void generate_arraycopy_stubs() { 2632 address entry; 2633 address entry_jbyte_arraycopy; 2634 address entry_jshort_arraycopy; 2635 address entry_jint_arraycopy; 2636 address entry_oop_arraycopy; 2637 address entry_jlong_arraycopy; 2638 address entry_checkcast_arraycopy; 2639 2640 generate_copy_longs(StubGenStubId::copy_byte_f_id, IN_HEAP | IS_ARRAY, copy_f, r0, r1, r15); 2641 generate_copy_longs(StubGenStubId::copy_byte_b_id, IN_HEAP | IS_ARRAY, copy_b, r0, r1, r15); 2642 2643 generate_copy_longs(StubGenStubId::copy_oop_f_id, IN_HEAP | IS_ARRAY, copy_obj_f, r0, r1, r15); 2644 generate_copy_longs(StubGenStubId::copy_oop_b_id, IN_HEAP | IS_ARRAY, copy_obj_b, r0, r1, r15); 2645 2646 generate_copy_longs(StubGenStubId::copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_f, r0, r1, r15); 2647 generate_copy_longs(StubGenStubId::copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, copy_obj_uninit_b, r0, r1, r15); 2648 2649 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2650 2651 //*** jbyte 2652 // Always need aligned and unaligned versions 2653 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jbyte_disjoint_arraycopy_id, &entry); 2654 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubGenStubId::jbyte_arraycopy_id, entry, &entry_jbyte_arraycopy); 2655 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jbyte_disjoint_arraycopy_id, &entry); 2656 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jbyte_arraycopy_id, entry, nullptr); 2657 2658 //*** jshort 2659 // Always need aligned and unaligned versions 2660 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jshort_disjoint_arraycopy_id, &entry); 2661 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubGenStubId::jshort_arraycopy_id, entry, &entry_jshort_arraycopy); 2662 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jshort_disjoint_arraycopy_id, &entry); 2663 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jshort_arraycopy_id, entry, nullptr); 2664 2665 //*** jint 2666 // Aligned versions 2667 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jint_disjoint_arraycopy_id, &entry); 2668 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jint_arraycopy_id, entry, &entry_jint_arraycopy); 2669 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2670 // entry_jint_arraycopy always points to the unaligned version 2671 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::jint_disjoint_arraycopy_id, &entry); 2672 StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubGenStubId::jint_arraycopy_id, entry, &entry_jint_arraycopy); 2673 2674 //*** jlong 2675 // It is always aligned 2676 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubGenStubId::arrayof_jlong_disjoint_arraycopy_id, &entry); 2677 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubGenStubId::arrayof_jlong_arraycopy_id, entry, &entry_jlong_arraycopy); 2678 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2679 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2680 2681 //*** oops 2682 { 2683 // With compressed oops we need unaligned versions; notice that 2684 // we overwrite entry_oop_arraycopy. 2685 bool aligned = !UseCompressedOops; 2686 2687 StubRoutines::_arrayof_oop_disjoint_arraycopy 2688 = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_id, &entry); 2689 StubRoutines::_arrayof_oop_arraycopy 2690 = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_id, entry, &entry_oop_arraycopy); 2691 // Aligned versions without pre-barriers 2692 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2693 = generate_disjoint_copy(StubGenStubId::arrayof_oop_disjoint_arraycopy_uninit_id, &entry); 2694 StubRoutines::_arrayof_oop_arraycopy_uninit 2695 = generate_conjoint_copy(StubGenStubId::arrayof_oop_arraycopy_uninit_id, entry, nullptr); 2696 } 2697 2698 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2699 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2700 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2701 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2702 2703 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_id, &entry_checkcast_arraycopy); 2704 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubGenStubId::checkcast_arraycopy_uninit_id, nullptr); 2705 2706 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(entry_jbyte_arraycopy, 2707 entry_jshort_arraycopy, 2708 entry_jint_arraycopy, 2709 entry_jlong_arraycopy); 2710 2711 StubRoutines::_generic_arraycopy = generate_generic_copy(entry_jbyte_arraycopy, 2712 entry_jshort_arraycopy, 2713 entry_jint_arraycopy, 2714 entry_oop_arraycopy, 2715 entry_jlong_arraycopy, 2716 entry_checkcast_arraycopy); 2717 2718 StubRoutines::_jbyte_fill = generate_fill(StubGenStubId::jbyte_fill_id); 2719 StubRoutines::_jshort_fill = generate_fill(StubGenStubId::jshort_fill_id); 2720 StubRoutines::_jint_fill = generate_fill(StubGenStubId::jint_fill_id); 2721 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubGenStubId::arrayof_jbyte_fill_id); 2722 StubRoutines::_arrayof_jshort_fill = generate_fill(StubGenStubId::arrayof_jshort_fill_id); 2723 StubRoutines::_arrayof_jint_fill = generate_fill(StubGenStubId::arrayof_jint_fill_id); 2724 } 2725 2726 void generate_math_stubs() { Unimplemented(); } 2727 2728 // Arguments: 2729 // 2730 // Inputs: 2731 // c_rarg0 - source byte array address 2732 // c_rarg1 - destination byte array address 2733 // c_rarg2 - K (key) in little endian int array 2734 // 2735 address generate_aescrypt_encryptBlock() { 2736 __ align(CodeEntryAlignment); 2737 StubGenStubId stub_id = StubGenStubId::aescrypt_encryptBlock_id; 2738 StubCodeMark mark(this, stub_id); 2739 2740 const Register from = c_rarg0; // source array address 2741 const Register to = c_rarg1; // destination array address 2742 const Register key = c_rarg2; // key array address 2743 const Register keylen = rscratch1; 2744 2745 address start = __ pc(); 2746 __ enter(); 2747 2748 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2749 2750 __ aesenc_loadkeys(key, keylen); 2751 __ aesecb_encrypt(from, to, keylen); 2752 2753 __ mov(r0, 0); 2754 2755 __ leave(); 2756 __ ret(lr); 2757 2758 return start; 2759 } 2760 2761 // Arguments: 2762 // 2763 // Inputs: 2764 // c_rarg0 - source byte array address 2765 // c_rarg1 - destination byte array address 2766 // c_rarg2 - K (key) in little endian int array 2767 // 2768 address generate_aescrypt_decryptBlock() { 2769 assert(UseAES, "need AES cryptographic extension support"); 2770 __ align(CodeEntryAlignment); 2771 StubGenStubId stub_id = StubGenStubId::aescrypt_decryptBlock_id; 2772 StubCodeMark mark(this, stub_id); 2773 Label L_doLast; 2774 2775 const Register from = c_rarg0; // source array address 2776 const Register to = c_rarg1; // destination array address 2777 const Register key = c_rarg2; // key array address 2778 const Register keylen = rscratch1; 2779 2780 address start = __ pc(); 2781 __ enter(); // required for proper stackwalking of RuntimeStub frame 2782 2783 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2784 2785 __ aesecb_decrypt(from, to, key, keylen); 2786 2787 __ mov(r0, 0); 2788 2789 __ leave(); 2790 __ ret(lr); 2791 2792 return start; 2793 } 2794 2795 // Arguments: 2796 // 2797 // Inputs: 2798 // c_rarg0 - source byte array address 2799 // c_rarg1 - destination byte array address 2800 // c_rarg2 - K (key) in little endian int array 2801 // c_rarg3 - r vector byte array address 2802 // c_rarg4 - input length 2803 // 2804 // Output: 2805 // x0 - input length 2806 // 2807 address generate_cipherBlockChaining_encryptAESCrypt() { 2808 assert(UseAES, "need AES cryptographic extension support"); 2809 __ align(CodeEntryAlignment); 2810 StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_encryptAESCrypt_id; 2811 StubCodeMark mark(this, stub_id); 2812 2813 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2814 2815 const Register from = c_rarg0; // source array address 2816 const Register to = c_rarg1; // destination array address 2817 const Register key = c_rarg2; // key array address 2818 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2819 // and left with the results of the last encryption block 2820 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2821 const Register keylen = rscratch1; 2822 2823 address start = __ pc(); 2824 2825 __ enter(); 2826 2827 __ movw(rscratch2, len_reg); 2828 2829 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2830 2831 __ ld1(v0, __ T16B, rvec); 2832 2833 __ cmpw(keylen, 52); 2834 __ br(Assembler::CC, L_loadkeys_44); 2835 __ br(Assembler::EQ, L_loadkeys_52); 2836 2837 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2838 __ rev32(v17, __ T16B, v17); 2839 __ rev32(v18, __ T16B, v18); 2840 __ BIND(L_loadkeys_52); 2841 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2842 __ rev32(v19, __ T16B, v19); 2843 __ rev32(v20, __ T16B, v20); 2844 __ BIND(L_loadkeys_44); 2845 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2846 __ rev32(v21, __ T16B, v21); 2847 __ rev32(v22, __ T16B, v22); 2848 __ rev32(v23, __ T16B, v23); 2849 __ rev32(v24, __ T16B, v24); 2850 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2851 __ rev32(v25, __ T16B, v25); 2852 __ rev32(v26, __ T16B, v26); 2853 __ rev32(v27, __ T16B, v27); 2854 __ rev32(v28, __ T16B, v28); 2855 __ ld1(v29, v30, v31, __ T16B, key); 2856 __ rev32(v29, __ T16B, v29); 2857 __ rev32(v30, __ T16B, v30); 2858 __ rev32(v31, __ T16B, v31); 2859 2860 __ BIND(L_aes_loop); 2861 __ ld1(v1, __ T16B, __ post(from, 16)); 2862 __ eor(v0, __ T16B, v0, v1); 2863 2864 __ br(Assembler::CC, L_rounds_44); 2865 __ br(Assembler::EQ, L_rounds_52); 2866 2867 __ aese(v0, v17); __ aesmc(v0, v0); 2868 __ aese(v0, v18); __ aesmc(v0, v0); 2869 __ BIND(L_rounds_52); 2870 __ aese(v0, v19); __ aesmc(v0, v0); 2871 __ aese(v0, v20); __ aesmc(v0, v0); 2872 __ BIND(L_rounds_44); 2873 __ aese(v0, v21); __ aesmc(v0, v0); 2874 __ aese(v0, v22); __ aesmc(v0, v0); 2875 __ aese(v0, v23); __ aesmc(v0, v0); 2876 __ aese(v0, v24); __ aesmc(v0, v0); 2877 __ aese(v0, v25); __ aesmc(v0, v0); 2878 __ aese(v0, v26); __ aesmc(v0, v0); 2879 __ aese(v0, v27); __ aesmc(v0, v0); 2880 __ aese(v0, v28); __ aesmc(v0, v0); 2881 __ aese(v0, v29); __ aesmc(v0, v0); 2882 __ aese(v0, v30); 2883 __ eor(v0, __ T16B, v0, v31); 2884 2885 __ st1(v0, __ T16B, __ post(to, 16)); 2886 2887 __ subw(len_reg, len_reg, 16); 2888 __ cbnzw(len_reg, L_aes_loop); 2889 2890 __ st1(v0, __ T16B, rvec); 2891 2892 __ mov(r0, rscratch2); 2893 2894 __ leave(); 2895 __ ret(lr); 2896 2897 return start; 2898 } 2899 2900 // Arguments: 2901 // 2902 // Inputs: 2903 // c_rarg0 - source byte array address 2904 // c_rarg1 - destination byte array address 2905 // c_rarg2 - K (key) in little endian int array 2906 // c_rarg3 - r vector byte array address 2907 // c_rarg4 - input length 2908 // 2909 // Output: 2910 // r0 - input length 2911 // 2912 address generate_cipherBlockChaining_decryptAESCrypt() { 2913 assert(UseAES, "need AES cryptographic extension support"); 2914 __ align(CodeEntryAlignment); 2915 StubGenStubId stub_id = StubGenStubId::cipherBlockChaining_decryptAESCrypt_id; 2916 StubCodeMark mark(this, stub_id); 2917 2918 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2919 2920 const Register from = c_rarg0; // source array address 2921 const Register to = c_rarg1; // destination array address 2922 const Register key = c_rarg2; // key array address 2923 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2924 // and left with the results of the last encryption block 2925 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2926 const Register keylen = rscratch1; 2927 2928 address start = __ pc(); 2929 2930 __ enter(); 2931 2932 __ movw(rscratch2, len_reg); 2933 2934 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2935 2936 __ ld1(v2, __ T16B, rvec); 2937 2938 __ ld1(v31, __ T16B, __ post(key, 16)); 2939 __ rev32(v31, __ T16B, v31); 2940 2941 __ cmpw(keylen, 52); 2942 __ br(Assembler::CC, L_loadkeys_44); 2943 __ br(Assembler::EQ, L_loadkeys_52); 2944 2945 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2946 __ rev32(v17, __ T16B, v17); 2947 __ rev32(v18, __ T16B, v18); 2948 __ BIND(L_loadkeys_52); 2949 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2950 __ rev32(v19, __ T16B, v19); 2951 __ rev32(v20, __ T16B, v20); 2952 __ BIND(L_loadkeys_44); 2953 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2954 __ rev32(v21, __ T16B, v21); 2955 __ rev32(v22, __ T16B, v22); 2956 __ rev32(v23, __ T16B, v23); 2957 __ rev32(v24, __ T16B, v24); 2958 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2959 __ rev32(v25, __ T16B, v25); 2960 __ rev32(v26, __ T16B, v26); 2961 __ rev32(v27, __ T16B, v27); 2962 __ rev32(v28, __ T16B, v28); 2963 __ ld1(v29, v30, __ T16B, key); 2964 __ rev32(v29, __ T16B, v29); 2965 __ rev32(v30, __ T16B, v30); 2966 2967 __ BIND(L_aes_loop); 2968 __ ld1(v0, __ T16B, __ post(from, 16)); 2969 __ orr(v1, __ T16B, v0, v0); 2970 2971 __ br(Assembler::CC, L_rounds_44); 2972 __ br(Assembler::EQ, L_rounds_52); 2973 2974 __ aesd(v0, v17); __ aesimc(v0, v0); 2975 __ aesd(v0, v18); __ aesimc(v0, v0); 2976 __ BIND(L_rounds_52); 2977 __ aesd(v0, v19); __ aesimc(v0, v0); 2978 __ aesd(v0, v20); __ aesimc(v0, v0); 2979 __ BIND(L_rounds_44); 2980 __ aesd(v0, v21); __ aesimc(v0, v0); 2981 __ aesd(v0, v22); __ aesimc(v0, v0); 2982 __ aesd(v0, v23); __ aesimc(v0, v0); 2983 __ aesd(v0, v24); __ aesimc(v0, v0); 2984 __ aesd(v0, v25); __ aesimc(v0, v0); 2985 __ aesd(v0, v26); __ aesimc(v0, v0); 2986 __ aesd(v0, v27); __ aesimc(v0, v0); 2987 __ aesd(v0, v28); __ aesimc(v0, v0); 2988 __ aesd(v0, v29); __ aesimc(v0, v0); 2989 __ aesd(v0, v30); 2990 __ eor(v0, __ T16B, v0, v31); 2991 __ eor(v0, __ T16B, v0, v2); 2992 2993 __ st1(v0, __ T16B, __ post(to, 16)); 2994 __ orr(v2, __ T16B, v1, v1); 2995 2996 __ subw(len_reg, len_reg, 16); 2997 __ cbnzw(len_reg, L_aes_loop); 2998 2999 __ st1(v2, __ T16B, rvec); 3000 3001 __ mov(r0, rscratch2); 3002 3003 __ leave(); 3004 __ ret(lr); 3005 3006 return start; 3007 } 3008 3009 // Big-endian 128-bit + 64-bit -> 128-bit addition. 3010 // Inputs: 128-bits. in is preserved. 3011 // The least-significant 64-bit word is in the upper dword of each vector. 3012 // inc (the 64-bit increment) is preserved. Its lower dword must be zero. 3013 // Output: result 3014 void be_add_128_64(FloatRegister result, FloatRegister in, 3015 FloatRegister inc, FloatRegister tmp) { 3016 assert_different_registers(result, tmp, inc); 3017 3018 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of 3019 // input 3020 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing 3021 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and 3022 // MSD == 0 (must be!) to LSD 3023 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow 3024 } 3025 3026 // CTR AES crypt. 3027 // Arguments: 3028 // 3029 // Inputs: 3030 // c_rarg0 - source byte array address 3031 // c_rarg1 - destination byte array address 3032 // c_rarg2 - K (key) in little endian int array 3033 // c_rarg3 - counter vector byte array address 3034 // c_rarg4 - input length 3035 // c_rarg5 - saved encryptedCounter start 3036 // c_rarg6 - saved used length 3037 // 3038 // Output: 3039 // r0 - input length 3040 // 3041 address generate_counterMode_AESCrypt() { 3042 const Register in = c_rarg0; 3043 const Register out = c_rarg1; 3044 const Register key = c_rarg2; 3045 const Register counter = c_rarg3; 3046 const Register saved_len = c_rarg4, len = r10; 3047 const Register saved_encrypted_ctr = c_rarg5; 3048 const Register used_ptr = c_rarg6, used = r12; 3049 3050 const Register offset = r7; 3051 const Register keylen = r11; 3052 3053 const unsigned char block_size = 16; 3054 const int bulk_width = 4; 3055 // NB: bulk_width can be 4 or 8. 8 gives slightly faster 3056 // performance with larger data sizes, but it also means that the 3057 // fast path isn't used until you have at least 8 blocks, and up 3058 // to 127 bytes of data will be executed on the slow path. For 3059 // that reason, and also so as not to blow away too much icache, 4 3060 // blocks seems like a sensible compromise. 3061 3062 // Algorithm: 3063 // 3064 // if (len == 0) { 3065 // goto DONE; 3066 // } 3067 // int result = len; 3068 // do { 3069 // if (used >= blockSize) { 3070 // if (len >= bulk_width * blockSize) { 3071 // CTR_large_block(); 3072 // if (len == 0) 3073 // goto DONE; 3074 // } 3075 // for (;;) { 3076 // 16ByteVector v0 = counter; 3077 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); 3078 // used = 0; 3079 // if (len < blockSize) 3080 // break; /* goto NEXT */ 3081 // 16ByteVector v1 = load16Bytes(in, offset); 3082 // v1 = v1 ^ encryptedCounter; 3083 // store16Bytes(out, offset); 3084 // used = blockSize; 3085 // offset += blockSize; 3086 // len -= blockSize; 3087 // if (len == 0) 3088 // goto DONE; 3089 // } 3090 // } 3091 // NEXT: 3092 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); 3093 // len--; 3094 // } while (len != 0); 3095 // DONE: 3096 // return result; 3097 // 3098 // CTR_large_block() 3099 // Wide bulk encryption of whole blocks. 3100 3101 __ align(CodeEntryAlignment); 3102 StubGenStubId stub_id = StubGenStubId::counterMode_AESCrypt_id; 3103 StubCodeMark mark(this, stub_id); 3104 const address start = __ pc(); 3105 __ enter(); 3106 3107 Label DONE, CTR_large_block, large_block_return; 3108 __ ldrw(used, Address(used_ptr)); 3109 __ cbzw(saved_len, DONE); 3110 3111 __ mov(len, saved_len); 3112 __ mov(offset, 0); 3113 3114 // Compute #rounds for AES based on the length of the key array 3115 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3116 3117 __ aesenc_loadkeys(key, keylen); 3118 3119 { 3120 Label L_CTR_loop, NEXT; 3121 3122 __ bind(L_CTR_loop); 3123 3124 __ cmp(used, block_size); 3125 __ br(__ LO, NEXT); 3126 3127 // Maybe we have a lot of data 3128 __ subsw(rscratch1, len, bulk_width * block_size); 3129 __ br(__ HS, CTR_large_block); 3130 __ BIND(large_block_return); 3131 __ cbzw(len, DONE); 3132 3133 // Setup the counter 3134 __ movi(v4, __ T4S, 0); 3135 __ movi(v5, __ T4S, 1); 3136 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 } 3137 3138 // 128-bit big-endian increment 3139 __ ld1(v0, __ T16B, counter); 3140 __ rev64(v16, __ T16B, v0); 3141 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3142 __ rev64(v16, __ T16B, v16); 3143 __ st1(v16, __ T16B, counter); 3144 // Previous counter value is in v0 3145 // v4 contains { 0, 1 } 3146 3147 { 3148 // We have fewer than bulk_width blocks of data left. Encrypt 3149 // them one by one until there is less than a full block 3150 // remaining, being careful to save both the encrypted counter 3151 // and the counter. 3152 3153 Label inner_loop; 3154 __ bind(inner_loop); 3155 // Counter to encrypt is in v0 3156 __ aesecb_encrypt(noreg, noreg, keylen); 3157 __ st1(v0, __ T16B, saved_encrypted_ctr); 3158 3159 // Do we have a remaining full block? 3160 3161 __ mov(used, 0); 3162 __ cmp(len, block_size); 3163 __ br(__ LO, NEXT); 3164 3165 // Yes, we have a full block 3166 __ ldrq(v1, Address(in, offset)); 3167 __ eor(v1, __ T16B, v1, v0); 3168 __ strq(v1, Address(out, offset)); 3169 __ mov(used, block_size); 3170 __ add(offset, offset, block_size); 3171 3172 __ subw(len, len, block_size); 3173 __ cbzw(len, DONE); 3174 3175 // Increment the counter, store it back 3176 __ orr(v0, __ T16B, v16, v16); 3177 __ rev64(v16, __ T16B, v16); 3178 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3179 __ rev64(v16, __ T16B, v16); 3180 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3181 3182 __ b(inner_loop); 3183 } 3184 3185 __ BIND(NEXT); 3186 3187 // Encrypt a single byte, and loop. 3188 // We expect this to be a rare event. 3189 __ ldrb(rscratch1, Address(in, offset)); 3190 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); 3191 __ eor(rscratch1, rscratch1, rscratch2); 3192 __ strb(rscratch1, Address(out, offset)); 3193 __ add(offset, offset, 1); 3194 __ add(used, used, 1); 3195 __ subw(len, len,1); 3196 __ cbnzw(len, L_CTR_loop); 3197 } 3198 3199 __ bind(DONE); 3200 __ strw(used, Address(used_ptr)); 3201 __ mov(r0, saved_len); 3202 3203 __ leave(); // required for proper stackwalking of RuntimeStub frame 3204 __ ret(lr); 3205 3206 // Bulk encryption 3207 3208 __ BIND (CTR_large_block); 3209 assert(bulk_width == 4 || bulk_width == 8, "must be"); 3210 3211 if (bulk_width == 8) { 3212 __ sub(sp, sp, 4 * 16); 3213 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3214 } 3215 __ sub(sp, sp, 4 * 16); 3216 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3217 RegSet saved_regs = (RegSet::of(in, out, offset) 3218 + RegSet::of(saved_encrypted_ctr, used_ptr, len)); 3219 __ push(saved_regs, sp); 3220 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption 3221 __ add(in, in, offset); 3222 __ add(out, out, offset); 3223 3224 // Keys should already be loaded into the correct registers 3225 3226 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3227 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter 3228 3229 // AES/CTR loop 3230 { 3231 Label L_CTR_loop; 3232 __ BIND(L_CTR_loop); 3233 3234 // Setup the counters 3235 __ movi(v8, __ T4S, 0); 3236 __ movi(v9, __ T4S, 1); 3237 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 } 3238 3239 for (int i = 0; i < bulk_width; i++) { 3240 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3241 __ rev64(v0_ofs, __ T16B, v16); 3242 be_add_128_64(v16, v16, v8, /*tmp*/v9); 3243 } 3244 3245 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3246 3247 // Encrypt the counters 3248 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); 3249 3250 if (bulk_width == 8) { 3251 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3252 } 3253 3254 // XOR the encrypted counters with the inputs 3255 for (int i = 0; i < bulk_width; i++) { 3256 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3257 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3258 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3259 } 3260 3261 // Write the encrypted data 3262 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3263 if (bulk_width == 8) { 3264 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3265 } 3266 3267 __ subw(len, len, 16 * bulk_width); 3268 __ cbnzw(len, L_CTR_loop); 3269 } 3270 3271 // Save the counter back where it goes 3272 __ rev64(v16, __ T16B, v16); 3273 __ st1(v16, __ T16B, counter); 3274 3275 __ pop(saved_regs, sp); 3276 3277 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3278 if (bulk_width == 8) { 3279 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3280 } 3281 3282 __ andr(rscratch1, len, -16 * bulk_width); 3283 __ sub(len, len, rscratch1); 3284 __ add(offset, offset, rscratch1); 3285 __ mov(used, 16); 3286 __ strw(used, Address(used_ptr)); 3287 __ b(large_block_return); 3288 3289 return start; 3290 } 3291 3292 // Vector AES Galois Counter Mode implementation. Parameters: 3293 // 3294 // in = c_rarg0 3295 // len = c_rarg1 3296 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt) 3297 // out = c_rarg3 3298 // key = c_rarg4 3299 // state = c_rarg5 - GHASH.state 3300 // subkeyHtbl = c_rarg6 - powers of H 3301 // counter = c_rarg7 - 16 bytes of CTR 3302 // return - number of processed bytes 3303 address generate_galoisCounterMode_AESCrypt() { 3304 address ghash_polynomial = __ pc(); 3305 __ emit_int64(0x87); // The low-order bits of the field 3306 // polynomial (i.e. p = z^7+z^2+z+1) 3307 // repeated in the low and high parts of a 3308 // 128-bit vector 3309 __ emit_int64(0x87); 3310 3311 __ align(CodeEntryAlignment); 3312 StubGenStubId stub_id = StubGenStubId::galoisCounterMode_AESCrypt_id; 3313 StubCodeMark mark(this, stub_id); 3314 address start = __ pc(); 3315 __ enter(); 3316 3317 const Register in = c_rarg0; 3318 const Register len = c_rarg1; 3319 const Register ct = c_rarg2; 3320 const Register out = c_rarg3; 3321 // and updated with the incremented counter in the end 3322 3323 const Register key = c_rarg4; 3324 const Register state = c_rarg5; 3325 3326 const Register subkeyHtbl = c_rarg6; 3327 3328 const Register counter = c_rarg7; 3329 3330 const Register keylen = r10; 3331 // Save state before entering routine 3332 __ sub(sp, sp, 4 * 16); 3333 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3334 __ sub(sp, sp, 4 * 16); 3335 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3336 3337 // __ andr(len, len, -512); 3338 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption 3339 __ str(len, __ pre(sp, -2 * wordSize)); 3340 3341 Label DONE; 3342 __ cbz(len, DONE); 3343 3344 // Compute #rounds for AES based on the length of the key array 3345 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3346 3347 __ aesenc_loadkeys(key, keylen); 3348 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3349 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3350 3351 // AES/CTR loop 3352 { 3353 Label L_CTR_loop; 3354 __ BIND(L_CTR_loop); 3355 3356 // Setup the counters 3357 __ movi(v8, __ T4S, 0); 3358 __ movi(v9, __ T4S, 1); 3359 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3360 3361 assert(v0->encoding() < v8->encoding(), ""); 3362 for (int i = v0->encoding(); i < v8->encoding(); i++) { 3363 FloatRegister f = as_FloatRegister(i); 3364 __ rev32(f, __ T16B, v16); 3365 __ addv(v16, __ T4S, v16, v8); 3366 } 3367 3368 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3369 3370 // Encrypt the counters 3371 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8); 3372 3373 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3374 3375 // XOR the encrypted counters with the inputs 3376 for (int i = 0; i < 8; i++) { 3377 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3378 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3379 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3380 } 3381 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3382 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3383 3384 __ subw(len, len, 16 * 8); 3385 __ cbnzw(len, L_CTR_loop); 3386 } 3387 3388 __ rev32(v16, __ T16B, v16); 3389 __ st1(v16, __ T16B, counter); 3390 3391 __ ldr(len, Address(sp)); 3392 __ lsr(len, len, exact_log2(16)); // We want the count of blocks 3393 3394 // GHASH/CTR loop 3395 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct, 3396 len, /*unrolls*/4); 3397 3398 #ifdef ASSERT 3399 { Label L; 3400 __ cmp(len, (unsigned char)0); 3401 __ br(Assembler::EQ, L); 3402 __ stop("stubGenerator: abort"); 3403 __ bind(L); 3404 } 3405 #endif 3406 3407 __ bind(DONE); 3408 // Return the number of bytes processed 3409 __ ldr(r0, __ post(sp, 2 * wordSize)); 3410 3411 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3412 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3413 3414 __ leave(); // required for proper stackwalking of RuntimeStub frame 3415 __ ret(lr); 3416 return start; 3417 } 3418 3419 class Cached64Bytes { 3420 private: 3421 MacroAssembler *_masm; 3422 Register _regs[8]; 3423 3424 public: 3425 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) { 3426 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size()); 3427 auto it = rs.begin(); 3428 for (auto &r: _regs) { 3429 r = *it; 3430 ++it; 3431 } 3432 } 3433 3434 void gen_loads(Register base) { 3435 for (int i = 0; i < 8; i += 2) { 3436 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i)); 3437 } 3438 } 3439 3440 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes. 3441 void extract_u32(Register dest, int i) { 3442 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32); 3443 } 3444 }; 3445 3446 // Utility routines for md5. 3447 // Clobbers r10 and r11. 3448 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3449 int k, int s, int t) { 3450 Register rscratch3 = r10; 3451 Register rscratch4 = r11; 3452 3453 __ eorw(rscratch3, r3, r4); 3454 __ movw(rscratch2, t); 3455 __ andw(rscratch3, rscratch3, r2); 3456 __ addw(rscratch4, r1, rscratch2); 3457 reg_cache.extract_u32(rscratch1, k); 3458 __ eorw(rscratch3, rscratch3, r4); 3459 __ addw(rscratch4, rscratch4, rscratch1); 3460 __ addw(rscratch3, rscratch3, rscratch4); 3461 __ rorw(rscratch2, rscratch3, 32 - s); 3462 __ addw(r1, rscratch2, r2); 3463 } 3464 3465 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3466 int k, int s, int t) { 3467 Register rscratch3 = r10; 3468 Register rscratch4 = r11; 3469 3470 reg_cache.extract_u32(rscratch1, k); 3471 __ movw(rscratch2, t); 3472 __ addw(rscratch4, r1, rscratch2); 3473 __ addw(rscratch4, rscratch4, rscratch1); 3474 __ bicw(rscratch2, r3, r4); 3475 __ andw(rscratch3, r2, r4); 3476 __ addw(rscratch2, rscratch2, rscratch4); 3477 __ addw(rscratch2, rscratch2, rscratch3); 3478 __ rorw(rscratch2, rscratch2, 32 - s); 3479 __ addw(r1, rscratch2, r2); 3480 } 3481 3482 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3483 int k, int s, int t) { 3484 Register rscratch3 = r10; 3485 Register rscratch4 = r11; 3486 3487 __ eorw(rscratch3, r3, r4); 3488 __ movw(rscratch2, t); 3489 __ addw(rscratch4, r1, rscratch2); 3490 reg_cache.extract_u32(rscratch1, k); 3491 __ eorw(rscratch3, rscratch3, r2); 3492 __ addw(rscratch4, rscratch4, rscratch1); 3493 __ addw(rscratch3, rscratch3, rscratch4); 3494 __ rorw(rscratch2, rscratch3, 32 - s); 3495 __ addw(r1, rscratch2, r2); 3496 } 3497 3498 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3499 int k, int s, int t) { 3500 Register rscratch3 = r10; 3501 Register rscratch4 = r11; 3502 3503 __ movw(rscratch3, t); 3504 __ ornw(rscratch2, r2, r4); 3505 __ addw(rscratch4, r1, rscratch3); 3506 reg_cache.extract_u32(rscratch1, k); 3507 __ eorw(rscratch3, rscratch2, r3); 3508 __ addw(rscratch4, rscratch4, rscratch1); 3509 __ addw(rscratch3, rscratch3, rscratch4); 3510 __ rorw(rscratch2, rscratch3, 32 - s); 3511 __ addw(r1, rscratch2, r2); 3512 } 3513 3514 // Arguments: 3515 // 3516 // Inputs: 3517 // c_rarg0 - byte[] source+offset 3518 // c_rarg1 - int[] SHA.state 3519 // c_rarg2 - int offset 3520 // c_rarg3 - int limit 3521 // 3522 address generate_md5_implCompress(StubGenStubId stub_id) { 3523 bool multi_block; 3524 switch (stub_id) { 3525 case md5_implCompress_id: 3526 multi_block = false; 3527 break; 3528 case md5_implCompressMB_id: 3529 multi_block = true; 3530 break; 3531 default: 3532 ShouldNotReachHere(); 3533 } 3534 __ align(CodeEntryAlignment); 3535 3536 StubCodeMark mark(this, stub_id); 3537 address start = __ pc(); 3538 3539 Register buf = c_rarg0; 3540 Register state = c_rarg1; 3541 Register ofs = c_rarg2; 3542 Register limit = c_rarg3; 3543 Register a = r4; 3544 Register b = r5; 3545 Register c = r6; 3546 Register d = r7; 3547 Register rscratch3 = r10; 3548 Register rscratch4 = r11; 3549 3550 Register state_regs[2] = { r12, r13 }; 3551 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls; 3552 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers 3553 3554 __ push(saved_regs, sp); 3555 3556 __ ldp(state_regs[0], state_regs[1], Address(state)); 3557 __ ubfx(a, state_regs[0], 0, 32); 3558 __ ubfx(b, state_regs[0], 32, 32); 3559 __ ubfx(c, state_regs[1], 0, 32); 3560 __ ubfx(d, state_regs[1], 32, 32); 3561 3562 Label md5_loop; 3563 __ BIND(md5_loop); 3564 3565 reg_cache.gen_loads(buf); 3566 3567 // Round 1 3568 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478); 3569 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756); 3570 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db); 3571 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee); 3572 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf); 3573 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a); 3574 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613); 3575 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501); 3576 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8); 3577 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af); 3578 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1); 3579 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be); 3580 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122); 3581 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193); 3582 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e); 3583 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821); 3584 3585 // Round 2 3586 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562); 3587 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340); 3588 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51); 3589 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa); 3590 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d); 3591 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453); 3592 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681); 3593 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8); 3594 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6); 3595 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6); 3596 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87); 3597 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed); 3598 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905); 3599 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8); 3600 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9); 3601 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a); 3602 3603 // Round 3 3604 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942); 3605 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681); 3606 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122); 3607 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c); 3608 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44); 3609 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9); 3610 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60); 3611 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70); 3612 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6); 3613 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa); 3614 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085); 3615 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05); 3616 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039); 3617 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5); 3618 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8); 3619 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665); 3620 3621 // Round 4 3622 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244); 3623 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97); 3624 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7); 3625 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039); 3626 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3); 3627 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92); 3628 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d); 3629 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1); 3630 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f); 3631 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0); 3632 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314); 3633 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1); 3634 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82); 3635 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235); 3636 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb); 3637 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391); 3638 3639 __ addw(a, state_regs[0], a); 3640 __ ubfx(rscratch2, state_regs[0], 32, 32); 3641 __ addw(b, rscratch2, b); 3642 __ addw(c, state_regs[1], c); 3643 __ ubfx(rscratch4, state_regs[1], 32, 32); 3644 __ addw(d, rscratch4, d); 3645 3646 __ orr(state_regs[0], a, b, Assembler::LSL, 32); 3647 __ orr(state_regs[1], c, d, Assembler::LSL, 32); 3648 3649 if (multi_block) { 3650 __ add(buf, buf, 64); 3651 __ add(ofs, ofs, 64); 3652 __ cmp(ofs, limit); 3653 __ br(Assembler::LE, md5_loop); 3654 __ mov(c_rarg0, ofs); // return ofs 3655 } 3656 3657 // write hash values back in the correct order 3658 __ stp(state_regs[0], state_regs[1], Address(state)); 3659 3660 __ pop(saved_regs, sp); 3661 3662 __ ret(lr); 3663 3664 return start; 3665 } 3666 3667 // Arguments: 3668 // 3669 // Inputs: 3670 // c_rarg0 - byte[] source+offset 3671 // c_rarg1 - int[] SHA.state 3672 // c_rarg2 - int offset 3673 // c_rarg3 - int limit 3674 // 3675 address generate_sha1_implCompress(StubGenStubId stub_id) { 3676 bool multi_block; 3677 switch (stub_id) { 3678 case sha1_implCompress_id: 3679 multi_block = false; 3680 break; 3681 case sha1_implCompressMB_id: 3682 multi_block = true; 3683 break; 3684 default: 3685 ShouldNotReachHere(); 3686 } 3687 3688 __ align(CodeEntryAlignment); 3689 3690 StubCodeMark mark(this, stub_id); 3691 address start = __ pc(); 3692 3693 Register buf = c_rarg0; 3694 Register state = c_rarg1; 3695 Register ofs = c_rarg2; 3696 Register limit = c_rarg3; 3697 3698 Label keys; 3699 Label sha1_loop; 3700 3701 // load the keys into v0..v3 3702 __ adr(rscratch1, keys); 3703 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3704 // load 5 words state into v6, v7 3705 __ ldrq(v6, Address(state, 0)); 3706 __ ldrs(v7, Address(state, 16)); 3707 3708 3709 __ BIND(sha1_loop); 3710 // load 64 bytes of data into v16..v19 3711 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3712 __ rev32(v16, __ T16B, v16); 3713 __ rev32(v17, __ T16B, v17); 3714 __ rev32(v18, __ T16B, v18); 3715 __ rev32(v19, __ T16B, v19); 3716 3717 // do the sha1 3718 __ addv(v4, __ T4S, v16, v0); 3719 __ orr(v20, __ T16B, v6, v6); 3720 3721 FloatRegister d0 = v16; 3722 FloatRegister d1 = v17; 3723 FloatRegister d2 = v18; 3724 FloatRegister d3 = v19; 3725 3726 for (int round = 0; round < 20; round++) { 3727 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3728 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3729 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3730 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3731 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3732 3733 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3734 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3735 __ sha1h(tmp2, __ T4S, v20); 3736 if (round < 5) 3737 __ sha1c(v20, __ T4S, tmp3, tmp4); 3738 else if (round < 10 || round >= 15) 3739 __ sha1p(v20, __ T4S, tmp3, tmp4); 3740 else 3741 __ sha1m(v20, __ T4S, tmp3, tmp4); 3742 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3743 3744 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3745 } 3746 3747 __ addv(v7, __ T2S, v7, v21); 3748 __ addv(v6, __ T4S, v6, v20); 3749 3750 if (multi_block) { 3751 __ add(ofs, ofs, 64); 3752 __ cmp(ofs, limit); 3753 __ br(Assembler::LE, sha1_loop); 3754 __ mov(c_rarg0, ofs); // return ofs 3755 } 3756 3757 __ strq(v6, Address(state, 0)); 3758 __ strs(v7, Address(state, 16)); 3759 3760 __ ret(lr); 3761 3762 __ bind(keys); 3763 __ emit_int32(0x5a827999); 3764 __ emit_int32(0x6ed9eba1); 3765 __ emit_int32(0x8f1bbcdc); 3766 __ emit_int32(0xca62c1d6); 3767 3768 return start; 3769 } 3770 3771 3772 // Arguments: 3773 // 3774 // Inputs: 3775 // c_rarg0 - byte[] source+offset 3776 // c_rarg1 - int[] SHA.state 3777 // c_rarg2 - int offset 3778 // c_rarg3 - int limit 3779 // 3780 address generate_sha256_implCompress(StubGenStubId stub_id) { 3781 bool multi_block; 3782 switch (stub_id) { 3783 case sha256_implCompress_id: 3784 multi_block = false; 3785 break; 3786 case sha256_implCompressMB_id: 3787 multi_block = true; 3788 break; 3789 default: 3790 ShouldNotReachHere(); 3791 } 3792 3793 static const uint32_t round_consts[64] = { 3794 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3795 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3796 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3797 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3798 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3799 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3800 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3801 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3802 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3803 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3804 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3805 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3806 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3807 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3808 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3809 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3810 }; 3811 3812 __ align(CodeEntryAlignment); 3813 3814 StubCodeMark mark(this, stub_id); 3815 address start = __ pc(); 3816 3817 Register buf = c_rarg0; 3818 Register state = c_rarg1; 3819 Register ofs = c_rarg2; 3820 Register limit = c_rarg3; 3821 3822 Label sha1_loop; 3823 3824 __ stpd(v8, v9, __ pre(sp, -32)); 3825 __ stpd(v10, v11, Address(sp, 16)); 3826 3827 // dga == v0 3828 // dgb == v1 3829 // dg0 == v2 3830 // dg1 == v3 3831 // dg2 == v4 3832 // t0 == v6 3833 // t1 == v7 3834 3835 // load 16 keys to v16..v31 3836 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3837 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3838 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3839 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3840 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3841 3842 // load 8 words (256 bits) state 3843 __ ldpq(v0, v1, state); 3844 3845 __ BIND(sha1_loop); 3846 // load 64 bytes of data into v8..v11 3847 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3848 __ rev32(v8, __ T16B, v8); 3849 __ rev32(v9, __ T16B, v9); 3850 __ rev32(v10, __ T16B, v10); 3851 __ rev32(v11, __ T16B, v11); 3852 3853 __ addv(v6, __ T4S, v8, v16); 3854 __ orr(v2, __ T16B, v0, v0); 3855 __ orr(v3, __ T16B, v1, v1); 3856 3857 FloatRegister d0 = v8; 3858 FloatRegister d1 = v9; 3859 FloatRegister d2 = v10; 3860 FloatRegister d3 = v11; 3861 3862 3863 for (int round = 0; round < 16; round++) { 3864 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3865 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3866 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3867 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3868 3869 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3870 __ orr(v4, __ T16B, v2, v2); 3871 if (round < 15) 3872 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3873 __ sha256h(v2, __ T4S, v3, tmp2); 3874 __ sha256h2(v3, __ T4S, v4, tmp2); 3875 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3876 3877 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3878 } 3879 3880 __ addv(v0, __ T4S, v0, v2); 3881 __ addv(v1, __ T4S, v1, v3); 3882 3883 if (multi_block) { 3884 __ add(ofs, ofs, 64); 3885 __ cmp(ofs, limit); 3886 __ br(Assembler::LE, sha1_loop); 3887 __ mov(c_rarg0, ofs); // return ofs 3888 } 3889 3890 __ ldpd(v10, v11, Address(sp, 16)); 3891 __ ldpd(v8, v9, __ post(sp, 32)); 3892 3893 __ stpq(v0, v1, state); 3894 3895 __ ret(lr); 3896 3897 return start; 3898 } 3899 3900 // Double rounds for sha512. 3901 void sha512_dround(int dr, 3902 FloatRegister vi0, FloatRegister vi1, 3903 FloatRegister vi2, FloatRegister vi3, 3904 FloatRegister vi4, FloatRegister vrc0, 3905 FloatRegister vrc1, FloatRegister vin0, 3906 FloatRegister vin1, FloatRegister vin2, 3907 FloatRegister vin3, FloatRegister vin4) { 3908 if (dr < 36) { 3909 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16)); 3910 } 3911 __ addv(v5, __ T2D, vrc0, vin0); 3912 __ ext(v6, __ T16B, vi2, vi3, 8); 3913 __ ext(v5, __ T16B, v5, v5, 8); 3914 __ ext(v7, __ T16B, vi1, vi2, 8); 3915 __ addv(vi3, __ T2D, vi3, v5); 3916 if (dr < 32) { 3917 __ ext(v5, __ T16B, vin3, vin4, 8); 3918 __ sha512su0(vin0, __ T2D, vin1); 3919 } 3920 __ sha512h(vi3, __ T2D, v6, v7); 3921 if (dr < 32) { 3922 __ sha512su1(vin0, __ T2D, vin2, v5); 3923 } 3924 __ addv(vi4, __ T2D, vi1, vi3); 3925 __ sha512h2(vi3, __ T2D, vi1, vi0); 3926 } 3927 3928 // Arguments: 3929 // 3930 // Inputs: 3931 // c_rarg0 - byte[] source+offset 3932 // c_rarg1 - int[] SHA.state 3933 // c_rarg2 - int offset 3934 // c_rarg3 - int limit 3935 // 3936 address generate_sha512_implCompress(StubGenStubId stub_id) { 3937 bool multi_block; 3938 switch (stub_id) { 3939 case sha512_implCompress_id: 3940 multi_block = false; 3941 break; 3942 case sha512_implCompressMB_id: 3943 multi_block = true; 3944 break; 3945 default: 3946 ShouldNotReachHere(); 3947 } 3948 3949 static const uint64_t round_consts[80] = { 3950 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 3951 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 3952 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 3953 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 3954 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 3955 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 3956 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 3957 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 3958 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 3959 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 3960 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 3961 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 3962 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 3963 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 3964 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 3965 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 3966 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 3967 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 3968 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 3969 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 3970 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 3971 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 3972 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 3973 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 3974 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 3975 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 3976 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 3977 }; 3978 3979 __ align(CodeEntryAlignment); 3980 3981 StubCodeMark mark(this, stub_id); 3982 address start = __ pc(); 3983 3984 Register buf = c_rarg0; 3985 Register state = c_rarg1; 3986 Register ofs = c_rarg2; 3987 Register limit = c_rarg3; 3988 3989 __ stpd(v8, v9, __ pre(sp, -64)); 3990 __ stpd(v10, v11, Address(sp, 16)); 3991 __ stpd(v12, v13, Address(sp, 32)); 3992 __ stpd(v14, v15, Address(sp, 48)); 3993 3994 Label sha512_loop; 3995 3996 // load state 3997 __ ld1(v8, v9, v10, v11, __ T2D, state); 3998 3999 // load first 4 round constants 4000 __ lea(rscratch1, ExternalAddress((address)round_consts)); 4001 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 4002 4003 __ BIND(sha512_loop); 4004 // load 128B of data into v12..v19 4005 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 4006 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 4007 __ rev64(v12, __ T16B, v12); 4008 __ rev64(v13, __ T16B, v13); 4009 __ rev64(v14, __ T16B, v14); 4010 __ rev64(v15, __ T16B, v15); 4011 __ rev64(v16, __ T16B, v16); 4012 __ rev64(v17, __ T16B, v17); 4013 __ rev64(v18, __ T16B, v18); 4014 __ rev64(v19, __ T16B, v19); 4015 4016 __ mov(rscratch2, rscratch1); 4017 4018 __ mov(v0, __ T16B, v8); 4019 __ mov(v1, __ T16B, v9); 4020 __ mov(v2, __ T16B, v10); 4021 __ mov(v3, __ T16B, v11); 4022 4023 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17); 4024 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18); 4025 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19); 4026 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12); 4027 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13); 4028 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14); 4029 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15); 4030 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16); 4031 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17); 4032 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18); 4033 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19); 4034 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12); 4035 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13); 4036 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14); 4037 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15); 4038 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16); 4039 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17); 4040 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18); 4041 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19); 4042 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12); 4043 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13); 4044 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14); 4045 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15); 4046 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16); 4047 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17); 4048 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18); 4049 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19); 4050 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12); 4051 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13); 4052 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14); 4053 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15); 4054 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16); 4055 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0); 4056 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0); 4057 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0); 4058 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0); 4059 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0); 4060 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0); 4061 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0); 4062 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0); 4063 4064 __ addv(v8, __ T2D, v8, v0); 4065 __ addv(v9, __ T2D, v9, v1); 4066 __ addv(v10, __ T2D, v10, v2); 4067 __ addv(v11, __ T2D, v11, v3); 4068 4069 if (multi_block) { 4070 __ add(ofs, ofs, 128); 4071 __ cmp(ofs, limit); 4072 __ br(Assembler::LE, sha512_loop); 4073 __ mov(c_rarg0, ofs); // return ofs 4074 } 4075 4076 __ st1(v8, v9, v10, v11, __ T2D, state); 4077 4078 __ ldpd(v14, v15, Address(sp, 48)); 4079 __ ldpd(v12, v13, Address(sp, 32)); 4080 __ ldpd(v10, v11, Address(sp, 16)); 4081 __ ldpd(v8, v9, __ post(sp, 64)); 4082 4083 __ ret(lr); 4084 4085 return start; 4086 } 4087 4088 // Execute one round of keccak of two computations in parallel. 4089 // One of the states should be loaded into the lower halves of 4090 // the vector registers v0-v24, the other should be loaded into 4091 // the upper halves of those registers. The ld1r instruction loads 4092 // the round constant into both halves of register v31. 4093 // Intermediate results c0...c5 and d0...d5 are computed 4094 // in registers v25...v30. 4095 // All vector instructions that are used operate on both register 4096 // halves in parallel. 4097 // If only a single computation is needed, one can only load the lower halves. 4098 void keccak_round(Register rscratch1) { 4099 __ eor3(v29, __ T16B, v4, v9, v14); // c4 = a4 ^ a9 ^ a14 4100 __ eor3(v26, __ T16B, v1, v6, v11); // c1 = a1 ^ a16 ^ a11 4101 __ eor3(v28, __ T16B, v3, v8, v13); // c3 = a3 ^ a8 ^a13 4102 __ eor3(v25, __ T16B, v0, v5, v10); // c0 = a0 ^ a5 ^ a10 4103 __ eor3(v27, __ T16B, v2, v7, v12); // c2 = a2 ^ a7 ^ a12 4104 __ eor3(v29, __ T16B, v29, v19, v24); // c4 ^= a19 ^ a24 4105 __ eor3(v26, __ T16B, v26, v16, v21); // c1 ^= a16 ^ a21 4106 __ eor3(v28, __ T16B, v28, v18, v23); // c3 ^= a18 ^ a23 4107 __ eor3(v25, __ T16B, v25, v15, v20); // c0 ^= a15 ^ a20 4108 __ eor3(v27, __ T16B, v27, v17, v22); // c2 ^= a17 ^ a22 4109 4110 __ rax1(v30, __ T2D, v29, v26); // d0 = c4 ^ rol(c1, 1) 4111 __ rax1(v26, __ T2D, v26, v28); // d2 = c1 ^ rol(c3, 1) 4112 __ rax1(v28, __ T2D, v28, v25); // d4 = c3 ^ rol(c0, 1) 4113 __ rax1(v25, __ T2D, v25, v27); // d1 = c0 ^ rol(c2, 1) 4114 __ rax1(v27, __ T2D, v27, v29); // d3 = c2 ^ rol(c4, 1) 4115 4116 __ eor(v0, __ T16B, v0, v30); // a0 = a0 ^ d0 4117 __ xar(v29, __ T2D, v1, v25, (64 - 1)); // a10' = rol((a1^d1), 1) 4118 __ xar(v1, __ T2D, v6, v25, (64 - 44)); // a1 = rol(a6^d1), 44) 4119 __ xar(v6, __ T2D, v9, v28, (64 - 20)); // a6 = rol((a9^d4), 20) 4120 __ xar(v9, __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61) 4121 __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39) 4122 __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18) 4123 __ xar(v31, __ T2D, v2, v26, (64 - 62)); // a20' = rol((a2^d2), 62) 4124 __ xar(v2, __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43) 4125 __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25) 4126 __ xar(v13, __ T2D, v19, v28, (64 - 8)); // a13 = rol((a19^d4), 8) 4127 __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56) 4128 __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41) 4129 __ xar(v15, __ T2D, v4, v28, (64 - 27)); // a15 = rol((a4^d4), 27) 4130 __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14) 4131 __ xar(v24, __ T2D, v21, v25, (64 - 2)); // a24 = rol((a21^d1), 2) 4132 __ xar(v8, __ T2D, v8, v27, (64 - 55)); // a21' = rol((a8^d3), 55) 4133 __ xar(v4, __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45) 4134 __ xar(v16, __ T2D, v5, v30, (64 - 36)); // a16 = rol((a5^d0), 36) 4135 __ xar(v5, __ T2D, v3, v27, (64 - 28)); // a5 = rol((a3^d3), 28) 4136 __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21) 4137 __ xar(v3, __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15) 4138 __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10) 4139 __ xar(v26, __ T2D, v7, v26, (64 - 6)); // a11' = rol((a7^d2), 6) 4140 __ xar(v30, __ T2D, v10, v30, (64 - 3)); // a7' = rol((a10^d0), 3) 4141 4142 __ bcax(v20, __ T16B, v31, v22, v8); // a20 = a20' ^ (~a21 & a22') 4143 __ bcax(v21, __ T16B, v8, v23, v22); // a21 = a21' ^ (~a22 & a23) 4144 __ bcax(v22, __ T16B, v22, v24, v23); // a22 = a22 ^ (~a23 & a24) 4145 __ bcax(v23, __ T16B, v23, v31, v24); // a23 = a23 ^ (~a24 & a20') 4146 __ bcax(v24, __ T16B, v24, v8, v31); // a24 = a24 ^ (~a20' & a21') 4147 4148 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i] 4149 4150 __ bcax(v17, __ T16B, v25, v19, v3); // a17 = a17' ^ (~a18' & a19) 4151 __ bcax(v18, __ T16B, v3, v15, v19); // a18 = a18' ^ (~a19 & a15') 4152 __ bcax(v19, __ T16B, v19, v16, v15); // a19 = a19 ^ (~a15 & a16) 4153 __ bcax(v15, __ T16B, v15, v25, v16); // a15 = a15 ^ (~a16 & a17') 4154 __ bcax(v16, __ T16B, v16, v3, v25); // a16 = a16 ^ (~a17' & a18') 4155 4156 __ bcax(v10, __ T16B, v29, v12, v26); // a10 = a10' ^ (~a11' & a12) 4157 __ bcax(v11, __ T16B, v26, v13, v12); // a11 = a11' ^ (~a12 & a13) 4158 __ bcax(v12, __ T16B, v12, v14, v13); // a12 = a12 ^ (~a13 & a14) 4159 __ bcax(v13, __ T16B, v13, v29, v14); // a13 = a13 ^ (~a14 & a10') 4160 __ bcax(v14, __ T16B, v14, v26, v29); // a14 = a14 ^ (~a10' & a11') 4161 4162 __ bcax(v7, __ T16B, v30, v9, v4); // a7 = a7' ^ (~a8' & a9) 4163 __ bcax(v8, __ T16B, v4, v5, v9); // a8 = a8' ^ (~a9 & a5) 4164 __ bcax(v9, __ T16B, v9, v6, v5); // a9 = a9 ^ (~a5 & a6) 4165 __ bcax(v5, __ T16B, v5, v30, v6); // a5 = a5 ^ (~a6 & a7) 4166 __ bcax(v6, __ T16B, v6, v4, v30); // a6 = a6 ^ (~a7 & a8') 4167 4168 __ bcax(v3, __ T16B, v27, v0, v28); // a3 = a3' ^ (~a4' & a0) 4169 __ bcax(v4, __ T16B, v28, v1, v0); // a4 = a4' ^ (~a0 & a1) 4170 __ bcax(v0, __ T16B, v0, v2, v1); // a0 = a0 ^ (~a1 & a2) 4171 __ bcax(v1, __ T16B, v1, v27, v2); // a1 = a1 ^ (~a2 & a3) 4172 __ bcax(v2, __ T16B, v2, v28, v27); // a2 = a2 ^ (~a3 & a4') 4173 4174 __ eor(v0, __ T16B, v0, v31); // a0 = a0 ^ rc 4175 } 4176 4177 // Arguments: 4178 // 4179 // Inputs: 4180 // c_rarg0 - byte[] source+offset 4181 // c_rarg1 - byte[] SHA.state 4182 // c_rarg2 - int block_size 4183 // c_rarg3 - int offset 4184 // c_rarg4 - int limit 4185 // 4186 address generate_sha3_implCompress(StubGenStubId stub_id) { 4187 bool multi_block; 4188 switch (stub_id) { 4189 case sha3_implCompress_id: 4190 multi_block = false; 4191 break; 4192 case sha3_implCompressMB_id: 4193 multi_block = true; 4194 break; 4195 default: 4196 ShouldNotReachHere(); 4197 } 4198 4199 static const uint64_t round_consts[24] = { 4200 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4201 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4202 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4203 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4204 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4205 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4206 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4207 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4208 }; 4209 4210 __ align(CodeEntryAlignment); 4211 4212 StubCodeMark mark(this, stub_id); 4213 address start = __ pc(); 4214 4215 Register buf = c_rarg0; 4216 Register state = c_rarg1; 4217 Register block_size = c_rarg2; 4218 Register ofs = c_rarg3; 4219 Register limit = c_rarg4; 4220 4221 Label sha3_loop, rounds24_loop; 4222 Label sha3_512_or_sha3_384, shake128; 4223 4224 __ stpd(v8, v9, __ pre(sp, -64)); 4225 __ stpd(v10, v11, Address(sp, 16)); 4226 __ stpd(v12, v13, Address(sp, 32)); 4227 __ stpd(v14, v15, Address(sp, 48)); 4228 4229 // load state 4230 __ add(rscratch1, state, 32); 4231 __ ld1(v0, v1, v2, v3, __ T1D, state); 4232 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); 4233 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); 4234 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); 4235 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); 4236 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); 4237 __ ld1(v24, __ T1D, rscratch1); 4238 4239 __ BIND(sha3_loop); 4240 4241 // 24 keccak rounds 4242 __ movw(rscratch2, 24); 4243 4244 // load round_constants base 4245 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4246 4247 // load input 4248 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4249 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4250 __ eor(v0, __ T8B, v0, v25); 4251 __ eor(v1, __ T8B, v1, v26); 4252 __ eor(v2, __ T8B, v2, v27); 4253 __ eor(v3, __ T8B, v3, v28); 4254 __ eor(v4, __ T8B, v4, v29); 4255 __ eor(v5, __ T8B, v5, v30); 4256 __ eor(v6, __ T8B, v6, v31); 4257 4258 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 4259 __ tbz(block_size, 7, sha3_512_or_sha3_384); 4260 4261 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4262 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4263 __ eor(v7, __ T8B, v7, v25); 4264 __ eor(v8, __ T8B, v8, v26); 4265 __ eor(v9, __ T8B, v9, v27); 4266 __ eor(v10, __ T8B, v10, v28); 4267 __ eor(v11, __ T8B, v11, v29); 4268 __ eor(v12, __ T8B, v12, v30); 4269 __ eor(v13, __ T8B, v13, v31); 4270 4271 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24)); 4272 __ eor(v14, __ T8B, v14, v25); 4273 __ eor(v15, __ T8B, v15, v26); 4274 __ eor(v16, __ T8B, v16, v27); 4275 4276 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 4277 __ andw(c_rarg5, block_size, 48); 4278 __ cbzw(c_rarg5, rounds24_loop); 4279 4280 __ tbnz(block_size, 5, shake128); 4281 // block_size == 144, bit5 == 0, SHA3-224 4282 __ ldrd(v28, __ post(buf, 8)); 4283 __ eor(v17, __ T8B, v17, v28); 4284 __ b(rounds24_loop); 4285 4286 __ BIND(shake128); 4287 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32)); 4288 __ eor(v17, __ T8B, v17, v28); 4289 __ eor(v18, __ T8B, v18, v29); 4290 __ eor(v19, __ T8B, v19, v30); 4291 __ eor(v20, __ T8B, v20, v31); 4292 __ b(rounds24_loop); // block_size == 168, SHAKE128 4293 4294 __ BIND(sha3_512_or_sha3_384); 4295 __ ld1(v25, v26, __ T8B, __ post(buf, 16)); 4296 __ eor(v7, __ T8B, v7, v25); 4297 __ eor(v8, __ T8B, v8, v26); 4298 __ tbz(block_size, 5, rounds24_loop); // SHA3-512 4299 4300 // SHA3-384 4301 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32)); 4302 __ eor(v9, __ T8B, v9, v27); 4303 __ eor(v10, __ T8B, v10, v28); 4304 __ eor(v11, __ T8B, v11, v29); 4305 __ eor(v12, __ T8B, v12, v30); 4306 4307 __ BIND(rounds24_loop); 4308 __ subw(rscratch2, rscratch2, 1); 4309 4310 keccak_round(rscratch1); 4311 4312 __ cbnzw(rscratch2, rounds24_loop); 4313 4314 if (multi_block) { 4315 __ add(ofs, ofs, block_size); 4316 __ cmp(ofs, limit); 4317 __ br(Assembler::LE, sha3_loop); 4318 __ mov(c_rarg0, ofs); // return ofs 4319 } 4320 4321 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); 4322 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); 4323 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); 4324 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); 4325 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); 4326 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); 4327 __ st1(v24, __ T1D, state); 4328 4329 // restore callee-saved registers 4330 __ ldpd(v14, v15, Address(sp, 48)); 4331 __ ldpd(v12, v13, Address(sp, 32)); 4332 __ ldpd(v10, v11, Address(sp, 16)); 4333 __ ldpd(v8, v9, __ post(sp, 64)); 4334 4335 __ ret(lr); 4336 4337 return start; 4338 } 4339 4340 // Inputs: 4341 // c_rarg0 - long[] state0 4342 // c_rarg1 - long[] state1 4343 address generate_double_keccak() { 4344 static const uint64_t round_consts[24] = { 4345 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4346 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4347 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4348 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4349 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4350 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4351 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4352 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4353 }; 4354 4355 // Implements the double_keccak() method of the 4356 // sun.secyrity.provider.SHA3Parallel class 4357 __ align(CodeEntryAlignment); 4358 StubCodeMark mark(this, "StubRoutines", "double_keccak"); 4359 address start = __ pc(); 4360 __ enter(); 4361 4362 Register state0 = c_rarg0; 4363 Register state1 = c_rarg1; 4364 4365 Label rounds24_loop; 4366 4367 // save callee-saved registers 4368 __ stpd(v8, v9, __ pre(sp, -64)); 4369 __ stpd(v10, v11, Address(sp, 16)); 4370 __ stpd(v12, v13, Address(sp, 32)); 4371 __ stpd(v14, v15, Address(sp, 48)); 4372 4373 // load states 4374 __ add(rscratch1, state0, 32); 4375 __ ld4(v0, v1, v2, v3, __ D, 0, state0); 4376 __ ld4(v4, v5, v6, v7, __ D, 0, __ post(rscratch1, 32)); 4377 __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32)); 4378 __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32)); 4379 __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32)); 4380 __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32)); 4381 __ ld1(v24, __ D, 0, rscratch1); 4382 __ add(rscratch1, state1, 32); 4383 __ ld4(v0, v1, v2, v3, __ D, 1, state1); 4384 __ ld4(v4, v5, v6, v7, __ D, 1, __ post(rscratch1, 32)); 4385 __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32)); 4386 __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32)); 4387 __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32)); 4388 __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32)); 4389 __ ld1(v24, __ D, 1, rscratch1); 4390 4391 // 24 keccak rounds 4392 __ movw(rscratch2, 24); 4393 4394 // load round_constants base 4395 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4396 4397 __ BIND(rounds24_loop); 4398 __ subw(rscratch2, rscratch2, 1); 4399 keccak_round(rscratch1); 4400 __ cbnzw(rscratch2, rounds24_loop); 4401 4402 __ st4(v0, v1, v2, v3, __ D, 0, __ post(state0, 32)); 4403 __ st4(v4, v5, v6, v7, __ D, 0, __ post(state0, 32)); 4404 __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32)); 4405 __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32)); 4406 __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32)); 4407 __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32)); 4408 __ st1(v24, __ D, 0, state0); 4409 __ st4(v0, v1, v2, v3, __ D, 1, __ post(state1, 32)); 4410 __ st4(v4, v5, v6, v7, __ D, 1, __ post(state1, 32)); 4411 __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32)); 4412 __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32)); 4413 __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32)); 4414 __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32)); 4415 __ st1(v24, __ D, 1, state1); 4416 4417 // restore callee-saved vector registers 4418 __ ldpd(v14, v15, Address(sp, 48)); 4419 __ ldpd(v12, v13, Address(sp, 32)); 4420 __ ldpd(v10, v11, Address(sp, 16)); 4421 __ ldpd(v8, v9, __ post(sp, 64)); 4422 4423 __ leave(); // required for proper stackwalking of RuntimeStub frame 4424 __ mov(r0, zr); // return 0 4425 __ ret(lr); 4426 4427 return start; 4428 } 4429 4430 /** 4431 * Arguments: 4432 * 4433 * Inputs: 4434 * c_rarg0 - int crc 4435 * c_rarg1 - byte* buf 4436 * c_rarg2 - int length 4437 * 4438 * Output: 4439 * rax - int crc result 4440 */ 4441 address generate_updateBytesCRC32() { 4442 assert(UseCRC32Intrinsics, "what are we doing here?"); 4443 4444 __ align(CodeEntryAlignment); 4445 StubGenStubId stub_id = StubGenStubId::updateBytesCRC32_id; 4446 StubCodeMark mark(this, stub_id); 4447 4448 address start = __ pc(); 4449 4450 const Register crc = c_rarg0; // crc 4451 const Register buf = c_rarg1; // source java byte array address 4452 const Register len = c_rarg2; // length 4453 const Register table0 = c_rarg3; // crc_table address 4454 const Register table1 = c_rarg4; 4455 const Register table2 = c_rarg5; 4456 const Register table3 = c_rarg6; 4457 const Register tmp3 = c_rarg7; 4458 4459 BLOCK_COMMENT("Entry:"); 4460 __ enter(); // required for proper stackwalking of RuntimeStub frame 4461 4462 __ kernel_crc32(crc, buf, len, 4463 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4464 4465 __ leave(); // required for proper stackwalking of RuntimeStub frame 4466 __ ret(lr); 4467 4468 return start; 4469 } 4470 4471 // ChaCha20 block function. This version parallelizes 4 quarter 4472 // round operations at a time. It uses 16 SIMD registers to 4473 // produce 4 blocks of key stream. 4474 // 4475 // state (int[16]) = c_rarg0 4476 // keystream (byte[256]) = c_rarg1 4477 // return - number of bytes of keystream (always 256) 4478 // 4479 // In this approach, we load the 512-bit start state sequentially into 4480 // 4 128-bit vectors. We then make 4 4-vector copies of that starting 4481 // state, with each successive set of 4 vectors having a +1 added into 4482 // the first 32-bit lane of the 4th vector in that group (the counter). 4483 // By doing this, we can perform the block function on 4 512-bit blocks 4484 // within one run of this intrinsic. 4485 // The alignment of the data across the 4-vector group is such that at 4486 // the start it is already aligned for the first round of each two-round 4487 // loop iteration. In other words, the corresponding lanes of each vector 4488 // will contain the values needed for that quarter round operation (e.g. 4489 // elements 0/4/8/12, 1/5/9/13, 2/6/10/14, etc.). 4490 // In between each full round, a lane shift must occur. Within a loop 4491 // iteration, between the first and second rounds, the 2nd, 3rd, and 4th 4492 // vectors are rotated left 32, 64 and 96 bits, respectively. The result 4493 // is effectively a diagonal orientation in columnar form. After the 4494 // second full round, those registers are left-rotated again, this time 4495 // 96, 64, and 32 bits - returning the vectors to their columnar organization. 4496 // After all 10 iterations, the original state is added to each 4-vector 4497 // working state along with the add mask, and the 4 vector groups are 4498 // sequentially written to the memory dedicated for the output key stream. 4499 // 4500 // For a more detailed explanation, see Goll and Gueron, "Vectorization of 4501 // ChaCha Stream Cipher", 2014 11th Int. Conf. on Information Technology: 4502 // New Generations, Las Vegas, NV, USA, April 2014, DOI: 10.1109/ITNG.2014.33 4503 address generate_chacha20Block_qrpar() { 4504 Label L_Q_twoRounds, L_Q_cc20_const; 4505 // The constant data is broken into two 128-bit segments to be loaded 4506 // onto SIMD registers. The first 128 bits are a counter add overlay 4507 // that adds +1/+0/+0/+0 to the vectors holding replicated state[12]. 4508 // The second 128-bits is a table constant used for 8-bit left rotations. 4509 // on 32-bit lanes within a SIMD register. 4510 __ BIND(L_Q_cc20_const); 4511 __ emit_int64(0x0000000000000001UL); 4512 __ emit_int64(0x0000000000000000UL); 4513 __ emit_int64(0x0605040702010003UL); 4514 __ emit_int64(0x0E0D0C0F0A09080BUL); 4515 4516 __ align(CodeEntryAlignment); 4517 StubGenStubId stub_id = StubGenStubId::chacha20Block_id; 4518 StubCodeMark mark(this, stub_id); 4519 address start = __ pc(); 4520 __ enter(); 4521 4522 const Register state = c_rarg0; 4523 const Register keystream = c_rarg1; 4524 const Register loopCtr = r10; 4525 const Register tmpAddr = r11; 4526 4527 const FloatRegister aState = v0; 4528 const FloatRegister bState = v1; 4529 const FloatRegister cState = v2; 4530 const FloatRegister dState = v3; 4531 const FloatRegister a1Vec = v4; 4532 const FloatRegister b1Vec = v5; 4533 const FloatRegister c1Vec = v6; 4534 const FloatRegister d1Vec = v7; 4535 // Skip the callee-saved registers v8 - v15 4536 const FloatRegister a2Vec = v16; 4537 const FloatRegister b2Vec = v17; 4538 const FloatRegister c2Vec = v18; 4539 const FloatRegister d2Vec = v19; 4540 const FloatRegister a3Vec = v20; 4541 const FloatRegister b3Vec = v21; 4542 const FloatRegister c3Vec = v22; 4543 const FloatRegister d3Vec = v23; 4544 const FloatRegister a4Vec = v24; 4545 const FloatRegister b4Vec = v25; 4546 const FloatRegister c4Vec = v26; 4547 const FloatRegister d4Vec = v27; 4548 const FloatRegister scratch = v28; 4549 const FloatRegister addMask = v29; 4550 const FloatRegister lrot8Tbl = v30; 4551 4552 // Load the initial state in the first 4 quadword registers, 4553 // then copy the initial state into the next 4 quadword registers 4554 // that will be used for the working state. 4555 __ ld1(aState, bState, cState, dState, __ T16B, Address(state)); 4556 4557 // Load the index register for 2 constant 128-bit data fields. 4558 // The first represents the +1/+0/+0/+0 add mask. The second is 4559 // the 8-bit left rotation. 4560 __ adr(tmpAddr, L_Q_cc20_const); 4561 __ ldpq(addMask, lrot8Tbl, Address(tmpAddr)); 4562 4563 __ mov(a1Vec, __ T16B, aState); 4564 __ mov(b1Vec, __ T16B, bState); 4565 __ mov(c1Vec, __ T16B, cState); 4566 __ mov(d1Vec, __ T16B, dState); 4567 4568 __ mov(a2Vec, __ T16B, aState); 4569 __ mov(b2Vec, __ T16B, bState); 4570 __ mov(c2Vec, __ T16B, cState); 4571 __ addv(d2Vec, __ T4S, d1Vec, addMask); 4572 4573 __ mov(a3Vec, __ T16B, aState); 4574 __ mov(b3Vec, __ T16B, bState); 4575 __ mov(c3Vec, __ T16B, cState); 4576 __ addv(d3Vec, __ T4S, d2Vec, addMask); 4577 4578 __ mov(a4Vec, __ T16B, aState); 4579 __ mov(b4Vec, __ T16B, bState); 4580 __ mov(c4Vec, __ T16B, cState); 4581 __ addv(d4Vec, __ T4S, d3Vec, addMask); 4582 4583 // Set up the 10 iteration loop 4584 __ mov(loopCtr, 10); 4585 __ BIND(L_Q_twoRounds); 4586 4587 // The first set of operations on the vectors covers the first 4 quarter 4588 // round operations: 4589 // Qround(state, 0, 4, 8,12) 4590 // Qround(state, 1, 5, 9,13) 4591 // Qround(state, 2, 6,10,14) 4592 // Qround(state, 3, 7,11,15) 4593 __ cc20_quarter_round(a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl); 4594 __ cc20_quarter_round(a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl); 4595 __ cc20_quarter_round(a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl); 4596 __ cc20_quarter_round(a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl); 4597 4598 // Shuffle the b1Vec/c1Vec/d1Vec to reorganize the state vectors to 4599 // diagonals. The a1Vec does not need to change orientation. 4600 __ cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, true); 4601 __ cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, true); 4602 __ cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, true); 4603 __ cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, true); 4604 4605 // The second set of operations on the vectors covers the second 4 quarter 4606 // round operations, now acting on the diagonals: 4607 // Qround(state, 0, 5,10,15) 4608 // Qround(state, 1, 6,11,12) 4609 // Qround(state, 2, 7, 8,13) 4610 // Qround(state, 3, 4, 9,14) 4611 __ cc20_quarter_round(a1Vec, b1Vec, c1Vec, d1Vec, scratch, lrot8Tbl); 4612 __ cc20_quarter_round(a2Vec, b2Vec, c2Vec, d2Vec, scratch, lrot8Tbl); 4613 __ cc20_quarter_round(a3Vec, b3Vec, c3Vec, d3Vec, scratch, lrot8Tbl); 4614 __ cc20_quarter_round(a4Vec, b4Vec, c4Vec, d4Vec, scratch, lrot8Tbl); 4615 4616 // Before we start the next iteration, we need to perform shuffles 4617 // on the b/c/d vectors to move them back to columnar organizations 4618 // from their current diagonal orientation. 4619 __ cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, false); 4620 __ cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, false); 4621 __ cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, false); 4622 __ cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, false); 4623 4624 // Decrement and iterate 4625 __ sub(loopCtr, loopCtr, 1); 4626 __ cbnz(loopCtr, L_Q_twoRounds); 4627 4628 // Once the counter reaches zero, we fall out of the loop 4629 // and need to add the initial state back into the working state 4630 // represented by the a/b/c/d1Vec registers. This is destructive 4631 // on the dState register but we no longer will need it. 4632 __ addv(a1Vec, __ T4S, a1Vec, aState); 4633 __ addv(b1Vec, __ T4S, b1Vec, bState); 4634 __ addv(c1Vec, __ T4S, c1Vec, cState); 4635 __ addv(d1Vec, __ T4S, d1Vec, dState); 4636 4637 __ addv(a2Vec, __ T4S, a2Vec, aState); 4638 __ addv(b2Vec, __ T4S, b2Vec, bState); 4639 __ addv(c2Vec, __ T4S, c2Vec, cState); 4640 __ addv(dState, __ T4S, dState, addMask); 4641 __ addv(d2Vec, __ T4S, d2Vec, dState); 4642 4643 __ addv(a3Vec, __ T4S, a3Vec, aState); 4644 __ addv(b3Vec, __ T4S, b3Vec, bState); 4645 __ addv(c3Vec, __ T4S, c3Vec, cState); 4646 __ addv(dState, __ T4S, dState, addMask); 4647 __ addv(d3Vec, __ T4S, d3Vec, dState); 4648 4649 __ addv(a4Vec, __ T4S, a4Vec, aState); 4650 __ addv(b4Vec, __ T4S, b4Vec, bState); 4651 __ addv(c4Vec, __ T4S, c4Vec, cState); 4652 __ addv(dState, __ T4S, dState, addMask); 4653 __ addv(d4Vec, __ T4S, d4Vec, dState); 4654 4655 // Write the final state back to the result buffer 4656 __ st1(a1Vec, b1Vec, c1Vec, d1Vec, __ T16B, __ post(keystream, 64)); 4657 __ st1(a2Vec, b2Vec, c2Vec, d2Vec, __ T16B, __ post(keystream, 64)); 4658 __ st1(a3Vec, b3Vec, c3Vec, d3Vec, __ T16B, __ post(keystream, 64)); 4659 __ st1(a4Vec, b4Vec, c4Vec, d4Vec, __ T16B, __ post(keystream, 64)); 4660 4661 __ mov(r0, 256); // Return length of output keystream 4662 __ leave(); 4663 __ ret(lr); 4664 4665 return start; 4666 } 4667 4668 // Helpers to schedule parallel operation bundles across vector 4669 // register sequences of size 2, 4 or 8. 4670 4671 // Implement various primitive computations across vector sequences 4672 4673 template<int N> 4674 void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4675 const VSeq<N>& v1, const VSeq<N>& v2) { 4676 for (int i = 0; i < N; i++) { 4677 __ addv(v[i], T, v1[i], v2[i]); 4678 } 4679 } 4680 4681 template<int N> 4682 void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4683 const VSeq<N>& v1, const VSeq<N>& v2) { 4684 for (int i = 0; i < N; i++) { 4685 __ subv(v[i], T, v1[i], v2[i]); 4686 } 4687 } 4688 4689 template<int N> 4690 void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4691 const VSeq<N>& v1, const VSeq<N>& v2) { 4692 for (int i = 0; i < N; i++) { 4693 __ mulv(v[i], T, v1[i], v2[i]); 4694 } 4695 } 4696 4697 template<int N> 4698 void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) { 4699 for (int i = 0; i < N; i++) { 4700 __ negr(v[i], T, v1[i]); 4701 } 4702 } 4703 4704 template<int N> 4705 void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, 4706 const VSeq<N>& v1, int shift) { 4707 for (int i = 0; i < N; i++) { 4708 __ sshr(v[i], T, v1[i], shift); 4709 } 4710 } 4711 4712 template<int N> 4713 void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) { 4714 for (int i = 0; i < N; i++) { 4715 __ andr(v[i], __ T16B, v1[i], v2[i]); 4716 } 4717 } 4718 4719 template<int N> 4720 void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) { 4721 for (int i = 0; i < N; i++) { 4722 __ orr(v[i], __ T16B, v1[i], v2[i]); 4723 } 4724 } 4725 4726 template<int N> 4727 void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) { 4728 for (int i = 0; i < N; i++) { 4729 __ notr(v[i], __ T16B, v1[i]); 4730 } 4731 } 4732 4733 // load N/2 successive pairs of quadword values from memory in order 4734 // into N successive vector registers of the sequence via the 4735 // address supplied in base. 4736 template<int N> 4737 void vs_ldpq(const VSeq<N>& v, Register base) { 4738 for (int i = 0; i < N; i += 2) { 4739 __ ldpq(v[i], v[i+1], Address(base, 32 * i)); 4740 } 4741 } 4742 4743 // load N/2 successive pairs of quadword values from memory in order 4744 // into N vector registers of the sequence via the address supplied 4745 // in base using post-increment addressing 4746 template<int N> 4747 void vs_ldpq_post(const VSeq<N>& v, Register base) { 4748 for (int i = 0; i < N; i += 2) { 4749 __ ldpq(v[i], v[i+1], __ post(base, 32)); 4750 } 4751 } 4752 4753 // store N successive vector registers of the sequence into N/2 4754 // successive pairs of quadword memory locations via the address 4755 // supplied in base using post-increment addressing 4756 template<int N> 4757 void vs_stpq_post(const VSeq<N>& v, Register base) { 4758 for (int i = 0; i < N; i += 2) { 4759 __ stpq(v[i], v[i+1], __ post(base, 32)); 4760 } 4761 } 4762 4763 // load N/2 pairs of quadword values from memory into N vector 4764 // registers via the address supplied in base with each pair indexed 4765 // using the the start offset plus the corresponding entry in the 4766 // offsets array 4767 template<int N> 4768 void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) { 4769 for (int i = 0; i < N/2; i++) { 4770 __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i])); 4771 } 4772 } 4773 4774 // store N vector registers into N/2 pairs of quadword memory 4775 // locations via the address supplied in base with each pair indexed 4776 // using the the start offset plus the corresponding entry in the 4777 // offsets array 4778 template<int N> 4779 void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) { 4780 for (int i = 0; i < N/2; i++) { 4781 __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i])); 4782 } 4783 } 4784 4785 // load N single quadword values from memory into N vector registers 4786 // via the address supplied in base with each value indexed using 4787 // the the start offset plus the corresponding entry in the offsets 4788 // array 4789 template<int N> 4790 void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base, 4791 int start, int (&offsets)[N]) { 4792 for (int i = 0; i < N; i++) { 4793 __ ldr(v[i], T, Address(base, start + offsets[i])); 4794 } 4795 } 4796 4797 // store N vector registers into N single quadword memory locations 4798 // via the address supplied in base with each value indexed using 4799 // the the start offset plus the corresponding entry in the offsets 4800 // array 4801 template<int N> 4802 void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base, 4803 int start, int (&offsets)[N]) { 4804 for (int i = 0; i < N; i++) { 4805 __ str(v[i], T, Address(base, start + offsets[i])); 4806 } 4807 } 4808 4809 // load N/2 pairs of quadword values from memory de-interleaved into 4810 // N vector registers 2 at a time via the address supplied in base 4811 // with each pair indexed using the the start offset plus the 4812 // corresponding entry in the offsets array 4813 template<int N> 4814 void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base, 4815 Register tmp, int start, int (&offsets)[N/2]) { 4816 for (int i = 0; i < N/2; i++) { 4817 __ add(tmp, base, start + offsets[i]); 4818 __ ld2(v[2*i], v[2*i+1], T, tmp); 4819 } 4820 } 4821 4822 // store N vector registers 2 at a time interleaved into N/2 pairs 4823 // of quadword memory locations via the address supplied in base 4824 // with each pair indexed using the the start offset plus the 4825 // corresponding entry in the offsets array 4826 template<int N> 4827 void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base, 4828 Register tmp, int start, int (&offsets)[N/2]) { 4829 for (int i = 0; i < N/2; i++) { 4830 __ add(tmp, base, start + offsets[i]); 4831 __ st2(v[2*i], v[2*i+1], T, tmp); 4832 } 4833 } 4834 4835 // Helper routines for various flavours of dilithium montgomery 4836 // multiply 4837 4838 // Perform 16 32-bit Montgomery multiplications in parallel 4839 // See the montMul() method of the sun.security.provider.ML_DSA class. 4840 // 4841 // Computes 4x4S results 4842 // a = b * c * 2^-32 mod MONT_Q 4843 // Inputs: vb, vc - 4x4S vector register sequences 4844 // vq - 2x4S constants <MONT_Q, MONT_Q_INV_MOD_R> 4845 // Temps: vtmp - 4x4S vector sequence trashed after call 4846 // Outputs: va - 4x4S vector register sequences 4847 // vb, vc, vtmp and vq must all be disjoint 4848 // va must be disjoint from all other inputs/temps or must equal vc 4849 // n.b. MONT_R_BITS is 32, so the right shift by it is implicit. 4850 void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc, 4851 const VSeq<4>& vtmp, const VSeq<2>& vq) { 4852 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 4853 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 4854 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 4855 4856 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 4857 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 4858 4859 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 4860 4861 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 4862 assert(vs_disjoint(va, vb), "va and vb overlap"); 4863 assert(vs_disjoint(va, vq), "va and vq overlap"); 4864 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 4865 4866 // schedule 4 streams of instructions across the vector sequences 4867 for (int i = 0; i < 4; i++) { 4868 __ sqdmulh(vtmp[i], __ T4S, vb[i], vc[i]); // aHigh = hi32(2 * b * c) 4869 __ mulv(va[i], __ T4S, vb[i], vc[i]); // aLow = lo32(b * c) 4870 } 4871 4872 for (int i = 0; i < 4; i++) { 4873 __ mulv(va[i], __ T4S, va[i], vq[0]); // m = aLow * qinv 4874 } 4875 4876 for (int i = 0; i < 4; i++) { 4877 __ sqdmulh(va[i], __ T4S, va[i], vq[1]); // n = hi32(2 * m * q) 4878 } 4879 4880 for (int i = 0; i < 4; i++) { 4881 __ shsubv(va[i], __ T4S, vtmp[i], va[i]); // a = (aHigh - n) / 2 4882 } 4883 } 4884 4885 // Perform 2x16 32-bit Montgomery multiplications in parallel 4886 // See the montMul() method of the sun.security.provider.ML_DSA class. 4887 // 4888 // Computes 8x4S results 4889 // a = b * c * 2^-32 mod MONT_Q 4890 // Inputs: vb, vc - 8x4S vector register sequences 4891 // vq - 2x4S constants <MONT_Q, MONT_Q_INV_MOD_R> 4892 // Temps: vtmp - 4x4S vector sequence trashed after call 4893 // Outputs: va - 8x4S vector register sequences 4894 // vb, vc, vtmp and vq must all be disjoint 4895 // va must be disjoint from all other inputs/temps or must equal vc 4896 // n.b. MONT_R_BITS is 32, so the right shift by it is implicit. 4897 void vs_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc, 4898 const VSeq<4>& vtmp, const VSeq<2>& vq) { 4899 // vb, vc, vtmp and vq must be disjoint. va must either be 4900 // disjoint from all other registers or equal vc 4901 4902 assert(vs_disjoint(vb, vc), "vb and vc overlap"); 4903 assert(vs_disjoint(vb, vq), "vb and vq overlap"); 4904 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap"); 4905 4906 assert(vs_disjoint(vc, vq), "vc and vq overlap"); 4907 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap"); 4908 4909 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap"); 4910 4911 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal"); 4912 assert(vs_disjoint(va, vb), "va and vb overlap"); 4913 assert(vs_disjoint(va, vq), "va and vq overlap"); 4914 assert(vs_disjoint(va, vtmp), "va and vtmp overlap"); 4915 4916 // we need to multiply the front and back halves of each sequence 4917 // 4x4S at a time because 4918 // 4919 // 1) we are currently only able to get 4-way instruction 4920 // parallelism at best 4921 // 4922 // 2) we need registers for the constants in vq and temporary 4923 // scratch registers to hold intermediate results so vtmp can only 4924 // be a VSeq<4> which means we only have 4 scratch slots 4925 4926 dilithium_montmul16(vs_front(va), vs_front(vb), vs_front(vc), vtmp, vq); 4927 dilithium_montmul16(vs_back(va), vs_back(vb), vs_back(vc), vtmp, vq); 4928 } 4929 4930 // perform combined montmul then add/sub on 4x4S vectors 4931 4932 void dilithium_montmul16_sub_add(const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc, 4933 const VSeq<4>& vtmp, const VSeq<2>& vq) { 4934 // compute a = montmul(a1, c) 4935 dilithium_montmul16(vc, va1, vc, vtmp, vq); 4936 // ouptut a1 = a0 - a 4937 vs_subv(va1, __ T4S, va0, vc); 4938 // and a0 = a0 + a 4939 vs_addv(va0, __ T4S, va0, vc); 4940 } 4941 4942 // perform combined add/sub then montul on 4x4S vectors 4943 4944 void dilithium_sub_add_montmul16(const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb, 4945 const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) { 4946 // compute c = a0 - a1 4947 vs_subv(vtmp1, __ T4S, va0, va1); 4948 // output a0 = a0 + a1 4949 vs_addv(va0, __ T4S, va0, va1); 4950 // output a1 = b montmul c 4951 dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq); 4952 } 4953 4954 // At these levels, the indices that correspond to the 'j's (and 'j+l's) 4955 // in the Java implementation come in sequences of at least 8, so we 4956 // can use ldpq to collect the corresponding data into pairs of vector 4957 // registers. 4958 // We collect the coefficients corresponding to the 'j+l' indexes into 4959 // the vector registers v0-v7, the zetas into the vector registers v16-v23 4960 // then we do the (Montgomery) multiplications by the zetas in parallel 4961 // into v16-v23, load the coeffs corresponding to the 'j' indexes into 4962 // v0-v7, then do the additions into v24-v31 and the subtractions into 4963 // v0-v7 and finally save the results back to the coeffs array. 4964 void dilithiumNttLevel0_4(const Register dilithiumConsts, 4965 const Register coeffs, const Register zetas) { 4966 int c1 = 0; 4967 int c2 = 512; 4968 int startIncr; 4969 // don't use callee save registers v8 - v15 4970 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 4971 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 4972 VSeq<2> vq(30); // n.b. constants overlap vs3 4973 int offsets[4] = { 0, 32, 64, 96 }; 4974 4975 for (int level = 0; level < 5; level++) { 4976 int c1Start = c1; 4977 int c2Start = c2; 4978 if (level == 3) { 4979 offsets[1] = 32; 4980 offsets[2] = 128; 4981 offsets[3] = 160; 4982 } else if (level == 4) { 4983 offsets[1] = 64; 4984 offsets[2] = 128; 4985 offsets[3] = 192; 4986 } 4987 4988 // for levels 1 - 4 we simply load 2 x 4 adjacent values at a 4989 // time at 4 different offsets and multiply them in order by the 4990 // next set of input values. So we employ indexed load and store 4991 // pair instructions with arrangement 4S 4992 for (int i = 0; i < 4; i++) { 4993 // reload q and qinv 4994 vs_ldpq(vq, dilithiumConsts); // qInv, q 4995 // load 8x4S coefficients via second start pos == c2 4996 vs_ldpq_indexed(vs1, coeffs, c2Start, offsets); 4997 // load next 8x4S inputs == b 4998 vs_ldpq_post(vs2, zetas); 4999 // compute a == c2 * b mod MONT_Q 5000 vs_montmul32(vs2, vs1, vs2, vtmp, vq); 5001 // load 8x4s coefficients via first start pos == c1 5002 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets); 5003 // compute a1 = c1 + a 5004 vs_addv(vs3, __ T4S, vs1, vs2); 5005 // compute a2 = c1 - a 5006 vs_subv(vs1, __ T4S, vs1, vs2); 5007 // output a1 and a2 5008 vs_stpq_indexed(vs3, coeffs, c1Start, offsets); 5009 vs_stpq_indexed(vs1, coeffs, c2Start, offsets); 5010 5011 int k = 4 * level + i; 5012 5013 if (k > 7) { 5014 startIncr = 256; 5015 } else if (k == 5) { 5016 startIncr = 384; 5017 } else { 5018 startIncr = 128; 5019 } 5020 5021 c1Start += startIncr; 5022 c2Start += startIncr; 5023 } 5024 5025 c2 /= 2; 5026 } 5027 } 5028 5029 // Dilithium NTT function except for the final "normalization" to |coeff| < Q. 5030 // Implements the method 5031 // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {} 5032 // of the Java class sun.security.provider 5033 // 5034 // coeffs (int[256]) = c_rarg0 5035 // zetas (int[256]) = c_rarg1 5036 address generate_dilithiumAlmostNtt() { 5037 5038 __ align(CodeEntryAlignment); 5039 StubGenStubId stub_id = StubGenStubId::dilithiumAlmostNtt_id; 5040 StubCodeMark mark(this, stub_id); 5041 address start = __ pc(); 5042 __ enter(); 5043 5044 const Register coeffs = c_rarg0; 5045 const Register zetas = c_rarg1; 5046 5047 const Register tmpAddr = r9; 5048 const Register dilithiumConsts = r10; 5049 const Register result = r11; 5050 // don't use callee save registers v8 - v15 5051 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 5052 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 5053 VSeq<2> vq(30); // n.b. constants overlap vs3 5054 int offsets[4] = {0, 32, 64, 96}; 5055 int offsets1[8] = {16, 48, 80, 112, 144, 176, 208, 240 }; 5056 int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 5057 __ add(result, coeffs, 0); 5058 __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 5059 5060 // Each level represents one iteration of the outer for loop of the Java version 5061 5062 // level 0-4 5063 dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas); 5064 5065 // level 5 5066 5067 // at level 5 the coefficients we need to combine with the zetas 5068 // are grouped in memory in blocks of size 4. So, for both sets of 5069 // coefficients we load 4 adjacent values at 8 different offsets 5070 // using an indexed ldr with register variant Q and multiply them 5071 // in sequence order by the next set of inputs. Likewise we store 5072 // the resuls using an indexed str with register variant Q. 5073 for (int i = 0; i < 1024; i += 256) { 5074 // reload constants q, qinv each iteration as they get clobbered later 5075 vs_ldpq(vq, dilithiumConsts); // qInv, q 5076 // load 32 (8x4S) coefficients via first offsets = c1 5077 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1); 5078 // load next 32 (8x4S) inputs = b 5079 vs_ldpq_post(vs2, zetas); 5080 // a = b montul c1 5081 vs_montmul32(vs2, vs1, vs2, vtmp, vq); 5082 // load 32 (8x4S) coefficients via second offsets = c2 5083 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2); 5084 // add/sub with result of multiply 5085 vs_addv(vs3, __ T4S, vs1, vs2); // a1 = a - c2 5086 vs_subv(vs1, __ T4S, vs1, vs2); // a0 = a + c1 5087 // write back new coefficients using same offsets 5088 vs_str_indexed(vs3, __ Q, coeffs, i, offsets2); 5089 vs_str_indexed(vs1, __ Q, coeffs, i, offsets1); 5090 } 5091 5092 // level 6 5093 // at level 6 the coefficients we need to combine with the zetas 5094 // are grouped in memory in pairs, the first two being montmul 5095 // inputs and the second add/sub inputs. We can still implement 5096 // the montmul+sub+add using 4-way parallelism but only if we 5097 // combine the coefficients with the zetas 16 at a time. We load 8 5098 // adjacent values at 4 different offsets using an ld2 load with 5099 // arrangement 2D. That interleaves the lower and upper halves of 5100 // each pair of quadwords into successive vector registers. We 5101 // then need to montmul the 4 even elements of the coefficients 5102 // register sequence by the zetas in order and then add/sub the 4 5103 // odd elements of the coefficients register sequence. We use an 5104 // equivalent st2 operation to store the results back into memory 5105 // de-interleaved. 5106 for (int i = 0; i < 1024; i += 128) { 5107 // reload constants q, qinv each iteration as they get clobbered later 5108 vs_ldpq(vq, dilithiumConsts); // qInv, q 5109 // load interleaved 16 (4x2D) coefficients via offsets 5110 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 5111 // load next 16 (4x4S) inputs 5112 vs_ldpq_post(vs_front(vs2), zetas); 5113 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens 5114 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1), 5115 vs_front(vs2), vtmp, vq); 5116 // store interleaved 16 (4x2D) coefficients via offsets 5117 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 5118 } 5119 5120 // level 7 5121 // at level 7 the coefficients we need to combine with the zetas 5122 // occur singly with montmul inputs alterating with add/sub 5123 // inputs. Once again we can use 4-way parallelism to combine 16 5124 // zetas at a time. However, we have to load 8 adjacent values at 5125 // 4 different offsets using an ld2 load with arrangement 4S. That 5126 // interleaves the the odd words of each pair into one 5127 // coefficients vector register and the even words of the pair 5128 // into the next register. We then need to montmul the 4 even 5129 // elements of the coefficients register sequence by the zetas in 5130 // order and then add/sub the 4 odd elements of the coefficients 5131 // register sequence. We use an equivalent st2 operation to store 5132 // the results back into memory de-interleaved. 5133 5134 for (int i = 0; i < 1024; i += 128) { 5135 // reload constants q, qinv each iteration as they get clobbered later 5136 vs_ldpq(vq, dilithiumConsts); // qInv, q 5137 // load interleaved 16 (4x4S) coefficients via offsets 5138 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 5139 // load next 16 (4x4S) inputs 5140 vs_ldpq_post(vs_front(vs2), zetas); 5141 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens 5142 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1), 5143 vs_front(vs2), vtmp, vq); 5144 // store interleaved 16 (4x4S) coefficients via offsets 5145 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 5146 } 5147 __ leave(); // required for proper stackwalking of RuntimeStub frame 5148 __ mov(r0, zr); // return 0 5149 __ ret(lr); 5150 5151 return start; 5152 } 5153 5154 // At these levels, the indices that correspond to the 'j's (and 'j+l's) 5155 // in the Java implementation come in sequences of at least 8, so we 5156 // can use ldpq to collect the corresponding data into pairs of vector 5157 // registers 5158 // We collect the coefficients that correspond to the 'j's into vs1 5159 // the coefficiets that correspond to the 'j+l's into vs2 then 5160 // do the additions into vs3 and the subtractions into vs1 then 5161 // save the result of the additions, load the zetas into vs2 5162 // do the (Montgomery) multiplications by zeta in parallel into vs2 5163 // finally save the results back to the coeffs array 5164 void dilithiumInverseNttLevel3_7(const Register dilithiumConsts, 5165 const Register coeffs, const Register zetas) { 5166 int c1 = 0; 5167 int c2 = 32; 5168 int startIncr; 5169 int offsets[4]; 5170 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 5171 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 5172 VSeq<2> vq(30); // n.b. constants overlap vs3 5173 5174 offsets[0] = 0; 5175 5176 for (int level = 3; level < 8; level++) { 5177 int c1Start = c1; 5178 int c2Start = c2; 5179 if (level == 3) { 5180 offsets[1] = 64; 5181 offsets[2] = 128; 5182 offsets[3] = 192; 5183 } else if (level == 4) { 5184 offsets[1] = 32; 5185 offsets[2] = 128; 5186 offsets[3] = 160; 5187 } else { 5188 offsets[1] = 32; 5189 offsets[2] = 64; 5190 offsets[3] = 96; 5191 } 5192 5193 // for levels 3 - 7 we simply load 2 x 4 adjacent values at a 5194 // time at 4 different offsets and multiply them in order by the 5195 // next set of input values. So we employ indexed load and store 5196 // pair instructions with arrangement 4S 5197 for (int i = 0; i < 4; i++) { 5198 // load v1 32 (8x4S) coefficients relative to first start index 5199 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets); 5200 // load v2 32 (8x4S) coefficients relative to second start index 5201 vs_ldpq_indexed(vs2, coeffs, c2Start, offsets); 5202 // a0 = v1 + v2 -- n.b. clobbers vqs 5203 vs_addv(vs3, __ T4S, vs1, vs2); 5204 // a1 = v1 - v2 5205 vs_subv(vs1, __ T4S, vs1, vs2); 5206 // save a1 relative to first start index 5207 vs_stpq_indexed(vs3, coeffs, c1Start, offsets); 5208 // load constants q, qinv each iteration as they get clobbered above 5209 vs_ldpq(vq, dilithiumConsts); // qInv, q 5210 // load b next 32 (8x4S) inputs 5211 vs_ldpq_post(vs2, zetas); 5212 // a = a1 montmul b 5213 vs_montmul32(vs2, vs1, vs2, vtmp, vq); 5214 // save a relative to second start index 5215 vs_stpq_indexed(vs2, coeffs, c2Start, offsets); 5216 5217 int k = 4 * level + i; 5218 5219 if (k < 24) { 5220 startIncr = 256; 5221 } else if (k == 25) { 5222 startIncr = 384; 5223 } else { 5224 startIncr = 128; 5225 } 5226 5227 c1Start += startIncr; 5228 c2Start += startIncr; 5229 } 5230 5231 c2 *= 2; 5232 } 5233 } 5234 5235 // Dilithium Inverse NTT function except the final mod Q division by 2^256. 5236 // Implements the method 5237 // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of 5238 // the sun.security.provider.ML_DSA class. 5239 // 5240 // coeffs (int[256]) = c_rarg0 5241 // zetas (int[256]) = c_rarg1 5242 address generate_dilithiumAlmostInverseNtt() { 5243 5244 __ align(CodeEntryAlignment); 5245 StubGenStubId stub_id = StubGenStubId::dilithiumAlmostInverseNtt_id; 5246 StubCodeMark mark(this, stub_id); 5247 address start = __ pc(); 5248 __ enter(); 5249 5250 const Register coeffs = c_rarg0; 5251 const Register zetas = c_rarg1; 5252 5253 const Register tmpAddr = r9; 5254 const Register dilithiumConsts = r10; 5255 const Register result = r11; 5256 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 5257 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 5258 VSeq<2> vq(30); // n.b. constants overlap vs3 5259 int offsets[4] = { 0, 32, 64, 96 }; 5260 int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 }; 5261 int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 }; 5262 5263 __ add(result, coeffs, 0); 5264 __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 5265 5266 // Each level represents one iteration of the outer for loop of the Java version 5267 // level0 5268 5269 // level 0 5270 // At level 0 we need to interleave adjacent quartets of 5271 // coefficients before we multiply and add/sub by the next 16 5272 // zetas just as we did for level 7 in the multiply code. So we 5273 // load and store the values using an ld2/st2 with arrangement 4S 5274 for (int i = 0; i < 1024; i += 128) { 5275 // load constants q, qinv 5276 // n.b. this can be moved out of the loop as they do not get 5277 // clobbered by first two loops 5278 vs_ldpq(vq, dilithiumConsts); // qInv, q 5279 // a0/a1 load interleaved 32 (8x4S) coefficients 5280 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 5281 // b load next 32 (8x4S) inputs 5282 vs_ldpq_post(vs_front(vs2), zetas); 5283 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b) 5284 // n.b. second half of vs2 provides temporary register storage 5285 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1), 5286 vs_front(vs2), vs_back(vs2), vtmp, vq); 5287 // a0/a1 store interleaved 32 (8x4S) coefficients 5288 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets); 5289 } 5290 5291 // level 1 5292 // At level 1 we need to interleave pairs of adjacent pairs of 5293 // coefficients before we multiply by the next 16 zetas just as we 5294 // did for level 6 in the multiply code. So we load and store the 5295 // values an ld2/st2 with arrangement 2D 5296 for (int i = 0; i < 1024; i += 128) { 5297 // a0/a1 load interleaved 32 (8x2D) coefficients 5298 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 5299 // b load next 16 (4x4S) inputs 5300 vs_ldpq_post(vs_front(vs2), zetas); 5301 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b) 5302 // n.b. second half of vs2 provides temporary register storage 5303 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1), 5304 vs_front(vs2), vs_back(vs2), vtmp, vq); 5305 // a0/a1 store interleaved 32 (8x2D) coefficients 5306 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets); 5307 } 5308 5309 // level 2 5310 // At level 2 coefficients come in blocks of 4. So, we load 4 5311 // adjacent coefficients at 8 distinct offsets for both the first 5312 // and second coefficient sequences, using an ldr with register 5313 // variant Q then combine them with next set of 32 zetas. Likewise 5314 // we store the results using an str with register variant Q. 5315 for (int i = 0; i < 1024; i += 256) { 5316 // c0 load 32 (8x4S) coefficients via first offsets 5317 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1); 5318 // c1 load 32 (8x4S) coefficients via second offsets 5319 vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2); 5320 // a0 = c0 + c1 n.b. clobbers vq which overlaps vs3 5321 vs_addv(vs3, __ T4S, vs1, vs2); 5322 // c = c0 - c1 5323 vs_subv(vs1, __ T4S, vs1, vs2); 5324 // store a0 32 (8x4S) coefficients via first offsets 5325 vs_str_indexed(vs3, __ Q, coeffs, i, offsets1); 5326 // b load 32 (8x4S) next inputs 5327 vs_ldpq_post(vs2, zetas); 5328 // reload constants q, qinv -- they were clobbered earlier 5329 vs_ldpq(vq, dilithiumConsts); // qInv, q 5330 // compute a1 = b montmul c 5331 vs_montmul32(vs2, vs1, vs2, vtmp, vq); 5332 // store a1 32 (8x4S) coefficients via second offsets 5333 vs_str_indexed(vs2, __ Q, coeffs, i, offsets2); 5334 } 5335 5336 // level 3-7 5337 dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas); 5338 5339 __ leave(); // required for proper stackwalking of RuntimeStub frame 5340 __ mov(r0, zr); // return 0 5341 __ ret(lr); 5342 5343 return start; 5344 5345 } 5346 5347 // Dilithium multiply polynomials in the NTT domain. 5348 // Straightforward implementation of the method 5349 // static int implDilithiumNttMult( 5350 // int[] result, int[] ntta, int[] nttb {} of 5351 // the sun.security.provider.ML_DSA class. 5352 // 5353 // result (int[256]) = c_rarg0 5354 // poly1 (int[256]) = c_rarg1 5355 // poly2 (int[256]) = c_rarg2 5356 address generate_dilithiumNttMult() { 5357 5358 __ align(CodeEntryAlignment); 5359 StubGenStubId stub_id = StubGenStubId::dilithiumNttMult_id; 5360 StubCodeMark mark(this, stub_id); 5361 address start = __ pc(); 5362 __ enter(); 5363 5364 Label L_loop; 5365 5366 const Register result = c_rarg0; 5367 const Register poly1 = c_rarg1; 5368 const Register poly2 = c_rarg2; 5369 5370 const Register dilithiumConsts = r10; 5371 const Register len = r11; 5372 5373 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 5374 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 5375 VSeq<2> vq(30); // n.b. constants overlap vs3 5376 VSeq<8> vrsquare(29, 0); // for montmul by constant RSQUARE 5377 5378 __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 5379 5380 // load constants q, qinv 5381 vs_ldpq(vq, dilithiumConsts); // qInv, q 5382 // load constant rSquare into v29 5383 __ ldr(v29, __ Q, Address(dilithiumConsts, 48)); // rSquare 5384 5385 __ mov(len, zr); 5386 __ add(len, len, 1024); 5387 5388 __ BIND(L_loop); 5389 5390 // b load 32 (8x4S) next inputs from poly1 5391 vs_ldpq_post(vs1, poly1); 5392 // c load 32 (8x4S) next inputs from poly2 5393 vs_ldpq_post(vs2, poly2); 5394 // compute a = b montmul c 5395 vs_montmul32(vs2, vs1, vs2, vtmp, vq); 5396 // compute a = rsquare montmul a 5397 vs_montmul32(vs2, vrsquare, vs2, vtmp, vq); 5398 // save a 32 (8x4S) results 5399 vs_stpq_post(vs2, result); 5400 5401 __ sub(len, len, 128); 5402 __ cmp(len, (u1)128); 5403 __ br(Assembler::GE, L_loop); 5404 5405 __ leave(); // required for proper stackwalking of RuntimeStub frame 5406 __ mov(r0, zr); // return 0 5407 __ ret(lr); 5408 5409 return start; 5410 5411 } 5412 5413 // Dilithium Motgomery multiply an array by a constant. 5414 // A straightforward implementation of the method 5415 // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {} 5416 // of the sun.security.provider.MLDSA class 5417 // 5418 // coeffs (int[256]) = c_rarg0 5419 // constant (int) = c_rarg1 5420 address generate_dilithiumMontMulByConstant() { 5421 5422 __ align(CodeEntryAlignment); 5423 StubGenStubId stub_id = StubGenStubId::dilithiumMontMulByConstant_id; 5424 StubCodeMark mark(this, stub_id); 5425 address start = __ pc(); 5426 __ enter(); 5427 5428 Label L_loop; 5429 5430 const Register coeffs = c_rarg0; 5431 const Register constant = c_rarg1; 5432 5433 const Register dilithiumConsts = r10; 5434 const Register result = r11; 5435 const Register len = r12; 5436 5437 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs 5438 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3 5439 VSeq<2> vq(30); // n.b. constants overlap vs3 5440 VSeq<8> vconst(29, 0); // for montmul by constant 5441 5442 // results track inputs 5443 __ add(result, coeffs, 0); 5444 __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 5445 5446 // load constants q, qinv -- they do not get clobbered by first two loops 5447 vs_ldpq(vq, dilithiumConsts); // qInv, q 5448 // copy caller supplied constant across vconst 5449 __ dup(vconst[0], __ T4S, constant); 5450 __ mov(len, zr); 5451 __ add(len, len, 1024); 5452 5453 __ BIND(L_loop); 5454 5455 // load next 32 inputs 5456 vs_ldpq_post(vs2, coeffs); 5457 // mont mul by constant 5458 vs_montmul32(vs2, vconst, vs2, vtmp, vq); 5459 // write next 32 results 5460 vs_stpq_post(vs2, result); 5461 5462 __ sub(len, len, 128); 5463 __ cmp(len, (u1)128); 5464 __ br(Assembler::GE, L_loop); 5465 5466 __ leave(); // required for proper stackwalking of RuntimeStub frame 5467 __ mov(r0, zr); // return 0 5468 __ ret(lr); 5469 5470 return start; 5471 5472 } 5473 5474 // Dilithium decompose poly. 5475 // Implements the method 5476 // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {} 5477 // of the sun.security.provider.ML_DSA class 5478 // 5479 // input (int[256]) = c_rarg0 5480 // lowPart (int[256]) = c_rarg1 5481 // highPart (int[256]) = c_rarg2 5482 // twoGamma2 (int) = c_rarg3 5483 // multiplier (int) = c_rarg4 5484 address generate_dilithiumDecomposePoly() { 5485 5486 __ align(CodeEntryAlignment); 5487 StubGenStubId stub_id = StubGenStubId::dilithiumDecomposePoly_id; 5488 StubCodeMark mark(this, stub_id); 5489 address start = __ pc(); 5490 Label L_loop; 5491 5492 const Register input = c_rarg0; 5493 const Register lowPart = c_rarg1; 5494 const Register highPart = c_rarg2; 5495 const Register twoGamma2 = c_rarg3; 5496 const Register multiplier = c_rarg4; 5497 5498 const Register len = r9; 5499 const Register dilithiumConsts = r10; 5500 const Register tmp = r11; 5501 5502 VSeq<4> vs1(0), vs2(4), vs3(8); // 6 independent sets of 4x4s values 5503 VSeq<4> vs4(12), vs5(16), vtmp(20); 5504 VSeq<4> one(25, 0); // 7 constants for cross-multiplying 5505 VSeq<4> qminus1(26, 0); 5506 VSeq<4> g2(27, 0); 5507 VSeq<4> twog2(28, 0); 5508 VSeq<4> mult(29, 0); 5509 VSeq<4> q(30, 0); 5510 VSeq<4> qadd(31, 0); 5511 5512 __ enter(); 5513 5514 __ lea(dilithiumConsts, ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts)); 5515 5516 // save callee-saved registers 5517 __ stpd(v8, v9, __ pre(sp, -64)); 5518 __ stpd(v10, v11, Address(sp, 16)); 5519 __ stpd(v12, v13, Address(sp, 32)); 5520 __ stpd(v14, v15, Address(sp, 48)); 5521 5522 // populate constant registers 5523 __ mov(tmp, zr); 5524 __ add(tmp, tmp, 1); 5525 __ dup(one[0], __ T4S, tmp); // 1 5526 __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q 5527 __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce 5528 __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2 5529 __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce 5530 __ subv(qminus1[0], __ T4S, v30, v25); // q - 1 5531 __ sshr(g2[0], __ T4S, v28, 1); // gamma2 5532 5533 __ mov(len, zr); 5534 __ add(len, len, 1024); 5535 5536 __ BIND(L_loop); 5537 5538 // load next 4x4S inputs interleaved: rplus --> vs1 5539 __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64)); 5540 5541 // rplus = rplus - ((rplus + qadd) >> 23) * q 5542 vs_addv(vtmp, __ T4S, vs1, qadd); 5543 vs_sshr(vtmp, __ T4S, vtmp, 23); 5544 vs_mulv(vtmp, __ T4S, vtmp, q); 5545 vs_subv(vs1, __ T4S, vs1, vtmp); 5546 5547 // rplus = rplus + ((rplus >> 31) & dilithium_q); 5548 vs_sshr(vtmp, __ T4S, vs1, 31); 5549 vs_andr(vtmp, vtmp, q); 5550 vs_addv(vs1, __ T4S, vs1, vtmp); 5551 5552 // quotient --> vs2 5553 // int quotient = (rplus * multiplier) >> 22; 5554 vs_mulv(vtmp, __ T4S, vs1, mult); 5555 vs_sshr(vs2, __ T4S, vtmp, 22); 5556 5557 // r0 --> vs3 5558 // int r0 = rplus - quotient * twoGamma2; 5559 vs_mulv(vtmp, __ T4S, vs2, twog2); 5560 vs_subv(vs3, __ T4S, vs1, vtmp); 5561 5562 // mask --> vs4 5563 // int mask = (twoGamma2 - r0) >> 22; 5564 vs_subv(vtmp, __ T4S, twog2, vs3); 5565 vs_sshr(vs4, __ T4S, vtmp, 22); 5566 5567 // r0 -= (mask & twoGamma2); 5568 vs_andr(vtmp, vs4, twog2); 5569 vs_subv(vs3, __ T4S, vs3, vtmp); 5570 5571 // quotient += (mask & 1); 5572 vs_andr(vtmp, vs4, one); 5573 vs_addv(vs2, __ T4S, vs2, vtmp); 5574 5575 // mask = (twoGamma2 / 2 - r0) >> 31; 5576 vs_subv(vtmp, __ T4S, g2, vs3); 5577 vs_sshr(vs4, __ T4S, vtmp, 31); 5578 5579 // r0 -= (mask & twoGamma2); 5580 vs_andr(vtmp, vs4, twog2); 5581 vs_subv(vs3, __ T4S, vs3, vtmp); 5582 5583 // quotient += (mask & 1); 5584 vs_andr(vtmp, vs4, one); 5585 vs_addv(vs2, __ T4S, vs2, vtmp); 5586 5587 // r1 --> vs5 5588 // int r1 = rplus - r0 - (dilithium_q - 1); 5589 vs_subv(vtmp, __ T4S, vs1, vs3); 5590 vs_subv(vs5, __ T4S, vtmp, qminus1); 5591 5592 // r1 --> vs1 (overwriting rplus) 5593 // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise 5594 vs_negr(vtmp, __ T4S, vs5); 5595 vs_orr(vtmp, vs5, vtmp); 5596 vs_sshr(vs1, __ T4S, vtmp, 31); 5597 5598 // r0 += ~r1; 5599 vs_notr(vtmp, vs1); 5600 vs_addv(vs3, __ T4S, vs3, vtmp); 5601 5602 // r1 = r1 & quotient; 5603 vs_andr(vs1, vs2, vs1); 5604 5605 // store results inteleaved 5606 // lowPart[m] = r0; 5607 // highPart[m] = r1; 5608 __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64)); 5609 __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64)); 5610 5611 5612 __ sub(len, len, 64); 5613 __ cmp(len, (u1)64); 5614 __ br(Assembler::GE, L_loop); 5615 5616 // restore callee-saved vector registers 5617 __ ldpd(v14, v15, Address(sp, 48)); 5618 __ ldpd(v12, v13, Address(sp, 32)); 5619 __ ldpd(v10, v11, Address(sp, 16)); 5620 __ ldpd(v8, v9, __ post(sp, 64)); 5621 5622 __ leave(); // required for proper stackwalking of RuntimeStub frame 5623 __ mov(r0, zr); // return 0 5624 __ ret(lr); 5625 5626 return start; 5627 5628 } 5629 5630 /** 5631 * Arguments: 5632 * 5633 * Inputs: 5634 * c_rarg0 - int crc 5635 * c_rarg1 - byte* buf 5636 * c_rarg2 - int length 5637 * c_rarg3 - int* table 5638 * 5639 * Output: 5640 * r0 - int crc result 5641 */ 5642 address generate_updateBytesCRC32C() { 5643 assert(UseCRC32CIntrinsics, "what are we doing here?"); 5644 5645 __ align(CodeEntryAlignment); 5646 StubGenStubId stub_id = StubGenStubId::updateBytesCRC32C_id; 5647 StubCodeMark mark(this, stub_id); 5648 5649 address start = __ pc(); 5650 5651 const Register crc = c_rarg0; // crc 5652 const Register buf = c_rarg1; // source java byte array address 5653 const Register len = c_rarg2; // length 5654 const Register table0 = c_rarg3; // crc_table address 5655 const Register table1 = c_rarg4; 5656 const Register table2 = c_rarg5; 5657 const Register table3 = c_rarg6; 5658 const Register tmp3 = c_rarg7; 5659 5660 BLOCK_COMMENT("Entry:"); 5661 __ enter(); // required for proper stackwalking of RuntimeStub frame 5662 5663 __ kernel_crc32c(crc, buf, len, 5664 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 5665 5666 __ leave(); // required for proper stackwalking of RuntimeStub frame 5667 __ ret(lr); 5668 5669 return start; 5670 } 5671 5672 /*** 5673 * Arguments: 5674 * 5675 * Inputs: 5676 * c_rarg0 - int adler 5677 * c_rarg1 - byte* buff 5678 * c_rarg2 - int len 5679 * 5680 * Output: 5681 * c_rarg0 - int adler result 5682 */ 5683 address generate_updateBytesAdler32() { 5684 __ align(CodeEntryAlignment); 5685 StubGenStubId stub_id = StubGenStubId::updateBytesAdler32_id; 5686 StubCodeMark mark(this, stub_id); 5687 address start = __ pc(); 5688 5689 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 5690 5691 // Aliases 5692 Register adler = c_rarg0; 5693 Register s1 = c_rarg0; 5694 Register s2 = c_rarg3; 5695 Register buff = c_rarg1; 5696 Register len = c_rarg2; 5697 Register nmax = r4; 5698 Register base = r5; 5699 Register count = r6; 5700 Register temp0 = rscratch1; 5701 Register temp1 = rscratch2; 5702 FloatRegister vbytes = v0; 5703 FloatRegister vs1acc = v1; 5704 FloatRegister vs2acc = v2; 5705 FloatRegister vtable = v3; 5706 5707 // Max number of bytes we can process before having to take the mod 5708 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 5709 uint64_t BASE = 0xfff1; 5710 uint64_t NMAX = 0x15B0; 5711 5712 __ mov(base, BASE); 5713 __ mov(nmax, NMAX); 5714 5715 // Load accumulation coefficients for the upper 16 bits 5716 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 5717 __ ld1(vtable, __ T16B, Address(temp0)); 5718 5719 // s1 is initialized to the lower 16 bits of adler 5720 // s2 is initialized to the upper 16 bits of adler 5721 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 5722 __ uxth(s1, adler); // s1 = (adler & 0xffff) 5723 5724 // The pipelined loop needs at least 16 elements for 1 iteration 5725 // It does check this, but it is more effective to skip to the cleanup loop 5726 __ cmp(len, (u1)16); 5727 __ br(Assembler::HS, L_nmax); 5728 __ cbz(len, L_combine); 5729 5730 __ bind(L_simple_by1_loop); 5731 __ ldrb(temp0, Address(__ post(buff, 1))); 5732 __ add(s1, s1, temp0); 5733 __ add(s2, s2, s1); 5734 __ subs(len, len, 1); 5735 __ br(Assembler::HI, L_simple_by1_loop); 5736 5737 // s1 = s1 % BASE 5738 __ subs(temp0, s1, base); 5739 __ csel(s1, temp0, s1, Assembler::HS); 5740 5741 // s2 = s2 % BASE 5742 __ lsr(temp0, s2, 16); 5743 __ lsl(temp1, temp0, 4); 5744 __ sub(temp1, temp1, temp0); 5745 __ add(s2, temp1, s2, ext::uxth); 5746 5747 __ subs(temp0, s2, base); 5748 __ csel(s2, temp0, s2, Assembler::HS); 5749 5750 __ b(L_combine); 5751 5752 __ bind(L_nmax); 5753 __ subs(len, len, nmax); 5754 __ sub(count, nmax, 16); 5755 __ br(Assembler::LO, L_by16); 5756 5757 __ bind(L_nmax_loop); 5758 5759 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 5760 vbytes, vs1acc, vs2acc, vtable); 5761 5762 __ subs(count, count, 16); 5763 __ br(Assembler::HS, L_nmax_loop); 5764 5765 // s1 = s1 % BASE 5766 __ lsr(temp0, s1, 16); 5767 __ lsl(temp1, temp0, 4); 5768 __ sub(temp1, temp1, temp0); 5769 __ add(temp1, temp1, s1, ext::uxth); 5770 5771 __ lsr(temp0, temp1, 16); 5772 __ lsl(s1, temp0, 4); 5773 __ sub(s1, s1, temp0); 5774 __ add(s1, s1, temp1, ext:: uxth); 5775 5776 __ subs(temp0, s1, base); 5777 __ csel(s1, temp0, s1, Assembler::HS); 5778 5779 // s2 = s2 % BASE 5780 __ lsr(temp0, s2, 16); 5781 __ lsl(temp1, temp0, 4); 5782 __ sub(temp1, temp1, temp0); 5783 __ add(temp1, temp1, s2, ext::uxth); 5784 5785 __ lsr(temp0, temp1, 16); 5786 __ lsl(s2, temp0, 4); 5787 __ sub(s2, s2, temp0); 5788 __ add(s2, s2, temp1, ext:: uxth); 5789 5790 __ subs(temp0, s2, base); 5791 __ csel(s2, temp0, s2, Assembler::HS); 5792 5793 __ subs(len, len, nmax); 5794 __ sub(count, nmax, 16); 5795 __ br(Assembler::HS, L_nmax_loop); 5796 5797 __ bind(L_by16); 5798 __ adds(len, len, count); 5799 __ br(Assembler::LO, L_by1); 5800 5801 __ bind(L_by16_loop); 5802 5803 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 5804 vbytes, vs1acc, vs2acc, vtable); 5805 5806 __ subs(len, len, 16); 5807 __ br(Assembler::HS, L_by16_loop); 5808 5809 __ bind(L_by1); 5810 __ adds(len, len, 15); 5811 __ br(Assembler::LO, L_do_mod); 5812 5813 __ bind(L_by1_loop); 5814 __ ldrb(temp0, Address(__ post(buff, 1))); 5815 __ add(s1, temp0, s1); 5816 __ add(s2, s2, s1); 5817 __ subs(len, len, 1); 5818 __ br(Assembler::HS, L_by1_loop); 5819 5820 __ bind(L_do_mod); 5821 // s1 = s1 % BASE 5822 __ lsr(temp0, s1, 16); 5823 __ lsl(temp1, temp0, 4); 5824 __ sub(temp1, temp1, temp0); 5825 __ add(temp1, temp1, s1, ext::uxth); 5826 5827 __ lsr(temp0, temp1, 16); 5828 __ lsl(s1, temp0, 4); 5829 __ sub(s1, s1, temp0); 5830 __ add(s1, s1, temp1, ext:: uxth); 5831 5832 __ subs(temp0, s1, base); 5833 __ csel(s1, temp0, s1, Assembler::HS); 5834 5835 // s2 = s2 % BASE 5836 __ lsr(temp0, s2, 16); 5837 __ lsl(temp1, temp0, 4); 5838 __ sub(temp1, temp1, temp0); 5839 __ add(temp1, temp1, s2, ext::uxth); 5840 5841 __ lsr(temp0, temp1, 16); 5842 __ lsl(s2, temp0, 4); 5843 __ sub(s2, s2, temp0); 5844 __ add(s2, s2, temp1, ext:: uxth); 5845 5846 __ subs(temp0, s2, base); 5847 __ csel(s2, temp0, s2, Assembler::HS); 5848 5849 // Combine lower bits and higher bits 5850 __ bind(L_combine); 5851 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 5852 5853 __ ret(lr); 5854 5855 return start; 5856 } 5857 5858 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 5859 Register temp0, Register temp1, FloatRegister vbytes, 5860 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 5861 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 5862 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 5863 // In non-vectorized code, we update s1 and s2 as: 5864 // s1 <- s1 + b1 5865 // s2 <- s2 + s1 5866 // s1 <- s1 + b2 5867 // s2 <- s2 + b1 5868 // ... 5869 // s1 <- s1 + b16 5870 // s2 <- s2 + s1 5871 // Putting above assignments together, we have: 5872 // s1_new = s1 + b1 + b2 + ... + b16 5873 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 5874 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 5875 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 5876 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 5877 5878 // s2 = s2 + s1 * 16 5879 __ add(s2, s2, s1, Assembler::LSL, 4); 5880 5881 // vs1acc = b1 + b2 + b3 + ... + b16 5882 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 5883 __ umullv(vs2acc, __ T8B, vtable, vbytes); 5884 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 5885 __ uaddlv(vs1acc, __ T16B, vbytes); 5886 __ uaddlv(vs2acc, __ T8H, vs2acc); 5887 5888 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 5889 __ fmovd(temp0, vs1acc); 5890 __ fmovd(temp1, vs2acc); 5891 __ add(s1, s1, temp0); 5892 __ add(s2, s2, temp1); 5893 } 5894 5895 /** 5896 * Arguments: 5897 * 5898 * Input: 5899 * c_rarg0 - x address 5900 * c_rarg1 - x length 5901 * c_rarg2 - y address 5902 * c_rarg3 - y length 5903 * c_rarg4 - z address 5904 */ 5905 address generate_multiplyToLen() { 5906 __ align(CodeEntryAlignment); 5907 StubGenStubId stub_id = StubGenStubId::multiplyToLen_id; 5908 StubCodeMark mark(this, stub_id); 5909 5910 address start = __ pc(); 5911 const Register x = r0; 5912 const Register xlen = r1; 5913 const Register y = r2; 5914 const Register ylen = r3; 5915 const Register z = r4; 5916 5917 const Register tmp0 = r5; 5918 const Register tmp1 = r10; 5919 const Register tmp2 = r11; 5920 const Register tmp3 = r12; 5921 const Register tmp4 = r13; 5922 const Register tmp5 = r14; 5923 const Register tmp6 = r15; 5924 const Register tmp7 = r16; 5925 5926 BLOCK_COMMENT("Entry:"); 5927 __ enter(); // required for proper stackwalking of RuntimeStub frame 5928 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 5929 __ leave(); // required for proper stackwalking of RuntimeStub frame 5930 __ ret(lr); 5931 5932 return start; 5933 } 5934 5935 address generate_squareToLen() { 5936 // squareToLen algorithm for sizes 1..127 described in java code works 5937 // faster than multiply_to_len on some CPUs and slower on others, but 5938 // multiply_to_len shows a bit better overall results 5939 __ align(CodeEntryAlignment); 5940 StubGenStubId stub_id = StubGenStubId::squareToLen_id; 5941 StubCodeMark mark(this, stub_id); 5942 address start = __ pc(); 5943 5944 const Register x = r0; 5945 const Register xlen = r1; 5946 const Register z = r2; 5947 const Register y = r4; // == x 5948 const Register ylen = r5; // == xlen 5949 5950 const Register tmp0 = r3; 5951 const Register tmp1 = r10; 5952 const Register tmp2 = r11; 5953 const Register tmp3 = r12; 5954 const Register tmp4 = r13; 5955 const Register tmp5 = r14; 5956 const Register tmp6 = r15; 5957 const Register tmp7 = r16; 5958 5959 RegSet spilled_regs = RegSet::of(y, ylen); 5960 BLOCK_COMMENT("Entry:"); 5961 __ enter(); 5962 __ push(spilled_regs, sp); 5963 __ mov(y, x); 5964 __ mov(ylen, xlen); 5965 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 5966 __ pop(spilled_regs, sp); 5967 __ leave(); 5968 __ ret(lr); 5969 return start; 5970 } 5971 5972 address generate_mulAdd() { 5973 __ align(CodeEntryAlignment); 5974 StubGenStubId stub_id = StubGenStubId::mulAdd_id; 5975 StubCodeMark mark(this, stub_id); 5976 5977 address start = __ pc(); 5978 5979 const Register out = r0; 5980 const Register in = r1; 5981 const Register offset = r2; 5982 const Register len = r3; 5983 const Register k = r4; 5984 5985 BLOCK_COMMENT("Entry:"); 5986 __ enter(); 5987 __ mul_add(out, in, offset, len, k); 5988 __ leave(); 5989 __ ret(lr); 5990 5991 return start; 5992 } 5993 5994 // Arguments: 5995 // 5996 // Input: 5997 // c_rarg0 - newArr address 5998 // c_rarg1 - oldArr address 5999 // c_rarg2 - newIdx 6000 // c_rarg3 - shiftCount 6001 // c_rarg4 - numIter 6002 // 6003 address generate_bigIntegerRightShift() { 6004 __ align(CodeEntryAlignment); 6005 StubGenStubId stub_id = StubGenStubId::bigIntegerRightShiftWorker_id; 6006 StubCodeMark mark(this, stub_id); 6007 address start = __ pc(); 6008 6009 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 6010 6011 Register newArr = c_rarg0; 6012 Register oldArr = c_rarg1; 6013 Register newIdx = c_rarg2; 6014 Register shiftCount = c_rarg3; 6015 Register numIter = c_rarg4; 6016 Register idx = numIter; 6017 6018 Register newArrCur = rscratch1; 6019 Register shiftRevCount = rscratch2; 6020 Register oldArrCur = r13; 6021 Register oldArrNext = r14; 6022 6023 FloatRegister oldElem0 = v0; 6024 FloatRegister oldElem1 = v1; 6025 FloatRegister newElem = v2; 6026 FloatRegister shiftVCount = v3; 6027 FloatRegister shiftVRevCount = v4; 6028 6029 __ cbz(idx, Exit); 6030 6031 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 6032 6033 // left shift count 6034 __ movw(shiftRevCount, 32); 6035 __ subw(shiftRevCount, shiftRevCount, shiftCount); 6036 6037 // numIter too small to allow a 4-words SIMD loop, rolling back 6038 __ cmp(numIter, (u1)4); 6039 __ br(Assembler::LT, ShiftThree); 6040 6041 __ dup(shiftVCount, __ T4S, shiftCount); 6042 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 6043 __ negr(shiftVCount, __ T4S, shiftVCount); 6044 6045 __ BIND(ShiftSIMDLoop); 6046 6047 // Calculate the load addresses 6048 __ sub(idx, idx, 4); 6049 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 6050 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 6051 __ add(oldArrCur, oldArrNext, 4); 6052 6053 // Load 4 words and process 6054 __ ld1(oldElem0, __ T4S, Address(oldArrCur)); 6055 __ ld1(oldElem1, __ T4S, Address(oldArrNext)); 6056 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 6057 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 6058 __ orr(newElem, __ T16B, oldElem0, oldElem1); 6059 __ st1(newElem, __ T4S, Address(newArrCur)); 6060 6061 __ cmp(idx, (u1)4); 6062 __ br(Assembler::LT, ShiftTwoLoop); 6063 __ b(ShiftSIMDLoop); 6064 6065 __ BIND(ShiftTwoLoop); 6066 __ cbz(idx, Exit); 6067 __ cmp(idx, (u1)1); 6068 __ br(Assembler::EQ, ShiftOne); 6069 6070 // Calculate the load addresses 6071 __ sub(idx, idx, 2); 6072 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 6073 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 6074 __ add(oldArrCur, oldArrNext, 4); 6075 6076 // Load 2 words and process 6077 __ ld1(oldElem0, __ T2S, Address(oldArrCur)); 6078 __ ld1(oldElem1, __ T2S, Address(oldArrNext)); 6079 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 6080 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 6081 __ orr(newElem, __ T8B, oldElem0, oldElem1); 6082 __ st1(newElem, __ T2S, Address(newArrCur)); 6083 __ b(ShiftTwoLoop); 6084 6085 __ BIND(ShiftThree); 6086 __ tbz(idx, 1, ShiftOne); 6087 __ tbz(idx, 0, ShiftTwo); 6088 __ ldrw(r10, Address(oldArr, 12)); 6089 __ ldrw(r11, Address(oldArr, 8)); 6090 __ lsrvw(r10, r10, shiftCount); 6091 __ lslvw(r11, r11, shiftRevCount); 6092 __ orrw(r12, r10, r11); 6093 __ strw(r12, Address(newArr, 8)); 6094 6095 __ BIND(ShiftTwo); 6096 __ ldrw(r10, Address(oldArr, 8)); 6097 __ ldrw(r11, Address(oldArr, 4)); 6098 __ lsrvw(r10, r10, shiftCount); 6099 __ lslvw(r11, r11, shiftRevCount); 6100 __ orrw(r12, r10, r11); 6101 __ strw(r12, Address(newArr, 4)); 6102 6103 __ BIND(ShiftOne); 6104 __ ldrw(r10, Address(oldArr, 4)); 6105 __ ldrw(r11, Address(oldArr)); 6106 __ lsrvw(r10, r10, shiftCount); 6107 __ lslvw(r11, r11, shiftRevCount); 6108 __ orrw(r12, r10, r11); 6109 __ strw(r12, Address(newArr)); 6110 6111 __ BIND(Exit); 6112 __ ret(lr); 6113 6114 return start; 6115 } 6116 6117 // Arguments: 6118 // 6119 // Input: 6120 // c_rarg0 - newArr address 6121 // c_rarg1 - oldArr address 6122 // c_rarg2 - newIdx 6123 // c_rarg3 - shiftCount 6124 // c_rarg4 - numIter 6125 // 6126 address generate_bigIntegerLeftShift() { 6127 __ align(CodeEntryAlignment); 6128 StubGenStubId stub_id = StubGenStubId::bigIntegerLeftShiftWorker_id; 6129 StubCodeMark mark(this, stub_id); 6130 address start = __ pc(); 6131 6132 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 6133 6134 Register newArr = c_rarg0; 6135 Register oldArr = c_rarg1; 6136 Register newIdx = c_rarg2; 6137 Register shiftCount = c_rarg3; 6138 Register numIter = c_rarg4; 6139 6140 Register shiftRevCount = rscratch1; 6141 Register oldArrNext = rscratch2; 6142 6143 FloatRegister oldElem0 = v0; 6144 FloatRegister oldElem1 = v1; 6145 FloatRegister newElem = v2; 6146 FloatRegister shiftVCount = v3; 6147 FloatRegister shiftVRevCount = v4; 6148 6149 __ cbz(numIter, Exit); 6150 6151 __ add(oldArrNext, oldArr, 4); 6152 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 6153 6154 // right shift count 6155 __ movw(shiftRevCount, 32); 6156 __ subw(shiftRevCount, shiftRevCount, shiftCount); 6157 6158 // numIter too small to allow a 4-words SIMD loop, rolling back 6159 __ cmp(numIter, (u1)4); 6160 __ br(Assembler::LT, ShiftThree); 6161 6162 __ dup(shiftVCount, __ T4S, shiftCount); 6163 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 6164 __ negr(shiftVRevCount, __ T4S, shiftVRevCount); 6165 6166 __ BIND(ShiftSIMDLoop); 6167 6168 // load 4 words and process 6169 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); 6170 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); 6171 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 6172 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 6173 __ orr(newElem, __ T16B, oldElem0, oldElem1); 6174 __ st1(newElem, __ T4S, __ post(newArr, 16)); 6175 __ sub(numIter, numIter, 4); 6176 6177 __ cmp(numIter, (u1)4); 6178 __ br(Assembler::LT, ShiftTwoLoop); 6179 __ b(ShiftSIMDLoop); 6180 6181 __ BIND(ShiftTwoLoop); 6182 __ cbz(numIter, Exit); 6183 __ cmp(numIter, (u1)1); 6184 __ br(Assembler::EQ, ShiftOne); 6185 6186 // load 2 words and process 6187 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); 6188 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); 6189 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 6190 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 6191 __ orr(newElem, __ T8B, oldElem0, oldElem1); 6192 __ st1(newElem, __ T2S, __ post(newArr, 8)); 6193 __ sub(numIter, numIter, 2); 6194 __ b(ShiftTwoLoop); 6195 6196 __ BIND(ShiftThree); 6197 __ ldrw(r10, __ post(oldArr, 4)); 6198 __ ldrw(r11, __ post(oldArrNext, 4)); 6199 __ lslvw(r10, r10, shiftCount); 6200 __ lsrvw(r11, r11, shiftRevCount); 6201 __ orrw(r12, r10, r11); 6202 __ strw(r12, __ post(newArr, 4)); 6203 __ tbz(numIter, 1, Exit); 6204 __ tbz(numIter, 0, ShiftOne); 6205 6206 __ BIND(ShiftTwo); 6207 __ ldrw(r10, __ post(oldArr, 4)); 6208 __ ldrw(r11, __ post(oldArrNext, 4)); 6209 __ lslvw(r10, r10, shiftCount); 6210 __ lsrvw(r11, r11, shiftRevCount); 6211 __ orrw(r12, r10, r11); 6212 __ strw(r12, __ post(newArr, 4)); 6213 6214 __ BIND(ShiftOne); 6215 __ ldrw(r10, Address(oldArr)); 6216 __ ldrw(r11, Address(oldArrNext)); 6217 __ lslvw(r10, r10, shiftCount); 6218 __ lsrvw(r11, r11, shiftRevCount); 6219 __ orrw(r12, r10, r11); 6220 __ strw(r12, Address(newArr)); 6221 6222 __ BIND(Exit); 6223 __ ret(lr); 6224 6225 return start; 6226 } 6227 6228 address generate_count_positives(address &count_positives_long) { 6229 const u1 large_loop_size = 64; 6230 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 6231 int dcache_line = VM_Version::dcache_line_size(); 6232 6233 Register ary1 = r1, len = r2, result = r0; 6234 6235 __ align(CodeEntryAlignment); 6236 6237 StubGenStubId stub_id = StubGenStubId::count_positives_id; 6238 StubCodeMark mark(this, stub_id); 6239 6240 address entry = __ pc(); 6241 6242 __ enter(); 6243 // precondition: a copy of len is already in result 6244 // __ mov(result, len); 6245 6246 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16, 6247 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 6248 6249 __ cmp(len, (u1)15); 6250 __ br(Assembler::GT, LEN_OVER_15); 6251 // The only case when execution falls into this code is when pointer is near 6252 // the end of memory page and we have to avoid reading next page 6253 __ add(ary1, ary1, len); 6254 __ subs(len, len, 8); 6255 __ br(Assembler::GT, LEN_OVER_8); 6256 __ ldr(rscratch2, Address(ary1, -8)); 6257 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 6258 __ lsrv(rscratch2, rscratch2, rscratch1); 6259 __ tst(rscratch2, UPPER_BIT_MASK); 6260 __ csel(result, zr, result, Assembler::NE); 6261 __ leave(); 6262 __ ret(lr); 6263 __ bind(LEN_OVER_8); 6264 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 6265 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 6266 __ tst(rscratch2, UPPER_BIT_MASK); 6267 __ br(Assembler::NE, RET_NO_POP); 6268 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 6269 __ lsrv(rscratch1, rscratch1, rscratch2); 6270 __ tst(rscratch1, UPPER_BIT_MASK); 6271 __ bind(RET_NO_POP); 6272 __ csel(result, zr, result, Assembler::NE); 6273 __ leave(); 6274 __ ret(lr); 6275 6276 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 6277 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 6278 6279 count_positives_long = __ pc(); // 2nd entry point 6280 6281 __ enter(); 6282 6283 __ bind(LEN_OVER_15); 6284 __ push(spilled_regs, sp); 6285 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 6286 __ cbz(rscratch2, ALIGNED); 6287 __ ldp(tmp6, tmp1, Address(ary1)); 6288 __ mov(tmp5, 16); 6289 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 6290 __ add(ary1, ary1, rscratch1); 6291 __ orr(tmp6, tmp6, tmp1); 6292 __ tst(tmp6, UPPER_BIT_MASK); 6293 __ br(Assembler::NE, RET_ADJUST); 6294 __ sub(len, len, rscratch1); 6295 6296 __ bind(ALIGNED); 6297 __ cmp(len, large_loop_size); 6298 __ br(Assembler::LT, CHECK_16); 6299 // Perform 16-byte load as early return in pre-loop to handle situation 6300 // when initially aligned large array has negative values at starting bytes, 6301 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 6302 // slower. Cases with negative bytes further ahead won't be affected that 6303 // much. In fact, it'll be faster due to early loads, less instructions and 6304 // less branches in LARGE_LOOP. 6305 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 6306 __ sub(len, len, 16); 6307 __ orr(tmp6, tmp6, tmp1); 6308 __ tst(tmp6, UPPER_BIT_MASK); 6309 __ br(Assembler::NE, RET_ADJUST_16); 6310 __ cmp(len, large_loop_size); 6311 __ br(Assembler::LT, CHECK_16); 6312 6313 if (SoftwarePrefetchHintDistance >= 0 6314 && SoftwarePrefetchHintDistance >= dcache_line) { 6315 // initial prefetch 6316 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 6317 } 6318 __ bind(LARGE_LOOP); 6319 if (SoftwarePrefetchHintDistance >= 0) { 6320 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 6321 } 6322 // Issue load instructions first, since it can save few CPU/MEM cycles, also 6323 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 6324 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 6325 // instructions per cycle and have less branches, but this approach disables 6326 // early return, thus, all 64 bytes are loaded and checked every time. 6327 __ ldp(tmp2, tmp3, Address(ary1)); 6328 __ ldp(tmp4, tmp5, Address(ary1, 16)); 6329 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 6330 __ ldp(tmp6, tmp1, Address(ary1, 48)); 6331 __ add(ary1, ary1, large_loop_size); 6332 __ sub(len, len, large_loop_size); 6333 __ orr(tmp2, tmp2, tmp3); 6334 __ orr(tmp4, tmp4, tmp5); 6335 __ orr(rscratch1, rscratch1, rscratch2); 6336 __ orr(tmp6, tmp6, tmp1); 6337 __ orr(tmp2, tmp2, tmp4); 6338 __ orr(rscratch1, rscratch1, tmp6); 6339 __ orr(tmp2, tmp2, rscratch1); 6340 __ tst(tmp2, UPPER_BIT_MASK); 6341 __ br(Assembler::NE, RET_ADJUST_LONG); 6342 __ cmp(len, large_loop_size); 6343 __ br(Assembler::GE, LARGE_LOOP); 6344 6345 __ bind(CHECK_16); // small 16-byte load pre-loop 6346 __ cmp(len, (u1)16); 6347 __ br(Assembler::LT, POST_LOOP16); 6348 6349 __ bind(LOOP16); // small 16-byte load loop 6350 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 6351 __ sub(len, len, 16); 6352 __ orr(tmp2, tmp2, tmp3); 6353 __ tst(tmp2, UPPER_BIT_MASK); 6354 __ br(Assembler::NE, RET_ADJUST_16); 6355 __ cmp(len, (u1)16); 6356 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 6357 6358 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 6359 __ cmp(len, (u1)8); 6360 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 6361 __ ldr(tmp3, Address(__ post(ary1, 8))); 6362 __ tst(tmp3, UPPER_BIT_MASK); 6363 __ br(Assembler::NE, RET_ADJUST); 6364 __ sub(len, len, 8); 6365 6366 __ bind(POST_LOOP16_LOAD_TAIL); 6367 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0 6368 __ ldr(tmp1, Address(ary1)); 6369 __ mov(tmp2, 64); 6370 __ sub(tmp4, tmp2, len, __ LSL, 3); 6371 __ lslv(tmp1, tmp1, tmp4); 6372 __ tst(tmp1, UPPER_BIT_MASK); 6373 __ br(Assembler::NE, RET_ADJUST); 6374 // Fallthrough 6375 6376 __ bind(RET_LEN); 6377 __ pop(spilled_regs, sp); 6378 __ leave(); 6379 __ ret(lr); 6380 6381 // difference result - len is the count of guaranteed to be 6382 // positive bytes 6383 6384 __ bind(RET_ADJUST_LONG); 6385 __ add(len, len, (u1)(large_loop_size - 16)); 6386 __ bind(RET_ADJUST_16); 6387 __ add(len, len, 16); 6388 __ bind(RET_ADJUST); 6389 __ pop(spilled_regs, sp); 6390 __ leave(); 6391 __ sub(result, result, len); 6392 __ ret(lr); 6393 6394 return entry; 6395 } 6396 6397 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 6398 bool usePrefetch, Label &NOT_EQUAL) { 6399 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 6400 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 6401 tmp7 = r12, tmp8 = r13; 6402 Label LOOP; 6403 6404 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 6405 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 6406 __ bind(LOOP); 6407 if (usePrefetch) { 6408 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 6409 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 6410 } 6411 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 6412 __ eor(tmp1, tmp1, tmp2); 6413 __ eor(tmp3, tmp3, tmp4); 6414 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 6415 __ orr(tmp1, tmp1, tmp3); 6416 __ cbnz(tmp1, NOT_EQUAL); 6417 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 6418 __ eor(tmp5, tmp5, tmp6); 6419 __ eor(tmp7, tmp7, tmp8); 6420 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 6421 __ orr(tmp5, tmp5, tmp7); 6422 __ cbnz(tmp5, NOT_EQUAL); 6423 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 6424 __ eor(tmp1, tmp1, tmp2); 6425 __ eor(tmp3, tmp3, tmp4); 6426 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 6427 __ orr(tmp1, tmp1, tmp3); 6428 __ cbnz(tmp1, NOT_EQUAL); 6429 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 6430 __ eor(tmp5, tmp5, tmp6); 6431 __ sub(cnt1, cnt1, 8 * wordSize); 6432 __ eor(tmp7, tmp7, tmp8); 6433 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 6434 // tmp6 is not used. MacroAssembler::subs is used here (rather than 6435 // cmp) because subs allows an unlimited range of immediate operand. 6436 __ subs(tmp6, cnt1, loopThreshold); 6437 __ orr(tmp5, tmp5, tmp7); 6438 __ cbnz(tmp5, NOT_EQUAL); 6439 __ br(__ GE, LOOP); 6440 // post-loop 6441 __ eor(tmp1, tmp1, tmp2); 6442 __ eor(tmp3, tmp3, tmp4); 6443 __ orr(tmp1, tmp1, tmp3); 6444 __ sub(cnt1, cnt1, 2 * wordSize); 6445 __ cbnz(tmp1, NOT_EQUAL); 6446 } 6447 6448 void generate_large_array_equals_loop_simd(int loopThreshold, 6449 bool usePrefetch, Label &NOT_EQUAL) { 6450 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 6451 tmp2 = rscratch2; 6452 Label LOOP; 6453 6454 __ bind(LOOP); 6455 if (usePrefetch) { 6456 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 6457 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 6458 } 6459 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 6460 __ sub(cnt1, cnt1, 8 * wordSize); 6461 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 6462 __ subs(tmp1, cnt1, loopThreshold); 6463 __ eor(v0, __ T16B, v0, v4); 6464 __ eor(v1, __ T16B, v1, v5); 6465 __ eor(v2, __ T16B, v2, v6); 6466 __ eor(v3, __ T16B, v3, v7); 6467 __ orr(v0, __ T16B, v0, v1); 6468 __ orr(v1, __ T16B, v2, v3); 6469 __ orr(v0, __ T16B, v0, v1); 6470 __ umov(tmp1, v0, __ D, 0); 6471 __ umov(tmp2, v0, __ D, 1); 6472 __ orr(tmp1, tmp1, tmp2); 6473 __ cbnz(tmp1, NOT_EQUAL); 6474 __ br(__ GE, LOOP); 6475 } 6476 6477 // a1 = r1 - array1 address 6478 // a2 = r2 - array2 address 6479 // result = r0 - return value. Already contains "false" 6480 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 6481 // r3-r5 are reserved temporary registers 6482 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2 6483 address generate_large_array_equals() { 6484 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 6485 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 6486 tmp7 = r12, tmp8 = r13; 6487 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 6488 SMALL_LOOP, POST_LOOP; 6489 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 6490 // calculate if at least 32 prefetched bytes are used 6491 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 6492 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 6493 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 6494 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 6495 tmp5, tmp6, tmp7, tmp8); 6496 6497 __ align(CodeEntryAlignment); 6498 6499 StubGenStubId stub_id = StubGenStubId::large_array_equals_id; 6500 StubCodeMark mark(this, stub_id); 6501 6502 address entry = __ pc(); 6503 __ enter(); 6504 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 6505 // also advance pointers to use post-increment instead of pre-increment 6506 __ add(a1, a1, wordSize); 6507 __ add(a2, a2, wordSize); 6508 if (AvoidUnalignedAccesses) { 6509 // both implementations (SIMD/nonSIMD) are using relatively large load 6510 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 6511 // on some CPUs in case of address is not at least 16-byte aligned. 6512 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 6513 // load if needed at least for 1st address and make if 16-byte aligned. 6514 Label ALIGNED16; 6515 __ tbz(a1, 3, ALIGNED16); 6516 __ ldr(tmp1, Address(__ post(a1, wordSize))); 6517 __ ldr(tmp2, Address(__ post(a2, wordSize))); 6518 __ sub(cnt1, cnt1, wordSize); 6519 __ eor(tmp1, tmp1, tmp2); 6520 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 6521 __ bind(ALIGNED16); 6522 } 6523 if (UseSIMDForArrayEquals) { 6524 if (SoftwarePrefetchHintDistance >= 0) { 6525 __ subs(tmp1, cnt1, prefetchLoopThreshold); 6526 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 6527 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 6528 /* prfm = */ true, NOT_EQUAL); 6529 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 6530 __ br(__ LT, TAIL); 6531 } 6532 __ bind(NO_PREFETCH_LARGE_LOOP); 6533 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 6534 /* prfm = */ false, NOT_EQUAL); 6535 } else { 6536 __ push(spilled_regs, sp); 6537 if (SoftwarePrefetchHintDistance >= 0) { 6538 __ subs(tmp1, cnt1, prefetchLoopThreshold); 6539 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 6540 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 6541 /* prfm = */ true, NOT_EQUAL); 6542 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 6543 __ br(__ LT, TAIL); 6544 } 6545 __ bind(NO_PREFETCH_LARGE_LOOP); 6546 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 6547 /* prfm = */ false, NOT_EQUAL); 6548 } 6549 __ bind(TAIL); 6550 __ cbz(cnt1, EQUAL); 6551 __ subs(cnt1, cnt1, wordSize); 6552 __ br(__ LE, POST_LOOP); 6553 __ bind(SMALL_LOOP); 6554 __ ldr(tmp1, Address(__ post(a1, wordSize))); 6555 __ ldr(tmp2, Address(__ post(a2, wordSize))); 6556 __ subs(cnt1, cnt1, wordSize); 6557 __ eor(tmp1, tmp1, tmp2); 6558 __ cbnz(tmp1, NOT_EQUAL); 6559 __ br(__ GT, SMALL_LOOP); 6560 __ bind(POST_LOOP); 6561 __ ldr(tmp1, Address(a1, cnt1)); 6562 __ ldr(tmp2, Address(a2, cnt1)); 6563 __ eor(tmp1, tmp1, tmp2); 6564 __ cbnz(tmp1, NOT_EQUAL); 6565 __ bind(EQUAL); 6566 __ mov(result, true); 6567 __ bind(NOT_EQUAL); 6568 if (!UseSIMDForArrayEquals) { 6569 __ pop(spilled_regs, sp); 6570 } 6571 __ bind(NOT_EQUAL_NO_POP); 6572 __ leave(); 6573 __ ret(lr); 6574 return entry; 6575 } 6576 6577 // result = r0 - return value. Contains initial hashcode value on entry. 6578 // ary = r1 - array address 6579 // cnt = r2 - elements count 6580 // Clobbers: v0-v13, rscratch1, rscratch2 6581 address generate_large_arrays_hashcode(BasicType eltype) { 6582 const Register result = r0, ary = r1, cnt = r2; 6583 const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0; 6584 const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7; 6585 const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0> 6586 const FloatRegister vpowm = v13; 6587 6588 ARRAYS_HASHCODE_REGISTERS; 6589 6590 Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE; 6591 6592 unsigned int vf; // vectorization factor 6593 bool multiply_by_halves; 6594 Assembler::SIMD_Arrangement load_arrangement; 6595 switch (eltype) { 6596 case T_BOOLEAN: 6597 case T_BYTE: 6598 load_arrangement = Assembler::T8B; 6599 multiply_by_halves = true; 6600 vf = 8; 6601 break; 6602 case T_CHAR: 6603 case T_SHORT: 6604 load_arrangement = Assembler::T8H; 6605 multiply_by_halves = true; 6606 vf = 8; 6607 break; 6608 case T_INT: 6609 load_arrangement = Assembler::T4S; 6610 multiply_by_halves = false; 6611 vf = 4; 6612 break; 6613 default: 6614 ShouldNotReachHere(); 6615 } 6616 6617 // Unroll factor 6618 const unsigned uf = 4; 6619 6620 // Effective vectorization factor 6621 const unsigned evf = vf * uf; 6622 6623 __ align(CodeEntryAlignment); 6624 6625 StubGenStubId stub_id; 6626 switch (eltype) { 6627 case T_BOOLEAN: 6628 stub_id = StubGenStubId::large_arrays_hashcode_boolean_id; 6629 break; 6630 case T_BYTE: 6631 stub_id = StubGenStubId::large_arrays_hashcode_byte_id; 6632 break; 6633 case T_CHAR: 6634 stub_id = StubGenStubId::large_arrays_hashcode_char_id; 6635 break; 6636 case T_SHORT: 6637 stub_id = StubGenStubId::large_arrays_hashcode_short_id; 6638 break; 6639 case T_INT: 6640 stub_id = StubGenStubId::large_arrays_hashcode_int_id; 6641 break; 6642 default: 6643 stub_id = StubGenStubId::NO_STUBID; 6644 ShouldNotReachHere(); 6645 }; 6646 6647 StubCodeMark mark(this, stub_id); 6648 6649 address entry = __ pc(); 6650 __ enter(); 6651 6652 // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in 6653 // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's 6654 // value shouldn't change throughout both loops. 6655 __ movw(rscratch1, intpow(31U, 3)); 6656 __ mov(vpow, Assembler::S, 0, rscratch1); 6657 __ movw(rscratch1, intpow(31U, 2)); 6658 __ mov(vpow, Assembler::S, 1, rscratch1); 6659 __ movw(rscratch1, intpow(31U, 1)); 6660 __ mov(vpow, Assembler::S, 2, rscratch1); 6661 __ movw(rscratch1, intpow(31U, 0)); 6662 __ mov(vpow, Assembler::S, 3, rscratch1); 6663 6664 __ mov(vmul0, Assembler::T16B, 0); 6665 __ mov(vmul0, Assembler::S, 3, result); 6666 6667 __ andr(rscratch2, cnt, (uf - 1) * vf); 6668 __ cbz(rscratch2, LARGE_LOOP_PREHEADER); 6669 6670 __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf)); 6671 __ mov(vpowm, Assembler::S, 0, rscratch1); 6672 6673 // SMALL LOOP 6674 __ bind(SMALL_LOOP); 6675 6676 __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype)))); 6677 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 6678 __ subsw(rscratch2, rscratch2, vf); 6679 6680 if (load_arrangement == Assembler::T8B) { 6681 // Extend 8B to 8H to be able to use vector multiply 6682 // instructions 6683 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 6684 if (is_signed_subword_type(eltype)) { 6685 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 6686 } else { 6687 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 6688 } 6689 } 6690 6691 switch (load_arrangement) { 6692 case Assembler::T4S: 6693 __ addv(vmul0, load_arrangement, vmul0, vdata0); 6694 break; 6695 case Assembler::T8B: 6696 case Assembler::T8H: 6697 assert(is_subword_type(eltype), "subword type expected"); 6698 if (is_signed_subword_type(eltype)) { 6699 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 6700 } else { 6701 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 6702 } 6703 break; 6704 default: 6705 __ should_not_reach_here(); 6706 } 6707 6708 // Process the upper half of a vector 6709 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 6710 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 6711 if (is_signed_subword_type(eltype)) { 6712 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 6713 } else { 6714 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 6715 } 6716 } 6717 6718 __ br(Assembler::HI, SMALL_LOOP); 6719 6720 // SMALL LOOP'S EPILOQUE 6721 __ lsr(rscratch2, cnt, exact_log2(evf)); 6722 __ cbnz(rscratch2, LARGE_LOOP_PREHEADER); 6723 6724 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 6725 __ addv(vmul0, Assembler::T4S, vmul0); 6726 __ umov(result, vmul0, Assembler::S, 0); 6727 6728 // TAIL 6729 __ bind(TAIL); 6730 6731 // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs 6732 // of load + madd insns i.e. it only executes cnt % vf load + madd pairs. 6733 assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC"); 6734 __ andr(rscratch2, cnt, vf - 1); 6735 __ bind(TAIL_SHORTCUT); 6736 __ adr(rscratch1, BR_BASE); 6737 __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, 3); 6738 __ movw(rscratch2, 0x1f); 6739 __ br(rscratch1); 6740 6741 for (size_t i = 0; i < vf - 1; ++i) { 6742 __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))), 6743 eltype); 6744 __ maddw(result, result, rscratch2, rscratch1); 6745 } 6746 __ bind(BR_BASE); 6747 6748 __ leave(); 6749 __ ret(lr); 6750 6751 // LARGE LOOP 6752 __ bind(LARGE_LOOP_PREHEADER); 6753 6754 __ lsr(rscratch2, cnt, exact_log2(evf)); 6755 6756 if (multiply_by_halves) { 6757 // 31^4 - multiplier between lower and upper parts of a register 6758 __ movw(rscratch1, intpow(31U, vf / 2)); 6759 __ mov(vpowm, Assembler::S, 1, rscratch1); 6760 // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4 6761 __ movw(rscratch1, intpow(31U, evf - vf / 2)); 6762 __ mov(vpowm, Assembler::S, 0, rscratch1); 6763 } else { 6764 // 31^16 6765 __ movw(rscratch1, intpow(31U, evf)); 6766 __ mov(vpowm, Assembler::S, 0, rscratch1); 6767 } 6768 6769 __ mov(vmul3, Assembler::T16B, 0); 6770 __ mov(vmul2, Assembler::T16B, 0); 6771 __ mov(vmul1, Assembler::T16B, 0); 6772 6773 __ bind(LARGE_LOOP); 6774 6775 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0); 6776 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0); 6777 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0); 6778 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 6779 6780 __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement, 6781 Address(__ post(ary, evf * type2aelembytes(eltype)))); 6782 6783 if (load_arrangement == Assembler::T8B) { 6784 // Extend 8B to 8H to be able to use vector multiply 6785 // instructions 6786 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 6787 if (is_signed_subword_type(eltype)) { 6788 __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 6789 __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 6790 __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 6791 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 6792 } else { 6793 __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 6794 __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 6795 __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 6796 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 6797 } 6798 } 6799 6800 switch (load_arrangement) { 6801 case Assembler::T4S: 6802 __ addv(vmul3, load_arrangement, vmul3, vdata3); 6803 __ addv(vmul2, load_arrangement, vmul2, vdata2); 6804 __ addv(vmul1, load_arrangement, vmul1, vdata1); 6805 __ addv(vmul0, load_arrangement, vmul0, vdata0); 6806 break; 6807 case Assembler::T8B: 6808 case Assembler::T8H: 6809 assert(is_subword_type(eltype), "subword type expected"); 6810 if (is_signed_subword_type(eltype)) { 6811 __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 6812 __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 6813 __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 6814 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 6815 } else { 6816 __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 6817 __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 6818 __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 6819 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 6820 } 6821 break; 6822 default: 6823 __ should_not_reach_here(); 6824 } 6825 6826 // Process the upper half of a vector 6827 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 6828 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1); 6829 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1); 6830 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1); 6831 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1); 6832 if (is_signed_subword_type(eltype)) { 6833 __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 6834 __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 6835 __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 6836 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 6837 } else { 6838 __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 6839 __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 6840 __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 6841 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 6842 } 6843 } 6844 6845 __ subsw(rscratch2, rscratch2, 1); 6846 __ br(Assembler::HI, LARGE_LOOP); 6847 6848 __ mulv(vmul3, Assembler::T4S, vmul3, vpow); 6849 __ addv(vmul3, Assembler::T4S, vmul3); 6850 __ umov(result, vmul3, Assembler::S, 0); 6851 6852 __ mov(rscratch2, intpow(31U, vf)); 6853 6854 __ mulv(vmul2, Assembler::T4S, vmul2, vpow); 6855 __ addv(vmul2, Assembler::T4S, vmul2); 6856 __ umov(rscratch1, vmul2, Assembler::S, 0); 6857 __ maddw(result, result, rscratch2, rscratch1); 6858 6859 __ mulv(vmul1, Assembler::T4S, vmul1, vpow); 6860 __ addv(vmul1, Assembler::T4S, vmul1); 6861 __ umov(rscratch1, vmul1, Assembler::S, 0); 6862 __ maddw(result, result, rscratch2, rscratch1); 6863 6864 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 6865 __ addv(vmul0, Assembler::T4S, vmul0); 6866 __ umov(rscratch1, vmul0, Assembler::S, 0); 6867 __ maddw(result, result, rscratch2, rscratch1); 6868 6869 __ andr(rscratch2, cnt, vf - 1); 6870 __ cbnz(rscratch2, TAIL_SHORTCUT); 6871 6872 __ leave(); 6873 __ ret(lr); 6874 6875 return entry; 6876 } 6877 6878 address generate_dsin_dcos(bool isCos) { 6879 __ align(CodeEntryAlignment); 6880 StubGenStubId stub_id = (isCos ? StubGenStubId::dcos_id : StubGenStubId::dsin_id); 6881 StubCodeMark mark(this, stub_id); 6882 address start = __ pc(); 6883 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 6884 (address)StubRoutines::aarch64::_two_over_pi, 6885 (address)StubRoutines::aarch64::_pio2, 6886 (address)StubRoutines::aarch64::_dsin_coef, 6887 (address)StubRoutines::aarch64::_dcos_coef); 6888 return start; 6889 } 6890 6891 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 6892 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 6893 Label &DIFF2) { 6894 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 6895 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 6896 6897 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 6898 __ ldr(tmpU, Address(__ post(cnt1, 8))); 6899 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 6900 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 6901 6902 __ fmovd(tmpL, vtmp3); 6903 __ eor(rscratch2, tmp3, tmpL); 6904 __ cbnz(rscratch2, DIFF2); 6905 6906 __ ldr(tmp3, Address(__ post(cnt1, 8))); 6907 __ umov(tmpL, vtmp3, __ D, 1); 6908 __ eor(rscratch2, tmpU, tmpL); 6909 __ cbnz(rscratch2, DIFF1); 6910 6911 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 6912 __ ldr(tmpU, Address(__ post(cnt1, 8))); 6913 __ fmovd(tmpL, vtmp); 6914 __ eor(rscratch2, tmp3, tmpL); 6915 __ cbnz(rscratch2, DIFF2); 6916 6917 __ ldr(tmp3, Address(__ post(cnt1, 8))); 6918 __ umov(tmpL, vtmp, __ D, 1); 6919 __ eor(rscratch2, tmpU, tmpL); 6920 __ cbnz(rscratch2, DIFF1); 6921 } 6922 6923 // r0 = result 6924 // r1 = str1 6925 // r2 = cnt1 6926 // r3 = str2 6927 // r4 = cnt2 6928 // r10 = tmp1 6929 // r11 = tmp2 6930 address generate_compare_long_string_different_encoding(bool isLU) { 6931 __ align(CodeEntryAlignment); 6932 StubGenStubId stub_id = (isLU ? StubGenStubId::compare_long_string_LU_id : StubGenStubId::compare_long_string_UL_id); 6933 StubCodeMark mark(this, stub_id); 6934 address entry = __ pc(); 6935 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 6936 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 6937 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 6938 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 6939 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 6940 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 6941 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 6942 6943 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 6944 6945 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 6946 // cnt2 == amount of characters left to compare 6947 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 6948 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 6949 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 6950 __ add(str2, str2, isLU ? wordSize : wordSize/2); 6951 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 6952 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 6953 __ eor(rscratch2, tmp1, tmp2); 6954 __ mov(rscratch1, tmp2); 6955 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 6956 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 6957 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 6958 __ push(spilled_regs, sp); 6959 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 6960 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 6961 6962 __ ldr(tmp3, Address(__ post(cnt1, 8))); 6963 6964 if (SoftwarePrefetchHintDistance >= 0) { 6965 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 6966 __ br(__ LT, NO_PREFETCH); 6967 __ bind(LARGE_LOOP_PREFETCH); 6968 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 6969 __ mov(tmp4, 2); 6970 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 6971 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 6972 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 6973 __ subs(tmp4, tmp4, 1); 6974 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 6975 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 6976 __ mov(tmp4, 2); 6977 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 6978 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 6979 __ subs(tmp4, tmp4, 1); 6980 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 6981 __ sub(cnt2, cnt2, 64); 6982 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 6983 __ br(__ GE, LARGE_LOOP_PREFETCH); 6984 } 6985 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 6986 __ bind(NO_PREFETCH); 6987 __ subs(cnt2, cnt2, 16); 6988 __ br(__ LT, TAIL); 6989 __ align(OptoLoopAlignment); 6990 __ bind(SMALL_LOOP); // smaller loop 6991 __ subs(cnt2, cnt2, 16); 6992 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 6993 __ br(__ GE, SMALL_LOOP); 6994 __ cmn(cnt2, (u1)16); 6995 __ br(__ EQ, LOAD_LAST); 6996 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 6997 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 6998 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 6999 __ ldr(tmp3, Address(cnt1, -8)); 7000 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 7001 __ b(LOAD_LAST); 7002 __ bind(DIFF2); 7003 __ mov(tmpU, tmp3); 7004 __ bind(DIFF1); 7005 __ pop(spilled_regs, sp); 7006 __ b(CALCULATE_DIFFERENCE); 7007 __ bind(LOAD_LAST); 7008 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 7009 // No need to load it again 7010 __ mov(tmpU, tmp3); 7011 __ pop(spilled_regs, sp); 7012 7013 // tmp2 points to the address of the last 4 Latin1 characters right now 7014 __ ldrs(vtmp, Address(tmp2)); 7015 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 7016 __ fmovd(tmpL, vtmp); 7017 7018 __ eor(rscratch2, tmpU, tmpL); 7019 __ cbz(rscratch2, DONE); 7020 7021 // Find the first different characters in the longwords and 7022 // compute their difference. 7023 __ bind(CALCULATE_DIFFERENCE); 7024 __ rev(rscratch2, rscratch2); 7025 __ clz(rscratch2, rscratch2); 7026 __ andr(rscratch2, rscratch2, -16); 7027 __ lsrv(tmp1, tmp1, rscratch2); 7028 __ uxthw(tmp1, tmp1); 7029 __ lsrv(rscratch1, rscratch1, rscratch2); 7030 __ uxthw(rscratch1, rscratch1); 7031 __ subw(result, tmp1, rscratch1); 7032 __ bind(DONE); 7033 __ ret(lr); 7034 return entry; 7035 } 7036 7037 // r0 = input (float16) 7038 // v0 = result (float) 7039 // v1 = temporary float register 7040 address generate_float16ToFloat() { 7041 __ align(CodeEntryAlignment); 7042 StubGenStubId stub_id = StubGenStubId::hf2f_id; 7043 StubCodeMark mark(this, stub_id); 7044 address entry = __ pc(); 7045 BLOCK_COMMENT("Entry:"); 7046 __ flt16_to_flt(v0, r0, v1); 7047 __ ret(lr); 7048 return entry; 7049 } 7050 7051 // v0 = input (float) 7052 // r0 = result (float16) 7053 // v1 = temporary float register 7054 address generate_floatToFloat16() { 7055 __ align(CodeEntryAlignment); 7056 StubGenStubId stub_id = StubGenStubId::f2hf_id; 7057 StubCodeMark mark(this, stub_id); 7058 address entry = __ pc(); 7059 BLOCK_COMMENT("Entry:"); 7060 __ flt_to_flt16(r0, v0, v1); 7061 __ ret(lr); 7062 return entry; 7063 } 7064 7065 address generate_method_entry_barrier() { 7066 __ align(CodeEntryAlignment); 7067 StubGenStubId stub_id = StubGenStubId::method_entry_barrier_id; 7068 StubCodeMark mark(this, stub_id); 7069 7070 Label deoptimize_label; 7071 7072 address start = __ pc(); 7073 7074 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 7075 7076 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 7077 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 7078 // We can get here despite the nmethod being good, if we have not 7079 // yet applied our cross modification fence (or data fence). 7080 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 7081 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr())); 7082 __ ldrw(rscratch2, rscratch2); 7083 __ strw(rscratch2, thread_epoch_addr); 7084 __ isb(); 7085 __ membar(__ LoadLoad); 7086 } 7087 7088 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 7089 7090 __ enter(); 7091 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 7092 7093 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 7094 7095 __ push_call_clobbered_registers(); 7096 7097 __ mov(c_rarg0, rscratch2); 7098 __ call_VM_leaf 7099 (CAST_FROM_FN_PTR 7100 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 7101 7102 __ reset_last_Java_frame(true); 7103 7104 __ mov(rscratch1, r0); 7105 7106 __ pop_call_clobbered_registers(); 7107 7108 __ cbnz(rscratch1, deoptimize_label); 7109 7110 __ leave(); 7111 __ ret(lr); 7112 7113 __ BIND(deoptimize_label); 7114 7115 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 7116 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 7117 7118 __ mov(sp, rscratch1); 7119 __ br(rscratch2); 7120 7121 return start; 7122 } 7123 7124 // r0 = result 7125 // r1 = str1 7126 // r2 = cnt1 7127 // r3 = str2 7128 // r4 = cnt2 7129 // r10 = tmp1 7130 // r11 = tmp2 7131 address generate_compare_long_string_same_encoding(bool isLL) { 7132 __ align(CodeEntryAlignment); 7133 StubGenStubId stub_id = (isLL ? StubGenStubId::compare_long_string_LL_id : StubGenStubId::compare_long_string_UU_id); 7134 StubCodeMark mark(this, stub_id); 7135 address entry = __ pc(); 7136 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 7137 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; 7138 7139 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; 7140 7141 // exit from large loop when less than 64 bytes left to read or we're about 7142 // to prefetch memory behind array border 7143 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 7144 7145 // before jumping to stub, pre-load 8 bytes already, so do comparison directly 7146 __ eor(rscratch2, tmp1, tmp2); 7147 __ cbnz(rscratch2, CAL_DIFFERENCE); 7148 7149 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 7150 // update pointers, because of previous read 7151 __ add(str1, str1, wordSize); 7152 __ add(str2, str2, wordSize); 7153 if (SoftwarePrefetchHintDistance >= 0) { 7154 __ align(OptoLoopAlignment); 7155 __ bind(LARGE_LOOP_PREFETCH); 7156 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 7157 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 7158 7159 for (int i = 0; i < 4; i++) { 7160 __ ldp(tmp1, tmp1h, Address(str1, i * 16)); 7161 __ ldp(tmp2, tmp2h, Address(str2, i * 16)); 7162 __ cmp(tmp1, tmp2); 7163 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 7164 __ br(Assembler::NE, DIFF); 7165 } 7166 __ sub(cnt2, cnt2, isLL ? 64 : 32); 7167 __ add(str1, str1, 64); 7168 __ add(str2, str2, 64); 7169 __ subs(rscratch2, cnt2, largeLoopExitCondition); 7170 __ br(Assembler::GE, LARGE_LOOP_PREFETCH); 7171 __ cbz(cnt2, LENGTH_DIFF); // no more chars left? 7172 } 7173 7174 __ subs(rscratch1, cnt2, isLL ? 16 : 8); 7175 __ br(Assembler::LE, LESS16); 7176 __ align(OptoLoopAlignment); 7177 __ bind(LOOP_COMPARE16); 7178 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 7179 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 7180 __ cmp(tmp1, tmp2); 7181 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 7182 __ br(Assembler::NE, DIFF); 7183 __ sub(cnt2, cnt2, isLL ? 16 : 8); 7184 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 7185 __ br(Assembler::LT, LESS16); 7186 7187 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 7188 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 7189 __ cmp(tmp1, tmp2); 7190 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 7191 __ br(Assembler::NE, DIFF); 7192 __ sub(cnt2, cnt2, isLL ? 16 : 8); 7193 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 7194 __ br(Assembler::GE, LOOP_COMPARE16); 7195 __ cbz(cnt2, LENGTH_DIFF); 7196 7197 __ bind(LESS16); 7198 // each 8 compare 7199 __ subs(cnt2, cnt2, isLL ? 8 : 4); 7200 __ br(Assembler::LE, LESS8); 7201 __ ldr(tmp1, Address(__ post(str1, 8))); 7202 __ ldr(tmp2, Address(__ post(str2, 8))); 7203 __ eor(rscratch2, tmp1, tmp2); 7204 __ cbnz(rscratch2, CAL_DIFFERENCE); 7205 __ sub(cnt2, cnt2, isLL ? 8 : 4); 7206 7207 __ bind(LESS8); // directly load last 8 bytes 7208 if (!isLL) { 7209 __ add(cnt2, cnt2, cnt2); 7210 } 7211 __ ldr(tmp1, Address(str1, cnt2)); 7212 __ ldr(tmp2, Address(str2, cnt2)); 7213 __ eor(rscratch2, tmp1, tmp2); 7214 __ cbz(rscratch2, LENGTH_DIFF); 7215 __ b(CAL_DIFFERENCE); 7216 7217 __ bind(DIFF); 7218 __ cmp(tmp1, tmp2); 7219 __ csel(tmp1, tmp1, tmp1h, Assembler::NE); 7220 __ csel(tmp2, tmp2, tmp2h, Assembler::NE); 7221 // reuse rscratch2 register for the result of eor instruction 7222 __ eor(rscratch2, tmp1, tmp2); 7223 7224 __ bind(CAL_DIFFERENCE); 7225 __ rev(rscratch2, rscratch2); 7226 __ clz(rscratch2, rscratch2); 7227 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 7228 __ lsrv(tmp1, tmp1, rscratch2); 7229 __ lsrv(tmp2, tmp2, rscratch2); 7230 if (isLL) { 7231 __ uxtbw(tmp1, tmp1); 7232 __ uxtbw(tmp2, tmp2); 7233 } else { 7234 __ uxthw(tmp1, tmp1); 7235 __ uxthw(tmp2, tmp2); 7236 } 7237 __ subw(result, tmp1, tmp2); 7238 7239 __ bind(LENGTH_DIFF); 7240 __ ret(lr); 7241 return entry; 7242 } 7243 7244 enum string_compare_mode { 7245 LL, 7246 LU, 7247 UL, 7248 UU, 7249 }; 7250 7251 // The following registers are declared in aarch64.ad 7252 // r0 = result 7253 // r1 = str1 7254 // r2 = cnt1 7255 // r3 = str2 7256 // r4 = cnt2 7257 // r10 = tmp1 7258 // r11 = tmp2 7259 // z0 = ztmp1 7260 // z1 = ztmp2 7261 // p0 = pgtmp1 7262 // p1 = pgtmp2 7263 address generate_compare_long_string_sve(string_compare_mode mode) { 7264 StubGenStubId stub_id; 7265 switch (mode) { 7266 case LL: stub_id = StubGenStubId::compare_long_string_LL_id; break; 7267 case LU: stub_id = StubGenStubId::compare_long_string_LU_id; break; 7268 case UL: stub_id = StubGenStubId::compare_long_string_UL_id; break; 7269 case UU: stub_id = StubGenStubId::compare_long_string_UU_id; break; 7270 default: ShouldNotReachHere(); 7271 } 7272 7273 __ align(CodeEntryAlignment); 7274 address entry = __ pc(); 7275 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 7276 tmp1 = r10, tmp2 = r11; 7277 7278 Label LOOP, DONE, MISMATCH; 7279 Register vec_len = tmp1; 7280 Register idx = tmp2; 7281 // The minimum of the string lengths has been stored in cnt2. 7282 Register cnt = cnt2; 7283 FloatRegister ztmp1 = z0, ztmp2 = z1; 7284 PRegister pgtmp1 = p0, pgtmp2 = p1; 7285 7286 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \ 7287 switch (mode) { \ 7288 case LL: \ 7289 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \ 7290 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \ 7291 break; \ 7292 case LU: \ 7293 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \ 7294 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 7295 break; \ 7296 case UL: \ 7297 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 7298 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \ 7299 break; \ 7300 case UU: \ 7301 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 7302 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 7303 break; \ 7304 default: \ 7305 ShouldNotReachHere(); \ 7306 } 7307 7308 StubCodeMark mark(this, stub_id); 7309 7310 __ mov(idx, 0); 7311 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 7312 7313 if (mode == LL) { 7314 __ sve_cntb(vec_len); 7315 } else { 7316 __ sve_cnth(vec_len); 7317 } 7318 7319 __ sub(rscratch1, cnt, vec_len); 7320 7321 __ bind(LOOP); 7322 7323 // main loop 7324 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 7325 __ add(idx, idx, vec_len); 7326 // Compare strings. 7327 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 7328 __ br(__ NE, MISMATCH); 7329 __ cmp(idx, rscratch1); 7330 __ br(__ LT, LOOP); 7331 7332 // post loop, last iteration 7333 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 7334 7335 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 7336 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 7337 __ br(__ EQ, DONE); 7338 7339 __ bind(MISMATCH); 7340 7341 // Crop the vector to find its location. 7342 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */); 7343 // Extract the first different characters of each string. 7344 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1); 7345 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2); 7346 7347 // Compute the difference of the first different characters. 7348 __ sub(result, rscratch1, rscratch2); 7349 7350 __ bind(DONE); 7351 __ ret(lr); 7352 #undef LOAD_PAIR 7353 return entry; 7354 } 7355 7356 void generate_compare_long_strings() { 7357 if (UseSVE == 0) { 7358 StubRoutines::aarch64::_compare_long_string_LL 7359 = generate_compare_long_string_same_encoding(true); 7360 StubRoutines::aarch64::_compare_long_string_UU 7361 = generate_compare_long_string_same_encoding(false); 7362 StubRoutines::aarch64::_compare_long_string_LU 7363 = generate_compare_long_string_different_encoding(true); 7364 StubRoutines::aarch64::_compare_long_string_UL 7365 = generate_compare_long_string_different_encoding(false); 7366 } else { 7367 StubRoutines::aarch64::_compare_long_string_LL 7368 = generate_compare_long_string_sve(LL); 7369 StubRoutines::aarch64::_compare_long_string_UU 7370 = generate_compare_long_string_sve(UU); 7371 StubRoutines::aarch64::_compare_long_string_LU 7372 = generate_compare_long_string_sve(LU); 7373 StubRoutines::aarch64::_compare_long_string_UL 7374 = generate_compare_long_string_sve(UL); 7375 } 7376 } 7377 7378 // R0 = result 7379 // R1 = str2 7380 // R2 = cnt1 7381 // R3 = str1 7382 // R4 = cnt2 7383 // Clobbers: rscratch1, rscratch2, v0, v1, rflags 7384 // 7385 // This generic linear code use few additional ideas, which makes it faster: 7386 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 7387 // in order to skip initial loading(help in systems with 1 ld pipeline) 7388 // 2) we can use "fast" algorithm of finding single character to search for 7389 // first symbol with less branches(1 branch per each loaded register instead 7390 // of branch for each symbol), so, this is where constants like 7391 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 7392 // 3) after loading and analyzing 1st register of source string, it can be 7393 // used to search for every 1st character entry, saving few loads in 7394 // comparison with "simplier-but-slower" implementation 7395 // 4) in order to avoid lots of push/pop operations, code below is heavily 7396 // re-using/re-initializing/compressing register values, which makes code 7397 // larger and a bit less readable, however, most of extra operations are 7398 // issued during loads or branches, so, penalty is minimal 7399 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 7400 StubGenStubId stub_id; 7401 if (str1_isL) { 7402 if (str2_isL) { 7403 stub_id = StubGenStubId::string_indexof_linear_ll_id; 7404 } else { 7405 stub_id = StubGenStubId::string_indexof_linear_ul_id; 7406 } 7407 } else { 7408 if (str2_isL) { 7409 ShouldNotReachHere(); 7410 } else { 7411 stub_id = StubGenStubId::string_indexof_linear_uu_id; 7412 } 7413 } 7414 __ align(CodeEntryAlignment); 7415 StubCodeMark mark(this, stub_id); 7416 address entry = __ pc(); 7417 7418 int str1_chr_size = str1_isL ? 1 : 2; 7419 int str2_chr_size = str2_isL ? 1 : 2; 7420 int str1_chr_shift = str1_isL ? 0 : 1; 7421 int str2_chr_shift = str2_isL ? 0 : 1; 7422 bool isL = str1_isL && str2_isL; 7423 // parameters 7424 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 7425 // temporary registers 7426 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 7427 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 7428 // redefinitions 7429 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 7430 7431 __ push(spilled_regs, sp); 7432 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 7433 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 7434 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 7435 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 7436 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 7437 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 7438 // Read whole register from str1. It is safe, because length >=8 here 7439 __ ldr(ch1, Address(str1)); 7440 // Read whole register from str2. It is safe, because length >=8 here 7441 __ ldr(ch2, Address(str2)); 7442 __ sub(cnt2, cnt2, cnt1); 7443 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 7444 if (str1_isL != str2_isL) { 7445 __ eor(v0, __ T16B, v0, v0); 7446 } 7447 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 7448 __ mul(first, first, tmp1); 7449 // check if we have less than 1 register to check 7450 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 7451 if (str1_isL != str2_isL) { 7452 __ fmovd(v1, ch1); 7453 } 7454 __ br(__ LE, L_SMALL); 7455 __ eor(ch2, first, ch2); 7456 if (str1_isL != str2_isL) { 7457 __ zip1(v1, __ T16B, v1, v0); 7458 } 7459 __ sub(tmp2, ch2, tmp1); 7460 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 7461 __ bics(tmp2, tmp2, ch2); 7462 if (str1_isL != str2_isL) { 7463 __ fmovd(ch1, v1); 7464 } 7465 __ br(__ NE, L_HAS_ZERO); 7466 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 7467 __ add(result, result, wordSize/str2_chr_size); 7468 __ add(str2, str2, wordSize); 7469 __ br(__ LT, L_POST_LOOP); 7470 __ BIND(L_LOOP); 7471 __ ldr(ch2, Address(str2)); 7472 __ eor(ch2, first, ch2); 7473 __ sub(tmp2, ch2, tmp1); 7474 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 7475 __ bics(tmp2, tmp2, ch2); 7476 __ br(__ NE, L_HAS_ZERO); 7477 __ BIND(L_LOOP_PROCEED); 7478 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 7479 __ add(str2, str2, wordSize); 7480 __ add(result, result, wordSize/str2_chr_size); 7481 __ br(__ GE, L_LOOP); 7482 __ BIND(L_POST_LOOP); 7483 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 7484 __ br(__ LE, NOMATCH); 7485 __ ldr(ch2, Address(str2)); 7486 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 7487 __ eor(ch2, first, ch2); 7488 __ sub(tmp2, ch2, tmp1); 7489 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 7490 __ mov(tmp4, -1); // all bits set 7491 __ b(L_SMALL_PROCEED); 7492 __ align(OptoLoopAlignment); 7493 __ BIND(L_SMALL); 7494 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 7495 __ eor(ch2, first, ch2); 7496 if (str1_isL != str2_isL) { 7497 __ zip1(v1, __ T16B, v1, v0); 7498 } 7499 __ sub(tmp2, ch2, tmp1); 7500 __ mov(tmp4, -1); // all bits set 7501 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 7502 if (str1_isL != str2_isL) { 7503 __ fmovd(ch1, v1); // move converted 4 symbols 7504 } 7505 __ BIND(L_SMALL_PROCEED); 7506 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 7507 __ bic(tmp2, tmp2, ch2); 7508 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 7509 __ rbit(tmp2, tmp2); 7510 __ br(__ EQ, NOMATCH); 7511 __ BIND(L_SMALL_HAS_ZERO_LOOP); 7512 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 7513 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 7514 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 7515 if (str2_isL) { // LL 7516 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 7517 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 7518 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 7519 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 7520 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 7521 } else { 7522 __ mov(ch2, 0xE); // all bits in byte set except last one 7523 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 7524 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 7525 __ lslv(tmp2, tmp2, tmp4); 7526 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7527 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7528 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 7529 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7530 } 7531 __ cmp(ch1, ch2); 7532 __ mov(tmp4, wordSize/str2_chr_size); 7533 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 7534 __ BIND(L_SMALL_CMP_LOOP); 7535 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 7536 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 7537 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 7538 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 7539 __ add(tmp4, tmp4, 1); 7540 __ cmp(tmp4, cnt1); 7541 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 7542 __ cmp(first, ch2); 7543 __ br(__ EQ, L_SMALL_CMP_LOOP); 7544 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 7545 __ cbz(tmp2, NOMATCH); // no more matches. exit 7546 __ clz(tmp4, tmp2); 7547 __ add(result, result, 1); // advance index 7548 __ add(str2, str2, str2_chr_size); // advance pointer 7549 __ b(L_SMALL_HAS_ZERO_LOOP); 7550 __ align(OptoLoopAlignment); 7551 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 7552 __ cmp(first, ch2); 7553 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 7554 __ b(DONE); 7555 __ align(OptoLoopAlignment); 7556 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 7557 if (str2_isL) { // LL 7558 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 7559 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 7560 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 7561 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 7562 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 7563 } else { 7564 __ mov(ch2, 0xE); // all bits in byte set except last one 7565 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 7566 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 7567 __ lslv(tmp2, tmp2, tmp4); 7568 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7569 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7570 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 7571 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7572 } 7573 __ cmp(ch1, ch2); 7574 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 7575 __ b(DONE); 7576 __ align(OptoLoopAlignment); 7577 __ BIND(L_HAS_ZERO); 7578 __ rbit(tmp2, tmp2); 7579 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 7580 // Now, perform compression of counters(cnt2 and cnt1) into one register. 7581 // It's fine because both counters are 32bit and are not changed in this 7582 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 7583 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 7584 __ sub(result, result, 1); 7585 __ BIND(L_HAS_ZERO_LOOP); 7586 __ mov(cnt1, wordSize/str2_chr_size); 7587 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 7588 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 7589 if (str2_isL) { 7590 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 7591 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 7592 __ lslv(tmp2, tmp2, tmp4); 7593 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7594 __ add(tmp4, tmp4, 1); 7595 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7596 __ lsl(tmp2, tmp2, 1); 7597 __ mov(tmp4, wordSize/str2_chr_size); 7598 } else { 7599 __ mov(ch2, 0xE); 7600 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 7601 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 7602 __ lslv(tmp2, tmp2, tmp4); 7603 __ add(tmp4, tmp4, 1); 7604 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7605 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 7606 __ lsl(tmp2, tmp2, 1); 7607 __ mov(tmp4, wordSize/str2_chr_size); 7608 __ sub(str2, str2, str2_chr_size); 7609 } 7610 __ cmp(ch1, ch2); 7611 __ mov(tmp4, wordSize/str2_chr_size); 7612 __ br(__ NE, L_CMP_LOOP_NOMATCH); 7613 __ BIND(L_CMP_LOOP); 7614 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 7615 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 7616 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 7617 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 7618 __ add(tmp4, tmp4, 1); 7619 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 7620 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 7621 __ cmp(cnt1, ch2); 7622 __ br(__ EQ, L_CMP_LOOP); 7623 __ BIND(L_CMP_LOOP_NOMATCH); 7624 // here we're not matched 7625 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 7626 __ clz(tmp4, tmp2); 7627 __ add(str2, str2, str2_chr_size); // advance pointer 7628 __ b(L_HAS_ZERO_LOOP); 7629 __ align(OptoLoopAlignment); 7630 __ BIND(L_CMP_LOOP_LAST_CMP); 7631 __ cmp(cnt1, ch2); 7632 __ br(__ NE, L_CMP_LOOP_NOMATCH); 7633 __ b(DONE); 7634 __ align(OptoLoopAlignment); 7635 __ BIND(L_CMP_LOOP_LAST_CMP2); 7636 if (str2_isL) { 7637 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 7638 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 7639 __ lslv(tmp2, tmp2, tmp4); 7640 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7641 __ add(tmp4, tmp4, 1); 7642 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7643 __ lsl(tmp2, tmp2, 1); 7644 } else { 7645 __ mov(ch2, 0xE); 7646 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 7647 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 7648 __ lslv(tmp2, tmp2, tmp4); 7649 __ add(tmp4, tmp4, 1); 7650 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 7651 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 7652 __ lsl(tmp2, tmp2, 1); 7653 __ sub(str2, str2, str2_chr_size); 7654 } 7655 __ cmp(ch1, ch2); 7656 __ br(__ NE, L_CMP_LOOP_NOMATCH); 7657 __ b(DONE); 7658 __ align(OptoLoopAlignment); 7659 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 7660 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 7661 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 7662 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 7663 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 7664 // result by analyzed characters value, so, we can just reset lower bits 7665 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 7666 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 7667 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 7668 // index of last analyzed substring inside current octet. So, str2 in at 7669 // respective start address. We need to advance it to next octet 7670 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 7671 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 7672 __ bfm(result, zr, 0, 2 - str2_chr_shift); 7673 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 7674 __ movw(cnt2, cnt2); 7675 __ b(L_LOOP_PROCEED); 7676 __ align(OptoLoopAlignment); 7677 __ BIND(NOMATCH); 7678 __ mov(result, -1); 7679 __ BIND(DONE); 7680 __ pop(spilled_regs, sp); 7681 __ ret(lr); 7682 return entry; 7683 } 7684 7685 void generate_string_indexof_stubs() { 7686 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 7687 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 7688 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 7689 } 7690 7691 void inflate_and_store_2_fp_registers(bool generatePrfm, 7692 FloatRegister src1, FloatRegister src2) { 7693 Register dst = r1; 7694 __ zip1(v1, __ T16B, src1, v0); 7695 __ zip2(v2, __ T16B, src1, v0); 7696 if (generatePrfm) { 7697 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 7698 } 7699 __ zip1(v3, __ T16B, src2, v0); 7700 __ zip2(v4, __ T16B, src2, v0); 7701 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 7702 } 7703 7704 // R0 = src 7705 // R1 = dst 7706 // R2 = len 7707 // R3 = len >> 3 7708 // V0 = 0 7709 // v1 = loaded 8 bytes 7710 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6 7711 address generate_large_byte_array_inflate() { 7712 __ align(CodeEntryAlignment); 7713 StubGenStubId stub_id = StubGenStubId::large_byte_array_inflate_id; 7714 StubCodeMark mark(this, stub_id); 7715 address entry = __ pc(); 7716 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 7717 Register src = r0, dst = r1, len = r2, octetCounter = r3; 7718 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 7719 7720 // do one more 8-byte read to have address 16-byte aligned in most cases 7721 // also use single store instruction 7722 __ ldrd(v2, __ post(src, 8)); 7723 __ sub(octetCounter, octetCounter, 2); 7724 __ zip1(v1, __ T16B, v1, v0); 7725 __ zip1(v2, __ T16B, v2, v0); 7726 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 7727 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 7728 __ subs(rscratch1, octetCounter, large_loop_threshold); 7729 __ br(__ LE, LOOP_START); 7730 __ b(LOOP_PRFM_START); 7731 __ bind(LOOP_PRFM); 7732 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 7733 __ bind(LOOP_PRFM_START); 7734 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 7735 __ sub(octetCounter, octetCounter, 8); 7736 __ subs(rscratch1, octetCounter, large_loop_threshold); 7737 inflate_and_store_2_fp_registers(true, v3, v4); 7738 inflate_and_store_2_fp_registers(true, v5, v6); 7739 __ br(__ GT, LOOP_PRFM); 7740 __ cmp(octetCounter, (u1)8); 7741 __ br(__ LT, DONE); 7742 __ bind(LOOP); 7743 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 7744 __ bind(LOOP_START); 7745 __ sub(octetCounter, octetCounter, 8); 7746 __ cmp(octetCounter, (u1)8); 7747 inflate_and_store_2_fp_registers(false, v3, v4); 7748 inflate_and_store_2_fp_registers(false, v5, v6); 7749 __ br(__ GE, LOOP); 7750 __ bind(DONE); 7751 __ ret(lr); 7752 return entry; 7753 } 7754 7755 /** 7756 * Arguments: 7757 * 7758 * Input: 7759 * c_rarg0 - current state address 7760 * c_rarg1 - H key address 7761 * c_rarg2 - data address 7762 * c_rarg3 - number of blocks 7763 * 7764 * Output: 7765 * Updated state at c_rarg0 7766 */ 7767 address generate_ghash_processBlocks() { 7768 // Bafflingly, GCM uses little-endian for the byte order, but 7769 // big-endian for the bit order. For example, the polynomial 1 is 7770 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 7771 // 7772 // So, we must either reverse the bytes in each word and do 7773 // everything big-endian or reverse the bits in each byte and do 7774 // it little-endian. On AArch64 it's more idiomatic to reverse 7775 // the bits in each byte (we have an instruction, RBIT, to do 7776 // that) and keep the data in little-endian bit order through the 7777 // calculation, bit-reversing the inputs and outputs. 7778 7779 StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_id; 7780 StubCodeMark mark(this, stub_id); 7781 __ align(wordSize * 2); 7782 address p = __ pc(); 7783 __ emit_int64(0x87); // The low-order bits of the field 7784 // polynomial (i.e. p = z^7+z^2+z+1) 7785 // repeated in the low and high parts of a 7786 // 128-bit vector 7787 __ emit_int64(0x87); 7788 7789 __ align(CodeEntryAlignment); 7790 address start = __ pc(); 7791 7792 Register state = c_rarg0; 7793 Register subkeyH = c_rarg1; 7794 Register data = c_rarg2; 7795 Register blocks = c_rarg3; 7796 7797 FloatRegister vzr = v30; 7798 __ eor(vzr, __ T16B, vzr, vzr); // zero register 7799 7800 __ ldrq(v24, p); // The field polynomial 7801 7802 __ ldrq(v0, Address(state)); 7803 __ ldrq(v1, Address(subkeyH)); 7804 7805 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 7806 __ rbit(v0, __ T16B, v0); 7807 __ rev64(v1, __ T16B, v1); 7808 __ rbit(v1, __ T16B, v1); 7809 7810 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 7811 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 7812 7813 { 7814 Label L_ghash_loop; 7815 __ bind(L_ghash_loop); 7816 7817 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 7818 // reversing each byte 7819 __ rbit(v2, __ T16B, v2); 7820 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 7821 7822 // Multiply state in v2 by subkey in v1 7823 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 7824 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4, 7825 /*temps*/v6, v3, /*reuse/clobber b*/v2); 7826 // Reduce v7:v5 by the field polynomial 7827 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3); 7828 7829 __ sub(blocks, blocks, 1); 7830 __ cbnz(blocks, L_ghash_loop); 7831 } 7832 7833 // The bit-reversed result is at this point in v0 7834 __ rev64(v0, __ T16B, v0); 7835 __ rbit(v0, __ T16B, v0); 7836 7837 __ st1(v0, __ T16B, state); 7838 __ ret(lr); 7839 7840 return start; 7841 } 7842 7843 address generate_ghash_processBlocks_wide() { 7844 address small = generate_ghash_processBlocks(); 7845 7846 StubGenStubId stub_id = StubGenStubId::ghash_processBlocks_wide_id; 7847 StubCodeMark mark(this, stub_id); 7848 __ align(wordSize * 2); 7849 address p = __ pc(); 7850 __ emit_int64(0x87); // The low-order bits of the field 7851 // polynomial (i.e. p = z^7+z^2+z+1) 7852 // repeated in the low and high parts of a 7853 // 128-bit vector 7854 __ emit_int64(0x87); 7855 7856 __ align(CodeEntryAlignment); 7857 address start = __ pc(); 7858 7859 Register state = c_rarg0; 7860 Register subkeyH = c_rarg1; 7861 Register data = c_rarg2; 7862 Register blocks = c_rarg3; 7863 7864 const int unroll = 4; 7865 7866 __ cmp(blocks, (unsigned char)(unroll * 2)); 7867 __ br(__ LT, small); 7868 7869 if (unroll > 1) { 7870 // Save state before entering routine 7871 __ sub(sp, sp, 4 * 16); 7872 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 7873 __ sub(sp, sp, 4 * 16); 7874 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 7875 } 7876 7877 __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); 7878 7879 if (unroll > 1) { 7880 // And restore state 7881 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 7882 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 7883 } 7884 7885 __ cmp(blocks, (unsigned char)0); 7886 __ br(__ GT, small); 7887 7888 __ ret(lr); 7889 7890 return start; 7891 } 7892 7893 void generate_base64_encode_simdround(Register src, Register dst, 7894 FloatRegister codec, u8 size) { 7895 7896 FloatRegister in0 = v4, in1 = v5, in2 = v6; 7897 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; 7898 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; 7899 7900 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 7901 7902 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); 7903 7904 __ ushr(ind0, arrangement, in0, 2); 7905 7906 __ ushr(ind1, arrangement, in1, 2); 7907 __ shl(in0, arrangement, in0, 6); 7908 __ orr(ind1, arrangement, ind1, in0); 7909 __ ushr(ind1, arrangement, ind1, 2); 7910 7911 __ ushr(ind2, arrangement, in2, 4); 7912 __ shl(in1, arrangement, in1, 4); 7913 __ orr(ind2, arrangement, in1, ind2); 7914 __ ushr(ind2, arrangement, ind2, 2); 7915 7916 __ shl(ind3, arrangement, in2, 2); 7917 __ ushr(ind3, arrangement, ind3, 2); 7918 7919 __ tbl(out0, arrangement, codec, 4, ind0); 7920 __ tbl(out1, arrangement, codec, 4, ind1); 7921 __ tbl(out2, arrangement, codec, 4, ind2); 7922 __ tbl(out3, arrangement, codec, 4, ind3); 7923 7924 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); 7925 } 7926 7927 /** 7928 * Arguments: 7929 * 7930 * Input: 7931 * c_rarg0 - src_start 7932 * c_rarg1 - src_offset 7933 * c_rarg2 - src_length 7934 * c_rarg3 - dest_start 7935 * c_rarg4 - dest_offset 7936 * c_rarg5 - isURL 7937 * 7938 */ 7939 address generate_base64_encodeBlock() { 7940 7941 static const char toBase64[64] = { 7942 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 7943 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 7944 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 7945 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 7946 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 7947 }; 7948 7949 static const char toBase64URL[64] = { 7950 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 7951 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 7952 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 7953 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 7954 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 7955 }; 7956 7957 __ align(CodeEntryAlignment); 7958 StubGenStubId stub_id = StubGenStubId::base64_encodeBlock_id; 7959 StubCodeMark mark(this, stub_id); 7960 address start = __ pc(); 7961 7962 Register src = c_rarg0; // source array 7963 Register soff = c_rarg1; // source start offset 7964 Register send = c_rarg2; // source end offset 7965 Register dst = c_rarg3; // dest array 7966 Register doff = c_rarg4; // position for writing to dest array 7967 Register isURL = c_rarg5; // Base64 or URL character set 7968 7969 // c_rarg6 and c_rarg7 are free to use as temps 7970 Register codec = c_rarg6; 7971 Register length = c_rarg7; 7972 7973 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; 7974 7975 __ add(src, src, soff); 7976 __ add(dst, dst, doff); 7977 __ sub(length, send, soff); 7978 7979 // load the codec base address 7980 __ lea(codec, ExternalAddress((address) toBase64)); 7981 __ cbz(isURL, ProcessData); 7982 __ lea(codec, ExternalAddress((address) toBase64URL)); 7983 7984 __ BIND(ProcessData); 7985 7986 // too short to formup a SIMD loop, roll back 7987 __ cmp(length, (u1)24); 7988 __ br(Assembler::LT, Process3B); 7989 7990 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); 7991 7992 __ BIND(Process48B); 7993 __ cmp(length, (u1)48); 7994 __ br(Assembler::LT, Process24B); 7995 generate_base64_encode_simdround(src, dst, v0, 16); 7996 __ sub(length, length, 48); 7997 __ b(Process48B); 7998 7999 __ BIND(Process24B); 8000 __ cmp(length, (u1)24); 8001 __ br(Assembler::LT, SIMDExit); 8002 generate_base64_encode_simdround(src, dst, v0, 8); 8003 __ sub(length, length, 24); 8004 8005 __ BIND(SIMDExit); 8006 __ cbz(length, Exit); 8007 8008 __ BIND(Process3B); 8009 // 3 src bytes, 24 bits 8010 __ ldrb(r10, __ post(src, 1)); 8011 __ ldrb(r11, __ post(src, 1)); 8012 __ ldrb(r12, __ post(src, 1)); 8013 __ orrw(r11, r11, r10, Assembler::LSL, 8); 8014 __ orrw(r12, r12, r11, Assembler::LSL, 8); 8015 // codec index 8016 __ ubfmw(r15, r12, 18, 23); 8017 __ ubfmw(r14, r12, 12, 17); 8018 __ ubfmw(r13, r12, 6, 11); 8019 __ andw(r12, r12, 63); 8020 // get the code based on the codec 8021 __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); 8022 __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); 8023 __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); 8024 __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); 8025 __ strb(r15, __ post(dst, 1)); 8026 __ strb(r14, __ post(dst, 1)); 8027 __ strb(r13, __ post(dst, 1)); 8028 __ strb(r12, __ post(dst, 1)); 8029 __ sub(length, length, 3); 8030 __ cbnz(length, Process3B); 8031 8032 __ BIND(Exit); 8033 __ ret(lr); 8034 8035 return start; 8036 } 8037 8038 void generate_base64_decode_simdround(Register src, Register dst, 8039 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { 8040 8041 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; 8042 FloatRegister out0 = v20, out1 = v21, out2 = v22; 8043 8044 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; 8045 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; 8046 8047 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; 8048 8049 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 8050 8051 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); 8052 8053 // we need unsigned saturating subtract, to make sure all input values 8054 // in range [0, 63] will have 0U value in the higher half lookup 8055 __ uqsubv(decH0, __ T16B, in0, v27); 8056 __ uqsubv(decH1, __ T16B, in1, v27); 8057 __ uqsubv(decH2, __ T16B, in2, v27); 8058 __ uqsubv(decH3, __ T16B, in3, v27); 8059 8060 // lower half lookup 8061 __ tbl(decL0, arrangement, codecL, 4, in0); 8062 __ tbl(decL1, arrangement, codecL, 4, in1); 8063 __ tbl(decL2, arrangement, codecL, 4, in2); 8064 __ tbl(decL3, arrangement, codecL, 4, in3); 8065 8066 // higher half lookup 8067 __ tbx(decH0, arrangement, codecH, 4, decH0); 8068 __ tbx(decH1, arrangement, codecH, 4, decH1); 8069 __ tbx(decH2, arrangement, codecH, 4, decH2); 8070 __ tbx(decH3, arrangement, codecH, 4, decH3); 8071 8072 // combine lower and higher 8073 __ orr(decL0, arrangement, decL0, decH0); 8074 __ orr(decL1, arrangement, decL1, decH1); 8075 __ orr(decL2, arrangement, decL2, decH2); 8076 __ orr(decL3, arrangement, decL3, decH3); 8077 8078 // check illegal inputs, value larger than 63 (maximum of 6 bits) 8079 __ cm(Assembler::HI, decH0, arrangement, decL0, v27); 8080 __ cm(Assembler::HI, decH1, arrangement, decL1, v27); 8081 __ cm(Assembler::HI, decH2, arrangement, decL2, v27); 8082 __ cm(Assembler::HI, decH3, arrangement, decL3, v27); 8083 __ orr(in0, arrangement, decH0, decH1); 8084 __ orr(in1, arrangement, decH2, decH3); 8085 __ orr(in2, arrangement, in0, in1); 8086 __ umaxv(in3, arrangement, in2); 8087 __ umov(rscratch2, in3, __ B, 0); 8088 8089 // get the data to output 8090 __ shl(out0, arrangement, decL0, 2); 8091 __ ushr(out1, arrangement, decL1, 4); 8092 __ orr(out0, arrangement, out0, out1); 8093 __ shl(out1, arrangement, decL1, 4); 8094 __ ushr(out2, arrangement, decL2, 2); 8095 __ orr(out1, arrangement, out1, out2); 8096 __ shl(out2, arrangement, decL2, 6); 8097 __ orr(out2, arrangement, out2, decL3); 8098 8099 __ cbz(rscratch2, NoIllegalData); 8100 8101 // handle illegal input 8102 __ umov(r10, in2, __ D, 0); 8103 if (size == 16) { 8104 __ cbnz(r10, ErrorInLowerHalf); 8105 8106 // illegal input is in higher half, store the lower half now. 8107 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); 8108 8109 __ umov(r10, in2, __ D, 1); 8110 __ umov(r11, out0, __ D, 1); 8111 __ umov(r12, out1, __ D, 1); 8112 __ umov(r13, out2, __ D, 1); 8113 __ b(StoreLegalData); 8114 8115 __ BIND(ErrorInLowerHalf); 8116 } 8117 __ umov(r11, out0, __ D, 0); 8118 __ umov(r12, out1, __ D, 0); 8119 __ umov(r13, out2, __ D, 0); 8120 8121 __ BIND(StoreLegalData); 8122 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input 8123 __ strb(r11, __ post(dst, 1)); 8124 __ strb(r12, __ post(dst, 1)); 8125 __ strb(r13, __ post(dst, 1)); 8126 __ lsr(r10, r10, 8); 8127 __ lsr(r11, r11, 8); 8128 __ lsr(r12, r12, 8); 8129 __ lsr(r13, r13, 8); 8130 __ b(StoreLegalData); 8131 8132 __ BIND(NoIllegalData); 8133 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); 8134 } 8135 8136 8137 /** 8138 * Arguments: 8139 * 8140 * Input: 8141 * c_rarg0 - src_start 8142 * c_rarg1 - src_offset 8143 * c_rarg2 - src_length 8144 * c_rarg3 - dest_start 8145 * c_rarg4 - dest_offset 8146 * c_rarg5 - isURL 8147 * c_rarg6 - isMIME 8148 * 8149 */ 8150 address generate_base64_decodeBlock() { 8151 8152 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined 8153 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section 8154 // titled "Base64 decoding". 8155 8156 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, 8157 // except the trailing character '=' is also treated illegal value in this intrinsic. That 8158 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. 8159 static const uint8_t fromBase64ForNoSIMD[256] = { 8160 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8161 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8162 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 8163 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 8164 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 8165 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 8166 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 8167 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 8168 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8169 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8170 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8171 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8172 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8173 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8174 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8175 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8176 }; 8177 8178 static const uint8_t fromBase64URLForNoSIMD[256] = { 8179 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8180 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8181 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 8182 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 8183 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 8184 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 8185 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 8186 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 8187 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8188 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8189 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8190 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8191 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8192 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8193 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8194 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8195 }; 8196 8197 // A legal value of base64 code is in range [0, 127]. We need two lookups 8198 // with tbl/tbx and combine them to get the decode data. The 1st table vector 8199 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd 8200 // table vector lookup use tbx, out of range indices are unchanged in 8201 // destination. Input [64..126] is mapped to index [65, 127] in second lookup. 8202 // The value of index 64 is set to 0, so that we know that we already get the 8203 // decoded data with the 1st lookup. 8204 static const uint8_t fromBase64ForSIMD[128] = { 8205 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8206 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8207 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 8208 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 8209 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 8210 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 8211 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 8212 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 8213 }; 8214 8215 static const uint8_t fromBase64URLForSIMD[128] = { 8216 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8217 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 8218 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 8219 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 8220 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 8221 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 8222 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 8223 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 8224 }; 8225 8226 __ align(CodeEntryAlignment); 8227 StubGenStubId stub_id = StubGenStubId::base64_decodeBlock_id; 8228 StubCodeMark mark(this, stub_id); 8229 address start = __ pc(); 8230 8231 Register src = c_rarg0; // source array 8232 Register soff = c_rarg1; // source start offset 8233 Register send = c_rarg2; // source end offset 8234 Register dst = c_rarg3; // dest array 8235 Register doff = c_rarg4; // position for writing to dest array 8236 Register isURL = c_rarg5; // Base64 or URL character set 8237 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation 8238 8239 Register length = send; // reuse send as length of source data to process 8240 8241 Register simd_codec = c_rarg6; 8242 Register nosimd_codec = c_rarg7; 8243 8244 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; 8245 8246 __ enter(); 8247 8248 __ add(src, src, soff); 8249 __ add(dst, dst, doff); 8250 8251 __ mov(doff, dst); 8252 8253 __ sub(length, send, soff); 8254 __ bfm(length, zr, 0, 1); 8255 8256 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); 8257 __ cbz(isURL, ProcessData); 8258 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); 8259 8260 __ BIND(ProcessData); 8261 __ mov(rscratch1, length); 8262 __ cmp(length, (u1)144); // 144 = 80 + 64 8263 __ br(Assembler::LT, Process4B); 8264 8265 // In the MIME case, the line length cannot be more than 76 8266 // bytes (see RFC 2045). This is too short a block for SIMD 8267 // to be worthwhile, so we use non-SIMD here. 8268 __ movw(rscratch1, 79); 8269 8270 __ BIND(Process4B); 8271 __ ldrw(r14, __ post(src, 4)); 8272 __ ubfxw(r10, r14, 0, 8); 8273 __ ubfxw(r11, r14, 8, 8); 8274 __ ubfxw(r12, r14, 16, 8); 8275 __ ubfxw(r13, r14, 24, 8); 8276 // get the de-code 8277 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); 8278 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); 8279 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); 8280 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); 8281 // error detection, 255u indicates an illegal input 8282 __ orrw(r14, r10, r11); 8283 __ orrw(r15, r12, r13); 8284 __ orrw(r14, r14, r15); 8285 __ tbnz(r14, 7, Exit); 8286 // recover the data 8287 __ lslw(r14, r10, 10); 8288 __ bfiw(r14, r11, 4, 6); 8289 __ bfmw(r14, r12, 2, 5); 8290 __ rev16w(r14, r14); 8291 __ bfiw(r13, r12, 6, 2); 8292 __ strh(r14, __ post(dst, 2)); 8293 __ strb(r13, __ post(dst, 1)); 8294 // non-simd loop 8295 __ subsw(rscratch1, rscratch1, 4); 8296 __ br(Assembler::GT, Process4B); 8297 8298 // if exiting from PreProcess80B, rscratch1 == -1; 8299 // otherwise, rscratch1 == 0. 8300 __ cbzw(rscratch1, Exit); 8301 __ sub(length, length, 80); 8302 8303 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); 8304 __ cbz(isURL, SIMDEnter); 8305 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); 8306 8307 __ BIND(SIMDEnter); 8308 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); 8309 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); 8310 __ mov(rscratch1, 63); 8311 __ dup(v27, __ T16B, rscratch1); 8312 8313 __ BIND(Process64B); 8314 __ cmp(length, (u1)64); 8315 __ br(Assembler::LT, Process32B); 8316 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); 8317 __ sub(length, length, 64); 8318 __ b(Process64B); 8319 8320 __ BIND(Process32B); 8321 __ cmp(length, (u1)32); 8322 __ br(Assembler::LT, SIMDExit); 8323 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); 8324 __ sub(length, length, 32); 8325 __ b(Process32B); 8326 8327 __ BIND(SIMDExit); 8328 __ cbz(length, Exit); 8329 __ movw(rscratch1, length); 8330 __ b(Process4B); 8331 8332 __ BIND(Exit); 8333 __ sub(c_rarg0, dst, doff); 8334 8335 __ leave(); 8336 __ ret(lr); 8337 8338 return start; 8339 } 8340 8341 // Support for spin waits. 8342 address generate_spin_wait() { 8343 __ align(CodeEntryAlignment); 8344 StubGenStubId stub_id = StubGenStubId::spin_wait_id; 8345 StubCodeMark mark(this, stub_id); 8346 address start = __ pc(); 8347 8348 __ spin_wait(); 8349 __ ret(lr); 8350 8351 return start; 8352 } 8353 8354 void generate_lookup_secondary_supers_table_stub() { 8355 StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_id; 8356 StubCodeMark mark(this, stub_id); 8357 8358 const Register 8359 r_super_klass = r0, 8360 r_array_base = r1, 8361 r_array_length = r2, 8362 r_array_index = r3, 8363 r_sub_klass = r4, 8364 r_bitmap = rscratch2, 8365 result = r5; 8366 const FloatRegister 8367 vtemp = v0; 8368 8369 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) { 8370 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc(); 8371 Label L_success; 8372 __ enter(); 8373 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass, 8374 r_array_base, r_array_length, r_array_index, 8375 vtemp, result, slot, 8376 /*stub_is_near*/true); 8377 __ leave(); 8378 __ ret(lr); 8379 } 8380 } 8381 8382 // Slow path implementation for UseSecondarySupersTable. 8383 address generate_lookup_secondary_supers_table_slow_path_stub() { 8384 StubGenStubId stub_id = StubGenStubId::lookup_secondary_supers_table_slow_path_id; 8385 StubCodeMark mark(this, stub_id); 8386 8387 address start = __ pc(); 8388 const Register 8389 r_super_klass = r0, // argument 8390 r_array_base = r1, // argument 8391 temp1 = r2, // temp 8392 r_array_index = r3, // argument 8393 r_bitmap = rscratch2, // argument 8394 result = r5; // argument 8395 8396 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result); 8397 __ ret(lr); 8398 8399 return start; 8400 } 8401 8402 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 8403 8404 // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX. 8405 // 8406 // If LSE is in use, generate LSE versions of all the stubs. The 8407 // non-LSE versions are in atomic_aarch64.S. 8408 8409 // class AtomicStubMark records the entry point of a stub and the 8410 // stub pointer which will point to it. The stub pointer is set to 8411 // the entry point when ~AtomicStubMark() is called, which must be 8412 // after ICache::invalidate_range. This ensures safe publication of 8413 // the generated code. 8414 class AtomicStubMark { 8415 address _entry_point; 8416 aarch64_atomic_stub_t *_stub; 8417 MacroAssembler *_masm; 8418 public: 8419 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { 8420 _masm = masm; 8421 __ align(32); 8422 _entry_point = __ pc(); 8423 _stub = stub; 8424 } 8425 ~AtomicStubMark() { 8426 *_stub = (aarch64_atomic_stub_t)_entry_point; 8427 } 8428 }; 8429 8430 // NB: For memory_order_conservative we need a trailing membar after 8431 // LSE atomic operations but not a leading membar. 8432 // 8433 // We don't need a leading membar because a clause in the Arm ARM 8434 // says: 8435 // 8436 // Barrier-ordered-before 8437 // 8438 // Barrier instructions order prior Memory effects before subsequent 8439 // Memory effects generated by the same Observer. A read or a write 8440 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same 8441 // Observer if and only if RW1 appears in program order before RW 2 8442 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic 8443 // instruction with both Acquire and Release semantics. 8444 // 8445 // All the atomic instructions {ldaddal, swapal, casal} have Acquire 8446 // and Release semantics, therefore we don't need a leading 8447 // barrier. However, there is no corresponding Barrier-ordered-after 8448 // relationship, therefore we need a trailing membar to prevent a 8449 // later store or load from being reordered with the store in an 8450 // atomic instruction. 8451 // 8452 // This was checked by using the herd7 consistency model simulator 8453 // (http://diy.inria.fr/) with this test case: 8454 // 8455 // AArch64 LseCas 8456 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } 8457 // P0 | P1; 8458 // LDR W4, [X2] | MOV W3, #0; 8459 // DMB LD | MOV W4, #1; 8460 // LDR W3, [X1] | CASAL W3, W4, [X1]; 8461 // | DMB ISH; 8462 // | STR W4, [X2]; 8463 // exists 8464 // (0:X3=0 /\ 0:X4=1) 8465 // 8466 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered 8467 // with the store to x in P1. Without the DMB in P1 this may happen. 8468 // 8469 // At the time of writing we don't know of any AArch64 hardware that 8470 // reorders stores in this way, but the Reference Manual permits it. 8471 8472 void gen_cas_entry(Assembler::operand_size size, 8473 atomic_memory_order order) { 8474 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, 8475 exchange_val = c_rarg2; 8476 bool acquire, release; 8477 switch (order) { 8478 case memory_order_relaxed: 8479 acquire = false; 8480 release = false; 8481 break; 8482 case memory_order_release: 8483 acquire = false; 8484 release = true; 8485 break; 8486 default: 8487 acquire = true; 8488 release = true; 8489 break; 8490 } 8491 __ mov(prev, compare_val); 8492 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); 8493 if (order == memory_order_conservative) { 8494 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 8495 } 8496 if (size == Assembler::xword) { 8497 __ mov(r0, prev); 8498 } else { 8499 __ movw(r0, prev); 8500 } 8501 __ ret(lr); 8502 } 8503 8504 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) { 8505 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 8506 // If not relaxed, then default to conservative. Relaxed is the only 8507 // case we use enough to be worth specializing. 8508 if (order == memory_order_relaxed) { 8509 __ ldadd(size, incr, prev, addr); 8510 } else { 8511 __ ldaddal(size, incr, prev, addr); 8512 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 8513 } 8514 if (size == Assembler::xword) { 8515 __ mov(r0, prev); 8516 } else { 8517 __ movw(r0, prev); 8518 } 8519 __ ret(lr); 8520 } 8521 8522 void gen_swpal_entry(Assembler::operand_size size) { 8523 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 8524 __ swpal(size, incr, prev, addr); 8525 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 8526 if (size == Assembler::xword) { 8527 __ mov(r0, prev); 8528 } else { 8529 __ movw(r0, prev); 8530 } 8531 __ ret(lr); 8532 } 8533 8534 void generate_atomic_entry_points() { 8535 if (! UseLSE) { 8536 return; 8537 } 8538 __ align(CodeEntryAlignment); 8539 StubGenStubId stub_id = StubGenStubId::atomic_entry_points_id; 8540 StubCodeMark mark(this, stub_id); 8541 address first_entry = __ pc(); 8542 8543 // ADD, memory_order_conservative 8544 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); 8545 gen_ldadd_entry(Assembler::word, memory_order_conservative); 8546 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); 8547 gen_ldadd_entry(Assembler::xword, memory_order_conservative); 8548 8549 // ADD, memory_order_relaxed 8550 AtomicStubMark mark_fetch_add_4_relaxed 8551 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl); 8552 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed); 8553 AtomicStubMark mark_fetch_add_8_relaxed 8554 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl); 8555 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed); 8556 8557 // XCHG, memory_order_conservative 8558 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); 8559 gen_swpal_entry(Assembler::word); 8560 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); 8561 gen_swpal_entry(Assembler::xword); 8562 8563 // CAS, memory_order_conservative 8564 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); 8565 gen_cas_entry(MacroAssembler::byte, memory_order_conservative); 8566 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); 8567 gen_cas_entry(MacroAssembler::word, memory_order_conservative); 8568 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); 8569 gen_cas_entry(MacroAssembler::xword, memory_order_conservative); 8570 8571 // CAS, memory_order_relaxed 8572 AtomicStubMark mark_cmpxchg_1_relaxed 8573 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); 8574 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); 8575 AtomicStubMark mark_cmpxchg_4_relaxed 8576 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); 8577 gen_cas_entry(MacroAssembler::word, memory_order_relaxed); 8578 AtomicStubMark mark_cmpxchg_8_relaxed 8579 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); 8580 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); 8581 8582 AtomicStubMark mark_cmpxchg_4_release 8583 (_masm, &aarch64_atomic_cmpxchg_4_release_impl); 8584 gen_cas_entry(MacroAssembler::word, memory_order_release); 8585 AtomicStubMark mark_cmpxchg_8_release 8586 (_masm, &aarch64_atomic_cmpxchg_8_release_impl); 8587 gen_cas_entry(MacroAssembler::xword, memory_order_release); 8588 8589 AtomicStubMark mark_cmpxchg_4_seq_cst 8590 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); 8591 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); 8592 AtomicStubMark mark_cmpxchg_8_seq_cst 8593 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); 8594 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); 8595 8596 ICache::invalidate_range(first_entry, __ pc() - first_entry); 8597 } 8598 #endif // LINUX 8599 8600 address generate_cont_thaw(Continuation::thaw_kind kind) { 8601 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 8602 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 8603 8604 address start = __ pc(); 8605 8606 if (return_barrier) { 8607 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())); 8608 __ mov(sp, rscratch1); 8609 } 8610 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 8611 8612 if (return_barrier) { 8613 // preserve possible return value from a method returning to the return barrier 8614 __ fmovd(rscratch1, v0); 8615 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 8616 } 8617 8618 __ movw(c_rarg1, (return_barrier ? 1 : 0)); 8619 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1); 8620 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames 8621 8622 if (return_barrier) { 8623 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 8624 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 8625 __ fmovd(v0, rscratch1); 8626 } 8627 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 8628 8629 8630 Label thaw_success; 8631 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames 8632 __ cbnz(rscratch2, thaw_success); 8633 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry())); 8634 __ br(rscratch1); 8635 __ bind(thaw_success); 8636 8637 // make room for the thawed frames 8638 __ sub(rscratch1, sp, rscratch2); 8639 __ andr(rscratch1, rscratch1, -16); // align 8640 __ mov(sp, rscratch1); 8641 8642 if (return_barrier) { 8643 // save original return value -- again 8644 __ fmovd(rscratch1, v0); 8645 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 8646 } 8647 8648 // If we want, we can templatize thaw by kind, and have three different entries 8649 __ movw(c_rarg1, (uint32_t)kind); 8650 8651 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1); 8652 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame 8653 8654 if (return_barrier) { 8655 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 8656 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 8657 __ fmovd(v0, rscratch1); 8658 } else { 8659 __ mov(r0, zr); // return 0 (success) from doYield 8660 } 8661 8662 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down) 8663 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill 8664 __ mov(rfp, sp); 8665 8666 if (return_barrier_exception) { 8667 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address 8668 __ authenticate_return_address(c_rarg1); 8669 __ verify_oop(r0); 8670 // save return value containing the exception oop in callee-saved R19 8671 __ mov(r19, r0); 8672 8673 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); 8674 8675 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code. 8676 // __ reinitialize_ptrue(); 8677 8678 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc 8679 8680 __ mov(r1, r0); // the exception handler 8681 __ mov(r0, r19); // restore return value containing the exception oop 8682 __ verify_oop(r0); 8683 8684 __ leave(); 8685 __ mov(r3, lr); 8686 __ br(r1); // the exception handler 8687 } else { 8688 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 8689 __ leave(); 8690 __ ret(lr); 8691 } 8692 8693 return start; 8694 } 8695 8696 address generate_cont_thaw() { 8697 if (!Continuations::enabled()) return nullptr; 8698 8699 StubGenStubId stub_id = StubGenStubId::cont_thaw_id; 8700 StubCodeMark mark(this, stub_id); 8701 address start = __ pc(); 8702 generate_cont_thaw(Continuation::thaw_top); 8703 return start; 8704 } 8705 8706 address generate_cont_returnBarrier() { 8707 if (!Continuations::enabled()) return nullptr; 8708 8709 // TODO: will probably need multiple return barriers depending on return type 8710 StubGenStubId stub_id = StubGenStubId::cont_returnBarrier_id; 8711 StubCodeMark mark(this, stub_id); 8712 address start = __ pc(); 8713 8714 generate_cont_thaw(Continuation::thaw_return_barrier); 8715 8716 return start; 8717 } 8718 8719 address generate_cont_returnBarrier_exception() { 8720 if (!Continuations::enabled()) return nullptr; 8721 8722 StubGenStubId stub_id = StubGenStubId::cont_returnBarrierExc_id; 8723 StubCodeMark mark(this, stub_id); 8724 address start = __ pc(); 8725 8726 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 8727 8728 return start; 8729 } 8730 8731 address generate_cont_preempt_stub() { 8732 if (!Continuations::enabled()) return nullptr; 8733 StubGenStubId stub_id = StubGenStubId::cont_preempt_id; 8734 StubCodeMark mark(this, stub_id); 8735 address start = __ pc(); 8736 8737 __ reset_last_Java_frame(true); 8738 8739 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap. 8740 __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset())); 8741 __ mov(sp, rscratch2); 8742 8743 Label preemption_cancelled; 8744 __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset())); 8745 __ cbnz(rscratch1, preemption_cancelled); 8746 8747 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount. 8748 SharedRuntime::continuation_enter_cleanup(_masm); 8749 __ leave(); 8750 __ ret(lr); 8751 8752 // We acquired the monitor after freezing the frames so call thaw to continue execution. 8753 __ bind(preemption_cancelled); 8754 __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset())); 8755 __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size()))); 8756 __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address())); 8757 __ ldr(rscratch1, Address(rscratch1)); 8758 __ br(rscratch1); 8759 8760 return start; 8761 } 8762 8763 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers 8764 // are represented as long[5], with BITS_PER_LIMB = 26. 8765 // Pack five 26-bit limbs into three 64-bit registers. 8766 void pack_26(Register dest0, Register dest1, Register dest2, Register src) { 8767 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits 8768 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits 8769 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong))); 8770 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits 8771 8772 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits 8773 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits 8774 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong))); 8775 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits 8776 8777 if (dest2->is_valid()) { 8778 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits 8779 } else { 8780 #ifdef ASSERT 8781 Label OK; 8782 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits 8783 __ br(__ EQ, OK); 8784 __ stop("high bits of Poly1305 integer should be zero"); 8785 __ should_not_reach_here(); 8786 __ bind(OK); 8787 #endif 8788 } 8789 } 8790 8791 // As above, but return only a 128-bit integer, packed into two 8792 // 64-bit registers. 8793 void pack_26(Register dest0, Register dest1, Register src) { 8794 pack_26(dest0, dest1, noreg, src); 8795 } 8796 8797 // Multiply and multiply-accumulate unsigned 64-bit registers. 8798 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) { 8799 __ mul(prod_lo, n, m); 8800 __ umulh(prod_hi, n, m); 8801 } 8802 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) { 8803 wide_mul(rscratch1, rscratch2, n, m); 8804 __ adds(sum_lo, sum_lo, rscratch1); 8805 __ adc(sum_hi, sum_hi, rscratch2); 8806 } 8807 8808 // Poly1305, RFC 7539 8809 8810 // See https://loup-vaillant.fr/tutorials/poly1305-design for a 8811 // description of the tricks used to simplify and accelerate this 8812 // computation. 8813 8814 address generate_poly1305_processBlocks() { 8815 __ align(CodeEntryAlignment); 8816 StubGenStubId stub_id = StubGenStubId::poly1305_processBlocks_id; 8817 StubCodeMark mark(this, stub_id); 8818 address start = __ pc(); 8819 Label here; 8820 __ enter(); 8821 RegSet callee_saved = RegSet::range(r19, r28); 8822 __ push(callee_saved, sp); 8823 8824 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin(); 8825 8826 // Arguments 8827 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs; 8828 8829 // R_n is the 128-bit randomly-generated key, packed into two 8830 // registers. The caller passes this key to us as long[5], with 8831 // BITS_PER_LIMB = 26. 8832 const Register R_0 = *++regs, R_1 = *++regs; 8833 pack_26(R_0, R_1, r_start); 8834 8835 // RR_n is (R_n >> 2) * 5 8836 const Register RR_0 = *++regs, RR_1 = *++regs; 8837 __ lsr(RR_0, R_0, 2); 8838 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2); 8839 __ lsr(RR_1, R_1, 2); 8840 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2); 8841 8842 // U_n is the current checksum 8843 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs; 8844 pack_26(U_0, U_1, U_2, acc_start); 8845 8846 static constexpr int BLOCK_LENGTH = 16; 8847 Label DONE, LOOP; 8848 8849 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 8850 __ br(Assembler::LT, DONE); { 8851 __ bind(LOOP); 8852 8853 // S_n is to be the sum of U_n and the next block of data 8854 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs; 8855 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize)); 8856 __ adds(S_0, U_0, S_0); 8857 __ adcs(S_1, U_1, S_1); 8858 __ adc(S_2, U_2, zr); 8859 __ add(S_2, S_2, 1); 8860 8861 const Register U_0HI = *++regs, U_1HI = *++regs; 8862 8863 // NB: this logic depends on some of the special properties of 8864 // Poly1305 keys. In particular, because we know that the top 8865 // four bits of R_0 and R_1 are zero, we can add together 8866 // partial products without any risk of needing to propagate a 8867 // carry out. 8868 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0); 8869 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1); 8870 __ andr(U_2, R_0, 3); 8871 __ mul(U_2, S_2, U_2); 8872 8873 // Recycle registers S_0, S_1, S_2 8874 regs = (regs.remaining() + S_0 + S_1 + S_2).begin(); 8875 8876 // Partial reduction mod 2**130 - 5 8877 __ adds(U_1, U_0HI, U_1); 8878 __ adc(U_2, U_1HI, U_2); 8879 // Sum now in U_2:U_1:U_0. 8880 // Dead: U_0HI, U_1HI. 8881 regs = (regs.remaining() + U_0HI + U_1HI).begin(); 8882 8883 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps 8884 8885 // First, U_2:U_1:U_0 += (U_2 >> 2) 8886 __ lsr(rscratch1, U_2, 2); 8887 __ andr(U_2, U_2, (u8)3); 8888 __ adds(U_0, U_0, rscratch1); 8889 __ adcs(U_1, U_1, zr); 8890 __ adc(U_2, U_2, zr); 8891 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2 8892 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2); 8893 __ adcs(U_1, U_1, zr); 8894 __ adc(U_2, U_2, zr); 8895 8896 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH)); 8897 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 8898 __ br(~ Assembler::LT, LOOP); 8899 } 8900 8901 // Further reduce modulo 2^130 - 5 8902 __ lsr(rscratch1, U_2, 2); 8903 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5 8904 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5 8905 __ adcs(U_1, U_1, zr); 8906 __ andr(U_2, U_2, (u1)3); 8907 __ adc(U_2, U_2, zr); 8908 8909 // Unpack the sum into five 26-bit limbs and write to memory. 8910 __ ubfiz(rscratch1, U_0, 0, 26); 8911 __ ubfx(rscratch2, U_0, 26, 26); 8912 __ stp(rscratch1, rscratch2, Address(acc_start)); 8913 __ ubfx(rscratch1, U_0, 52, 12); 8914 __ bfi(rscratch1, U_1, 12, 14); 8915 __ ubfx(rscratch2, U_1, 14, 26); 8916 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong))); 8917 __ ubfx(rscratch1, U_1, 40, 24); 8918 __ bfi(rscratch1, U_2, 24, 3); 8919 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong))); 8920 8921 __ bind(DONE); 8922 __ pop(callee_saved, sp); 8923 __ leave(); 8924 __ ret(lr); 8925 8926 return start; 8927 } 8928 8929 // exception handler for upcall stubs 8930 address generate_upcall_stub_exception_handler() { 8931 StubGenStubId stub_id = StubGenStubId::upcall_stub_exception_handler_id; 8932 StubCodeMark mark(this, stub_id); 8933 address start = __ pc(); 8934 8935 // Native caller has no idea how to handle exceptions, 8936 // so we just crash here. Up to callee to catch exceptions. 8937 __ verify_oop(r0); 8938 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception)); 8939 __ blr(rscratch1); 8940 __ should_not_reach_here(); 8941 8942 return start; 8943 } 8944 8945 // load Method* target of MethodHandle 8946 // j_rarg0 = jobject receiver 8947 // rmethod = result 8948 address generate_upcall_stub_load_target() { 8949 StubGenStubId stub_id = StubGenStubId::upcall_stub_load_target_id; 8950 StubCodeMark mark(this, stub_id); 8951 address start = __ pc(); 8952 8953 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2); 8954 // Load target method from receiver 8955 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2); 8956 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2); 8957 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2); 8958 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod, 8959 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()), 8960 noreg, noreg); 8961 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized 8962 8963 __ ret(lr); 8964 8965 return start; 8966 } 8967 8968 #undef __ 8969 #define __ masm-> 8970 8971 class MontgomeryMultiplyGenerator : public MacroAssembler { 8972 8973 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 8974 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 8975 8976 RegSet _toSave; 8977 bool _squaring; 8978 8979 public: 8980 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 8981 : MacroAssembler(as->code()), _squaring(squaring) { 8982 8983 // Register allocation 8984 8985 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin(); 8986 Pa_base = *regs; // Argument registers 8987 if (squaring) 8988 Pb_base = Pa_base; 8989 else 8990 Pb_base = *++regs; 8991 Pn_base = *++regs; 8992 Rlen= *++regs; 8993 inv = *++regs; 8994 Pm_base = *++regs; 8995 8996 // Working registers: 8997 Ra = *++regs; // The current digit of a, b, n, and m. 8998 Rb = *++regs; 8999 Rm = *++regs; 9000 Rn = *++regs; 9001 9002 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 9003 Pb = *++regs; 9004 Pm = *++regs; 9005 Pn = *++regs; 9006 9007 t0 = *++regs; // Three registers which form a 9008 t1 = *++regs; // triple-precision accumuator. 9009 t2 = *++regs; 9010 9011 Ri = *++regs; // Inner and outer loop indexes. 9012 Rj = *++regs; 9013 9014 Rhi_ab = *++regs; // Product registers: low and high parts 9015 Rlo_ab = *++regs; // of a*b and m*n. 9016 Rhi_mn = *++regs; 9017 Rlo_mn = *++regs; 9018 9019 // r19 and up are callee-saved. 9020 _toSave = RegSet::range(r19, *regs) + Pm_base; 9021 } 9022 9023 private: 9024 void save_regs() { 9025 push(_toSave, sp); 9026 } 9027 9028 void restore_regs() { 9029 pop(_toSave, sp); 9030 } 9031 9032 template <typename T> 9033 void unroll_2(Register count, T block) { 9034 Label loop, end, odd; 9035 tbnz(count, 0, odd); 9036 cbz(count, end); 9037 align(16); 9038 bind(loop); 9039 (this->*block)(); 9040 bind(odd); 9041 (this->*block)(); 9042 subs(count, count, 2); 9043 br(Assembler::GT, loop); 9044 bind(end); 9045 } 9046 9047 template <typename T> 9048 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 9049 Label loop, end, odd; 9050 tbnz(count, 0, odd); 9051 cbz(count, end); 9052 align(16); 9053 bind(loop); 9054 (this->*block)(d, s, tmp); 9055 bind(odd); 9056 (this->*block)(d, s, tmp); 9057 subs(count, count, 2); 9058 br(Assembler::GT, loop); 9059 bind(end); 9060 } 9061 9062 void pre1(RegisterOrConstant i) { 9063 block_comment("pre1"); 9064 // Pa = Pa_base; 9065 // Pb = Pb_base + i; 9066 // Pm = Pm_base; 9067 // Pn = Pn_base + i; 9068 // Ra = *Pa; 9069 // Rb = *Pb; 9070 // Rm = *Pm; 9071 // Rn = *Pn; 9072 ldr(Ra, Address(Pa_base)); 9073 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 9074 ldr(Rm, Address(Pm_base)); 9075 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 9076 lea(Pa, Address(Pa_base)); 9077 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 9078 lea(Pm, Address(Pm_base)); 9079 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 9080 9081 // Zero the m*n result. 9082 mov(Rhi_mn, zr); 9083 mov(Rlo_mn, zr); 9084 } 9085 9086 // The core multiply-accumulate step of a Montgomery 9087 // multiplication. The idea is to schedule operations as a 9088 // pipeline so that instructions with long latencies (loads and 9089 // multiplies) have time to complete before their results are 9090 // used. This most benefits in-order implementations of the 9091 // architecture but out-of-order ones also benefit. 9092 void step() { 9093 block_comment("step"); 9094 // MACC(Ra, Rb, t0, t1, t2); 9095 // Ra = *++Pa; 9096 // Rb = *--Pb; 9097 umulh(Rhi_ab, Ra, Rb); 9098 mul(Rlo_ab, Ra, Rb); 9099 ldr(Ra, pre(Pa, wordSize)); 9100 ldr(Rb, pre(Pb, -wordSize)); 9101 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 9102 // previous iteration. 9103 // MACC(Rm, Rn, t0, t1, t2); 9104 // Rm = *++Pm; 9105 // Rn = *--Pn; 9106 umulh(Rhi_mn, Rm, Rn); 9107 mul(Rlo_mn, Rm, Rn); 9108 ldr(Rm, pre(Pm, wordSize)); 9109 ldr(Rn, pre(Pn, -wordSize)); 9110 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 9111 } 9112 9113 void post1() { 9114 block_comment("post1"); 9115 9116 // MACC(Ra, Rb, t0, t1, t2); 9117 // Ra = *++Pa; 9118 // Rb = *--Pb; 9119 umulh(Rhi_ab, Ra, Rb); 9120 mul(Rlo_ab, Ra, Rb); 9121 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 9122 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 9123 9124 // *Pm = Rm = t0 * inv; 9125 mul(Rm, t0, inv); 9126 str(Rm, Address(Pm)); 9127 9128 // MACC(Rm, Rn, t0, t1, t2); 9129 // t0 = t1; t1 = t2; t2 = 0; 9130 umulh(Rhi_mn, Rm, Rn); 9131 9132 #ifndef PRODUCT 9133 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 9134 { 9135 mul(Rlo_mn, Rm, Rn); 9136 add(Rlo_mn, t0, Rlo_mn); 9137 Label ok; 9138 cbz(Rlo_mn, ok); { 9139 stop("broken Montgomery multiply"); 9140 } bind(ok); 9141 } 9142 #endif 9143 // We have very carefully set things up so that 9144 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 9145 // the lower half of Rm * Rn because we know the result already: 9146 // it must be -t0. t0 + (-t0) must generate a carry iff 9147 // t0 != 0. So, rather than do a mul and an adds we just set 9148 // the carry flag iff t0 is nonzero. 9149 // 9150 // mul(Rlo_mn, Rm, Rn); 9151 // adds(zr, t0, Rlo_mn); 9152 subs(zr, t0, 1); // Set carry iff t0 is nonzero 9153 adcs(t0, t1, Rhi_mn); 9154 adc(t1, t2, zr); 9155 mov(t2, zr); 9156 } 9157 9158 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 9159 block_comment("pre2"); 9160 // Pa = Pa_base + i-len; 9161 // Pb = Pb_base + len; 9162 // Pm = Pm_base + i-len; 9163 // Pn = Pn_base + len; 9164 9165 if (i.is_register()) { 9166 sub(Rj, i.as_register(), len); 9167 } else { 9168 mov(Rj, i.as_constant()); 9169 sub(Rj, Rj, len); 9170 } 9171 // Rj == i-len 9172 9173 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 9174 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 9175 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 9176 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 9177 9178 // Ra = *++Pa; 9179 // Rb = *--Pb; 9180 // Rm = *++Pm; 9181 // Rn = *--Pn; 9182 ldr(Ra, pre(Pa, wordSize)); 9183 ldr(Rb, pre(Pb, -wordSize)); 9184 ldr(Rm, pre(Pm, wordSize)); 9185 ldr(Rn, pre(Pn, -wordSize)); 9186 9187 mov(Rhi_mn, zr); 9188 mov(Rlo_mn, zr); 9189 } 9190 9191 void post2(RegisterOrConstant i, RegisterOrConstant len) { 9192 block_comment("post2"); 9193 if (i.is_constant()) { 9194 mov(Rj, i.as_constant()-len.as_constant()); 9195 } else { 9196 sub(Rj, i.as_register(), len); 9197 } 9198 9199 adds(t0, t0, Rlo_mn); // The pending m*n, low part 9200 9201 // As soon as we know the least significant digit of our result, 9202 // store it. 9203 // Pm_base[i-len] = t0; 9204 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 9205 9206 // t0 = t1; t1 = t2; t2 = 0; 9207 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 9208 adc(t1, t2, zr); 9209 mov(t2, zr); 9210 } 9211 9212 // A carry in t0 after Montgomery multiplication means that we 9213 // should subtract multiples of n from our result in m. We'll 9214 // keep doing that until there is no carry. 9215 void normalize(RegisterOrConstant len) { 9216 block_comment("normalize"); 9217 // while (t0) 9218 // t0 = sub(Pm_base, Pn_base, t0, len); 9219 Label loop, post, again; 9220 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 9221 cbz(t0, post); { 9222 bind(again); { 9223 mov(i, zr); 9224 mov(cnt, len); 9225 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 9226 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 9227 subs(zr, zr, zr); // set carry flag, i.e. no borrow 9228 align(16); 9229 bind(loop); { 9230 sbcs(Rm, Rm, Rn); 9231 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 9232 add(i, i, 1); 9233 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 9234 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 9235 sub(cnt, cnt, 1); 9236 } cbnz(cnt, loop); 9237 sbc(t0, t0, zr); 9238 } cbnz(t0, again); 9239 } bind(post); 9240 } 9241 9242 // Move memory at s to d, reversing words. 9243 // Increments d to end of copied memory 9244 // Destroys tmp1, tmp2 9245 // Preserves len 9246 // Leaves s pointing to the address which was in d at start 9247 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 9248 assert(tmp1->encoding() < r19->encoding(), "register corruption"); 9249 assert(tmp2->encoding() < r19->encoding(), "register corruption"); 9250 9251 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 9252 mov(tmp1, len); 9253 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 9254 sub(s, d, len, ext::uxtw, LogBytesPerWord); 9255 } 9256 // where 9257 void reverse1(Register d, Register s, Register tmp) { 9258 ldr(tmp, pre(s, -wordSize)); 9259 ror(tmp, tmp, 32); 9260 str(tmp, post(d, wordSize)); 9261 } 9262 9263 void step_squaring() { 9264 // An extra ACC 9265 step(); 9266 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 9267 } 9268 9269 void last_squaring(RegisterOrConstant i) { 9270 Label dont; 9271 // if ((i & 1) == 0) { 9272 tbnz(i.as_register(), 0, dont); { 9273 // MACC(Ra, Rb, t0, t1, t2); 9274 // Ra = *++Pa; 9275 // Rb = *--Pb; 9276 umulh(Rhi_ab, Ra, Rb); 9277 mul(Rlo_ab, Ra, Rb); 9278 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 9279 } bind(dont); 9280 } 9281 9282 void extra_step_squaring() { 9283 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 9284 9285 // MACC(Rm, Rn, t0, t1, t2); 9286 // Rm = *++Pm; 9287 // Rn = *--Pn; 9288 umulh(Rhi_mn, Rm, Rn); 9289 mul(Rlo_mn, Rm, Rn); 9290 ldr(Rm, pre(Pm, wordSize)); 9291 ldr(Rn, pre(Pn, -wordSize)); 9292 } 9293 9294 void post1_squaring() { 9295 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 9296 9297 // *Pm = Rm = t0 * inv; 9298 mul(Rm, t0, inv); 9299 str(Rm, Address(Pm)); 9300 9301 // MACC(Rm, Rn, t0, t1, t2); 9302 // t0 = t1; t1 = t2; t2 = 0; 9303 umulh(Rhi_mn, Rm, Rn); 9304 9305 #ifndef PRODUCT 9306 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 9307 { 9308 mul(Rlo_mn, Rm, Rn); 9309 add(Rlo_mn, t0, Rlo_mn); 9310 Label ok; 9311 cbz(Rlo_mn, ok); { 9312 stop("broken Montgomery multiply"); 9313 } bind(ok); 9314 } 9315 #endif 9316 // We have very carefully set things up so that 9317 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 9318 // the lower half of Rm * Rn because we know the result already: 9319 // it must be -t0. t0 + (-t0) must generate a carry iff 9320 // t0 != 0. So, rather than do a mul and an adds we just set 9321 // the carry flag iff t0 is nonzero. 9322 // 9323 // mul(Rlo_mn, Rm, Rn); 9324 // adds(zr, t0, Rlo_mn); 9325 subs(zr, t0, 1); // Set carry iff t0 is nonzero 9326 adcs(t0, t1, Rhi_mn); 9327 adc(t1, t2, zr); 9328 mov(t2, zr); 9329 } 9330 9331 void acc(Register Rhi, Register Rlo, 9332 Register t0, Register t1, Register t2) { 9333 adds(t0, t0, Rlo); 9334 adcs(t1, t1, Rhi); 9335 adc(t2, t2, zr); 9336 } 9337 9338 public: 9339 /** 9340 * Fast Montgomery multiplication. The derivation of the 9341 * algorithm is in A Cryptographic Library for the Motorola 9342 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 9343 * 9344 * Arguments: 9345 * 9346 * Inputs for multiplication: 9347 * c_rarg0 - int array elements a 9348 * c_rarg1 - int array elements b 9349 * c_rarg2 - int array elements n (the modulus) 9350 * c_rarg3 - int length 9351 * c_rarg4 - int inv 9352 * c_rarg5 - int array elements m (the result) 9353 * 9354 * Inputs for squaring: 9355 * c_rarg0 - int array elements a 9356 * c_rarg1 - int array elements n (the modulus) 9357 * c_rarg2 - int length 9358 * c_rarg3 - int inv 9359 * c_rarg4 - int array elements m (the result) 9360 * 9361 */ 9362 address generate_multiply() { 9363 Label argh, nothing; 9364 bind(argh); 9365 stop("MontgomeryMultiply total_allocation must be <= 8192"); 9366 9367 align(CodeEntryAlignment); 9368 address entry = pc(); 9369 9370 cbzw(Rlen, nothing); 9371 9372 enter(); 9373 9374 // Make room. 9375 cmpw(Rlen, 512); 9376 br(Assembler::HI, argh); 9377 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 9378 andr(sp, Ra, -2 * wordSize); 9379 9380 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 9381 9382 { 9383 // Copy input args, reversing as we go. We use Ra as a 9384 // temporary variable. 9385 reverse(Ra, Pa_base, Rlen, t0, t1); 9386 if (!_squaring) 9387 reverse(Ra, Pb_base, Rlen, t0, t1); 9388 reverse(Ra, Pn_base, Rlen, t0, t1); 9389 } 9390 9391 // Push all call-saved registers and also Pm_base which we'll need 9392 // at the end. 9393 save_regs(); 9394 9395 #ifndef PRODUCT 9396 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 9397 { 9398 ldr(Rn, Address(Pn_base, 0)); 9399 mul(Rlo_mn, Rn, inv); 9400 subs(zr, Rlo_mn, -1); 9401 Label ok; 9402 br(EQ, ok); { 9403 stop("broken inverse in Montgomery multiply"); 9404 } bind(ok); 9405 } 9406 #endif 9407 9408 mov(Pm_base, Ra); 9409 9410 mov(t0, zr); 9411 mov(t1, zr); 9412 mov(t2, zr); 9413 9414 block_comment("for (int i = 0; i < len; i++) {"); 9415 mov(Ri, zr); { 9416 Label loop, end; 9417 cmpw(Ri, Rlen); 9418 br(Assembler::GE, end); 9419 9420 bind(loop); 9421 pre1(Ri); 9422 9423 block_comment(" for (j = i; j; j--) {"); { 9424 movw(Rj, Ri); 9425 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 9426 } block_comment(" } // j"); 9427 9428 post1(); 9429 addw(Ri, Ri, 1); 9430 cmpw(Ri, Rlen); 9431 br(Assembler::LT, loop); 9432 bind(end); 9433 block_comment("} // i"); 9434 } 9435 9436 block_comment("for (int i = len; i < 2*len; i++) {"); 9437 mov(Ri, Rlen); { 9438 Label loop, end; 9439 cmpw(Ri, Rlen, Assembler::LSL, 1); 9440 br(Assembler::GE, end); 9441 9442 bind(loop); 9443 pre2(Ri, Rlen); 9444 9445 block_comment(" for (j = len*2-i-1; j; j--) {"); { 9446 lslw(Rj, Rlen, 1); 9447 subw(Rj, Rj, Ri); 9448 subw(Rj, Rj, 1); 9449 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 9450 } block_comment(" } // j"); 9451 9452 post2(Ri, Rlen); 9453 addw(Ri, Ri, 1); 9454 cmpw(Ri, Rlen, Assembler::LSL, 1); 9455 br(Assembler::LT, loop); 9456 bind(end); 9457 } 9458 block_comment("} // i"); 9459 9460 normalize(Rlen); 9461 9462 mov(Ra, Pm_base); // Save Pm_base in Ra 9463 restore_regs(); // Restore caller's Pm_base 9464 9465 // Copy our result into caller's Pm_base 9466 reverse(Pm_base, Ra, Rlen, t0, t1); 9467 9468 leave(); 9469 bind(nothing); 9470 ret(lr); 9471 9472 return entry; 9473 } 9474 // In C, approximately: 9475 9476 // void 9477 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 9478 // julong Pn_base[], julong Pm_base[], 9479 // julong inv, int len) { 9480 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 9481 // julong *Pa, *Pb, *Pn, *Pm; 9482 // julong Ra, Rb, Rn, Rm; 9483 9484 // int i; 9485 9486 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 9487 9488 // for (i = 0; i < len; i++) { 9489 // int j; 9490 9491 // Pa = Pa_base; 9492 // Pb = Pb_base + i; 9493 // Pm = Pm_base; 9494 // Pn = Pn_base + i; 9495 9496 // Ra = *Pa; 9497 // Rb = *Pb; 9498 // Rm = *Pm; 9499 // Rn = *Pn; 9500 9501 // int iters = i; 9502 // for (j = 0; iters--; j++) { 9503 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 9504 // MACC(Ra, Rb, t0, t1, t2); 9505 // Ra = *++Pa; 9506 // Rb = *--Pb; 9507 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 9508 // MACC(Rm, Rn, t0, t1, t2); 9509 // Rm = *++Pm; 9510 // Rn = *--Pn; 9511 // } 9512 9513 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 9514 // MACC(Ra, Rb, t0, t1, t2); 9515 // *Pm = Rm = t0 * inv; 9516 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 9517 // MACC(Rm, Rn, t0, t1, t2); 9518 9519 // assert(t0 == 0, "broken Montgomery multiply"); 9520 9521 // t0 = t1; t1 = t2; t2 = 0; 9522 // } 9523 9524 // for (i = len; i < 2*len; i++) { 9525 // int j; 9526 9527 // Pa = Pa_base + i-len; 9528 // Pb = Pb_base + len; 9529 // Pm = Pm_base + i-len; 9530 // Pn = Pn_base + len; 9531 9532 // Ra = *++Pa; 9533 // Rb = *--Pb; 9534 // Rm = *++Pm; 9535 // Rn = *--Pn; 9536 9537 // int iters = len*2-i-1; 9538 // for (j = i-len+1; iters--; j++) { 9539 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 9540 // MACC(Ra, Rb, t0, t1, t2); 9541 // Ra = *++Pa; 9542 // Rb = *--Pb; 9543 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 9544 // MACC(Rm, Rn, t0, t1, t2); 9545 // Rm = *++Pm; 9546 // Rn = *--Pn; 9547 // } 9548 9549 // Pm_base[i-len] = t0; 9550 // t0 = t1; t1 = t2; t2 = 0; 9551 // } 9552 9553 // while (t0) 9554 // t0 = sub(Pm_base, Pn_base, t0, len); 9555 // } 9556 9557 /** 9558 * Fast Montgomery squaring. This uses asymptotically 25% fewer 9559 * multiplies than Montgomery multiplication so it should be up to 9560 * 25% faster. However, its loop control is more complex and it 9561 * may actually run slower on some machines. 9562 * 9563 * Arguments: 9564 * 9565 * Inputs: 9566 * c_rarg0 - int array elements a 9567 * c_rarg1 - int array elements n (the modulus) 9568 * c_rarg2 - int length 9569 * c_rarg3 - int inv 9570 * c_rarg4 - int array elements m (the result) 9571 * 9572 */ 9573 address generate_square() { 9574 Label argh; 9575 bind(argh); 9576 stop("MontgomeryMultiply total_allocation must be <= 8192"); 9577 9578 align(CodeEntryAlignment); 9579 address entry = pc(); 9580 9581 enter(); 9582 9583 // Make room. 9584 cmpw(Rlen, 512); 9585 br(Assembler::HI, argh); 9586 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 9587 andr(sp, Ra, -2 * wordSize); 9588 9589 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 9590 9591 { 9592 // Copy input args, reversing as we go. We use Ra as a 9593 // temporary variable. 9594 reverse(Ra, Pa_base, Rlen, t0, t1); 9595 reverse(Ra, Pn_base, Rlen, t0, t1); 9596 } 9597 9598 // Push all call-saved registers and also Pm_base which we'll need 9599 // at the end. 9600 save_regs(); 9601 9602 mov(Pm_base, Ra); 9603 9604 mov(t0, zr); 9605 mov(t1, zr); 9606 mov(t2, zr); 9607 9608 block_comment("for (int i = 0; i < len; i++) {"); 9609 mov(Ri, zr); { 9610 Label loop, end; 9611 bind(loop); 9612 cmp(Ri, Rlen); 9613 br(Assembler::GE, end); 9614 9615 pre1(Ri); 9616 9617 block_comment("for (j = (i+1)/2; j; j--) {"); { 9618 add(Rj, Ri, 1); 9619 lsr(Rj, Rj, 1); 9620 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 9621 } block_comment(" } // j"); 9622 9623 last_squaring(Ri); 9624 9625 block_comment(" for (j = i/2; j; j--) {"); { 9626 lsr(Rj, Ri, 1); 9627 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 9628 } block_comment(" } // j"); 9629 9630 post1_squaring(); 9631 add(Ri, Ri, 1); 9632 cmp(Ri, Rlen); 9633 br(Assembler::LT, loop); 9634 9635 bind(end); 9636 block_comment("} // i"); 9637 } 9638 9639 block_comment("for (int i = len; i < 2*len; i++) {"); 9640 mov(Ri, Rlen); { 9641 Label loop, end; 9642 bind(loop); 9643 cmp(Ri, Rlen, Assembler::LSL, 1); 9644 br(Assembler::GE, end); 9645 9646 pre2(Ri, Rlen); 9647 9648 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 9649 lsl(Rj, Rlen, 1); 9650 sub(Rj, Rj, Ri); 9651 sub(Rj, Rj, 1); 9652 lsr(Rj, Rj, 1); 9653 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 9654 } block_comment(" } // j"); 9655 9656 last_squaring(Ri); 9657 9658 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 9659 lsl(Rj, Rlen, 1); 9660 sub(Rj, Rj, Ri); 9661 lsr(Rj, Rj, 1); 9662 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 9663 } block_comment(" } // j"); 9664 9665 post2(Ri, Rlen); 9666 add(Ri, Ri, 1); 9667 cmp(Ri, Rlen, Assembler::LSL, 1); 9668 9669 br(Assembler::LT, loop); 9670 bind(end); 9671 block_comment("} // i"); 9672 } 9673 9674 normalize(Rlen); 9675 9676 mov(Ra, Pm_base); // Save Pm_base in Ra 9677 restore_regs(); // Restore caller's Pm_base 9678 9679 // Copy our result into caller's Pm_base 9680 reverse(Pm_base, Ra, Rlen, t0, t1); 9681 9682 leave(); 9683 ret(lr); 9684 9685 return entry; 9686 } 9687 // In C, approximately: 9688 9689 // void 9690 // montgomery_square(julong Pa_base[], julong Pn_base[], 9691 // julong Pm_base[], julong inv, int len) { 9692 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 9693 // julong *Pa, *Pb, *Pn, *Pm; 9694 // julong Ra, Rb, Rn, Rm; 9695 9696 // int i; 9697 9698 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 9699 9700 // for (i = 0; i < len; i++) { 9701 // int j; 9702 9703 // Pa = Pa_base; 9704 // Pb = Pa_base + i; 9705 // Pm = Pm_base; 9706 // Pn = Pn_base + i; 9707 9708 // Ra = *Pa; 9709 // Rb = *Pb; 9710 // Rm = *Pm; 9711 // Rn = *Pn; 9712 9713 // int iters = (i+1)/2; 9714 // for (j = 0; iters--; j++) { 9715 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 9716 // MACC2(Ra, Rb, t0, t1, t2); 9717 // Ra = *++Pa; 9718 // Rb = *--Pb; 9719 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 9720 // MACC(Rm, Rn, t0, t1, t2); 9721 // Rm = *++Pm; 9722 // Rn = *--Pn; 9723 // } 9724 // if ((i & 1) == 0) { 9725 // assert(Ra == Pa_base[j], "must be"); 9726 // MACC(Ra, Ra, t0, t1, t2); 9727 // } 9728 // iters = i/2; 9729 // assert(iters == i-j, "must be"); 9730 // for (; iters--; j++) { 9731 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 9732 // MACC(Rm, Rn, t0, t1, t2); 9733 // Rm = *++Pm; 9734 // Rn = *--Pn; 9735 // } 9736 9737 // *Pm = Rm = t0 * inv; 9738 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 9739 // MACC(Rm, Rn, t0, t1, t2); 9740 9741 // assert(t0 == 0, "broken Montgomery multiply"); 9742 9743 // t0 = t1; t1 = t2; t2 = 0; 9744 // } 9745 9746 // for (i = len; i < 2*len; i++) { 9747 // int start = i-len+1; 9748 // int end = start + (len - start)/2; 9749 // int j; 9750 9751 // Pa = Pa_base + i-len; 9752 // Pb = Pa_base + len; 9753 // Pm = Pm_base + i-len; 9754 // Pn = Pn_base + len; 9755 9756 // Ra = *++Pa; 9757 // Rb = *--Pb; 9758 // Rm = *++Pm; 9759 // Rn = *--Pn; 9760 9761 // int iters = (2*len-i-1)/2; 9762 // assert(iters == end-start, "must be"); 9763 // for (j = start; iters--; j++) { 9764 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 9765 // MACC2(Ra, Rb, t0, t1, t2); 9766 // Ra = *++Pa; 9767 // Rb = *--Pb; 9768 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 9769 // MACC(Rm, Rn, t0, t1, t2); 9770 // Rm = *++Pm; 9771 // Rn = *--Pn; 9772 // } 9773 // if ((i & 1) == 0) { 9774 // assert(Ra == Pa_base[j], "must be"); 9775 // MACC(Ra, Ra, t0, t1, t2); 9776 // } 9777 // iters = (2*len-i)/2; 9778 // assert(iters == len-j, "must be"); 9779 // for (; iters--; j++) { 9780 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 9781 // MACC(Rm, Rn, t0, t1, t2); 9782 // Rm = *++Pm; 9783 // Rn = *--Pn; 9784 // } 9785 // Pm_base[i-len] = t0; 9786 // t0 = t1; t1 = t2; t2 = 0; 9787 // } 9788 9789 // while (t0) 9790 // t0 = sub(Pm_base, Pn_base, t0, len); 9791 // } 9792 }; 9793 9794 void generate_vector_math_stubs() { 9795 // Get native vector math stub routine addresses 9796 void* libsleef = nullptr; 9797 char ebuf[1024]; 9798 char dll_name[JVM_MAXPATHLEN]; 9799 if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "sleef")) { 9800 libsleef = os::dll_load(dll_name, ebuf, sizeof ebuf); 9801 } 9802 if (libsleef == nullptr) { 9803 log_info(library)("Failed to load native vector math library, %s!", ebuf); 9804 return; 9805 } 9806 // Method naming convention 9807 // All the methods are named as <OP><T><N>_<U><suffix> 9808 // Where: 9809 // <OP> is the operation name, e.g. sin 9810 // <T> is optional to indicate float/double 9811 // "f/d" for vector float/double operation 9812 // <N> is the number of elements in the vector 9813 // "2/4" for neon, and "x" for sve 9814 // <U> is the precision level 9815 // "u10/u05" represents 1.0/0.5 ULP error bounds 9816 // We use "u10" for all operations by default 9817 // But for those functions do not have u10 support, we use "u05" instead 9818 // <suffix> indicates neon/sve 9819 // "sve/advsimd" for sve/neon implementations 9820 // e.g. sinfx_u10sve is the method for computing vector float sin using SVE instructions 9821 // cosd2_u10advsimd is the method for computing 2 elements vector double cos using NEON instructions 9822 // 9823 log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "sleef" JNI_LIB_SUFFIX, p2i(libsleef)); 9824 9825 // Math vector stubs implemented with SVE for scalable vector size. 9826 if (UseSVE > 0) { 9827 for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) { 9828 int vop = VectorSupport::VECTOR_OP_MATH_START + op; 9829 // Skip "tanh" because there is performance regression 9830 if (vop == VectorSupport::VECTOR_OP_TANH) { 9831 continue; 9832 } 9833 9834 // The native library does not support u10 level of "hypot". 9835 const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10"; 9836 9837 snprintf(ebuf, sizeof(ebuf), "%sfx_%ssve", VectorSupport::mathname[op], ulf); 9838 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf); 9839 9840 snprintf(ebuf, sizeof(ebuf), "%sdx_%ssve", VectorSupport::mathname[op], ulf); 9841 StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf); 9842 } 9843 } 9844 9845 // Math vector stubs implemented with NEON for 64/128 bits vector size. 9846 for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) { 9847 int vop = VectorSupport::VECTOR_OP_MATH_START + op; 9848 // Skip "tanh" because there is performance regression 9849 if (vop == VectorSupport::VECTOR_OP_TANH) { 9850 continue; 9851 } 9852 9853 // The native library does not support u10 level of "hypot". 9854 const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10"; 9855 9856 snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf); 9857 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libsleef, ebuf); 9858 9859 snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf); 9860 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf); 9861 9862 snprintf(ebuf, sizeof(ebuf), "%sd2_%sadvsimd", VectorSupport::mathname[op], ulf); 9863 StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf); 9864 } 9865 } 9866 9867 // Call here from the interpreter or compiled code to either load 9868 // multiple returned values from the inline type instance being 9869 // returned to registers or to store returned values to a newly 9870 // allocated inline type instance. 9871 address generate_return_value_stub(address destination, const char* name, bool has_res) { 9872 // We need to save all registers the calling convention may use so 9873 // the runtime calls read or update those registers. This needs to 9874 // be in sync with SharedRuntime::java_return_convention(). 9875 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 9876 enum layout { 9877 j_rarg7_off = 0, j_rarg7_2, // j_rarg7 is r0 9878 j_rarg6_off, j_rarg6_2, 9879 j_rarg5_off, j_rarg5_2, 9880 j_rarg4_off, j_rarg4_2, 9881 j_rarg3_off, j_rarg3_2, 9882 j_rarg2_off, j_rarg2_2, 9883 j_rarg1_off, j_rarg1_2, 9884 j_rarg0_off, j_rarg0_2, 9885 9886 j_farg7_off, j_farg7_2, 9887 j_farg6_off, j_farg6_2, 9888 j_farg5_off, j_farg5_2, 9889 j_farg4_off, j_farg4_2, 9890 j_farg3_off, j_farg3_2, 9891 j_farg2_off, j_farg2_2, 9892 j_farg1_off, j_farg1_2, 9893 j_farg0_off, j_farg0_2, 9894 9895 rfp_off, rfp_off2, 9896 return_off, return_off2, 9897 9898 framesize // inclusive of return address 9899 }; 9900 9901 CodeBuffer code(name, 512, 64); 9902 MacroAssembler* masm = new MacroAssembler(&code); 9903 9904 int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16); 9905 assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned"); 9906 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt; 9907 int frame_size_in_words = frame_size_in_bytes / wordSize; 9908 9909 OopMapSet* oop_maps = new OopMapSet(); 9910 OopMap* map = new OopMap(frame_size_in_slots, 0); 9911 9912 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg()); 9913 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg()); 9914 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg()); 9915 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg()); 9916 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg()); 9917 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg()); 9918 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg()); 9919 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg()); 9920 9921 map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg()); 9922 map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg()); 9923 map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg()); 9924 map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg()); 9925 map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg()); 9926 map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg()); 9927 map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg()); 9928 map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg()); 9929 9930 address start = __ pc(); 9931 9932 __ enter(); // Save FP and LR before call 9933 9934 __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize))); 9935 __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize))); 9936 __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize))); 9937 __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize))); 9938 9939 __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize))); 9940 __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize))); 9941 __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize))); 9942 __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize))); 9943 9944 int frame_complete = __ offset(); 9945 9946 // Set up last_Java_sp and last_Java_fp 9947 address the_pc = __ pc(); 9948 __ set_last_Java_frame(sp, noreg, the_pc, rscratch1); 9949 9950 // Call runtime 9951 __ mov(c_rarg1, r0); 9952 __ mov(c_rarg0, rthread); 9953 9954 __ mov(rscratch1, destination); 9955 __ blr(rscratch1); 9956 9957 oop_maps->add_gc_map(the_pc - start, map); 9958 9959 __ reset_last_Java_frame(false); 9960 9961 __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize))); 9962 __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize))); 9963 __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize))); 9964 __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize))); 9965 9966 __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize))); 9967 __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize))); 9968 __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize))); 9969 __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize))); 9970 9971 __ leave(); 9972 9973 // check for pending exceptions 9974 Label pending; 9975 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 9976 __ cbnz(rscratch1, pending); 9977 9978 if (has_res) { 9979 __ get_vm_result(r0, rthread); 9980 } 9981 9982 __ ret(lr); 9983 9984 __ bind(pending); 9985 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 9986 9987 // ------------- 9988 // make sure all code is generated 9989 masm->flush(); 9990 9991 RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false); 9992 return stub->entry_point(); 9993 } 9994 9995 // Initialization 9996 void generate_initial_stubs() { 9997 // Generate initial stubs and initializes the entry points 9998 9999 // entry points that exist in all platforms Note: This is code 10000 // that could be shared among different platforms - however the 10001 // benefit seems to be smaller than the disadvantage of having a 10002 // much more complicated generator structure. See also comment in 10003 // stubRoutines.hpp. 10004 10005 StubRoutines::_forward_exception_entry = generate_forward_exception(); 10006 10007 StubRoutines::_call_stub_entry = 10008 generate_call_stub(StubRoutines::_call_stub_return_address); 10009 10010 // is referenced by megamorphic call 10011 StubRoutines::_catch_exception_entry = generate_catch_exception(); 10012 10013 // Initialize table for copy memory (arraycopy) check. 10014 if (UnsafeMemoryAccess::_table == nullptr) { 10015 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory 10016 } 10017 10018 if (UseCRC32Intrinsics) { 10019 // set table address before stub generation which use it 10020 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 10021 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 10022 } 10023 10024 if (UseCRC32CIntrinsics) { 10025 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 10026 } 10027 10028 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 10029 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 10030 } 10031 10032 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 10033 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 10034 } 10035 10036 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) && 10037 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) { 10038 StubRoutines::_hf2f = generate_float16ToFloat(); 10039 StubRoutines::_f2hf = generate_floatToFloat16(); 10040 } 10041 10042 if (InlineTypeReturnedAsFields) { 10043 StubRoutines::_load_inline_type_fields_in_regs = 10044 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false); 10045 StubRoutines::_store_inline_type_fields_to_buf = 10046 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true); 10047 } 10048 10049 } 10050 10051 void generate_continuation_stubs() { 10052 // Continuation stubs: 10053 StubRoutines::_cont_thaw = generate_cont_thaw(); 10054 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 10055 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 10056 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub(); 10057 } 10058 10059 void generate_final_stubs() { 10060 // support for verify_oop (must happen after universe_init) 10061 if (VerifyOops) { 10062 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 10063 } 10064 10065 // arraycopy stubs used by compilers 10066 generate_arraycopy_stubs(); 10067 10068 StubRoutines::_method_entry_barrier = generate_method_entry_barrier(); 10069 10070 StubRoutines::aarch64::_spin_wait = generate_spin_wait(); 10071 10072 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler(); 10073 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target(); 10074 10075 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 10076 10077 generate_atomic_entry_points(); 10078 10079 #endif // LINUX 10080 10081 #ifdef COMPILER2 10082 if (UseSecondarySupersTable) { 10083 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub(); 10084 if (! InlineSecondarySupersTest) { 10085 generate_lookup_secondary_supers_table_stub(); 10086 } 10087 } 10088 #endif 10089 10090 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated 10091 } 10092 10093 void generate_compiler_stubs() { 10094 #if COMPILER2_OR_JVMCI 10095 10096 if (UseSVE == 0) { 10097 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubGenStubId::vector_iota_indices_id); 10098 } 10099 10100 // array equals stub for large arrays. 10101 if (!UseSimpleArrayEquals) { 10102 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 10103 } 10104 10105 // arrays_hascode stub for large arrays. 10106 StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN); 10107 StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE); 10108 StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR); 10109 StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT); 10110 StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT); 10111 10112 // byte_array_inflate stub for large arrays. 10113 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 10114 10115 // countPositives stub for large arrays. 10116 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long); 10117 10118 generate_compare_long_strings(); 10119 10120 generate_string_indexof_stubs(); 10121 10122 #ifdef COMPILER2 10123 if (UseMultiplyToLenIntrinsic) { 10124 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 10125 } 10126 10127 if (UseSquareToLenIntrinsic) { 10128 StubRoutines::_squareToLen = generate_squareToLen(); 10129 } 10130 10131 if (UseMulAddIntrinsic) { 10132 StubRoutines::_mulAdd = generate_mulAdd(); 10133 } 10134 10135 if (UseSIMDForBigIntegerShiftIntrinsics) { 10136 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 10137 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 10138 } 10139 10140 if (UseMontgomeryMultiplyIntrinsic) { 10141 StubGenStubId stub_id = StubGenStubId::montgomeryMultiply_id; 10142 StubCodeMark mark(this, stub_id); 10143 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 10144 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 10145 } 10146 10147 if (UseMontgomerySquareIntrinsic) { 10148 StubGenStubId stub_id = StubGenStubId::montgomerySquare_id; 10149 StubCodeMark mark(this, stub_id); 10150 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 10151 // We use generate_multiply() rather than generate_square() 10152 // because it's faster for the sizes of modulus we care about. 10153 StubRoutines::_montgomerySquare = g.generate_multiply(); 10154 } 10155 10156 generate_vector_math_stubs(); 10157 10158 #endif // COMPILER2 10159 10160 if (UseChaCha20Intrinsics) { 10161 StubRoutines::_chacha20Block = generate_chacha20Block_qrpar(); 10162 } 10163 10164 if (UseDilithiumIntrinsics) { 10165 StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt(); 10166 StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt(); 10167 StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult(); 10168 StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant(); 10169 StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly(); 10170 } 10171 10172 if (UseBASE64Intrinsics) { 10173 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 10174 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 10175 } 10176 10177 // data cache line writeback 10178 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 10179 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 10180 10181 if (UseAESIntrinsics) { 10182 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 10183 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 10184 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 10185 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 10186 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); 10187 } 10188 if (UseGHASHIntrinsics) { 10189 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 10190 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); 10191 } 10192 if (UseAESIntrinsics && UseGHASHIntrinsics) { 10193 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt(); 10194 } 10195 10196 if (UseMD5Intrinsics) { 10197 StubRoutines::_md5_implCompress = generate_md5_implCompress(StubGenStubId::md5_implCompress_id); 10198 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubGenStubId::md5_implCompressMB_id); 10199 } 10200 if (UseSHA1Intrinsics) { 10201 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubGenStubId::sha1_implCompress_id); 10202 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubGenStubId::sha1_implCompressMB_id); 10203 } 10204 if (UseSHA256Intrinsics) { 10205 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubGenStubId::sha256_implCompress_id); 10206 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubGenStubId::sha256_implCompressMB_id); 10207 } 10208 if (UseSHA512Intrinsics) { 10209 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubGenStubId::sha512_implCompress_id); 10210 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubGenStubId::sha512_implCompressMB_id); 10211 } 10212 if (UseSHA3Intrinsics) { 10213 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(StubGenStubId::sha3_implCompress_id); 10214 StubRoutines::_double_keccak = generate_double_keccak(); 10215 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(StubGenStubId::sha3_implCompressMB_id); 10216 } 10217 10218 if (UsePoly1305Intrinsics) { 10219 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); 10220 } 10221 10222 // generate Adler32 intrinsics code 10223 if (UseAdler32Intrinsics) { 10224 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 10225 } 10226 10227 #endif // COMPILER2_OR_JVMCI 10228 } 10229 10230 public: 10231 StubGenerator(CodeBuffer* code, StubGenBlobId blob_id) : StubCodeGenerator(code, blob_id) { 10232 switch(blob_id) { 10233 case initial_id: 10234 generate_initial_stubs(); 10235 break; 10236 case continuation_id: 10237 generate_continuation_stubs(); 10238 break; 10239 case compiler_id: 10240 generate_compiler_stubs(); 10241 break; 10242 case final_id: 10243 generate_final_stubs(); 10244 break; 10245 default: 10246 fatal("unexpected blob id: %d", blob_id); 10247 break; 10248 }; 10249 } 10250 }; // end class declaration 10251 10252 void StubGenerator_generate(CodeBuffer* code, StubGenBlobId blob_id) { 10253 StubGenerator g(code, blob_id); 10254 } 10255 10256 10257 #if defined (LINUX) 10258 10259 // Define pointers to atomic stubs and initialize them to point to the 10260 // code in atomic_aarch64.S. 10261 10262 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ 10263 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ 10264 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ 10265 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ 10266 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; 10267 10268 DEFAULT_ATOMIC_OP(fetch_add, 4, ) 10269 DEFAULT_ATOMIC_OP(fetch_add, 8, ) 10270 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed) 10271 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed) 10272 DEFAULT_ATOMIC_OP(xchg, 4, ) 10273 DEFAULT_ATOMIC_OP(xchg, 8, ) 10274 DEFAULT_ATOMIC_OP(cmpxchg, 1, ) 10275 DEFAULT_ATOMIC_OP(cmpxchg, 4, ) 10276 DEFAULT_ATOMIC_OP(cmpxchg, 8, ) 10277 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) 10278 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) 10279 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) 10280 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) 10281 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) 10282 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) 10283 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) 10284 10285 #undef DEFAULT_ATOMIC_OP 10286 10287 #endif // LINUX