1 /* 2 * Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2024, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "asm/register.hpp" 30 #include "atomic_aarch64.hpp" 31 #include "compiler/oopMap.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/barrierSetAssembler.hpp" 34 #include "gc/shared/gc_globals.hpp" 35 #include "gc/shared/tlab_globals.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "memory/universe.hpp" 38 #include "nativeInst_aarch64.hpp" 39 #include "oops/instanceOop.hpp" 40 #include "oops/method.hpp" 41 #include "oops/objArrayKlass.hpp" 42 #include "oops/oop.inline.hpp" 43 #include "prims/methodHandles.hpp" 44 #include "prims/upcallLinker.hpp" 45 #include "runtime/arguments.hpp" 46 #include "runtime/atomic.hpp" 47 #include "runtime/continuation.hpp" 48 #include "runtime/continuationEntry.inline.hpp" 49 #include "runtime/frame.inline.hpp" 50 #include "runtime/handles.inline.hpp" 51 #include "runtime/javaThread.hpp" 52 #include "runtime/sharedRuntime.hpp" 53 #include "runtime/stubCodeGenerator.hpp" 54 #include "runtime/stubRoutines.hpp" 55 #include "utilities/align.hpp" 56 #include "utilities/checkedCast.hpp" 57 #include "utilities/debug.hpp" 58 #include "utilities/globalDefinitions.hpp" 59 #include "utilities/intpow.hpp" 60 #include "utilities/powerOfTwo.hpp" 61 #ifdef COMPILER2 62 #include "opto/runtime.hpp" 63 #endif 64 #if INCLUDE_ZGC 65 #include "gc/z/zThreadLocalData.hpp" 66 #endif 67 68 // Declaration and definition of StubGenerator (no .hpp file). 69 // For a more detailed description of the stub routine structure 70 // see the comment in stubRoutines.hpp 71 72 #undef __ 73 #define __ _masm-> 74 75 #ifdef PRODUCT 76 #define BLOCK_COMMENT(str) /* nothing */ 77 #else 78 #define BLOCK_COMMENT(str) __ block_comment(str) 79 #endif 80 81 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 82 83 // Stub Code definitions 84 85 class StubGenerator: public StubCodeGenerator { 86 private: 87 88 #ifdef PRODUCT 89 #define inc_counter_np(counter) ((void)0) 90 #else 91 void inc_counter_np_(uint& counter) { 92 __ incrementw(ExternalAddress((address)&counter)); 93 } 94 #define inc_counter_np(counter) \ 95 BLOCK_COMMENT("inc_counter " #counter); \ 96 inc_counter_np_(counter); 97 #endif 98 99 // Call stubs are used to call Java from C 100 // 101 // Arguments: 102 // c_rarg0: call wrapper address address 103 // c_rarg1: result address 104 // c_rarg2: result type BasicType 105 // c_rarg3: method Method* 106 // c_rarg4: (interpreter) entry point address 107 // c_rarg5: parameters intptr_t* 108 // c_rarg6: parameter size (in words) int 109 // c_rarg7: thread Thread* 110 // 111 // There is no return from the stub itself as any Java result 112 // is written to result 113 // 114 // we save r30 (lr) as the return PC at the base of the frame and 115 // link r29 (fp) below it as the frame pointer installing sp (r31) 116 // into fp. 117 // 118 // we save r0-r7, which accounts for all the c arguments. 119 // 120 // TODO: strictly do we need to save them all? they are treated as 121 // volatile by C so could we omit saving the ones we are going to 122 // place in global registers (thread? method?) or those we only use 123 // during setup of the Java call? 124 // 125 // we don't need to save r8 which C uses as an indirect result location 126 // return register. 127 // 128 // we don't need to save r9-r15 which both C and Java treat as 129 // volatile 130 // 131 // we don't need to save r16-18 because Java does not use them 132 // 133 // we save r19-r28 which Java uses as scratch registers and C 134 // expects to be callee-save 135 // 136 // we save the bottom 64 bits of each value stored in v8-v15; it is 137 // the responsibility of the caller to preserve larger values. 138 // 139 // so the stub frame looks like this when we enter Java code 140 // 141 // [ return_from_Java ] <--- sp 142 // [ argument word n ] 143 // ... 144 // -29 [ argument word 1 ] 145 // -28 [ saved Floating-point Control Register ] 146 // -26 [ saved v15 ] <--- sp_after_call 147 // -25 [ saved v14 ] 148 // -24 [ saved v13 ] 149 // -23 [ saved v12 ] 150 // -22 [ saved v11 ] 151 // -21 [ saved v10 ] 152 // -20 [ saved v9 ] 153 // -19 [ saved v8 ] 154 // -18 [ saved r28 ] 155 // -17 [ saved r27 ] 156 // -16 [ saved r26 ] 157 // -15 [ saved r25 ] 158 // -14 [ saved r24 ] 159 // -13 [ saved r23 ] 160 // -12 [ saved r22 ] 161 // -11 [ saved r21 ] 162 // -10 [ saved r20 ] 163 // -9 [ saved r19 ] 164 // -8 [ call wrapper (r0) ] 165 // -7 [ result (r1) ] 166 // -6 [ result type (r2) ] 167 // -5 [ method (r3) ] 168 // -4 [ entry point (r4) ] 169 // -3 [ parameters (r5) ] 170 // -2 [ parameter size (r6) ] 171 // -1 [ thread (r7) ] 172 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 173 // 1 [ saved lr (r30) ] 174 175 // Call stub stack layout word offsets from fp 176 enum call_stub_layout { 177 sp_after_call_off = -28, 178 179 fpcr_off = sp_after_call_off, 180 d15_off = -26, 181 d13_off = -24, 182 d11_off = -22, 183 d9_off = -20, 184 185 r28_off = -18, 186 r26_off = -16, 187 r24_off = -14, 188 r22_off = -12, 189 r20_off = -10, 190 call_wrapper_off = -8, 191 result_off = -7, 192 result_type_off = -6, 193 method_off = -5, 194 entry_point_off = -4, 195 parameter_size_off = -2, 196 thread_off = -1, 197 fp_f = 0, 198 retaddr_off = 1, 199 }; 200 201 address generate_call_stub(address& return_address) { 202 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 203 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 204 "adjust this code"); 205 206 StubCodeMark mark(this, "StubRoutines", "call_stub"); 207 address start = __ pc(); 208 209 const Address sp_after_call (rfp, sp_after_call_off * wordSize); 210 211 const Address fpcr_save (rfp, fpcr_off * wordSize); 212 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 213 const Address result (rfp, result_off * wordSize); 214 const Address result_type (rfp, result_type_off * wordSize); 215 const Address method (rfp, method_off * wordSize); 216 const Address entry_point (rfp, entry_point_off * wordSize); 217 const Address parameter_size(rfp, parameter_size_off * wordSize); 218 219 const Address thread (rfp, thread_off * wordSize); 220 221 const Address d15_save (rfp, d15_off * wordSize); 222 const Address d13_save (rfp, d13_off * wordSize); 223 const Address d11_save (rfp, d11_off * wordSize); 224 const Address d9_save (rfp, d9_off * wordSize); 225 226 const Address r28_save (rfp, r28_off * wordSize); 227 const Address r26_save (rfp, r26_off * wordSize); 228 const Address r24_save (rfp, r24_off * wordSize); 229 const Address r22_save (rfp, r22_off * wordSize); 230 const Address r20_save (rfp, r20_off * wordSize); 231 232 // stub code 233 234 address aarch64_entry = __ pc(); 235 236 // set up frame and move sp to end of save area 237 __ enter(); 238 __ sub(sp, rfp, -sp_after_call_off * wordSize); 239 240 // save register parameters and Java scratch/global registers 241 // n.b. we save thread even though it gets installed in 242 // rthread because we want to sanity check rthread later 243 __ str(c_rarg7, thread); 244 __ strw(c_rarg6, parameter_size); 245 __ stp(c_rarg4, c_rarg5, entry_point); 246 __ stp(c_rarg2, c_rarg3, result_type); 247 __ stp(c_rarg0, c_rarg1, call_wrapper); 248 249 __ stp(r20, r19, r20_save); 250 __ stp(r22, r21, r22_save); 251 __ stp(r24, r23, r24_save); 252 __ stp(r26, r25, r26_save); 253 __ stp(r28, r27, r28_save); 254 255 __ stpd(v9, v8, d9_save); 256 __ stpd(v11, v10, d11_save); 257 __ stpd(v13, v12, d13_save); 258 __ stpd(v15, v14, d15_save); 259 260 __ get_fpcr(rscratch1); 261 __ str(rscratch1, fpcr_save); 262 // Set FPCR to the state we need. We do want Round to Nearest. We 263 // don't want non-IEEE rounding modes or floating-point traps. 264 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode 265 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12) 266 __ set_fpcr(rscratch1); 267 268 // install Java thread in global register now we have saved 269 // whatever value it held 270 __ mov(rthread, c_rarg7); 271 // And method 272 __ mov(rmethod, c_rarg3); 273 274 // set up the heapbase register 275 __ reinit_heapbase(); 276 277 #ifdef ASSERT 278 // make sure we have no pending exceptions 279 { 280 Label L; 281 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 282 __ cmp(rscratch1, (u1)NULL_WORD); 283 __ br(Assembler::EQ, L); 284 __ stop("StubRoutines::call_stub: entered with pending exception"); 285 __ BIND(L); 286 } 287 #endif 288 // pass parameters if any 289 __ mov(esp, sp); 290 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 291 __ andr(sp, rscratch1, -2 * wordSize); 292 293 BLOCK_COMMENT("pass parameters if any"); 294 Label parameters_done; 295 // parameter count is still in c_rarg6 296 // and parameter pointer identifying param 1 is in c_rarg5 297 __ cbzw(c_rarg6, parameters_done); 298 299 address loop = __ pc(); 300 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 301 __ subsw(c_rarg6, c_rarg6, 1); 302 __ push(rscratch1); 303 __ br(Assembler::GT, loop); 304 305 __ BIND(parameters_done); 306 307 // call Java entry -- passing methdoOop, and current sp 308 // rmethod: Method* 309 // r19_sender_sp: sender sp 310 BLOCK_COMMENT("call Java function"); 311 __ mov(r19_sender_sp, sp); 312 __ blr(c_rarg4); 313 314 // we do this here because the notify will already have been done 315 // if we get to the next instruction via an exception 316 // 317 // n.b. adding this instruction here affects the calculation of 318 // whether or not a routine returns to the call stub (used when 319 // doing stack walks) since the normal test is to check the return 320 // pc against the address saved below. so we may need to allow for 321 // this extra instruction in the check. 322 323 // save current address for use by exception handling code 324 325 return_address = __ pc(); 326 327 // store result depending on type (everything that is not 328 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 329 // n.b. this assumes Java returns an integral result in r0 330 // and a floating result in j_farg0 331 __ ldr(j_rarg2, result); 332 Label is_long, is_float, is_double, exit; 333 __ ldr(j_rarg1, result_type); 334 __ cmp(j_rarg1, (u1)T_OBJECT); 335 __ br(Assembler::EQ, is_long); 336 __ cmp(j_rarg1, (u1)T_LONG); 337 __ br(Assembler::EQ, is_long); 338 __ cmp(j_rarg1, (u1)T_FLOAT); 339 __ br(Assembler::EQ, is_float); 340 __ cmp(j_rarg1, (u1)T_DOUBLE); 341 __ br(Assembler::EQ, is_double); 342 343 // handle T_INT case 344 __ strw(r0, Address(j_rarg2)); 345 346 __ BIND(exit); 347 348 // pop parameters 349 __ sub(esp, rfp, -sp_after_call_off * wordSize); 350 351 #ifdef ASSERT 352 // verify that threads correspond 353 { 354 Label L, S; 355 __ ldr(rscratch1, thread); 356 __ cmp(rthread, rscratch1); 357 __ br(Assembler::NE, S); 358 __ get_thread(rscratch1); 359 __ cmp(rthread, rscratch1); 360 __ br(Assembler::EQ, L); 361 __ BIND(S); 362 __ stop("StubRoutines::call_stub: threads must correspond"); 363 __ BIND(L); 364 } 365 #endif 366 367 __ pop_cont_fastpath(rthread); 368 369 // restore callee-save registers 370 __ ldpd(v15, v14, d15_save); 371 __ ldpd(v13, v12, d13_save); 372 __ ldpd(v11, v10, d11_save); 373 __ ldpd(v9, v8, d9_save); 374 375 __ ldp(r28, r27, r28_save); 376 __ ldp(r26, r25, r26_save); 377 __ ldp(r24, r23, r24_save); 378 __ ldp(r22, r21, r22_save); 379 __ ldp(r20, r19, r20_save); 380 381 // restore fpcr 382 __ ldr(rscratch1, fpcr_save); 383 __ set_fpcr(rscratch1); 384 385 __ ldp(c_rarg0, c_rarg1, call_wrapper); 386 __ ldrw(c_rarg2, result_type); 387 __ ldr(c_rarg3, method); 388 __ ldp(c_rarg4, c_rarg5, entry_point); 389 __ ldp(c_rarg6, c_rarg7, parameter_size); 390 391 // leave frame and return to caller 392 __ leave(); 393 __ ret(lr); 394 395 // handle return types different from T_INT 396 397 __ BIND(is_long); 398 __ str(r0, Address(j_rarg2, 0)); 399 __ br(Assembler::AL, exit); 400 401 __ BIND(is_float); 402 __ strs(j_farg0, Address(j_rarg2, 0)); 403 __ br(Assembler::AL, exit); 404 405 __ BIND(is_double); 406 __ strd(j_farg0, Address(j_rarg2, 0)); 407 __ br(Assembler::AL, exit); 408 409 return start; 410 } 411 412 // Return point for a Java call if there's an exception thrown in 413 // Java code. The exception is caught and transformed into a 414 // pending exception stored in JavaThread that can be tested from 415 // within the VM. 416 // 417 // Note: Usually the parameters are removed by the callee. In case 418 // of an exception crossing an activation frame boundary, that is 419 // not the case if the callee is compiled code => need to setup the 420 // rsp. 421 // 422 // r0: exception oop 423 424 address generate_catch_exception() { 425 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 426 address start = __ pc(); 427 428 // same as in generate_call_stub(): 429 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 430 const Address thread (rfp, thread_off * wordSize); 431 432 #ifdef ASSERT 433 // verify that threads correspond 434 { 435 Label L, S; 436 __ ldr(rscratch1, thread); 437 __ cmp(rthread, rscratch1); 438 __ br(Assembler::NE, S); 439 __ get_thread(rscratch1); 440 __ cmp(rthread, rscratch1); 441 __ br(Assembler::EQ, L); 442 __ bind(S); 443 __ stop("StubRoutines::catch_exception: threads must correspond"); 444 __ bind(L); 445 } 446 #endif 447 448 // set pending exception 449 __ verify_oop(r0); 450 451 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 452 __ mov(rscratch1, (address)__FILE__); 453 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 454 __ movw(rscratch1, (int)__LINE__); 455 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 456 457 // complete return to VM 458 assert(StubRoutines::_call_stub_return_address != nullptr, 459 "_call_stub_return_address must have been generated before"); 460 __ b(StubRoutines::_call_stub_return_address); 461 462 return start; 463 } 464 465 // Continuation point for runtime calls returning with a pending 466 // exception. The pending exception check happened in the runtime 467 // or native call stub. The pending exception in Thread is 468 // converted into a Java-level exception. 469 // 470 // Contract with Java-level exception handlers: 471 // r0: exception 472 // r3: throwing pc 473 // 474 // NOTE: At entry of this stub, exception-pc must be in LR !! 475 476 // NOTE: this is always used as a jump target within generated code 477 // so it just needs to be generated code with no x86 prolog 478 479 address generate_forward_exception() { 480 StubCodeMark mark(this, "StubRoutines", "forward exception"); 481 address start = __ pc(); 482 483 // Upon entry, LR points to the return address returning into 484 // Java (interpreted or compiled) code; i.e., the return address 485 // becomes the throwing pc. 486 // 487 // Arguments pushed before the runtime call are still on the stack 488 // but the exception handler will reset the stack pointer -> 489 // ignore them. A potential result in registers can be ignored as 490 // well. 491 492 #ifdef ASSERT 493 // make sure this code is only executed if there is a pending exception 494 { 495 Label L; 496 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 497 __ cbnz(rscratch1, L); 498 __ stop("StubRoutines::forward exception: no pending exception (1)"); 499 __ bind(L); 500 } 501 #endif 502 503 // compute exception handler into r19 504 505 // call the VM to find the handler address associated with the 506 // caller address. pass thread in r0 and caller pc (ret address) 507 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 508 // the stack. 509 __ mov(c_rarg1, lr); 510 // lr will be trashed by the VM call so we move it to R19 511 // (callee-saved) because we also need to pass it to the handler 512 // returned by this call. 513 __ mov(r19, lr); 514 BLOCK_COMMENT("call exception_handler_for_return_address"); 515 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 516 SharedRuntime::exception_handler_for_return_address), 517 rthread, c_rarg1); 518 // Reinitialize the ptrue predicate register, in case the external runtime 519 // call clobbers ptrue reg, as we may return to SVE compiled code. 520 __ reinitialize_ptrue(); 521 522 // we should not really care that lr is no longer the callee 523 // address. we saved the value the handler needs in r19 so we can 524 // just copy it to r3. however, the C2 handler will push its own 525 // frame and then calls into the VM and the VM code asserts that 526 // the PC for the frame above the handler belongs to a compiled 527 // Java method. So, we restore lr here to satisfy that assert. 528 __ mov(lr, r19); 529 // setup r0 & r3 & clear pending exception 530 __ mov(r3, r19); 531 __ mov(r19, r0); 532 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 533 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 534 535 #ifdef ASSERT 536 // make sure exception is set 537 { 538 Label L; 539 __ cbnz(r0, L); 540 __ stop("StubRoutines::forward exception: no pending exception (2)"); 541 __ bind(L); 542 } 543 #endif 544 545 // continue at exception handler 546 // r0: exception 547 // r3: throwing pc 548 // r19: exception handler 549 __ verify_oop(r0); 550 __ br(r19); 551 552 return start; 553 } 554 555 // Non-destructive plausibility checks for oops 556 // 557 // Arguments: 558 // r0: oop to verify 559 // rscratch1: error message 560 // 561 // Stack after saving c_rarg3: 562 // [tos + 0]: saved c_rarg3 563 // [tos + 1]: saved c_rarg2 564 // [tos + 2]: saved lr 565 // [tos + 3]: saved rscratch2 566 // [tos + 4]: saved r0 567 // [tos + 5]: saved rscratch1 568 address generate_verify_oop() { 569 570 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 571 address start = __ pc(); 572 573 Label exit, error; 574 575 // save c_rarg2 and c_rarg3 576 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 577 578 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 579 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 580 __ ldr(c_rarg3, Address(c_rarg2)); 581 __ add(c_rarg3, c_rarg3, 1); 582 __ str(c_rarg3, Address(c_rarg2)); 583 584 // object is in r0 585 // make sure object is 'reasonable' 586 __ cbz(r0, exit); // if obj is null it is OK 587 588 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 589 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error); 590 591 // return if everything seems ok 592 __ bind(exit); 593 594 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 595 __ ret(lr); 596 597 // handle errors 598 __ bind(error); 599 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 600 601 __ push(RegSet::range(r0, r29), sp); 602 // debug(char* msg, int64_t pc, int64_t regs[]) 603 __ mov(c_rarg0, rscratch1); // pass address of error message 604 __ mov(c_rarg1, lr); // pass return address 605 __ mov(c_rarg2, sp); // pass address of regs on stack 606 #ifndef PRODUCT 607 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 608 #endif 609 BLOCK_COMMENT("call MacroAssembler::debug"); 610 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 611 __ blr(rscratch1); 612 __ hlt(0); 613 614 return start; 615 } 616 617 // Generate indices for iota vector. 618 address generate_iota_indices(const char *stub_name) { 619 __ align(CodeEntryAlignment); 620 StubCodeMark mark(this, "StubRoutines", stub_name); 621 address start = __ pc(); 622 // B 623 __ emit_data64(0x0706050403020100, relocInfo::none); 624 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 625 // H 626 __ emit_data64(0x0003000200010000, relocInfo::none); 627 __ emit_data64(0x0007000600050004, relocInfo::none); 628 // S 629 __ emit_data64(0x0000000100000000, relocInfo::none); 630 __ emit_data64(0x0000000300000002, relocInfo::none); 631 // D 632 __ emit_data64(0x0000000000000000, relocInfo::none); 633 __ emit_data64(0x0000000000000001, relocInfo::none); 634 // S - FP 635 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f 636 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f 637 // D - FP 638 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d 639 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d 640 return start; 641 } 642 643 // The inner part of zero_words(). This is the bulk operation, 644 // zeroing words in blocks, possibly using DC ZVA to do it. The 645 // caller is responsible for zeroing the last few words. 646 // 647 // Inputs: 648 // r10: the HeapWord-aligned base address of an array to zero. 649 // r11: the count in HeapWords, r11 > 0. 650 // 651 // Returns r10 and r11, adjusted for the caller to clear. 652 // r10: the base address of the tail of words left to clear. 653 // r11: the number of words in the tail. 654 // r11 < MacroAssembler::zero_words_block_size. 655 656 address generate_zero_blocks() { 657 Label done; 658 Label base_aligned; 659 660 Register base = r10, cnt = r11; 661 662 __ align(CodeEntryAlignment); 663 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 664 address start = __ pc(); 665 666 if (UseBlockZeroing) { 667 int zva_length = VM_Version::zva_length(); 668 669 // Ensure ZVA length can be divided by 16. This is required by 670 // the subsequent operations. 671 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 672 673 __ tbz(base, 3, base_aligned); 674 __ str(zr, Address(__ post(base, 8))); 675 __ sub(cnt, cnt, 1); 676 __ bind(base_aligned); 677 678 // Ensure count >= zva_length * 2 so that it still deserves a zva after 679 // alignment. 680 Label small; 681 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 682 __ subs(rscratch1, cnt, low_limit >> 3); 683 __ br(Assembler::LT, small); 684 __ zero_dcache_blocks(base, cnt); 685 __ bind(small); 686 } 687 688 { 689 // Number of stp instructions we'll unroll 690 const int unroll = 691 MacroAssembler::zero_words_block_size / 2; 692 // Clear the remaining blocks. 693 Label loop; 694 __ subs(cnt, cnt, unroll * 2); 695 __ br(Assembler::LT, done); 696 __ bind(loop); 697 for (int i = 0; i < unroll; i++) 698 __ stp(zr, zr, __ post(base, 16)); 699 __ subs(cnt, cnt, unroll * 2); 700 __ br(Assembler::GE, loop); 701 __ bind(done); 702 __ add(cnt, cnt, unroll * 2); 703 } 704 705 __ ret(lr); 706 707 return start; 708 } 709 710 711 typedef enum { 712 copy_forwards = 1, 713 copy_backwards = -1 714 } copy_direction; 715 716 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores 717 // for arraycopy stubs. 718 class ArrayCopyBarrierSetHelper : StackObj { 719 BarrierSetAssembler* _bs_asm; 720 MacroAssembler* _masm; 721 DecoratorSet _decorators; 722 BasicType _type; 723 Register _gct1; 724 Register _gct2; 725 Register _gct3; 726 FloatRegister _gcvt1; 727 FloatRegister _gcvt2; 728 FloatRegister _gcvt3; 729 730 public: 731 ArrayCopyBarrierSetHelper(MacroAssembler* masm, 732 DecoratorSet decorators, 733 BasicType type, 734 Register gct1, 735 Register gct2, 736 Register gct3, 737 FloatRegister gcvt1, 738 FloatRegister gcvt2, 739 FloatRegister gcvt3) 740 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()), 741 _masm(masm), 742 _decorators(decorators), 743 _type(type), 744 _gct1(gct1), 745 _gct2(gct2), 746 _gct3(gct3), 747 _gcvt1(gcvt1), 748 _gcvt2(gcvt2), 749 _gcvt3(gcvt3) { 750 } 751 752 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) { 753 _bs_asm->copy_load_at(_masm, _decorators, _type, 32, 754 dst1, dst2, src, 755 _gct1, _gct2, _gcvt1); 756 } 757 758 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) { 759 _bs_asm->copy_store_at(_masm, _decorators, _type, 32, 760 dst, src1, src2, 761 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3); 762 } 763 764 void copy_load_at_16(Register dst1, Register dst2, Address src) { 765 _bs_asm->copy_load_at(_masm, _decorators, _type, 16, 766 dst1, dst2, src, 767 _gct1); 768 } 769 770 void copy_store_at_16(Address dst, Register src1, Register src2) { 771 _bs_asm->copy_store_at(_masm, _decorators, _type, 16, 772 dst, src1, src2, 773 _gct1, _gct2, _gct3); 774 } 775 776 void copy_load_at_8(Register dst, Address src) { 777 _bs_asm->copy_load_at(_masm, _decorators, _type, 8, 778 dst, noreg, src, 779 _gct1); 780 } 781 782 void copy_store_at_8(Address dst, Register src) { 783 _bs_asm->copy_store_at(_masm, _decorators, _type, 8, 784 dst, src, noreg, 785 _gct1, _gct2, _gct3); 786 } 787 }; 788 789 // Bulk copy of blocks of 8 words. 790 // 791 // count is a count of words. 792 // 793 // Precondition: count >= 8 794 // 795 // Postconditions: 796 // 797 // The least significant bit of count contains the remaining count 798 // of words to copy. The rest of count is trash. 799 // 800 // s and d are adjusted to point to the remaining words to copy 801 // 802 void generate_copy_longs(DecoratorSet decorators, BasicType type, Label &start, Register s, Register d, Register count, 803 copy_direction direction) { 804 int unit = wordSize * direction; 805 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 806 807 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 808 t4 = r7, t5 = r11, t6 = r12, t7 = r13; 809 const Register stride = r14; 810 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 811 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 812 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 813 814 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7); 815 assert_different_registers(s, d, count, rscratch1, rscratch2); 816 817 Label again, drain; 818 const char *stub_name; 819 if (direction == copy_forwards) 820 stub_name = "forward_copy_longs"; 821 else 822 stub_name = "backward_copy_longs"; 823 824 __ align(CodeEntryAlignment); 825 826 StubCodeMark mark(this, "StubRoutines", stub_name); 827 828 __ bind(start); 829 830 Label unaligned_copy_long; 831 if (AvoidUnalignedAccesses) { 832 __ tbnz(d, 3, unaligned_copy_long); 833 } 834 835 if (direction == copy_forwards) { 836 __ sub(s, s, bias); 837 __ sub(d, d, bias); 838 } 839 840 #ifdef ASSERT 841 // Make sure we are never given < 8 words 842 { 843 Label L; 844 __ cmp(count, (u1)8); 845 __ br(Assembler::GE, L); 846 __ stop("genrate_copy_longs called with < 8 words"); 847 __ bind(L); 848 } 849 #endif 850 851 // Fill 8 registers 852 if (UseSIMDForMemoryOps) { 853 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 854 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 855 } else { 856 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 857 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 858 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 859 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 860 } 861 862 __ subs(count, count, 16); 863 __ br(Assembler::LO, drain); 864 865 int prefetch = PrefetchCopyIntervalInBytes; 866 bool use_stride = false; 867 if (direction == copy_backwards) { 868 use_stride = prefetch > 256; 869 prefetch = -prefetch; 870 if (use_stride) __ mov(stride, prefetch); 871 } 872 873 __ bind(again); 874 875 if (PrefetchCopyIntervalInBytes > 0) 876 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 877 878 if (UseSIMDForMemoryOps) { 879 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 880 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit)); 881 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 882 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit))); 883 } else { 884 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 885 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 886 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 887 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 888 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 889 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 890 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 891 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 892 } 893 894 __ subs(count, count, 8); 895 __ br(Assembler::HS, again); 896 897 // Drain 898 __ bind(drain); 899 if (UseSIMDForMemoryOps) { 900 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1); 901 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3); 902 } else { 903 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 904 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3); 905 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5); 906 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7); 907 } 908 909 { 910 Label L1, L2; 911 __ tbz(count, exact_log2(4), L1); 912 if (UseSIMDForMemoryOps) { 913 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit))); 914 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1); 915 } else { 916 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 917 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 918 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1); 919 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3); 920 } 921 __ bind(L1); 922 923 if (direction == copy_forwards) { 924 __ add(s, s, bias); 925 __ add(d, d, bias); 926 } 927 928 __ tbz(count, 1, L2); 929 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 930 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1); 931 __ bind(L2); 932 } 933 934 __ ret(lr); 935 936 if (AvoidUnalignedAccesses) { 937 Label drain, again; 938 // Register order for storing. Order is different for backward copy. 939 940 __ bind(unaligned_copy_long); 941 942 // source address is even aligned, target odd aligned 943 // 944 // when forward copying word pairs we read long pairs at offsets 945 // {0, 2, 4, 6} (in long words). when backwards copying we read 946 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 947 // address by -2 in the forwards case so we can compute the 948 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 949 // or -1. 950 // 951 // when forward copying we need to store 1 word, 3 pairs and 952 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a 953 // zero offset We adjust the destination by -1 which means we 954 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 955 // 956 // When backwards copyng we need to store 1 word, 3 pairs and 957 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 958 // offsets {1, 3, 5, 7, 8} * unit. 959 960 if (direction == copy_forwards) { 961 __ sub(s, s, 16); 962 __ sub(d, d, 8); 963 } 964 965 // Fill 8 registers 966 // 967 // for forwards copy s was offset by -16 from the original input 968 // value of s so the register contents are at these offsets 969 // relative to the 64 bit block addressed by that original input 970 // and so on for each successive 64 byte block when s is updated 971 // 972 // t0 at offset 0, t1 at offset 8 973 // t2 at offset 16, t3 at offset 24 974 // t4 at offset 32, t5 at offset 40 975 // t6 at offset 48, t7 at offset 56 976 977 // for backwards copy s was not offset so the register contents 978 // are at these offsets into the preceding 64 byte block 979 // relative to that original input and so on for each successive 980 // preceding 64 byte block when s is updated. this explains the 981 // slightly counter-intuitive looking pattern of register usage 982 // in the stp instructions for backwards copy. 983 // 984 // t0 at offset -16, t1 at offset -8 985 // t2 at offset -32, t3 at offset -24 986 // t4 at offset -48, t5 at offset -40 987 // t6 at offset -64, t7 at offset -56 988 989 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 990 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 991 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 992 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 993 994 __ subs(count, count, 16); 995 __ br(Assembler::LO, drain); 996 997 int prefetch = PrefetchCopyIntervalInBytes; 998 bool use_stride = false; 999 if (direction == copy_backwards) { 1000 use_stride = prefetch > 256; 1001 prefetch = -prefetch; 1002 if (use_stride) __ mov(stride, prefetch); 1003 } 1004 1005 __ bind(again); 1006 1007 if (PrefetchCopyIntervalInBytes > 0) 1008 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1009 1010 if (direction == copy_forwards) { 1011 // allowing for the offset of -8 the store instructions place 1012 // registers into the target 64 bit block at the following 1013 // offsets 1014 // 1015 // t0 at offset 0 1016 // t1 at offset 8, t2 at offset 16 1017 // t3 at offset 24, t4 at offset 32 1018 // t5 at offset 40, t6 at offset 48 1019 // t7 at offset 56 1020 1021 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1022 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1023 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1024 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1025 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1026 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1027 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1028 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1029 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1030 } else { 1031 // d was not offset when we started so the registers are 1032 // written into the 64 bit block preceding d with the following 1033 // offsets 1034 // 1035 // t1 at offset -8 1036 // t3 at offset -24, t0 at offset -16 1037 // t5 at offset -48, t2 at offset -32 1038 // t7 at offset -56, t4 at offset -48 1039 // t6 at offset -64 1040 // 1041 // note that this matches the offsets previously noted for the 1042 // loads 1043 1044 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1045 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1046 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1047 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1048 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit)); 1049 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1050 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit)); 1051 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1052 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit))); 1053 } 1054 1055 __ subs(count, count, 8); 1056 __ br(Assembler::HS, again); 1057 1058 // Drain 1059 // 1060 // this uses the same pattern of offsets and register arguments 1061 // as above 1062 __ bind(drain); 1063 if (direction == copy_forwards) { 1064 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1065 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1066 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4); 1067 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6); 1068 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7); 1069 } else { 1070 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1071 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1072 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2); 1073 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4); 1074 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6); 1075 } 1076 // now we need to copy any remaining part block which may 1077 // include a 4 word block subblock and/or a 2 word subblock. 1078 // bits 2 and 1 in the count are the tell-tale for whether we 1079 // have each such subblock 1080 { 1081 Label L1, L2; 1082 __ tbz(count, exact_log2(4), L1); 1083 // this is the same as above but copying only 4 longs hence 1084 // with only one intervening stp between the str instructions 1085 // but note that the offsets and registers still follow the 1086 // same pattern 1087 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit)); 1088 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit))); 1089 if (direction == copy_forwards) { 1090 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1091 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2); 1092 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3); 1093 } else { 1094 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1095 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0); 1096 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2); 1097 } 1098 __ bind(L1); 1099 1100 __ tbz(count, 1, L2); 1101 // this is the same as above but copying only 2 longs hence 1102 // there is no intervening stp between the str instructions 1103 // but note that the offset and register patterns are still 1104 // the same 1105 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit))); 1106 if (direction == copy_forwards) { 1107 bs.copy_store_at_8(Address(d, 1 * unit), t0); 1108 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1); 1109 } else { 1110 bs.copy_store_at_8(Address(d, 1 * unit), t1); 1111 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0); 1112 } 1113 __ bind(L2); 1114 1115 // for forwards copy we need to re-adjust the offsets we 1116 // applied so that s and d are follow the last words written 1117 1118 if (direction == copy_forwards) { 1119 __ add(s, s, 16); 1120 __ add(d, d, 8); 1121 } 1122 1123 } 1124 1125 __ ret(lr); 1126 } 1127 } 1128 1129 // Small copy: less than 16 bytes. 1130 // 1131 // NB: Ignores all of the bits of count which represent more than 15 1132 // bytes, so a caller doesn't have to mask them. 1133 1134 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) { 1135 bool is_backwards = step < 0; 1136 size_t granularity = uabs(step); 1137 int direction = is_backwards ? -1 : 1; 1138 1139 Label Lword, Lint, Lshort, Lbyte; 1140 1141 assert(granularity 1142 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1143 1144 const Register t0 = r3; 1145 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1146 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg); 1147 1148 // ??? I don't know if this bit-test-and-branch is the right thing 1149 // to do. It does a lot of jumping, resulting in several 1150 // mispredicted branches. It might make more sense to do this 1151 // with something like Duff's device with a single computed branch. 1152 1153 __ tbz(count, 3 - exact_log2(granularity), Lword); 1154 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1155 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1156 __ bind(Lword); 1157 1158 if (granularity <= sizeof (jint)) { 1159 __ tbz(count, 2 - exact_log2(granularity), Lint); 1160 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1161 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1162 __ bind(Lint); 1163 } 1164 1165 if (granularity <= sizeof (jshort)) { 1166 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1167 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1168 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1169 __ bind(Lshort); 1170 } 1171 1172 if (granularity <= sizeof (jbyte)) { 1173 __ tbz(count, 0, Lbyte); 1174 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1175 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1176 __ bind(Lbyte); 1177 } 1178 } 1179 1180 Label copy_f, copy_b; 1181 Label copy_obj_f, copy_obj_b; 1182 Label copy_obj_uninit_f, copy_obj_uninit_b; 1183 1184 // All-singing all-dancing memory copy. 1185 // 1186 // Copy count units of memory from s to d. The size of a unit is 1187 // step, which can be positive or negative depending on the direction 1188 // of copy. If is_aligned is false, we align the source address. 1189 // 1190 1191 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned, 1192 Register s, Register d, Register count, int step) { 1193 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1194 bool is_backwards = step < 0; 1195 unsigned int granularity = uabs(step); 1196 const Register t0 = r3, t1 = r4; 1197 1198 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always 1199 // load all the data before writing anything 1200 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1201 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11; 1202 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15; 1203 const Register send = r17, dend = r16; 1204 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10; 1205 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved 1206 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3); 1207 1208 if (PrefetchCopyIntervalInBytes > 0) 1209 __ prfm(Address(s, 0), PLDL1KEEP); 1210 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1211 __ br(Assembler::HI, copy_big); 1212 1213 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1214 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1215 1216 __ cmp(count, u1(16/granularity)); 1217 __ br(Assembler::LS, copy16); 1218 1219 __ cmp(count, u1(64/granularity)); 1220 __ br(Assembler::HI, copy80); 1221 1222 __ cmp(count, u1(32/granularity)); 1223 __ br(Assembler::LS, copy32); 1224 1225 // 33..64 bytes 1226 if (UseSIMDForMemoryOps) { 1227 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1228 bs.copy_load_at_32(v2, v3, Address(send, -32)); 1229 bs.copy_store_at_32(Address(d, 0), v0, v1); 1230 bs.copy_store_at_32(Address(dend, -32), v2, v3); 1231 } else { 1232 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1233 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1234 bs.copy_load_at_16(t4, t5, Address(send, -32)); 1235 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1236 1237 bs.copy_store_at_16(Address(d, 0), t0, t1); 1238 bs.copy_store_at_16(Address(d, 16), t2, t3); 1239 bs.copy_store_at_16(Address(dend, -32), t4, t5); 1240 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1241 } 1242 __ b(finish); 1243 1244 // 17..32 bytes 1245 __ bind(copy32); 1246 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1247 bs.copy_load_at_16(t6, t7, Address(send, -16)); 1248 1249 bs.copy_store_at_16(Address(d, 0), t0, t1); 1250 bs.copy_store_at_16(Address(dend, -16), t6, t7); 1251 __ b(finish); 1252 1253 // 65..80/96 bytes 1254 // (96 bytes if SIMD because we do 32 byes per instruction) 1255 __ bind(copy80); 1256 if (UseSIMDForMemoryOps) { 1257 bs.copy_load_at_32(v0, v1, Address(s, 0)); 1258 bs.copy_load_at_32(v2, v3, Address(s, 32)); 1259 // Unaligned pointers can be an issue for copying. 1260 // The issue has more chances to happen when granularity of data is 1261 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least 1262 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. 1263 // The most performance drop has been seen for the range 65-80 bytes. 1264 // For such cases using the pair of ldp/stp instead of the third pair of 1265 // ldpq/stpq fixes the performance issue. 1266 if (granularity < sizeof (jint)) { 1267 Label copy96; 1268 __ cmp(count, u1(80/granularity)); 1269 __ br(Assembler::HI, copy96); 1270 bs.copy_load_at_16(t0, t1, Address(send, -16)); 1271 1272 bs.copy_store_at_32(Address(d, 0), v0, v1); 1273 bs.copy_store_at_32(Address(d, 32), v2, v3); 1274 1275 bs.copy_store_at_16(Address(dend, -16), t0, t1); 1276 __ b(finish); 1277 1278 __ bind(copy96); 1279 } 1280 bs.copy_load_at_32(v4, v5, Address(send, -32)); 1281 1282 bs.copy_store_at_32(Address(d, 0), v0, v1); 1283 bs.copy_store_at_32(Address(d, 32), v2, v3); 1284 1285 bs.copy_store_at_32(Address(dend, -32), v4, v5); 1286 } else { 1287 bs.copy_load_at_16(t0, t1, Address(s, 0)); 1288 bs.copy_load_at_16(t2, t3, Address(s, 16)); 1289 bs.copy_load_at_16(t4, t5, Address(s, 32)); 1290 bs.copy_load_at_16(t6, t7, Address(s, 48)); 1291 bs.copy_load_at_16(t8, t9, Address(send, -16)); 1292 1293 bs.copy_store_at_16(Address(d, 0), t0, t1); 1294 bs.copy_store_at_16(Address(d, 16), t2, t3); 1295 bs.copy_store_at_16(Address(d, 32), t4, t5); 1296 bs.copy_store_at_16(Address(d, 48), t6, t7); 1297 bs.copy_store_at_16(Address(dend, -16), t8, t9); 1298 } 1299 __ b(finish); 1300 1301 // 0..16 bytes 1302 __ bind(copy16); 1303 __ cmp(count, u1(8/granularity)); 1304 __ br(Assembler::LO, copy8); 1305 1306 // 8..16 bytes 1307 bs.copy_load_at_8(t0, Address(s, 0)); 1308 bs.copy_load_at_8(t1, Address(send, -8)); 1309 bs.copy_store_at_8(Address(d, 0), t0); 1310 bs.copy_store_at_8(Address(dend, -8), t1); 1311 __ b(finish); 1312 1313 if (granularity < 8) { 1314 // 4..7 bytes 1315 __ bind(copy8); 1316 __ tbz(count, 2 - exact_log2(granularity), copy4); 1317 __ ldrw(t0, Address(s, 0)); 1318 __ ldrw(t1, Address(send, -4)); 1319 __ strw(t0, Address(d, 0)); 1320 __ strw(t1, Address(dend, -4)); 1321 __ b(finish); 1322 if (granularity < 4) { 1323 // 0..3 bytes 1324 __ bind(copy4); 1325 __ cbz(count, finish); // get rid of 0 case 1326 if (granularity == 2) { 1327 __ ldrh(t0, Address(s, 0)); 1328 __ strh(t0, Address(d, 0)); 1329 } else { // granularity == 1 1330 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1331 // the first and last byte. 1332 // Handle the 3 byte case by loading and storing base + count/2 1333 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1334 // This does means in the 1 byte case we load/store the same 1335 // byte 3 times. 1336 __ lsr(count, count, 1); 1337 __ ldrb(t0, Address(s, 0)); 1338 __ ldrb(t1, Address(send, -1)); 1339 __ ldrb(t2, Address(s, count)); 1340 __ strb(t0, Address(d, 0)); 1341 __ strb(t1, Address(dend, -1)); 1342 __ strb(t2, Address(d, count)); 1343 } 1344 __ b(finish); 1345 } 1346 } 1347 1348 __ bind(copy_big); 1349 if (is_backwards) { 1350 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1351 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1352 } 1353 1354 // Now we've got the small case out of the way we can align the 1355 // source address on a 2-word boundary. 1356 1357 // Here we will materialize a count in r15, which is used by copy_memory_small 1358 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes. 1359 // Up until here, we have used t9, which aliases r15, but from here on, that register 1360 // can not be used as a temp register, as it contains the count. 1361 1362 Label aligned; 1363 1364 if (is_aligned) { 1365 // We may have to adjust by 1 word to get s 2-word-aligned. 1366 __ tbz(s, exact_log2(wordSize), aligned); 1367 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards))); 1368 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0); 1369 __ sub(count, count, wordSize/granularity); 1370 } else { 1371 if (is_backwards) { 1372 __ andr(r15, s, 2 * wordSize - 1); 1373 } else { 1374 __ neg(r15, s); 1375 __ andr(r15, r15, 2 * wordSize - 1); 1376 } 1377 // r15 is the byte adjustment needed to align s. 1378 __ cbz(r15, aligned); 1379 int shift = exact_log2(granularity); 1380 if (shift > 0) { 1381 __ lsr(r15, r15, shift); 1382 } 1383 __ sub(count, count, r15); 1384 1385 #if 0 1386 // ?? This code is only correct for a disjoint copy. It may or 1387 // may not make sense to use it in that case. 1388 1389 // Copy the first pair; s and d may not be aligned. 1390 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1391 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1392 1393 // Align s and d, adjust count 1394 if (is_backwards) { 1395 __ sub(s, s, r15); 1396 __ sub(d, d, r15); 1397 } else { 1398 __ add(s, s, r15); 1399 __ add(d, d, r15); 1400 } 1401 #else 1402 copy_memory_small(decorators, type, s, d, r15, step); 1403 #endif 1404 } 1405 1406 __ bind(aligned); 1407 1408 // s is now 2-word-aligned. 1409 1410 // We have a count of units and some trailing bytes. Adjust the 1411 // count and do a bulk copy of words. If the shift is zero 1412 // perform a move instead to benefit from zero latency moves. 1413 int shift = exact_log2(wordSize/granularity); 1414 if (shift > 0) { 1415 __ lsr(r15, count, shift); 1416 } else { 1417 __ mov(r15, count); 1418 } 1419 if (direction == copy_forwards) { 1420 if (type != T_OBJECT) { 1421 __ bl(copy_f); 1422 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1423 __ bl(copy_obj_uninit_f); 1424 } else { 1425 __ bl(copy_obj_f); 1426 } 1427 } else { 1428 if (type != T_OBJECT) { 1429 __ bl(copy_b); 1430 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) { 1431 __ bl(copy_obj_uninit_b); 1432 } else { 1433 __ bl(copy_obj_b); 1434 } 1435 } 1436 1437 // And the tail. 1438 copy_memory_small(decorators, type, s, d, count, step); 1439 1440 if (granularity >= 8) __ bind(copy8); 1441 if (granularity >= 4) __ bind(copy4); 1442 __ bind(finish); 1443 } 1444 1445 1446 void clobber_registers() { 1447 #ifdef ASSERT 1448 RegSet clobbered 1449 = MacroAssembler::call_clobbered_gp_registers() - rscratch1; 1450 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1451 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1452 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) { 1453 __ mov(*it, rscratch1); 1454 } 1455 #endif 1456 1457 } 1458 1459 // Scan over array at a for count oops, verifying each one. 1460 // Preserves a and count, clobbers rscratch1 and rscratch2. 1461 void verify_oop_array (int size, Register a, Register count, Register temp) { 1462 Label loop, end; 1463 __ mov(rscratch1, a); 1464 __ mov(rscratch2, zr); 1465 __ bind(loop); 1466 __ cmp(rscratch2, count); 1467 __ br(Assembler::HS, end); 1468 if (size == wordSize) { 1469 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1470 __ verify_oop(temp); 1471 } else { 1472 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1473 __ decode_heap_oop(temp); // calls verify_oop 1474 } 1475 __ add(rscratch2, rscratch2, 1); 1476 __ b(loop); 1477 __ bind(end); 1478 } 1479 1480 // Arguments: 1481 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1482 // ignored 1483 // is_oop - true => oop array, so generate store check code 1484 // name - stub name string 1485 // 1486 // Inputs: 1487 // c_rarg0 - source array address 1488 // c_rarg1 - destination array address 1489 // c_rarg2 - element count, treated as ssize_t, can be zero 1490 // 1491 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1492 // the hardware handle it. The two dwords within qwords that span 1493 // cache line boundaries will still be loaded and stored atomically. 1494 // 1495 // Side Effects: 1496 // disjoint_int_copy_entry is set to the no-overlap entry point 1497 // used by generate_conjoint_int_oop_copy(). 1498 // 1499 address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry, 1500 const char *name, bool dest_uninitialized = false) { 1501 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1502 RegSet saved_reg = RegSet::of(s, d, count); 1503 __ align(CodeEntryAlignment); 1504 StubCodeMark mark(this, "StubRoutines", name); 1505 address start = __ pc(); 1506 __ enter(); 1507 1508 if (entry != nullptr) { 1509 *entry = __ pc(); 1510 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1511 BLOCK_COMMENT("Entry:"); 1512 } 1513 1514 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1515 if (dest_uninitialized) { 1516 decorators |= IS_DEST_UNINITIALIZED; 1517 } 1518 if (aligned) { 1519 decorators |= ARRAYCOPY_ALIGNED; 1520 } 1521 1522 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1523 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1524 1525 if (is_oop) { 1526 // save regs before copy_memory 1527 __ push(RegSet::of(d, count), sp); 1528 } 1529 { 1530 // UnsafeMemoryAccess page error: continue after unsafe access 1531 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1532 UnsafeMemoryAccessMark umam(this, add_entry, true); 1533 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size); 1534 } 1535 1536 if (is_oop) { 1537 __ pop(RegSet::of(d, count), sp); 1538 if (VerifyOops) 1539 verify_oop_array(size, d, count, r16); 1540 } 1541 1542 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1543 1544 __ leave(); 1545 __ mov(r0, zr); // return 0 1546 __ ret(lr); 1547 return start; 1548 } 1549 1550 // Arguments: 1551 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1552 // ignored 1553 // is_oop - true => oop array, so generate store check code 1554 // name - stub name string 1555 // 1556 // Inputs: 1557 // c_rarg0 - source array address 1558 // c_rarg1 - destination array address 1559 // c_rarg2 - element count, treated as ssize_t, can be zero 1560 // 1561 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1562 // the hardware handle it. The two dwords within qwords that span 1563 // cache line boundaries will still be loaded and stored atomically. 1564 // 1565 address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target, 1566 address *entry, const char *name, 1567 bool dest_uninitialized = false) { 1568 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1569 RegSet saved_regs = RegSet::of(s, d, count); 1570 StubCodeMark mark(this, "StubRoutines", name); 1571 address start = __ pc(); 1572 __ enter(); 1573 1574 if (entry != nullptr) { 1575 *entry = __ pc(); 1576 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1577 BLOCK_COMMENT("Entry:"); 1578 } 1579 1580 // use fwd copy when (d-s) above_equal (count*size) 1581 __ sub(rscratch1, d, s); 1582 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1583 __ br(Assembler::HS, nooverlap_target); 1584 1585 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1586 if (dest_uninitialized) { 1587 decorators |= IS_DEST_UNINITIALIZED; 1588 } 1589 if (aligned) { 1590 decorators |= ARRAYCOPY_ALIGNED; 1591 } 1592 1593 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1594 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1595 1596 if (is_oop) { 1597 // save regs before copy_memory 1598 __ push(RegSet::of(d, count), sp); 1599 } 1600 { 1601 // UnsafeMemoryAccess page error: continue after unsafe access 1602 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1603 UnsafeMemoryAccessMark umam(this, add_entry, true); 1604 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size); 1605 } 1606 if (is_oop) { 1607 __ pop(RegSet::of(d, count), sp); 1608 if (VerifyOops) 1609 verify_oop_array(size, d, count, r16); 1610 } 1611 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1612 __ leave(); 1613 __ mov(r0, zr); // return 0 1614 __ ret(lr); 1615 return start; 1616 } 1617 1618 // Arguments: 1619 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1620 // ignored 1621 // name - stub name string 1622 // 1623 // Inputs: 1624 // c_rarg0 - source array address 1625 // c_rarg1 - destination array address 1626 // c_rarg2 - element count, treated as ssize_t, can be zero 1627 // 1628 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1629 // we let the hardware handle it. The one to eight bytes within words, 1630 // dwords or qwords that span cache line boundaries will still be loaded 1631 // and stored atomically. 1632 // 1633 // Side Effects: 1634 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1635 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1636 // we let the hardware handle it. The one to eight bytes within words, 1637 // dwords or qwords that span cache line boundaries will still be loaded 1638 // and stored atomically. 1639 // 1640 // Side Effects: 1641 // disjoint_byte_copy_entry is set to the no-overlap entry point 1642 // used by generate_conjoint_byte_copy(). 1643 // 1644 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1645 const bool not_oop = false; 1646 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1647 } 1648 1649 // Arguments: 1650 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1651 // ignored 1652 // name - stub name string 1653 // 1654 // Inputs: 1655 // c_rarg0 - source array address 1656 // c_rarg1 - destination array address 1657 // c_rarg2 - element count, treated as ssize_t, can be zero 1658 // 1659 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1660 // we let the hardware handle it. The one to eight bytes within words, 1661 // dwords or qwords that span cache line boundaries will still be loaded 1662 // and stored atomically. 1663 // 1664 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1665 address* entry, const char *name) { 1666 const bool not_oop = false; 1667 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1668 } 1669 1670 // Arguments: 1671 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1672 // ignored 1673 // name - stub name string 1674 // 1675 // Inputs: 1676 // c_rarg0 - source array address 1677 // c_rarg1 - destination array address 1678 // c_rarg2 - element count, treated as ssize_t, can be zero 1679 // 1680 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1681 // let the hardware handle it. The two or four words within dwords 1682 // or qwords that span cache line boundaries will still be loaded 1683 // and stored atomically. 1684 // 1685 // Side Effects: 1686 // disjoint_short_copy_entry is set to the no-overlap entry point 1687 // used by generate_conjoint_short_copy(). 1688 // 1689 address generate_disjoint_short_copy(bool aligned, 1690 address* entry, const char *name) { 1691 const bool not_oop = false; 1692 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1693 } 1694 1695 // Arguments: 1696 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1697 // ignored 1698 // name - stub name string 1699 // 1700 // Inputs: 1701 // c_rarg0 - source array address 1702 // c_rarg1 - destination array address 1703 // c_rarg2 - element count, treated as ssize_t, can be zero 1704 // 1705 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1706 // let the hardware handle it. The two or four words within dwords 1707 // or qwords that span cache line boundaries will still be loaded 1708 // and stored atomically. 1709 // 1710 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1711 address *entry, const char *name) { 1712 const bool not_oop = false; 1713 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1714 1715 } 1716 // Arguments: 1717 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1718 // ignored 1719 // name - stub name string 1720 // 1721 // Inputs: 1722 // c_rarg0 - source array address 1723 // c_rarg1 - destination array address 1724 // c_rarg2 - element count, treated as ssize_t, can be zero 1725 // 1726 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1727 // the hardware handle it. The two dwords within qwords that span 1728 // cache line boundaries will still be loaded and stored atomically. 1729 // 1730 // Side Effects: 1731 // disjoint_int_copy_entry is set to the no-overlap entry point 1732 // used by generate_conjoint_int_oop_copy(). 1733 // 1734 address generate_disjoint_int_copy(bool aligned, address *entry, 1735 const char *name, bool dest_uninitialized = false) { 1736 const bool not_oop = false; 1737 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1738 } 1739 1740 // Arguments: 1741 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1742 // ignored 1743 // name - stub name string 1744 // 1745 // Inputs: 1746 // c_rarg0 - source array address 1747 // c_rarg1 - destination array address 1748 // c_rarg2 - element count, treated as ssize_t, can be zero 1749 // 1750 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1751 // the hardware handle it. The two dwords within qwords that span 1752 // cache line boundaries will still be loaded and stored atomically. 1753 // 1754 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1755 address *entry, const char *name, 1756 bool dest_uninitialized = false) { 1757 const bool not_oop = false; 1758 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1759 } 1760 1761 1762 // Arguments: 1763 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1764 // ignored 1765 // name - stub name string 1766 // 1767 // Inputs: 1768 // c_rarg0 - source array address 1769 // c_rarg1 - destination array address 1770 // c_rarg2 - element count, treated as size_t, can be zero 1771 // 1772 // Side Effects: 1773 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1774 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1775 // 1776 address generate_disjoint_long_copy(bool aligned, address *entry, 1777 const char *name, bool dest_uninitialized = false) { 1778 const bool not_oop = false; 1779 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1780 } 1781 1782 // Arguments: 1783 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1784 // ignored 1785 // name - stub name string 1786 // 1787 // Inputs: 1788 // c_rarg0 - source array address 1789 // c_rarg1 - destination array address 1790 // c_rarg2 - element count, treated as size_t, can be zero 1791 // 1792 address generate_conjoint_long_copy(bool aligned, 1793 address nooverlap_target, address *entry, 1794 const char *name, bool dest_uninitialized = false) { 1795 const bool not_oop = false; 1796 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1797 } 1798 1799 // Arguments: 1800 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1801 // ignored 1802 // name - stub name string 1803 // 1804 // Inputs: 1805 // c_rarg0 - source array address 1806 // c_rarg1 - destination array address 1807 // c_rarg2 - element count, treated as size_t, can be zero 1808 // 1809 // Side Effects: 1810 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1811 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1812 // 1813 address generate_disjoint_oop_copy(bool aligned, address *entry, 1814 const char *name, bool dest_uninitialized) { 1815 const bool is_oop = true; 1816 const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1817 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1818 } 1819 1820 // Arguments: 1821 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1822 // ignored 1823 // name - stub name string 1824 // 1825 // Inputs: 1826 // c_rarg0 - source array address 1827 // c_rarg1 - destination array address 1828 // c_rarg2 - element count, treated as size_t, can be zero 1829 // 1830 address generate_conjoint_oop_copy(bool aligned, 1831 address nooverlap_target, address *entry, 1832 const char *name, bool dest_uninitialized) { 1833 const bool is_oop = true; 1834 const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1835 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1836 name, dest_uninitialized); 1837 } 1838 1839 1840 // Helper for generating a dynamic type check. 1841 // Smashes rscratch1, rscratch2. 1842 void generate_type_check(Register sub_klass, 1843 Register super_check_offset, 1844 Register super_klass, 1845 Register temp1, 1846 Register temp2, 1847 Register result, 1848 Label& L_success) { 1849 assert_different_registers(sub_klass, super_check_offset, super_klass); 1850 1851 BLOCK_COMMENT("type_check:"); 1852 1853 Label L_miss; 1854 1855 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, 1856 super_check_offset); 1857 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr); 1858 1859 // Fall through on failure! 1860 __ BIND(L_miss); 1861 } 1862 1863 // 1864 // Generate checkcasting array copy stub 1865 // 1866 // Input: 1867 // c_rarg0 - source array address 1868 // c_rarg1 - destination array address 1869 // c_rarg2 - element count, treated as ssize_t, can be zero 1870 // c_rarg3 - size_t ckoff (super_check_offset) 1871 // c_rarg4 - oop ckval (super_klass) 1872 // 1873 // Output: 1874 // r0 == 0 - success 1875 // r0 == -1^K - failure, where K is partial transfer count 1876 // 1877 address generate_checkcast_copy(const char *name, address *entry, 1878 bool dest_uninitialized = false) { 1879 1880 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1881 1882 // Input registers (after setup_arg_regs) 1883 const Register from = c_rarg0; // source array address 1884 const Register to = c_rarg1; // destination array address 1885 const Register count = c_rarg2; // elementscount 1886 const Register ckoff = c_rarg3; // super_check_offset 1887 const Register ckval = c_rarg4; // super_klass 1888 1889 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1890 RegSet wb_post_saved_regs = RegSet::of(count); 1891 1892 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1893 const Register copied_oop = r22; // actual oop copied 1894 const Register count_save = r21; // orig elementscount 1895 const Register start_to = r20; // destination array start address 1896 const Register r19_klass = r19; // oop._klass 1897 1898 // Registers used as gc temps (r5, r6, r7 are save-on-call) 1899 const Register gct1 = r5, gct2 = r6, gct3 = r7; 1900 1901 //--------------------------------------------------------------- 1902 // Assembler stub will be used for this call to arraycopy 1903 // if the two arrays are subtypes of Object[] but the 1904 // destination array type is not equal to or a supertype 1905 // of the source type. Each element must be separately 1906 // checked. 1907 1908 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1909 copied_oop, r19_klass, count_save); 1910 1911 __ align(CodeEntryAlignment); 1912 StubCodeMark mark(this, "StubRoutines", name); 1913 address start = __ pc(); 1914 1915 __ enter(); // required for proper stackwalking of RuntimeStub frame 1916 1917 #ifdef ASSERT 1918 // caller guarantees that the arrays really are different 1919 // otherwise, we would have to make conjoint checks 1920 { Label L; 1921 __ b(L); // conjoint check not yet implemented 1922 __ stop("checkcast_copy within a single array"); 1923 __ bind(L); 1924 } 1925 #endif //ASSERT 1926 1927 // Caller of this entry point must set up the argument registers. 1928 if (entry != nullptr) { 1929 *entry = __ pc(); 1930 BLOCK_COMMENT("Entry:"); 1931 } 1932 1933 // Empty array: Nothing to do. 1934 __ cbz(count, L_done); 1935 __ push(RegSet::of(r19, r20, r21, r22), sp); 1936 1937 #ifdef ASSERT 1938 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1939 // The ckoff and ckval must be mutually consistent, 1940 // even though caller generates both. 1941 { Label L; 1942 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1943 __ ldrw(start_to, Address(ckval, sco_offset)); 1944 __ cmpw(ckoff, start_to); 1945 __ br(Assembler::EQ, L); 1946 __ stop("super_check_offset inconsistent"); 1947 __ bind(L); 1948 } 1949 #endif //ASSERT 1950 1951 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1952 bool is_oop = true; 1953 int element_size = UseCompressedOops ? 4 : 8; 1954 if (dest_uninitialized) { 1955 decorators |= IS_DEST_UNINITIALIZED; 1956 } 1957 1958 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1959 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1960 1961 // save the original count 1962 __ mov(count_save, count); 1963 1964 // Copy from low to high addresses 1965 __ mov(start_to, to); // Save destination array start address 1966 __ b(L_load_element); 1967 1968 // ======== begin loop ======== 1969 // (Loop is rotated; its entry is L_load_element.) 1970 // Loop control: 1971 // for (; count != 0; count--) { 1972 // copied_oop = load_heap_oop(from++); 1973 // ... generate_type_check ...; 1974 // store_heap_oop(to++, copied_oop); 1975 // } 1976 __ align(OptoLoopAlignment); 1977 1978 __ BIND(L_store_element); 1979 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size, 1980 __ post(to, element_size), copied_oop, noreg, 1981 gct1, gct2, gct3); 1982 __ sub(count, count, 1); 1983 __ cbz(count, L_do_card_marks); 1984 1985 // ======== loop entry is here ======== 1986 __ BIND(L_load_element); 1987 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size, 1988 copied_oop, noreg, __ post(from, element_size), 1989 gct1); 1990 __ cbz(copied_oop, L_store_element); 1991 1992 __ load_klass(r19_klass, copied_oop);// query the object klass 1993 1994 BLOCK_COMMENT("type_check:"); 1995 generate_type_check(/*sub_klass*/r19_klass, 1996 /*super_check_offset*/ckoff, 1997 /*super_klass*/ckval, 1998 /*r_array_base*/gct1, 1999 /*temp2*/gct2, 2000 /*result*/r10, L_store_element); 2001 2002 // Fall through on failure! 2003 2004 // ======== end loop ======== 2005 2006 // It was a real error; we must depend on the caller to finish the job. 2007 // Register count = remaining oops, count_orig = total oops. 2008 // Emit GC store barriers for the oops we have copied and report 2009 // their number to the caller. 2010 2011 __ subs(count, count_save, count); // K = partially copied oop count 2012 __ eon(count, count, zr); // report (-1^K) to caller 2013 __ br(Assembler::EQ, L_done_pop); 2014 2015 __ BIND(L_do_card_marks); 2016 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 2017 2018 __ bind(L_done_pop); 2019 __ pop(RegSet::of(r19, r20, r21, r22), sp); 2020 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 2021 2022 __ bind(L_done); 2023 __ mov(r0, count); 2024 __ leave(); 2025 __ ret(lr); 2026 2027 return start; 2028 } 2029 2030 // Perform range checks on the proposed arraycopy. 2031 // Kills temp, but nothing else. 2032 // Also, clean the sign bits of src_pos and dst_pos. 2033 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2034 Register src_pos, // source position (c_rarg1) 2035 Register dst, // destination array oo (c_rarg2) 2036 Register dst_pos, // destination position (c_rarg3) 2037 Register length, 2038 Register temp, 2039 Label& L_failed) { 2040 BLOCK_COMMENT("arraycopy_range_checks:"); 2041 2042 assert_different_registers(rscratch1, temp); 2043 2044 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2045 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 2046 __ addw(temp, length, src_pos); 2047 __ cmpw(temp, rscratch1); 2048 __ br(Assembler::HI, L_failed); 2049 2050 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2051 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2052 __ addw(temp, length, dst_pos); 2053 __ cmpw(temp, rscratch1); 2054 __ br(Assembler::HI, L_failed); 2055 2056 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 2057 __ movw(src_pos, src_pos); 2058 __ movw(dst_pos, dst_pos); 2059 2060 BLOCK_COMMENT("arraycopy_range_checks done"); 2061 } 2062 2063 // These stubs get called from some dumb test routine. 2064 // I'll write them properly when they're called from 2065 // something that's actually doing something. 2066 static void fake_arraycopy_stub(address src, address dst, int count) { 2067 assert(count == 0, "huh?"); 2068 } 2069 2070 2071 // 2072 // Generate 'unsafe' array copy stub 2073 // Though just as safe as the other stubs, it takes an unscaled 2074 // size_t argument instead of an element count. 2075 // 2076 // Input: 2077 // c_rarg0 - source array address 2078 // c_rarg1 - destination array address 2079 // c_rarg2 - byte count, treated as ssize_t, can be zero 2080 // 2081 // Examines the alignment of the operands and dispatches 2082 // to a long, int, short, or byte copy loop. 2083 // 2084 address generate_unsafe_copy(const char *name, 2085 address byte_copy_entry, 2086 address short_copy_entry, 2087 address int_copy_entry, 2088 address long_copy_entry) { 2089 Label L_long_aligned, L_int_aligned, L_short_aligned; 2090 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 2091 2092 __ align(CodeEntryAlignment); 2093 StubCodeMark mark(this, "StubRoutines", name); 2094 address start = __ pc(); 2095 __ enter(); // required for proper stackwalking of RuntimeStub frame 2096 2097 // bump this on entry, not on exit: 2098 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2099 2100 __ orr(rscratch1, s, d); 2101 __ orr(rscratch1, rscratch1, count); 2102 2103 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2104 __ cbz(rscratch1, L_long_aligned); 2105 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2106 __ cbz(rscratch1, L_int_aligned); 2107 __ tbz(rscratch1, 0, L_short_aligned); 2108 __ b(RuntimeAddress(byte_copy_entry)); 2109 2110 __ BIND(L_short_aligned); 2111 __ lsr(count, count, LogBytesPerShort); // size => short_count 2112 __ b(RuntimeAddress(short_copy_entry)); 2113 __ BIND(L_int_aligned); 2114 __ lsr(count, count, LogBytesPerInt); // size => int_count 2115 __ b(RuntimeAddress(int_copy_entry)); 2116 __ BIND(L_long_aligned); 2117 __ lsr(count, count, LogBytesPerLong); // size => long_count 2118 __ b(RuntimeAddress(long_copy_entry)); 2119 2120 return start; 2121 } 2122 2123 // 2124 // Generate generic array copy stubs 2125 // 2126 // Input: 2127 // c_rarg0 - src oop 2128 // c_rarg1 - src_pos (32-bits) 2129 // c_rarg2 - dst oop 2130 // c_rarg3 - dst_pos (32-bits) 2131 // c_rarg4 - element count (32-bits) 2132 // 2133 // Output: 2134 // r0 == 0 - success 2135 // r0 == -1^K - failure, where K is partial transfer count 2136 // 2137 address generate_generic_copy(const char *name, 2138 address byte_copy_entry, address short_copy_entry, 2139 address int_copy_entry, address oop_copy_entry, 2140 address long_copy_entry, address checkcast_copy_entry) { 2141 2142 Label L_failed, L_objArray; 2143 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2144 2145 // Input registers 2146 const Register src = c_rarg0; // source array oop 2147 const Register src_pos = c_rarg1; // source position 2148 const Register dst = c_rarg2; // destination array oop 2149 const Register dst_pos = c_rarg3; // destination position 2150 const Register length = c_rarg4; 2151 2152 2153 // Registers used as temps 2154 const Register dst_klass = c_rarg5; 2155 2156 __ align(CodeEntryAlignment); 2157 2158 StubCodeMark mark(this, "StubRoutines", name); 2159 2160 address start = __ pc(); 2161 2162 __ enter(); // required for proper stackwalking of RuntimeStub frame 2163 2164 // bump this on entry, not on exit: 2165 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2166 2167 //----------------------------------------------------------------------- 2168 // Assembler stub will be used for this call to arraycopy 2169 // if the following conditions are met: 2170 // 2171 // (1) src and dst must not be null. 2172 // (2) src_pos must not be negative. 2173 // (3) dst_pos must not be negative. 2174 // (4) length must not be negative. 2175 // (5) src klass and dst klass should be the same and not null. 2176 // (6) src and dst should be arrays. 2177 // (7) src_pos + length must not exceed length of src. 2178 // (8) dst_pos + length must not exceed length of dst. 2179 // 2180 2181 // if (src == nullptr) return -1; 2182 __ cbz(src, L_failed); 2183 2184 // if (src_pos < 0) return -1; 2185 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2186 2187 // if (dst == nullptr) return -1; 2188 __ cbz(dst, L_failed); 2189 2190 // if (dst_pos < 0) return -1; 2191 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2192 2193 // registers used as temp 2194 const Register scratch_length = r16; // elements count to copy 2195 const Register scratch_src_klass = r17; // array klass 2196 const Register lh = r15; // layout helper 2197 2198 // if (length < 0) return -1; 2199 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2200 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2201 2202 __ load_klass(scratch_src_klass, src); 2203 #ifdef ASSERT 2204 // assert(src->klass() != nullptr); 2205 { 2206 BLOCK_COMMENT("assert klasses not null {"); 2207 Label L1, L2; 2208 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null 2209 __ bind(L1); 2210 __ stop("broken null klass"); 2211 __ bind(L2); 2212 __ load_klass(rscratch1, dst); 2213 __ cbz(rscratch1, L1); // this would be broken also 2214 BLOCK_COMMENT("} assert klasses not null done"); 2215 } 2216 #endif 2217 2218 // Load layout helper (32-bits) 2219 // 2220 // |array_tag| | header_size | element_type | |log2_element_size| 2221 // 32 30 24 16 8 2 0 2222 // 2223 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2224 // 2225 2226 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2227 2228 // Handle objArrays completely differently... 2229 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2230 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2231 __ movw(rscratch1, objArray_lh); 2232 __ eorw(rscratch2, lh, rscratch1); 2233 __ cbzw(rscratch2, L_objArray); 2234 2235 // if (src->klass() != dst->klass()) return -1; 2236 __ load_klass(rscratch2, dst); 2237 __ eor(rscratch2, rscratch2, scratch_src_klass); 2238 __ cbnz(rscratch2, L_failed); 2239 2240 // if (!src->is_Array()) return -1; 2241 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2242 2243 // At this point, it is known to be a typeArray (array_tag 0x3). 2244 #ifdef ASSERT 2245 { 2246 BLOCK_COMMENT("assert primitive array {"); 2247 Label L; 2248 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2249 __ cmpw(lh, rscratch2); 2250 __ br(Assembler::GE, L); 2251 __ stop("must be a primitive array"); 2252 __ bind(L); 2253 BLOCK_COMMENT("} assert primitive array done"); 2254 } 2255 #endif 2256 2257 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2258 rscratch2, L_failed); 2259 2260 // TypeArrayKlass 2261 // 2262 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2263 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2264 // 2265 2266 const Register rscratch1_offset = rscratch1; // array offset 2267 const Register r15_elsize = lh; // element size 2268 2269 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2270 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2271 __ add(src, src, rscratch1_offset); // src array offset 2272 __ add(dst, dst, rscratch1_offset); // dst array offset 2273 BLOCK_COMMENT("choose copy loop based on element size"); 2274 2275 // next registers should be set before the jump to corresponding stub 2276 const Register from = c_rarg0; // source array address 2277 const Register to = c_rarg1; // destination array address 2278 const Register count = c_rarg2; // elements count 2279 2280 // 'from', 'to', 'count' registers should be set in such order 2281 // since they are the same as 'src', 'src_pos', 'dst'. 2282 2283 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2284 2285 // The possible values of elsize are 0-3, i.e. exact_log2(element 2286 // size in bytes). We do a simple bitwise binary search. 2287 __ BIND(L_copy_bytes); 2288 __ tbnz(r15_elsize, 1, L_copy_ints); 2289 __ tbnz(r15_elsize, 0, L_copy_shorts); 2290 __ lea(from, Address(src, src_pos));// src_addr 2291 __ lea(to, Address(dst, dst_pos));// dst_addr 2292 __ movw(count, scratch_length); // length 2293 __ b(RuntimeAddress(byte_copy_entry)); 2294 2295 __ BIND(L_copy_shorts); 2296 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2297 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2298 __ movw(count, scratch_length); // length 2299 __ b(RuntimeAddress(short_copy_entry)); 2300 2301 __ BIND(L_copy_ints); 2302 __ tbnz(r15_elsize, 0, L_copy_longs); 2303 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2304 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2305 __ movw(count, scratch_length); // length 2306 __ b(RuntimeAddress(int_copy_entry)); 2307 2308 __ BIND(L_copy_longs); 2309 #ifdef ASSERT 2310 { 2311 BLOCK_COMMENT("assert long copy {"); 2312 Label L; 2313 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2314 __ cmpw(r15_elsize, LogBytesPerLong); 2315 __ br(Assembler::EQ, L); 2316 __ stop("must be long copy, but elsize is wrong"); 2317 __ bind(L); 2318 BLOCK_COMMENT("} assert long copy done"); 2319 } 2320 #endif 2321 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2322 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2323 __ movw(count, scratch_length); // length 2324 __ b(RuntimeAddress(long_copy_entry)); 2325 2326 // ObjArrayKlass 2327 __ BIND(L_objArray); 2328 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2329 2330 Label L_plain_copy, L_checkcast_copy; 2331 // test array classes for subtyping 2332 __ load_klass(r15, dst); 2333 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2334 __ br(Assembler::NE, L_checkcast_copy); 2335 2336 // Identically typed arrays can be copied without element-wise checks. 2337 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2338 rscratch2, L_failed); 2339 2340 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2341 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2342 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2343 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2344 __ movw(count, scratch_length); // length 2345 __ BIND(L_plain_copy); 2346 __ b(RuntimeAddress(oop_copy_entry)); 2347 2348 __ BIND(L_checkcast_copy); 2349 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2350 { 2351 // Before looking at dst.length, make sure dst is also an objArray. 2352 __ ldrw(rscratch1, Address(r15, lh_offset)); 2353 __ movw(rscratch2, objArray_lh); 2354 __ eorw(rscratch1, rscratch1, rscratch2); 2355 __ cbnzw(rscratch1, L_failed); 2356 2357 // It is safe to examine both src.length and dst.length. 2358 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2359 r15, L_failed); 2360 2361 __ load_klass(dst_klass, dst); // reload 2362 2363 // Marshal the base address arguments now, freeing registers. 2364 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2365 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2366 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2367 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2368 __ movw(count, length); // length (reloaded) 2369 Register sco_temp = c_rarg3; // this register is free now 2370 assert_different_registers(from, to, count, sco_temp, 2371 dst_klass, scratch_src_klass); 2372 // assert_clean_int(count, sco_temp); 2373 2374 // Generate the type check. 2375 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2376 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2377 2378 // Smashes rscratch1, rscratch2 2379 generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg, 2380 L_plain_copy); 2381 2382 // Fetch destination element klass from the ObjArrayKlass header. 2383 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2384 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2385 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2386 2387 // the checkcast_copy loop needs two extra arguments: 2388 assert(c_rarg3 == sco_temp, "#3 already in place"); 2389 // Set up arguments for checkcast_copy_entry. 2390 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2391 __ b(RuntimeAddress(checkcast_copy_entry)); 2392 } 2393 2394 __ BIND(L_failed); 2395 __ mov(r0, -1); 2396 __ leave(); // required for proper stackwalking of RuntimeStub frame 2397 __ ret(lr); 2398 2399 return start; 2400 } 2401 2402 // 2403 // Generate stub for array fill. If "aligned" is true, the 2404 // "to" address is assumed to be heapword aligned. 2405 // 2406 // Arguments for generated stub: 2407 // to: c_rarg0 2408 // value: c_rarg1 2409 // count: c_rarg2 treated as signed 2410 // 2411 address generate_fill(BasicType t, bool aligned, const char *name) { 2412 __ align(CodeEntryAlignment); 2413 StubCodeMark mark(this, "StubRoutines", name); 2414 address start = __ pc(); 2415 2416 BLOCK_COMMENT("Entry:"); 2417 2418 const Register to = c_rarg0; // source array address 2419 const Register value = c_rarg1; // value 2420 const Register count = c_rarg2; // elements count 2421 2422 const Register bz_base = r10; // base for block_zero routine 2423 const Register cnt_words = r11; // temp register 2424 2425 __ enter(); 2426 2427 Label L_fill_elements, L_exit1; 2428 2429 int shift = -1; 2430 switch (t) { 2431 case T_BYTE: 2432 shift = 0; 2433 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2434 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2435 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2436 __ br(Assembler::LO, L_fill_elements); 2437 break; 2438 case T_SHORT: 2439 shift = 1; 2440 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2441 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2442 __ br(Assembler::LO, L_fill_elements); 2443 break; 2444 case T_INT: 2445 shift = 2; 2446 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2447 __ br(Assembler::LO, L_fill_elements); 2448 break; 2449 default: ShouldNotReachHere(); 2450 } 2451 2452 // Align source address at 8 bytes address boundary. 2453 Label L_skip_align1, L_skip_align2, L_skip_align4; 2454 if (!aligned) { 2455 switch (t) { 2456 case T_BYTE: 2457 // One byte misalignment happens only for byte arrays. 2458 __ tbz(to, 0, L_skip_align1); 2459 __ strb(value, Address(__ post(to, 1))); 2460 __ subw(count, count, 1); 2461 __ bind(L_skip_align1); 2462 // Fallthrough 2463 case T_SHORT: 2464 // Two bytes misalignment happens only for byte and short (char) arrays. 2465 __ tbz(to, 1, L_skip_align2); 2466 __ strh(value, Address(__ post(to, 2))); 2467 __ subw(count, count, 2 >> shift); 2468 __ bind(L_skip_align2); 2469 // Fallthrough 2470 case T_INT: 2471 // Align to 8 bytes, we know we are 4 byte aligned to start. 2472 __ tbz(to, 2, L_skip_align4); 2473 __ strw(value, Address(__ post(to, 4))); 2474 __ subw(count, count, 4 >> shift); 2475 __ bind(L_skip_align4); 2476 break; 2477 default: ShouldNotReachHere(); 2478 } 2479 } 2480 2481 // 2482 // Fill large chunks 2483 // 2484 __ lsrw(cnt_words, count, 3 - shift); // number of words 2485 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2486 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2487 if (UseBlockZeroing) { 2488 Label non_block_zeroing, rest; 2489 // If the fill value is zero we can use the fast zero_words(). 2490 __ cbnz(value, non_block_zeroing); 2491 __ mov(bz_base, to); 2492 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2493 address tpc = __ zero_words(bz_base, cnt_words); 2494 if (tpc == nullptr) { 2495 fatal("CodeCache is full at generate_fill"); 2496 } 2497 __ b(rest); 2498 __ bind(non_block_zeroing); 2499 __ fill_words(to, cnt_words, value); 2500 __ bind(rest); 2501 } else { 2502 __ fill_words(to, cnt_words, value); 2503 } 2504 2505 // Remaining count is less than 8 bytes. Fill it by a single store. 2506 // Note that the total length is no less than 8 bytes. 2507 if (t == T_BYTE || t == T_SHORT) { 2508 Label L_exit1; 2509 __ cbzw(count, L_exit1); 2510 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2511 __ str(value, Address(to, -8)); // overwrite some elements 2512 __ bind(L_exit1); 2513 __ leave(); 2514 __ ret(lr); 2515 } 2516 2517 // Handle copies less than 8 bytes. 2518 Label L_fill_2, L_fill_4, L_exit2; 2519 __ bind(L_fill_elements); 2520 switch (t) { 2521 case T_BYTE: 2522 __ tbz(count, 0, L_fill_2); 2523 __ strb(value, Address(__ post(to, 1))); 2524 __ bind(L_fill_2); 2525 __ tbz(count, 1, L_fill_4); 2526 __ strh(value, Address(__ post(to, 2))); 2527 __ bind(L_fill_4); 2528 __ tbz(count, 2, L_exit2); 2529 __ strw(value, Address(to)); 2530 break; 2531 case T_SHORT: 2532 __ tbz(count, 0, L_fill_4); 2533 __ strh(value, Address(__ post(to, 2))); 2534 __ bind(L_fill_4); 2535 __ tbz(count, 1, L_exit2); 2536 __ strw(value, Address(to)); 2537 break; 2538 case T_INT: 2539 __ cbzw(count, L_exit2); 2540 __ strw(value, Address(to)); 2541 break; 2542 default: ShouldNotReachHere(); 2543 } 2544 __ bind(L_exit2); 2545 __ leave(); 2546 __ ret(lr); 2547 return start; 2548 } 2549 2550 address generate_data_cache_writeback() { 2551 const Register line = c_rarg0; // address of line to write back 2552 2553 __ align(CodeEntryAlignment); 2554 2555 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback"); 2556 2557 address start = __ pc(); 2558 __ enter(); 2559 __ cache_wb(Address(line, 0)); 2560 __ leave(); 2561 __ ret(lr); 2562 2563 return start; 2564 } 2565 2566 address generate_data_cache_writeback_sync() { 2567 const Register is_pre = c_rarg0; // pre or post sync 2568 2569 __ align(CodeEntryAlignment); 2570 2571 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync"); 2572 2573 // pre wbsync is a no-op 2574 // post wbsync translates to an sfence 2575 2576 Label skip; 2577 address start = __ pc(); 2578 __ enter(); 2579 __ cbnz(is_pre, skip); 2580 __ cache_wbsync(false); 2581 __ bind(skip); 2582 __ leave(); 2583 __ ret(lr); 2584 2585 return start; 2586 } 2587 2588 void generate_arraycopy_stubs() { 2589 address entry; 2590 address entry_jbyte_arraycopy; 2591 address entry_jshort_arraycopy; 2592 address entry_jint_arraycopy; 2593 address entry_oop_arraycopy; 2594 address entry_jlong_arraycopy; 2595 address entry_checkcast_arraycopy; 2596 2597 generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_f, r0, r1, r15, copy_forwards); 2598 generate_copy_longs(IN_HEAP | IS_ARRAY, T_BYTE, copy_b, r0, r1, r15, copy_backwards); 2599 2600 generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_f, r0, r1, r15, copy_forwards); 2601 generate_copy_longs(IN_HEAP | IS_ARRAY, T_OBJECT, copy_obj_b, r0, r1, r15, copy_backwards); 2602 2603 generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_f, r0, r1, r15, copy_forwards); 2604 generate_copy_longs(IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, T_OBJECT, copy_obj_uninit_b, r0, r1, r15, copy_backwards); 2605 2606 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2607 2608 //*** jbyte 2609 // Always need aligned and unaligned versions 2610 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2611 "jbyte_disjoint_arraycopy"); 2612 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2613 &entry_jbyte_arraycopy, 2614 "jbyte_arraycopy"); 2615 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2616 "arrayof_jbyte_disjoint_arraycopy"); 2617 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, nullptr, 2618 "arrayof_jbyte_arraycopy"); 2619 2620 //*** jshort 2621 // Always need aligned and unaligned versions 2622 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2623 "jshort_disjoint_arraycopy"); 2624 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2625 &entry_jshort_arraycopy, 2626 "jshort_arraycopy"); 2627 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2628 "arrayof_jshort_disjoint_arraycopy"); 2629 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, nullptr, 2630 "arrayof_jshort_arraycopy"); 2631 2632 //*** jint 2633 // Aligned versions 2634 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2635 "arrayof_jint_disjoint_arraycopy"); 2636 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2637 "arrayof_jint_arraycopy"); 2638 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2639 // entry_jint_arraycopy always points to the unaligned version 2640 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2641 "jint_disjoint_arraycopy"); 2642 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2643 &entry_jint_arraycopy, 2644 "jint_arraycopy"); 2645 2646 //*** jlong 2647 // It is always aligned 2648 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2649 "arrayof_jlong_disjoint_arraycopy"); 2650 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2651 "arrayof_jlong_arraycopy"); 2652 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2653 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2654 2655 //*** oops 2656 { 2657 // With compressed oops we need unaligned versions; notice that 2658 // we overwrite entry_oop_arraycopy. 2659 bool aligned = !UseCompressedOops; 2660 2661 StubRoutines::_arrayof_oop_disjoint_arraycopy 2662 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2663 /*dest_uninitialized*/false); 2664 StubRoutines::_arrayof_oop_arraycopy 2665 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2666 /*dest_uninitialized*/false); 2667 // Aligned versions without pre-barriers 2668 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2669 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2670 /*dest_uninitialized*/true); 2671 StubRoutines::_arrayof_oop_arraycopy_uninit 2672 = generate_conjoint_oop_copy(aligned, entry, nullptr, "arrayof_oop_arraycopy_uninit", 2673 /*dest_uninitialized*/true); 2674 } 2675 2676 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2677 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2678 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2679 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2680 2681 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2682 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", nullptr, 2683 /*dest_uninitialized*/true); 2684 2685 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2686 entry_jbyte_arraycopy, 2687 entry_jshort_arraycopy, 2688 entry_jint_arraycopy, 2689 entry_jlong_arraycopy); 2690 2691 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2692 entry_jbyte_arraycopy, 2693 entry_jshort_arraycopy, 2694 entry_jint_arraycopy, 2695 entry_oop_arraycopy, 2696 entry_jlong_arraycopy, 2697 entry_checkcast_arraycopy); 2698 2699 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2700 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2701 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2702 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2703 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2704 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2705 } 2706 2707 void generate_math_stubs() { Unimplemented(); } 2708 2709 // Arguments: 2710 // 2711 // Inputs: 2712 // c_rarg0 - source byte array address 2713 // c_rarg1 - destination byte array address 2714 // c_rarg2 - K (key) in little endian int array 2715 // 2716 address generate_aescrypt_encryptBlock() { 2717 __ align(CodeEntryAlignment); 2718 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2719 2720 const Register from = c_rarg0; // source array address 2721 const Register to = c_rarg1; // destination array address 2722 const Register key = c_rarg2; // key array address 2723 const Register keylen = rscratch1; 2724 2725 address start = __ pc(); 2726 __ enter(); 2727 2728 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2729 2730 __ aesenc_loadkeys(key, keylen); 2731 __ aesecb_encrypt(from, to, keylen); 2732 2733 __ mov(r0, 0); 2734 2735 __ leave(); 2736 __ ret(lr); 2737 2738 return start; 2739 } 2740 2741 // Arguments: 2742 // 2743 // Inputs: 2744 // c_rarg0 - source byte array address 2745 // c_rarg1 - destination byte array address 2746 // c_rarg2 - K (key) in little endian int array 2747 // 2748 address generate_aescrypt_decryptBlock() { 2749 assert(UseAES, "need AES cryptographic extension support"); 2750 __ align(CodeEntryAlignment); 2751 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2752 Label L_doLast; 2753 2754 const Register from = c_rarg0; // source array address 2755 const Register to = c_rarg1; // destination array address 2756 const Register key = c_rarg2; // key array address 2757 const Register keylen = rscratch1; 2758 2759 address start = __ pc(); 2760 __ enter(); // required for proper stackwalking of RuntimeStub frame 2761 2762 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2763 2764 __ aesecb_decrypt(from, to, key, keylen); 2765 2766 __ mov(r0, 0); 2767 2768 __ leave(); 2769 __ ret(lr); 2770 2771 return start; 2772 } 2773 2774 // Arguments: 2775 // 2776 // Inputs: 2777 // c_rarg0 - source byte array address 2778 // c_rarg1 - destination byte array address 2779 // c_rarg2 - K (key) in little endian int array 2780 // c_rarg3 - r vector byte array address 2781 // c_rarg4 - input length 2782 // 2783 // Output: 2784 // x0 - input length 2785 // 2786 address generate_cipherBlockChaining_encryptAESCrypt() { 2787 assert(UseAES, "need AES cryptographic extension support"); 2788 __ align(CodeEntryAlignment); 2789 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2790 2791 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2792 2793 const Register from = c_rarg0; // source array address 2794 const Register to = c_rarg1; // destination array address 2795 const Register key = c_rarg2; // key array address 2796 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2797 // and left with the results of the last encryption block 2798 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2799 const Register keylen = rscratch1; 2800 2801 address start = __ pc(); 2802 2803 __ enter(); 2804 2805 __ movw(rscratch2, len_reg); 2806 2807 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2808 2809 __ ld1(v0, __ T16B, rvec); 2810 2811 __ cmpw(keylen, 52); 2812 __ br(Assembler::CC, L_loadkeys_44); 2813 __ br(Assembler::EQ, L_loadkeys_52); 2814 2815 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2816 __ rev32(v17, __ T16B, v17); 2817 __ rev32(v18, __ T16B, v18); 2818 __ BIND(L_loadkeys_52); 2819 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2820 __ rev32(v19, __ T16B, v19); 2821 __ rev32(v20, __ T16B, v20); 2822 __ BIND(L_loadkeys_44); 2823 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2824 __ rev32(v21, __ T16B, v21); 2825 __ rev32(v22, __ T16B, v22); 2826 __ rev32(v23, __ T16B, v23); 2827 __ rev32(v24, __ T16B, v24); 2828 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2829 __ rev32(v25, __ T16B, v25); 2830 __ rev32(v26, __ T16B, v26); 2831 __ rev32(v27, __ T16B, v27); 2832 __ rev32(v28, __ T16B, v28); 2833 __ ld1(v29, v30, v31, __ T16B, key); 2834 __ rev32(v29, __ T16B, v29); 2835 __ rev32(v30, __ T16B, v30); 2836 __ rev32(v31, __ T16B, v31); 2837 2838 __ BIND(L_aes_loop); 2839 __ ld1(v1, __ T16B, __ post(from, 16)); 2840 __ eor(v0, __ T16B, v0, v1); 2841 2842 __ br(Assembler::CC, L_rounds_44); 2843 __ br(Assembler::EQ, L_rounds_52); 2844 2845 __ aese(v0, v17); __ aesmc(v0, v0); 2846 __ aese(v0, v18); __ aesmc(v0, v0); 2847 __ BIND(L_rounds_52); 2848 __ aese(v0, v19); __ aesmc(v0, v0); 2849 __ aese(v0, v20); __ aesmc(v0, v0); 2850 __ BIND(L_rounds_44); 2851 __ aese(v0, v21); __ aesmc(v0, v0); 2852 __ aese(v0, v22); __ aesmc(v0, v0); 2853 __ aese(v0, v23); __ aesmc(v0, v0); 2854 __ aese(v0, v24); __ aesmc(v0, v0); 2855 __ aese(v0, v25); __ aesmc(v0, v0); 2856 __ aese(v0, v26); __ aesmc(v0, v0); 2857 __ aese(v0, v27); __ aesmc(v0, v0); 2858 __ aese(v0, v28); __ aesmc(v0, v0); 2859 __ aese(v0, v29); __ aesmc(v0, v0); 2860 __ aese(v0, v30); 2861 __ eor(v0, __ T16B, v0, v31); 2862 2863 __ st1(v0, __ T16B, __ post(to, 16)); 2864 2865 __ subw(len_reg, len_reg, 16); 2866 __ cbnzw(len_reg, L_aes_loop); 2867 2868 __ st1(v0, __ T16B, rvec); 2869 2870 __ mov(r0, rscratch2); 2871 2872 __ leave(); 2873 __ ret(lr); 2874 2875 return start; 2876 } 2877 2878 // Arguments: 2879 // 2880 // Inputs: 2881 // c_rarg0 - source byte array address 2882 // c_rarg1 - destination byte array address 2883 // c_rarg2 - K (key) in little endian int array 2884 // c_rarg3 - r vector byte array address 2885 // c_rarg4 - input length 2886 // 2887 // Output: 2888 // r0 - input length 2889 // 2890 address generate_cipherBlockChaining_decryptAESCrypt() { 2891 assert(UseAES, "need AES cryptographic extension support"); 2892 __ align(CodeEntryAlignment); 2893 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2894 2895 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2896 2897 const Register from = c_rarg0; // source array address 2898 const Register to = c_rarg1; // destination array address 2899 const Register key = c_rarg2; // key array address 2900 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2901 // and left with the results of the last encryption block 2902 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2903 const Register keylen = rscratch1; 2904 2905 address start = __ pc(); 2906 2907 __ enter(); 2908 2909 __ movw(rscratch2, len_reg); 2910 2911 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2912 2913 __ ld1(v2, __ T16B, rvec); 2914 2915 __ ld1(v31, __ T16B, __ post(key, 16)); 2916 __ rev32(v31, __ T16B, v31); 2917 2918 __ cmpw(keylen, 52); 2919 __ br(Assembler::CC, L_loadkeys_44); 2920 __ br(Assembler::EQ, L_loadkeys_52); 2921 2922 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2923 __ rev32(v17, __ T16B, v17); 2924 __ rev32(v18, __ T16B, v18); 2925 __ BIND(L_loadkeys_52); 2926 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2927 __ rev32(v19, __ T16B, v19); 2928 __ rev32(v20, __ T16B, v20); 2929 __ BIND(L_loadkeys_44); 2930 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2931 __ rev32(v21, __ T16B, v21); 2932 __ rev32(v22, __ T16B, v22); 2933 __ rev32(v23, __ T16B, v23); 2934 __ rev32(v24, __ T16B, v24); 2935 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2936 __ rev32(v25, __ T16B, v25); 2937 __ rev32(v26, __ T16B, v26); 2938 __ rev32(v27, __ T16B, v27); 2939 __ rev32(v28, __ T16B, v28); 2940 __ ld1(v29, v30, __ T16B, key); 2941 __ rev32(v29, __ T16B, v29); 2942 __ rev32(v30, __ T16B, v30); 2943 2944 __ BIND(L_aes_loop); 2945 __ ld1(v0, __ T16B, __ post(from, 16)); 2946 __ orr(v1, __ T16B, v0, v0); 2947 2948 __ br(Assembler::CC, L_rounds_44); 2949 __ br(Assembler::EQ, L_rounds_52); 2950 2951 __ aesd(v0, v17); __ aesimc(v0, v0); 2952 __ aesd(v0, v18); __ aesimc(v0, v0); 2953 __ BIND(L_rounds_52); 2954 __ aesd(v0, v19); __ aesimc(v0, v0); 2955 __ aesd(v0, v20); __ aesimc(v0, v0); 2956 __ BIND(L_rounds_44); 2957 __ aesd(v0, v21); __ aesimc(v0, v0); 2958 __ aesd(v0, v22); __ aesimc(v0, v0); 2959 __ aesd(v0, v23); __ aesimc(v0, v0); 2960 __ aesd(v0, v24); __ aesimc(v0, v0); 2961 __ aesd(v0, v25); __ aesimc(v0, v0); 2962 __ aesd(v0, v26); __ aesimc(v0, v0); 2963 __ aesd(v0, v27); __ aesimc(v0, v0); 2964 __ aesd(v0, v28); __ aesimc(v0, v0); 2965 __ aesd(v0, v29); __ aesimc(v0, v0); 2966 __ aesd(v0, v30); 2967 __ eor(v0, __ T16B, v0, v31); 2968 __ eor(v0, __ T16B, v0, v2); 2969 2970 __ st1(v0, __ T16B, __ post(to, 16)); 2971 __ orr(v2, __ T16B, v1, v1); 2972 2973 __ subw(len_reg, len_reg, 16); 2974 __ cbnzw(len_reg, L_aes_loop); 2975 2976 __ st1(v2, __ T16B, rvec); 2977 2978 __ mov(r0, rscratch2); 2979 2980 __ leave(); 2981 __ ret(lr); 2982 2983 return start; 2984 } 2985 2986 // Big-endian 128-bit + 64-bit -> 128-bit addition. 2987 // Inputs: 128-bits. in is preserved. 2988 // The least-significant 64-bit word is in the upper dword of each vector. 2989 // inc (the 64-bit increment) is preserved. Its lower dword must be zero. 2990 // Output: result 2991 void be_add_128_64(FloatRegister result, FloatRegister in, 2992 FloatRegister inc, FloatRegister tmp) { 2993 assert_different_registers(result, tmp, inc); 2994 2995 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of 2996 // input 2997 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing 2998 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and 2999 // MSD == 0 (must be!) to LSD 3000 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow 3001 } 3002 3003 // CTR AES crypt. 3004 // Arguments: 3005 // 3006 // Inputs: 3007 // c_rarg0 - source byte array address 3008 // c_rarg1 - destination byte array address 3009 // c_rarg2 - K (key) in little endian int array 3010 // c_rarg3 - counter vector byte array address 3011 // c_rarg4 - input length 3012 // c_rarg5 - saved encryptedCounter start 3013 // c_rarg6 - saved used length 3014 // 3015 // Output: 3016 // r0 - input length 3017 // 3018 address generate_counterMode_AESCrypt() { 3019 const Register in = c_rarg0; 3020 const Register out = c_rarg1; 3021 const Register key = c_rarg2; 3022 const Register counter = c_rarg3; 3023 const Register saved_len = c_rarg4, len = r10; 3024 const Register saved_encrypted_ctr = c_rarg5; 3025 const Register used_ptr = c_rarg6, used = r12; 3026 3027 const Register offset = r7; 3028 const Register keylen = r11; 3029 3030 const unsigned char block_size = 16; 3031 const int bulk_width = 4; 3032 // NB: bulk_width can be 4 or 8. 8 gives slightly faster 3033 // performance with larger data sizes, but it also means that the 3034 // fast path isn't used until you have at least 8 blocks, and up 3035 // to 127 bytes of data will be executed on the slow path. For 3036 // that reason, and also so as not to blow away too much icache, 4 3037 // blocks seems like a sensible compromise. 3038 3039 // Algorithm: 3040 // 3041 // if (len == 0) { 3042 // goto DONE; 3043 // } 3044 // int result = len; 3045 // do { 3046 // if (used >= blockSize) { 3047 // if (len >= bulk_width * blockSize) { 3048 // CTR_large_block(); 3049 // if (len == 0) 3050 // goto DONE; 3051 // } 3052 // for (;;) { 3053 // 16ByteVector v0 = counter; 3054 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); 3055 // used = 0; 3056 // if (len < blockSize) 3057 // break; /* goto NEXT */ 3058 // 16ByteVector v1 = load16Bytes(in, offset); 3059 // v1 = v1 ^ encryptedCounter; 3060 // store16Bytes(out, offset); 3061 // used = blockSize; 3062 // offset += blockSize; 3063 // len -= blockSize; 3064 // if (len == 0) 3065 // goto DONE; 3066 // } 3067 // } 3068 // NEXT: 3069 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); 3070 // len--; 3071 // } while (len != 0); 3072 // DONE: 3073 // return result; 3074 // 3075 // CTR_large_block() 3076 // Wide bulk encryption of whole blocks. 3077 3078 __ align(CodeEntryAlignment); 3079 StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt"); 3080 const address start = __ pc(); 3081 __ enter(); 3082 3083 Label DONE, CTR_large_block, large_block_return; 3084 __ ldrw(used, Address(used_ptr)); 3085 __ cbzw(saved_len, DONE); 3086 3087 __ mov(len, saved_len); 3088 __ mov(offset, 0); 3089 3090 // Compute #rounds for AES based on the length of the key array 3091 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3092 3093 __ aesenc_loadkeys(key, keylen); 3094 3095 { 3096 Label L_CTR_loop, NEXT; 3097 3098 __ bind(L_CTR_loop); 3099 3100 __ cmp(used, block_size); 3101 __ br(__ LO, NEXT); 3102 3103 // Maybe we have a lot of data 3104 __ subsw(rscratch1, len, bulk_width * block_size); 3105 __ br(__ HS, CTR_large_block); 3106 __ BIND(large_block_return); 3107 __ cbzw(len, DONE); 3108 3109 // Setup the counter 3110 __ movi(v4, __ T4S, 0); 3111 __ movi(v5, __ T4S, 1); 3112 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 } 3113 3114 // 128-bit big-endian increment 3115 __ ld1(v0, __ T16B, counter); 3116 __ rev64(v16, __ T16B, v0); 3117 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3118 __ rev64(v16, __ T16B, v16); 3119 __ st1(v16, __ T16B, counter); 3120 // Previous counter value is in v0 3121 // v4 contains { 0, 1 } 3122 3123 { 3124 // We have fewer than bulk_width blocks of data left. Encrypt 3125 // them one by one until there is less than a full block 3126 // remaining, being careful to save both the encrypted counter 3127 // and the counter. 3128 3129 Label inner_loop; 3130 __ bind(inner_loop); 3131 // Counter to encrypt is in v0 3132 __ aesecb_encrypt(noreg, noreg, keylen); 3133 __ st1(v0, __ T16B, saved_encrypted_ctr); 3134 3135 // Do we have a remaining full block? 3136 3137 __ mov(used, 0); 3138 __ cmp(len, block_size); 3139 __ br(__ LO, NEXT); 3140 3141 // Yes, we have a full block 3142 __ ldrq(v1, Address(in, offset)); 3143 __ eor(v1, __ T16B, v1, v0); 3144 __ strq(v1, Address(out, offset)); 3145 __ mov(used, block_size); 3146 __ add(offset, offset, block_size); 3147 3148 __ subw(len, len, block_size); 3149 __ cbzw(len, DONE); 3150 3151 // Increment the counter, store it back 3152 __ orr(v0, __ T16B, v16, v16); 3153 __ rev64(v16, __ T16B, v16); 3154 be_add_128_64(v16, v16, v4, /*tmp*/v5); 3155 __ rev64(v16, __ T16B, v16); 3156 __ st1(v16, __ T16B, counter); // Save the incremented counter back 3157 3158 __ b(inner_loop); 3159 } 3160 3161 __ BIND(NEXT); 3162 3163 // Encrypt a single byte, and loop. 3164 // We expect this to be a rare event. 3165 __ ldrb(rscratch1, Address(in, offset)); 3166 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); 3167 __ eor(rscratch1, rscratch1, rscratch2); 3168 __ strb(rscratch1, Address(out, offset)); 3169 __ add(offset, offset, 1); 3170 __ add(used, used, 1); 3171 __ subw(len, len,1); 3172 __ cbnzw(len, L_CTR_loop); 3173 } 3174 3175 __ bind(DONE); 3176 __ strw(used, Address(used_ptr)); 3177 __ mov(r0, saved_len); 3178 3179 __ leave(); // required for proper stackwalking of RuntimeStub frame 3180 __ ret(lr); 3181 3182 // Bulk encryption 3183 3184 __ BIND (CTR_large_block); 3185 assert(bulk_width == 4 || bulk_width == 8, "must be"); 3186 3187 if (bulk_width == 8) { 3188 __ sub(sp, sp, 4 * 16); 3189 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3190 } 3191 __ sub(sp, sp, 4 * 16); 3192 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3193 RegSet saved_regs = (RegSet::of(in, out, offset) 3194 + RegSet::of(saved_encrypted_ctr, used_ptr, len)); 3195 __ push(saved_regs, sp); 3196 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption 3197 __ add(in, in, offset); 3198 __ add(out, out, offset); 3199 3200 // Keys should already be loaded into the correct registers 3201 3202 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3203 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter 3204 3205 // AES/CTR loop 3206 { 3207 Label L_CTR_loop; 3208 __ BIND(L_CTR_loop); 3209 3210 // Setup the counters 3211 __ movi(v8, __ T4S, 0); 3212 __ movi(v9, __ T4S, 1); 3213 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 } 3214 3215 for (int i = 0; i < bulk_width; i++) { 3216 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3217 __ rev64(v0_ofs, __ T16B, v16); 3218 be_add_128_64(v16, v16, v8, /*tmp*/v9); 3219 } 3220 3221 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3222 3223 // Encrypt the counters 3224 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); 3225 3226 if (bulk_width == 8) { 3227 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3228 } 3229 3230 // XOR the encrypted counters with the inputs 3231 for (int i = 0; i < bulk_width; i++) { 3232 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3233 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3234 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3235 } 3236 3237 // Write the encrypted data 3238 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3239 if (bulk_width == 8) { 3240 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3241 } 3242 3243 __ subw(len, len, 16 * bulk_width); 3244 __ cbnzw(len, L_CTR_loop); 3245 } 3246 3247 // Save the counter back where it goes 3248 __ rev64(v16, __ T16B, v16); 3249 __ st1(v16, __ T16B, counter); 3250 3251 __ pop(saved_regs, sp); 3252 3253 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3254 if (bulk_width == 8) { 3255 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3256 } 3257 3258 __ andr(rscratch1, len, -16 * bulk_width); 3259 __ sub(len, len, rscratch1); 3260 __ add(offset, offset, rscratch1); 3261 __ mov(used, 16); 3262 __ strw(used, Address(used_ptr)); 3263 __ b(large_block_return); 3264 3265 return start; 3266 } 3267 3268 // Vector AES Galois Counter Mode implementation. Parameters: 3269 // 3270 // in = c_rarg0 3271 // len = c_rarg1 3272 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt) 3273 // out = c_rarg3 3274 // key = c_rarg4 3275 // state = c_rarg5 - GHASH.state 3276 // subkeyHtbl = c_rarg6 - powers of H 3277 // counter = c_rarg7 - 16 bytes of CTR 3278 // return - number of processed bytes 3279 address generate_galoisCounterMode_AESCrypt() { 3280 address ghash_polynomial = __ pc(); 3281 __ emit_int64(0x87); // The low-order bits of the field 3282 // polynomial (i.e. p = z^7+z^2+z+1) 3283 // repeated in the low and high parts of a 3284 // 128-bit vector 3285 __ emit_int64(0x87); 3286 3287 __ align(CodeEntryAlignment); 3288 StubCodeMark mark(this, "StubRoutines", "galoisCounterMode_AESCrypt"); 3289 address start = __ pc(); 3290 __ enter(); 3291 3292 const Register in = c_rarg0; 3293 const Register len = c_rarg1; 3294 const Register ct = c_rarg2; 3295 const Register out = c_rarg3; 3296 // and updated with the incremented counter in the end 3297 3298 const Register key = c_rarg4; 3299 const Register state = c_rarg5; 3300 3301 const Register subkeyHtbl = c_rarg6; 3302 3303 const Register counter = c_rarg7; 3304 3305 const Register keylen = r10; 3306 // Save state before entering routine 3307 __ sub(sp, sp, 4 * 16); 3308 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 3309 __ sub(sp, sp, 4 * 16); 3310 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 3311 3312 // __ andr(len, len, -512); 3313 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption 3314 __ str(len, __ pre(sp, -2 * wordSize)); 3315 3316 Label DONE; 3317 __ cbz(len, DONE); 3318 3319 // Compute #rounds for AES based on the length of the key array 3320 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3321 3322 __ aesenc_loadkeys(key, keylen); 3323 __ ld1(v0, __ T16B, counter); // v0 contains the first counter 3324 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter 3325 3326 // AES/CTR loop 3327 { 3328 Label L_CTR_loop; 3329 __ BIND(L_CTR_loop); 3330 3331 // Setup the counters 3332 __ movi(v8, __ T4S, 0); 3333 __ movi(v9, __ T4S, 1); 3334 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } 3335 3336 assert(v0->encoding() < v8->encoding(), ""); 3337 for (int i = v0->encoding(); i < v8->encoding(); i++) { 3338 FloatRegister f = as_FloatRegister(i); 3339 __ rev32(f, __ T16B, v16); 3340 __ addv(v16, __ T4S, v16, v8); 3341 } 3342 3343 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); 3344 3345 // Encrypt the counters 3346 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8); 3347 3348 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); 3349 3350 // XOR the encrypted counters with the inputs 3351 for (int i = 0; i < 8; i++) { 3352 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i); 3353 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i); 3354 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs); 3355 } 3356 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); 3357 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); 3358 3359 __ subw(len, len, 16 * 8); 3360 __ cbnzw(len, L_CTR_loop); 3361 } 3362 3363 __ rev32(v16, __ T16B, v16); 3364 __ st1(v16, __ T16B, counter); 3365 3366 __ ldr(len, Address(sp)); 3367 __ lsr(len, len, exact_log2(16)); // We want the count of blocks 3368 3369 // GHASH/CTR loop 3370 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct, 3371 len, /*unrolls*/4); 3372 3373 #ifdef ASSERT 3374 { Label L; 3375 __ cmp(len, (unsigned char)0); 3376 __ br(Assembler::EQ, L); 3377 __ stop("stubGenerator: abort"); 3378 __ bind(L); 3379 } 3380 #endif 3381 3382 __ bind(DONE); 3383 // Return the number of bytes processed 3384 __ ldr(r0, __ post(sp, 2 * wordSize)); 3385 3386 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 3387 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 3388 3389 __ leave(); // required for proper stackwalking of RuntimeStub frame 3390 __ ret(lr); 3391 return start; 3392 } 3393 3394 class Cached64Bytes { 3395 private: 3396 MacroAssembler *_masm; 3397 Register _regs[8]; 3398 3399 public: 3400 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) { 3401 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size()); 3402 auto it = rs.begin(); 3403 for (auto &r: _regs) { 3404 r = *it; 3405 ++it; 3406 } 3407 } 3408 3409 void gen_loads(Register base) { 3410 for (int i = 0; i < 8; i += 2) { 3411 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i)); 3412 } 3413 } 3414 3415 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes. 3416 void extract_u32(Register dest, int i) { 3417 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32); 3418 } 3419 }; 3420 3421 // Utility routines for md5. 3422 // Clobbers r10 and r11. 3423 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3424 int k, int s, int t) { 3425 Register rscratch3 = r10; 3426 Register rscratch4 = r11; 3427 3428 __ eorw(rscratch3, r3, r4); 3429 __ movw(rscratch2, t); 3430 __ andw(rscratch3, rscratch3, r2); 3431 __ addw(rscratch4, r1, rscratch2); 3432 reg_cache.extract_u32(rscratch1, k); 3433 __ eorw(rscratch3, rscratch3, r4); 3434 __ addw(rscratch4, rscratch4, rscratch1); 3435 __ addw(rscratch3, rscratch3, rscratch4); 3436 __ rorw(rscratch2, rscratch3, 32 - s); 3437 __ addw(r1, rscratch2, r2); 3438 } 3439 3440 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3441 int k, int s, int t) { 3442 Register rscratch3 = r10; 3443 Register rscratch4 = r11; 3444 3445 reg_cache.extract_u32(rscratch1, k); 3446 __ movw(rscratch2, t); 3447 __ addw(rscratch4, r1, rscratch2); 3448 __ addw(rscratch4, rscratch4, rscratch1); 3449 __ bicw(rscratch2, r3, r4); 3450 __ andw(rscratch3, r2, r4); 3451 __ addw(rscratch2, rscratch2, rscratch4); 3452 __ addw(rscratch2, rscratch2, rscratch3); 3453 __ rorw(rscratch2, rscratch2, 32 - s); 3454 __ addw(r1, rscratch2, r2); 3455 } 3456 3457 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3458 int k, int s, int t) { 3459 Register rscratch3 = r10; 3460 Register rscratch4 = r11; 3461 3462 __ eorw(rscratch3, r3, r4); 3463 __ movw(rscratch2, t); 3464 __ addw(rscratch4, r1, rscratch2); 3465 reg_cache.extract_u32(rscratch1, k); 3466 __ eorw(rscratch3, rscratch3, r2); 3467 __ addw(rscratch4, rscratch4, rscratch1); 3468 __ addw(rscratch3, rscratch3, rscratch4); 3469 __ rorw(rscratch2, rscratch3, 32 - s); 3470 __ addw(r1, rscratch2, r2); 3471 } 3472 3473 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4, 3474 int k, int s, int t) { 3475 Register rscratch3 = r10; 3476 Register rscratch4 = r11; 3477 3478 __ movw(rscratch3, t); 3479 __ ornw(rscratch2, r2, r4); 3480 __ addw(rscratch4, r1, rscratch3); 3481 reg_cache.extract_u32(rscratch1, k); 3482 __ eorw(rscratch3, rscratch2, r3); 3483 __ addw(rscratch4, rscratch4, rscratch1); 3484 __ addw(rscratch3, rscratch3, rscratch4); 3485 __ rorw(rscratch2, rscratch3, 32 - s); 3486 __ addw(r1, rscratch2, r2); 3487 } 3488 3489 // Arguments: 3490 // 3491 // Inputs: 3492 // c_rarg0 - byte[] source+offset 3493 // c_rarg1 - int[] SHA.state 3494 // c_rarg2 - int offset 3495 // c_rarg3 - int limit 3496 // 3497 address generate_md5_implCompress(bool multi_block, const char *name) { 3498 __ align(CodeEntryAlignment); 3499 StubCodeMark mark(this, "StubRoutines", name); 3500 address start = __ pc(); 3501 3502 Register buf = c_rarg0; 3503 Register state = c_rarg1; 3504 Register ofs = c_rarg2; 3505 Register limit = c_rarg3; 3506 Register a = r4; 3507 Register b = r5; 3508 Register c = r6; 3509 Register d = r7; 3510 Register rscratch3 = r10; 3511 Register rscratch4 = r11; 3512 3513 Register state_regs[2] = { r12, r13 }; 3514 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls; 3515 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers 3516 3517 __ push(saved_regs, sp); 3518 3519 __ ldp(state_regs[0], state_regs[1], Address(state)); 3520 __ ubfx(a, state_regs[0], 0, 32); 3521 __ ubfx(b, state_regs[0], 32, 32); 3522 __ ubfx(c, state_regs[1], 0, 32); 3523 __ ubfx(d, state_regs[1], 32, 32); 3524 3525 Label md5_loop; 3526 __ BIND(md5_loop); 3527 3528 reg_cache.gen_loads(buf); 3529 3530 // Round 1 3531 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478); 3532 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756); 3533 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db); 3534 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee); 3535 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf); 3536 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a); 3537 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613); 3538 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501); 3539 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8); 3540 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af); 3541 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1); 3542 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be); 3543 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122); 3544 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193); 3545 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e); 3546 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821); 3547 3548 // Round 2 3549 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562); 3550 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340); 3551 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51); 3552 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa); 3553 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d); 3554 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453); 3555 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681); 3556 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8); 3557 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6); 3558 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6); 3559 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87); 3560 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed); 3561 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905); 3562 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8); 3563 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9); 3564 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a); 3565 3566 // Round 3 3567 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942); 3568 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681); 3569 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122); 3570 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c); 3571 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44); 3572 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9); 3573 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60); 3574 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70); 3575 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6); 3576 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa); 3577 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085); 3578 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05); 3579 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039); 3580 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5); 3581 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8); 3582 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665); 3583 3584 // Round 4 3585 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244); 3586 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97); 3587 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7); 3588 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039); 3589 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3); 3590 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92); 3591 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d); 3592 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1); 3593 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f); 3594 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0); 3595 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314); 3596 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1); 3597 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82); 3598 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235); 3599 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb); 3600 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391); 3601 3602 __ addw(a, state_regs[0], a); 3603 __ ubfx(rscratch2, state_regs[0], 32, 32); 3604 __ addw(b, rscratch2, b); 3605 __ addw(c, state_regs[1], c); 3606 __ ubfx(rscratch4, state_regs[1], 32, 32); 3607 __ addw(d, rscratch4, d); 3608 3609 __ orr(state_regs[0], a, b, Assembler::LSL, 32); 3610 __ orr(state_regs[1], c, d, Assembler::LSL, 32); 3611 3612 if (multi_block) { 3613 __ add(buf, buf, 64); 3614 __ add(ofs, ofs, 64); 3615 __ cmp(ofs, limit); 3616 __ br(Assembler::LE, md5_loop); 3617 __ mov(c_rarg0, ofs); // return ofs 3618 } 3619 3620 // write hash values back in the correct order 3621 __ stp(state_regs[0], state_regs[1], Address(state)); 3622 3623 __ pop(saved_regs, sp); 3624 3625 __ ret(lr); 3626 3627 return start; 3628 } 3629 3630 // Arguments: 3631 // 3632 // Inputs: 3633 // c_rarg0 - byte[] source+offset 3634 // c_rarg1 - int[] SHA.state 3635 // c_rarg2 - int offset 3636 // c_rarg3 - int limit 3637 // 3638 address generate_sha1_implCompress(bool multi_block, const char *name) { 3639 __ align(CodeEntryAlignment); 3640 StubCodeMark mark(this, "StubRoutines", name); 3641 address start = __ pc(); 3642 3643 Register buf = c_rarg0; 3644 Register state = c_rarg1; 3645 Register ofs = c_rarg2; 3646 Register limit = c_rarg3; 3647 3648 Label keys; 3649 Label sha1_loop; 3650 3651 // load the keys into v0..v3 3652 __ adr(rscratch1, keys); 3653 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3654 // load 5 words state into v6, v7 3655 __ ldrq(v6, Address(state, 0)); 3656 __ ldrs(v7, Address(state, 16)); 3657 3658 3659 __ BIND(sha1_loop); 3660 // load 64 bytes of data into v16..v19 3661 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3662 __ rev32(v16, __ T16B, v16); 3663 __ rev32(v17, __ T16B, v17); 3664 __ rev32(v18, __ T16B, v18); 3665 __ rev32(v19, __ T16B, v19); 3666 3667 // do the sha1 3668 __ addv(v4, __ T4S, v16, v0); 3669 __ orr(v20, __ T16B, v6, v6); 3670 3671 FloatRegister d0 = v16; 3672 FloatRegister d1 = v17; 3673 FloatRegister d2 = v18; 3674 FloatRegister d3 = v19; 3675 3676 for (int round = 0; round < 20; round++) { 3677 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3678 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3679 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3680 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3681 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3682 3683 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3684 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3685 __ sha1h(tmp2, __ T4S, v20); 3686 if (round < 5) 3687 __ sha1c(v20, __ T4S, tmp3, tmp4); 3688 else if (round < 10 || round >= 15) 3689 __ sha1p(v20, __ T4S, tmp3, tmp4); 3690 else 3691 __ sha1m(v20, __ T4S, tmp3, tmp4); 3692 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3693 3694 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3695 } 3696 3697 __ addv(v7, __ T2S, v7, v21); 3698 __ addv(v6, __ T4S, v6, v20); 3699 3700 if (multi_block) { 3701 __ add(ofs, ofs, 64); 3702 __ cmp(ofs, limit); 3703 __ br(Assembler::LE, sha1_loop); 3704 __ mov(c_rarg0, ofs); // return ofs 3705 } 3706 3707 __ strq(v6, Address(state, 0)); 3708 __ strs(v7, Address(state, 16)); 3709 3710 __ ret(lr); 3711 3712 __ bind(keys); 3713 __ emit_int32(0x5a827999); 3714 __ emit_int32(0x6ed9eba1); 3715 __ emit_int32(0x8f1bbcdc); 3716 __ emit_int32(0xca62c1d6); 3717 3718 return start; 3719 } 3720 3721 3722 // Arguments: 3723 // 3724 // Inputs: 3725 // c_rarg0 - byte[] source+offset 3726 // c_rarg1 - int[] SHA.state 3727 // c_rarg2 - int offset 3728 // c_rarg3 - int limit 3729 // 3730 address generate_sha256_implCompress(bool multi_block, const char *name) { 3731 static const uint32_t round_consts[64] = { 3732 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3733 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3734 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3735 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3736 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3737 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3738 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3739 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3740 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3741 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3742 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3743 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3744 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3745 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3746 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3747 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3748 }; 3749 __ align(CodeEntryAlignment); 3750 StubCodeMark mark(this, "StubRoutines", name); 3751 address start = __ pc(); 3752 3753 Register buf = c_rarg0; 3754 Register state = c_rarg1; 3755 Register ofs = c_rarg2; 3756 Register limit = c_rarg3; 3757 3758 Label sha1_loop; 3759 3760 __ stpd(v8, v9, __ pre(sp, -32)); 3761 __ stpd(v10, v11, Address(sp, 16)); 3762 3763 // dga == v0 3764 // dgb == v1 3765 // dg0 == v2 3766 // dg1 == v3 3767 // dg2 == v4 3768 // t0 == v6 3769 // t1 == v7 3770 3771 // load 16 keys to v16..v31 3772 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3773 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3774 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3775 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3776 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3777 3778 // load 8 words (256 bits) state 3779 __ ldpq(v0, v1, state); 3780 3781 __ BIND(sha1_loop); 3782 // load 64 bytes of data into v8..v11 3783 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3784 __ rev32(v8, __ T16B, v8); 3785 __ rev32(v9, __ T16B, v9); 3786 __ rev32(v10, __ T16B, v10); 3787 __ rev32(v11, __ T16B, v11); 3788 3789 __ addv(v6, __ T4S, v8, v16); 3790 __ orr(v2, __ T16B, v0, v0); 3791 __ orr(v3, __ T16B, v1, v1); 3792 3793 FloatRegister d0 = v8; 3794 FloatRegister d1 = v9; 3795 FloatRegister d2 = v10; 3796 FloatRegister d3 = v11; 3797 3798 3799 for (int round = 0; round < 16; round++) { 3800 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3801 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3802 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3803 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3804 3805 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3806 __ orr(v4, __ T16B, v2, v2); 3807 if (round < 15) 3808 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3809 __ sha256h(v2, __ T4S, v3, tmp2); 3810 __ sha256h2(v3, __ T4S, v4, tmp2); 3811 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3812 3813 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3814 } 3815 3816 __ addv(v0, __ T4S, v0, v2); 3817 __ addv(v1, __ T4S, v1, v3); 3818 3819 if (multi_block) { 3820 __ add(ofs, ofs, 64); 3821 __ cmp(ofs, limit); 3822 __ br(Assembler::LE, sha1_loop); 3823 __ mov(c_rarg0, ofs); // return ofs 3824 } 3825 3826 __ ldpd(v10, v11, Address(sp, 16)); 3827 __ ldpd(v8, v9, __ post(sp, 32)); 3828 3829 __ stpq(v0, v1, state); 3830 3831 __ ret(lr); 3832 3833 return start; 3834 } 3835 3836 // Double rounds for sha512. 3837 void sha512_dround(int dr, 3838 FloatRegister vi0, FloatRegister vi1, 3839 FloatRegister vi2, FloatRegister vi3, 3840 FloatRegister vi4, FloatRegister vrc0, 3841 FloatRegister vrc1, FloatRegister vin0, 3842 FloatRegister vin1, FloatRegister vin2, 3843 FloatRegister vin3, FloatRegister vin4) { 3844 if (dr < 36) { 3845 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16)); 3846 } 3847 __ addv(v5, __ T2D, vrc0, vin0); 3848 __ ext(v6, __ T16B, vi2, vi3, 8); 3849 __ ext(v5, __ T16B, v5, v5, 8); 3850 __ ext(v7, __ T16B, vi1, vi2, 8); 3851 __ addv(vi3, __ T2D, vi3, v5); 3852 if (dr < 32) { 3853 __ ext(v5, __ T16B, vin3, vin4, 8); 3854 __ sha512su0(vin0, __ T2D, vin1); 3855 } 3856 __ sha512h(vi3, __ T2D, v6, v7); 3857 if (dr < 32) { 3858 __ sha512su1(vin0, __ T2D, vin2, v5); 3859 } 3860 __ addv(vi4, __ T2D, vi1, vi3); 3861 __ sha512h2(vi3, __ T2D, vi1, vi0); 3862 } 3863 3864 // Arguments: 3865 // 3866 // Inputs: 3867 // c_rarg0 - byte[] source+offset 3868 // c_rarg1 - int[] SHA.state 3869 // c_rarg2 - int offset 3870 // c_rarg3 - int limit 3871 // 3872 address generate_sha512_implCompress(bool multi_block, const char *name) { 3873 static const uint64_t round_consts[80] = { 3874 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 3875 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 3876 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 3877 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 3878 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 3879 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 3880 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 3881 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 3882 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 3883 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 3884 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 3885 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 3886 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 3887 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 3888 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 3889 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 3890 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 3891 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 3892 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 3893 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 3894 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 3895 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 3896 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 3897 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 3898 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 3899 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 3900 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 3901 }; 3902 3903 __ align(CodeEntryAlignment); 3904 StubCodeMark mark(this, "StubRoutines", name); 3905 address start = __ pc(); 3906 3907 Register buf = c_rarg0; 3908 Register state = c_rarg1; 3909 Register ofs = c_rarg2; 3910 Register limit = c_rarg3; 3911 3912 __ stpd(v8, v9, __ pre(sp, -64)); 3913 __ stpd(v10, v11, Address(sp, 16)); 3914 __ stpd(v12, v13, Address(sp, 32)); 3915 __ stpd(v14, v15, Address(sp, 48)); 3916 3917 Label sha512_loop; 3918 3919 // load state 3920 __ ld1(v8, v9, v10, v11, __ T2D, state); 3921 3922 // load first 4 round constants 3923 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3924 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 3925 3926 __ BIND(sha512_loop); 3927 // load 128B of data into v12..v19 3928 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 3929 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 3930 __ rev64(v12, __ T16B, v12); 3931 __ rev64(v13, __ T16B, v13); 3932 __ rev64(v14, __ T16B, v14); 3933 __ rev64(v15, __ T16B, v15); 3934 __ rev64(v16, __ T16B, v16); 3935 __ rev64(v17, __ T16B, v17); 3936 __ rev64(v18, __ T16B, v18); 3937 __ rev64(v19, __ T16B, v19); 3938 3939 __ mov(rscratch2, rscratch1); 3940 3941 __ mov(v0, __ T16B, v8); 3942 __ mov(v1, __ T16B, v9); 3943 __ mov(v2, __ T16B, v10); 3944 __ mov(v3, __ T16B, v11); 3945 3946 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17); 3947 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18); 3948 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19); 3949 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12); 3950 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13); 3951 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14); 3952 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15); 3953 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16); 3954 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17); 3955 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18); 3956 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19); 3957 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12); 3958 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13); 3959 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14); 3960 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15); 3961 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16); 3962 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17); 3963 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18); 3964 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19); 3965 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12); 3966 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13); 3967 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14); 3968 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15); 3969 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16); 3970 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17); 3971 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18); 3972 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19); 3973 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12); 3974 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13); 3975 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14); 3976 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15); 3977 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16); 3978 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0); 3979 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0); 3980 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0); 3981 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0); 3982 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0); 3983 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0); 3984 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0); 3985 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0); 3986 3987 __ addv(v8, __ T2D, v8, v0); 3988 __ addv(v9, __ T2D, v9, v1); 3989 __ addv(v10, __ T2D, v10, v2); 3990 __ addv(v11, __ T2D, v11, v3); 3991 3992 if (multi_block) { 3993 __ add(ofs, ofs, 128); 3994 __ cmp(ofs, limit); 3995 __ br(Assembler::LE, sha512_loop); 3996 __ mov(c_rarg0, ofs); // return ofs 3997 } 3998 3999 __ st1(v8, v9, v10, v11, __ T2D, state); 4000 4001 __ ldpd(v14, v15, Address(sp, 48)); 4002 __ ldpd(v12, v13, Address(sp, 32)); 4003 __ ldpd(v10, v11, Address(sp, 16)); 4004 __ ldpd(v8, v9, __ post(sp, 64)); 4005 4006 __ ret(lr); 4007 4008 return start; 4009 } 4010 4011 // Arguments: 4012 // 4013 // Inputs: 4014 // c_rarg0 - byte[] source+offset 4015 // c_rarg1 - byte[] SHA.state 4016 // c_rarg2 - int block_size 4017 // c_rarg3 - int offset 4018 // c_rarg4 - int limit 4019 // 4020 address generate_sha3_implCompress(bool multi_block, const char *name) { 4021 static const uint64_t round_consts[24] = { 4022 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL, 4023 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L, 4024 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL, 4025 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL, 4026 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L, 4027 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L, 4028 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L, 4029 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L 4030 }; 4031 4032 __ align(CodeEntryAlignment); 4033 StubCodeMark mark(this, "StubRoutines", name); 4034 address start = __ pc(); 4035 4036 Register buf = c_rarg0; 4037 Register state = c_rarg1; 4038 Register block_size = c_rarg2; 4039 Register ofs = c_rarg3; 4040 Register limit = c_rarg4; 4041 4042 Label sha3_loop, rounds24_loop; 4043 Label sha3_512_or_sha3_384, shake128; 4044 4045 __ stpd(v8, v9, __ pre(sp, -64)); 4046 __ stpd(v10, v11, Address(sp, 16)); 4047 __ stpd(v12, v13, Address(sp, 32)); 4048 __ stpd(v14, v15, Address(sp, 48)); 4049 4050 // load state 4051 __ add(rscratch1, state, 32); 4052 __ ld1(v0, v1, v2, v3, __ T1D, state); 4053 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32)); 4054 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32)); 4055 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32)); 4056 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32)); 4057 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32)); 4058 __ ld1(v24, __ T1D, rscratch1); 4059 4060 __ BIND(sha3_loop); 4061 4062 // 24 keccak rounds 4063 __ movw(rscratch2, 24); 4064 4065 // load round_constants base 4066 __ lea(rscratch1, ExternalAddress((address) round_consts)); 4067 4068 // load input 4069 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4070 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4071 __ eor(v0, __ T8B, v0, v25); 4072 __ eor(v1, __ T8B, v1, v26); 4073 __ eor(v2, __ T8B, v2, v27); 4074 __ eor(v3, __ T8B, v3, v28); 4075 __ eor(v4, __ T8B, v4, v29); 4076 __ eor(v5, __ T8B, v5, v30); 4077 __ eor(v6, __ T8B, v6, v31); 4078 4079 // block_size == 72, SHA3-512; block_size == 104, SHA3-384 4080 __ tbz(block_size, 7, sha3_512_or_sha3_384); 4081 4082 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32)); 4083 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24)); 4084 __ eor(v7, __ T8B, v7, v25); 4085 __ eor(v8, __ T8B, v8, v26); 4086 __ eor(v9, __ T8B, v9, v27); 4087 __ eor(v10, __ T8B, v10, v28); 4088 __ eor(v11, __ T8B, v11, v29); 4089 __ eor(v12, __ T8B, v12, v30); 4090 __ eor(v13, __ T8B, v13, v31); 4091 4092 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24)); 4093 __ eor(v14, __ T8B, v14, v25); 4094 __ eor(v15, __ T8B, v15, v26); 4095 __ eor(v16, __ T8B, v16, v27); 4096 4097 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256 4098 __ andw(c_rarg5, block_size, 48); 4099 __ cbzw(c_rarg5, rounds24_loop); 4100 4101 __ tbnz(block_size, 5, shake128); 4102 // block_size == 144, bit5 == 0, SHA3-244 4103 __ ldrd(v28, __ post(buf, 8)); 4104 __ eor(v17, __ T8B, v17, v28); 4105 __ b(rounds24_loop); 4106 4107 __ BIND(shake128); 4108 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32)); 4109 __ eor(v17, __ T8B, v17, v28); 4110 __ eor(v18, __ T8B, v18, v29); 4111 __ eor(v19, __ T8B, v19, v30); 4112 __ eor(v20, __ T8B, v20, v31); 4113 __ b(rounds24_loop); // block_size == 168, SHAKE128 4114 4115 __ BIND(sha3_512_or_sha3_384); 4116 __ ld1(v25, v26, __ T8B, __ post(buf, 16)); 4117 __ eor(v7, __ T8B, v7, v25); 4118 __ eor(v8, __ T8B, v8, v26); 4119 __ tbz(block_size, 5, rounds24_loop); // SHA3-512 4120 4121 // SHA3-384 4122 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32)); 4123 __ eor(v9, __ T8B, v9, v27); 4124 __ eor(v10, __ T8B, v10, v28); 4125 __ eor(v11, __ T8B, v11, v29); 4126 __ eor(v12, __ T8B, v12, v30); 4127 4128 __ BIND(rounds24_loop); 4129 __ subw(rscratch2, rscratch2, 1); 4130 4131 __ eor3(v29, __ T16B, v4, v9, v14); 4132 __ eor3(v26, __ T16B, v1, v6, v11); 4133 __ eor3(v28, __ T16B, v3, v8, v13); 4134 __ eor3(v25, __ T16B, v0, v5, v10); 4135 __ eor3(v27, __ T16B, v2, v7, v12); 4136 __ eor3(v29, __ T16B, v29, v19, v24); 4137 __ eor3(v26, __ T16B, v26, v16, v21); 4138 __ eor3(v28, __ T16B, v28, v18, v23); 4139 __ eor3(v25, __ T16B, v25, v15, v20); 4140 __ eor3(v27, __ T16B, v27, v17, v22); 4141 4142 __ rax1(v30, __ T2D, v29, v26); 4143 __ rax1(v26, __ T2D, v26, v28); 4144 __ rax1(v28, __ T2D, v28, v25); 4145 __ rax1(v25, __ T2D, v25, v27); 4146 __ rax1(v27, __ T2D, v27, v29); 4147 4148 __ eor(v0, __ T16B, v0, v30); 4149 __ xar(v29, __ T2D, v1, v25, (64 - 1)); 4150 __ xar(v1, __ T2D, v6, v25, (64 - 44)); 4151 __ xar(v6, __ T2D, v9, v28, (64 - 20)); 4152 __ xar(v9, __ T2D, v22, v26, (64 - 61)); 4153 __ xar(v22, __ T2D, v14, v28, (64 - 39)); 4154 __ xar(v14, __ T2D, v20, v30, (64 - 18)); 4155 __ xar(v31, __ T2D, v2, v26, (64 - 62)); 4156 __ xar(v2, __ T2D, v12, v26, (64 - 43)); 4157 __ xar(v12, __ T2D, v13, v27, (64 - 25)); 4158 __ xar(v13, __ T2D, v19, v28, (64 - 8)); 4159 __ xar(v19, __ T2D, v23, v27, (64 - 56)); 4160 __ xar(v23, __ T2D, v15, v30, (64 - 41)); 4161 __ xar(v15, __ T2D, v4, v28, (64 - 27)); 4162 __ xar(v28, __ T2D, v24, v28, (64 - 14)); 4163 __ xar(v24, __ T2D, v21, v25, (64 - 2)); 4164 __ xar(v8, __ T2D, v8, v27, (64 - 55)); 4165 __ xar(v4, __ T2D, v16, v25, (64 - 45)); 4166 __ xar(v16, __ T2D, v5, v30, (64 - 36)); 4167 __ xar(v5, __ T2D, v3, v27, (64 - 28)); 4168 __ xar(v27, __ T2D, v18, v27, (64 - 21)); 4169 __ xar(v3, __ T2D, v17, v26, (64 - 15)); 4170 __ xar(v25, __ T2D, v11, v25, (64 - 10)); 4171 __ xar(v26, __ T2D, v7, v26, (64 - 6)); 4172 __ xar(v30, __ T2D, v10, v30, (64 - 3)); 4173 4174 __ bcax(v20, __ T16B, v31, v22, v8); 4175 __ bcax(v21, __ T16B, v8, v23, v22); 4176 __ bcax(v22, __ T16B, v22, v24, v23); 4177 __ bcax(v23, __ T16B, v23, v31, v24); 4178 __ bcax(v24, __ T16B, v24, v8, v31); 4179 4180 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); 4181 4182 __ bcax(v17, __ T16B, v25, v19, v3); 4183 __ bcax(v18, __ T16B, v3, v15, v19); 4184 __ bcax(v19, __ T16B, v19, v16, v15); 4185 __ bcax(v15, __ T16B, v15, v25, v16); 4186 __ bcax(v16, __ T16B, v16, v3, v25); 4187 4188 __ bcax(v10, __ T16B, v29, v12, v26); 4189 __ bcax(v11, __ T16B, v26, v13, v12); 4190 __ bcax(v12, __ T16B, v12, v14, v13); 4191 __ bcax(v13, __ T16B, v13, v29, v14); 4192 __ bcax(v14, __ T16B, v14, v26, v29); 4193 4194 __ bcax(v7, __ T16B, v30, v9, v4); 4195 __ bcax(v8, __ T16B, v4, v5, v9); 4196 __ bcax(v9, __ T16B, v9, v6, v5); 4197 __ bcax(v5, __ T16B, v5, v30, v6); 4198 __ bcax(v6, __ T16B, v6, v4, v30); 4199 4200 __ bcax(v3, __ T16B, v27, v0, v28); 4201 __ bcax(v4, __ T16B, v28, v1, v0); 4202 __ bcax(v0, __ T16B, v0, v2, v1); 4203 __ bcax(v1, __ T16B, v1, v27, v2); 4204 __ bcax(v2, __ T16B, v2, v28, v27); 4205 4206 __ eor(v0, __ T16B, v0, v31); 4207 4208 __ cbnzw(rscratch2, rounds24_loop); 4209 4210 if (multi_block) { 4211 __ add(ofs, ofs, block_size); 4212 __ cmp(ofs, limit); 4213 __ br(Assembler::LE, sha3_loop); 4214 __ mov(c_rarg0, ofs); // return ofs 4215 } 4216 4217 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32)); 4218 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32)); 4219 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32)); 4220 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32)); 4221 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32)); 4222 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32)); 4223 __ st1(v24, __ T1D, state); 4224 4225 __ ldpd(v14, v15, Address(sp, 48)); 4226 __ ldpd(v12, v13, Address(sp, 32)); 4227 __ ldpd(v10, v11, Address(sp, 16)); 4228 __ ldpd(v8, v9, __ post(sp, 64)); 4229 4230 __ ret(lr); 4231 4232 return start; 4233 } 4234 4235 /** 4236 * Arguments: 4237 * 4238 * Inputs: 4239 * c_rarg0 - int crc 4240 * c_rarg1 - byte* buf 4241 * c_rarg2 - int length 4242 * 4243 * Output: 4244 * rax - int crc result 4245 */ 4246 address generate_updateBytesCRC32() { 4247 assert(UseCRC32Intrinsics, "what are we doing here?"); 4248 4249 __ align(CodeEntryAlignment); 4250 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 4251 4252 address start = __ pc(); 4253 4254 const Register crc = c_rarg0; // crc 4255 const Register buf = c_rarg1; // source java byte array address 4256 const Register len = c_rarg2; // length 4257 const Register table0 = c_rarg3; // crc_table address 4258 const Register table1 = c_rarg4; 4259 const Register table2 = c_rarg5; 4260 const Register table3 = c_rarg6; 4261 const Register tmp3 = c_rarg7; 4262 4263 BLOCK_COMMENT("Entry:"); 4264 __ enter(); // required for proper stackwalking of RuntimeStub frame 4265 4266 __ kernel_crc32(crc, buf, len, 4267 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4268 4269 __ leave(); // required for proper stackwalking of RuntimeStub frame 4270 __ ret(lr); 4271 4272 return start; 4273 } 4274 4275 // ChaCha20 block function. This version parallelizes by loading 4276 // individual 32-bit state elements into vectors for four blocks 4277 // (e.g. all four blocks' worth of state[0] in one register, etc.) 4278 // 4279 // state (int[16]) = c_rarg0 4280 // keystream (byte[1024]) = c_rarg1 4281 // return - number of bytes of keystream (always 256) 4282 address generate_chacha20Block_blockpar() { 4283 Label L_twoRounds, L_cc20_const; 4284 // The constant data is broken into two 128-bit segments to be loaded 4285 // onto FloatRegisters. The first 128 bits are a counter add overlay 4286 // that adds +0/+1/+2/+3 to the vector holding replicated state[12]. 4287 // The second 128-bits is a table constant used for 8-bit left rotations. 4288 __ BIND(L_cc20_const); 4289 __ emit_int64(0x0000000100000000UL); 4290 __ emit_int64(0x0000000300000002UL); 4291 __ emit_int64(0x0605040702010003UL); 4292 __ emit_int64(0x0E0D0C0F0A09080BUL); 4293 4294 __ align(CodeEntryAlignment); 4295 StubCodeMark mark(this, "StubRoutines", "chacha20Block"); 4296 address start = __ pc(); 4297 __ enter(); 4298 4299 int i, j; 4300 const Register state = c_rarg0; 4301 const Register keystream = c_rarg1; 4302 const Register loopCtr = r10; 4303 const Register tmpAddr = r11; 4304 4305 const FloatRegister stateFirst = v0; 4306 const FloatRegister stateSecond = v1; 4307 const FloatRegister stateThird = v2; 4308 const FloatRegister stateFourth = v3; 4309 const FloatRegister origCtrState = v28; 4310 const FloatRegister scratch = v29; 4311 const FloatRegister lrot8Tbl = v30; 4312 4313 // Organize SIMD registers in an array that facilitates 4314 // putting repetitive opcodes into loop structures. It is 4315 // important that each grouping of 4 registers is monotonically 4316 // increasing to support the requirements of multi-register 4317 // instructions (e.g. ld4r, st4, etc.) 4318 const FloatRegister workSt[16] = { 4319 v4, v5, v6, v7, v16, v17, v18, v19, 4320 v20, v21, v22, v23, v24, v25, v26, v27 4321 }; 4322 4323 // Load from memory and interlace across 16 SIMD registers, 4324 // With each word from memory being broadcast to all lanes of 4325 // each successive SIMD register. 4326 // Addr(0) -> All lanes in workSt[i] 4327 // Addr(4) -> All lanes workSt[i + 1], etc. 4328 __ mov(tmpAddr, state); 4329 for (i = 0; i < 16; i += 4) { 4330 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S, 4331 __ post(tmpAddr, 16)); 4332 } 4333 4334 // Pull in constant data. The first 16 bytes are the add overlay 4335 // which is applied to the vector holding the counter (state[12]). 4336 // The second 16 bytes is the index register for the 8-bit left 4337 // rotation tbl instruction. 4338 __ adr(tmpAddr, L_cc20_const); 4339 __ ldpq(origCtrState, lrot8Tbl, Address(tmpAddr)); 4340 __ addv(workSt[12], __ T4S, workSt[12], origCtrState); 4341 4342 // Set up the 10 iteration loop and perform all 8 quarter round ops 4343 __ mov(loopCtr, 10); 4344 __ BIND(L_twoRounds); 4345 4346 __ cc20_quarter_round(workSt[0], workSt[4], workSt[8], workSt[12], 4347 scratch, lrot8Tbl); 4348 __ cc20_quarter_round(workSt[1], workSt[5], workSt[9], workSt[13], 4349 scratch, lrot8Tbl); 4350 __ cc20_quarter_round(workSt[2], workSt[6], workSt[10], workSt[14], 4351 scratch, lrot8Tbl); 4352 __ cc20_quarter_round(workSt[3], workSt[7], workSt[11], workSt[15], 4353 scratch, lrot8Tbl); 4354 4355 __ cc20_quarter_round(workSt[0], workSt[5], workSt[10], workSt[15], 4356 scratch, lrot8Tbl); 4357 __ cc20_quarter_round(workSt[1], workSt[6], workSt[11], workSt[12], 4358 scratch, lrot8Tbl); 4359 __ cc20_quarter_round(workSt[2], workSt[7], workSt[8], workSt[13], 4360 scratch, lrot8Tbl); 4361 __ cc20_quarter_round(workSt[3], workSt[4], workSt[9], workSt[14], 4362 scratch, lrot8Tbl); 4363 4364 // Decrement and iterate 4365 __ sub(loopCtr, loopCtr, 1); 4366 __ cbnz(loopCtr, L_twoRounds); 4367 4368 __ mov(tmpAddr, state); 4369 4370 // Add the starting state back to the post-loop keystream 4371 // state. We read/interlace the state array from memory into 4372 // 4 registers similar to what we did in the beginning. Then 4373 // add the counter overlay onto workSt[12] at the end. 4374 for (i = 0; i < 16; i += 4) { 4375 __ ld4r(stateFirst, stateSecond, stateThird, stateFourth, __ T4S, 4376 __ post(tmpAddr, 16)); 4377 __ addv(workSt[i], __ T4S, workSt[i], stateFirst); 4378 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], stateSecond); 4379 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], stateThird); 4380 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], stateFourth); 4381 } 4382 __ addv(workSt[12], __ T4S, workSt[12], origCtrState); // Add ctr mask 4383 4384 // Write to key stream, storing the same element out of workSt[0..15] 4385 // to consecutive 4-byte offsets in the key stream buffer, then repeating 4386 // for the next element position. 4387 for (i = 0; i < 4; i++) { 4388 for (j = 0; j < 16; j += 4) { 4389 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i, 4390 __ post(keystream, 16)); 4391 } 4392 } 4393 4394 __ mov(r0, 256); // Return length of output keystream 4395 __ leave(); 4396 __ ret(lr); 4397 4398 return start; 4399 } 4400 4401 /** 4402 * Arguments: 4403 * 4404 * Inputs: 4405 * c_rarg0 - int crc 4406 * c_rarg1 - byte* buf 4407 * c_rarg2 - int length 4408 * c_rarg3 - int* table 4409 * 4410 * Output: 4411 * r0 - int crc result 4412 */ 4413 address generate_updateBytesCRC32C() { 4414 assert(UseCRC32CIntrinsics, "what are we doing here?"); 4415 4416 __ align(CodeEntryAlignment); 4417 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 4418 4419 address start = __ pc(); 4420 4421 const Register crc = c_rarg0; // crc 4422 const Register buf = c_rarg1; // source java byte array address 4423 const Register len = c_rarg2; // length 4424 const Register table0 = c_rarg3; // crc_table address 4425 const Register table1 = c_rarg4; 4426 const Register table2 = c_rarg5; 4427 const Register table3 = c_rarg6; 4428 const Register tmp3 = c_rarg7; 4429 4430 BLOCK_COMMENT("Entry:"); 4431 __ enter(); // required for proper stackwalking of RuntimeStub frame 4432 4433 __ kernel_crc32c(crc, buf, len, 4434 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 4435 4436 __ leave(); // required for proper stackwalking of RuntimeStub frame 4437 __ ret(lr); 4438 4439 return start; 4440 } 4441 4442 /*** 4443 * Arguments: 4444 * 4445 * Inputs: 4446 * c_rarg0 - int adler 4447 * c_rarg1 - byte* buff 4448 * c_rarg2 - int len 4449 * 4450 * Output: 4451 * c_rarg0 - int adler result 4452 */ 4453 address generate_updateBytesAdler32() { 4454 __ align(CodeEntryAlignment); 4455 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 4456 address start = __ pc(); 4457 4458 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 4459 4460 // Aliases 4461 Register adler = c_rarg0; 4462 Register s1 = c_rarg0; 4463 Register s2 = c_rarg3; 4464 Register buff = c_rarg1; 4465 Register len = c_rarg2; 4466 Register nmax = r4; 4467 Register base = r5; 4468 Register count = r6; 4469 Register temp0 = rscratch1; 4470 Register temp1 = rscratch2; 4471 FloatRegister vbytes = v0; 4472 FloatRegister vs1acc = v1; 4473 FloatRegister vs2acc = v2; 4474 FloatRegister vtable = v3; 4475 4476 // Max number of bytes we can process before having to take the mod 4477 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 4478 uint64_t BASE = 0xfff1; 4479 uint64_t NMAX = 0x15B0; 4480 4481 __ mov(base, BASE); 4482 __ mov(nmax, NMAX); 4483 4484 // Load accumulation coefficients for the upper 16 bits 4485 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 4486 __ ld1(vtable, __ T16B, Address(temp0)); 4487 4488 // s1 is initialized to the lower 16 bits of adler 4489 // s2 is initialized to the upper 16 bits of adler 4490 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 4491 __ uxth(s1, adler); // s1 = (adler & 0xffff) 4492 4493 // The pipelined loop needs at least 16 elements for 1 iteration 4494 // It does check this, but it is more effective to skip to the cleanup loop 4495 __ cmp(len, (u1)16); 4496 __ br(Assembler::HS, L_nmax); 4497 __ cbz(len, L_combine); 4498 4499 __ bind(L_simple_by1_loop); 4500 __ ldrb(temp0, Address(__ post(buff, 1))); 4501 __ add(s1, s1, temp0); 4502 __ add(s2, s2, s1); 4503 __ subs(len, len, 1); 4504 __ br(Assembler::HI, L_simple_by1_loop); 4505 4506 // s1 = s1 % BASE 4507 __ subs(temp0, s1, base); 4508 __ csel(s1, temp0, s1, Assembler::HS); 4509 4510 // s2 = s2 % BASE 4511 __ lsr(temp0, s2, 16); 4512 __ lsl(temp1, temp0, 4); 4513 __ sub(temp1, temp1, temp0); 4514 __ add(s2, temp1, s2, ext::uxth); 4515 4516 __ subs(temp0, s2, base); 4517 __ csel(s2, temp0, s2, Assembler::HS); 4518 4519 __ b(L_combine); 4520 4521 __ bind(L_nmax); 4522 __ subs(len, len, nmax); 4523 __ sub(count, nmax, 16); 4524 __ br(Assembler::LO, L_by16); 4525 4526 __ bind(L_nmax_loop); 4527 4528 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4529 vbytes, vs1acc, vs2acc, vtable); 4530 4531 __ subs(count, count, 16); 4532 __ br(Assembler::HS, L_nmax_loop); 4533 4534 // s1 = s1 % BASE 4535 __ lsr(temp0, s1, 16); 4536 __ lsl(temp1, temp0, 4); 4537 __ sub(temp1, temp1, temp0); 4538 __ add(temp1, temp1, s1, ext::uxth); 4539 4540 __ lsr(temp0, temp1, 16); 4541 __ lsl(s1, temp0, 4); 4542 __ sub(s1, s1, temp0); 4543 __ add(s1, s1, temp1, ext:: uxth); 4544 4545 __ subs(temp0, s1, base); 4546 __ csel(s1, temp0, s1, Assembler::HS); 4547 4548 // s2 = s2 % BASE 4549 __ lsr(temp0, s2, 16); 4550 __ lsl(temp1, temp0, 4); 4551 __ sub(temp1, temp1, temp0); 4552 __ add(temp1, temp1, s2, ext::uxth); 4553 4554 __ lsr(temp0, temp1, 16); 4555 __ lsl(s2, temp0, 4); 4556 __ sub(s2, s2, temp0); 4557 __ add(s2, s2, temp1, ext:: uxth); 4558 4559 __ subs(temp0, s2, base); 4560 __ csel(s2, temp0, s2, Assembler::HS); 4561 4562 __ subs(len, len, nmax); 4563 __ sub(count, nmax, 16); 4564 __ br(Assembler::HS, L_nmax_loop); 4565 4566 __ bind(L_by16); 4567 __ adds(len, len, count); 4568 __ br(Assembler::LO, L_by1); 4569 4570 __ bind(L_by16_loop); 4571 4572 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 4573 vbytes, vs1acc, vs2acc, vtable); 4574 4575 __ subs(len, len, 16); 4576 __ br(Assembler::HS, L_by16_loop); 4577 4578 __ bind(L_by1); 4579 __ adds(len, len, 15); 4580 __ br(Assembler::LO, L_do_mod); 4581 4582 __ bind(L_by1_loop); 4583 __ ldrb(temp0, Address(__ post(buff, 1))); 4584 __ add(s1, temp0, s1); 4585 __ add(s2, s2, s1); 4586 __ subs(len, len, 1); 4587 __ br(Assembler::HS, L_by1_loop); 4588 4589 __ bind(L_do_mod); 4590 // s1 = s1 % BASE 4591 __ lsr(temp0, s1, 16); 4592 __ lsl(temp1, temp0, 4); 4593 __ sub(temp1, temp1, temp0); 4594 __ add(temp1, temp1, s1, ext::uxth); 4595 4596 __ lsr(temp0, temp1, 16); 4597 __ lsl(s1, temp0, 4); 4598 __ sub(s1, s1, temp0); 4599 __ add(s1, s1, temp1, ext:: uxth); 4600 4601 __ subs(temp0, s1, base); 4602 __ csel(s1, temp0, s1, Assembler::HS); 4603 4604 // s2 = s2 % BASE 4605 __ lsr(temp0, s2, 16); 4606 __ lsl(temp1, temp0, 4); 4607 __ sub(temp1, temp1, temp0); 4608 __ add(temp1, temp1, s2, ext::uxth); 4609 4610 __ lsr(temp0, temp1, 16); 4611 __ lsl(s2, temp0, 4); 4612 __ sub(s2, s2, temp0); 4613 __ add(s2, s2, temp1, ext:: uxth); 4614 4615 __ subs(temp0, s2, base); 4616 __ csel(s2, temp0, s2, Assembler::HS); 4617 4618 // Combine lower bits and higher bits 4619 __ bind(L_combine); 4620 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 4621 4622 __ ret(lr); 4623 4624 return start; 4625 } 4626 4627 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 4628 Register temp0, Register temp1, FloatRegister vbytes, 4629 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 4630 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 4631 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 4632 // In non-vectorized code, we update s1 and s2 as: 4633 // s1 <- s1 + b1 4634 // s2 <- s2 + s1 4635 // s1 <- s1 + b2 4636 // s2 <- s2 + b1 4637 // ... 4638 // s1 <- s1 + b16 4639 // s2 <- s2 + s1 4640 // Putting above assignments together, we have: 4641 // s1_new = s1 + b1 + b2 + ... + b16 4642 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 4643 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 4644 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 4645 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 4646 4647 // s2 = s2 + s1 * 16 4648 __ add(s2, s2, s1, Assembler::LSL, 4); 4649 4650 // vs1acc = b1 + b2 + b3 + ... + b16 4651 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 4652 __ umullv(vs2acc, __ T8B, vtable, vbytes); 4653 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 4654 __ uaddlv(vs1acc, __ T16B, vbytes); 4655 __ uaddlv(vs2acc, __ T8H, vs2acc); 4656 4657 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 4658 __ fmovd(temp0, vs1acc); 4659 __ fmovd(temp1, vs2acc); 4660 __ add(s1, s1, temp0); 4661 __ add(s2, s2, temp1); 4662 } 4663 4664 /** 4665 * Arguments: 4666 * 4667 * Input: 4668 * c_rarg0 - x address 4669 * c_rarg1 - x length 4670 * c_rarg2 - y address 4671 * c_rarg3 - y length 4672 * c_rarg4 - z address 4673 */ 4674 address generate_multiplyToLen() { 4675 __ align(CodeEntryAlignment); 4676 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 4677 4678 address start = __ pc(); 4679 const Register x = r0; 4680 const Register xlen = r1; 4681 const Register y = r2; 4682 const Register ylen = r3; 4683 const Register z = r4; 4684 4685 const Register tmp0 = r5; 4686 const Register tmp1 = r10; 4687 const Register tmp2 = r11; 4688 const Register tmp3 = r12; 4689 const Register tmp4 = r13; 4690 const Register tmp5 = r14; 4691 const Register tmp6 = r15; 4692 const Register tmp7 = r16; 4693 4694 BLOCK_COMMENT("Entry:"); 4695 __ enter(); // required for proper stackwalking of RuntimeStub frame 4696 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4697 __ leave(); // required for proper stackwalking of RuntimeStub frame 4698 __ ret(lr); 4699 4700 return start; 4701 } 4702 4703 address generate_squareToLen() { 4704 // squareToLen algorithm for sizes 1..127 described in java code works 4705 // faster than multiply_to_len on some CPUs and slower on others, but 4706 // multiply_to_len shows a bit better overall results 4707 __ align(CodeEntryAlignment); 4708 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 4709 address start = __ pc(); 4710 4711 const Register x = r0; 4712 const Register xlen = r1; 4713 const Register z = r2; 4714 const Register y = r4; // == x 4715 const Register ylen = r5; // == xlen 4716 4717 const Register tmp0 = r3; 4718 const Register tmp1 = r10; 4719 const Register tmp2 = r11; 4720 const Register tmp3 = r12; 4721 const Register tmp4 = r13; 4722 const Register tmp5 = r14; 4723 const Register tmp6 = r15; 4724 const Register tmp7 = r16; 4725 4726 RegSet spilled_regs = RegSet::of(y, ylen); 4727 BLOCK_COMMENT("Entry:"); 4728 __ enter(); 4729 __ push(spilled_regs, sp); 4730 __ mov(y, x); 4731 __ mov(ylen, xlen); 4732 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 4733 __ pop(spilled_regs, sp); 4734 __ leave(); 4735 __ ret(lr); 4736 return start; 4737 } 4738 4739 address generate_mulAdd() { 4740 __ align(CodeEntryAlignment); 4741 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 4742 4743 address start = __ pc(); 4744 4745 const Register out = r0; 4746 const Register in = r1; 4747 const Register offset = r2; 4748 const Register len = r3; 4749 const Register k = r4; 4750 4751 BLOCK_COMMENT("Entry:"); 4752 __ enter(); 4753 __ mul_add(out, in, offset, len, k); 4754 __ leave(); 4755 __ ret(lr); 4756 4757 return start; 4758 } 4759 4760 // Arguments: 4761 // 4762 // Input: 4763 // c_rarg0 - newArr address 4764 // c_rarg1 - oldArr address 4765 // c_rarg2 - newIdx 4766 // c_rarg3 - shiftCount 4767 // c_rarg4 - numIter 4768 // 4769 address generate_bigIntegerRightShift() { 4770 __ align(CodeEntryAlignment); 4771 StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker"); 4772 address start = __ pc(); 4773 4774 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 4775 4776 Register newArr = c_rarg0; 4777 Register oldArr = c_rarg1; 4778 Register newIdx = c_rarg2; 4779 Register shiftCount = c_rarg3; 4780 Register numIter = c_rarg4; 4781 Register idx = numIter; 4782 4783 Register newArrCur = rscratch1; 4784 Register shiftRevCount = rscratch2; 4785 Register oldArrCur = r13; 4786 Register oldArrNext = r14; 4787 4788 FloatRegister oldElem0 = v0; 4789 FloatRegister oldElem1 = v1; 4790 FloatRegister newElem = v2; 4791 FloatRegister shiftVCount = v3; 4792 FloatRegister shiftVRevCount = v4; 4793 4794 __ cbz(idx, Exit); 4795 4796 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 4797 4798 // left shift count 4799 __ movw(shiftRevCount, 32); 4800 __ subw(shiftRevCount, shiftRevCount, shiftCount); 4801 4802 // numIter too small to allow a 4-words SIMD loop, rolling back 4803 __ cmp(numIter, (u1)4); 4804 __ br(Assembler::LT, ShiftThree); 4805 4806 __ dup(shiftVCount, __ T4S, shiftCount); 4807 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 4808 __ negr(shiftVCount, __ T4S, shiftVCount); 4809 4810 __ BIND(ShiftSIMDLoop); 4811 4812 // Calculate the load addresses 4813 __ sub(idx, idx, 4); 4814 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4815 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4816 __ add(oldArrCur, oldArrNext, 4); 4817 4818 // Load 4 words and process 4819 __ ld1(oldElem0, __ T4S, Address(oldArrCur)); 4820 __ ld1(oldElem1, __ T4S, Address(oldArrNext)); 4821 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 4822 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 4823 __ orr(newElem, __ T16B, oldElem0, oldElem1); 4824 __ st1(newElem, __ T4S, Address(newArrCur)); 4825 4826 __ cmp(idx, (u1)4); 4827 __ br(Assembler::LT, ShiftTwoLoop); 4828 __ b(ShiftSIMDLoop); 4829 4830 __ BIND(ShiftTwoLoop); 4831 __ cbz(idx, Exit); 4832 __ cmp(idx, (u1)1); 4833 __ br(Assembler::EQ, ShiftOne); 4834 4835 // Calculate the load addresses 4836 __ sub(idx, idx, 2); 4837 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); 4838 __ add(newArrCur, newArr, idx, Assembler::LSL, 2); 4839 __ add(oldArrCur, oldArrNext, 4); 4840 4841 // Load 2 words and process 4842 __ ld1(oldElem0, __ T2S, Address(oldArrCur)); 4843 __ ld1(oldElem1, __ T2S, Address(oldArrNext)); 4844 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 4845 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 4846 __ orr(newElem, __ T8B, oldElem0, oldElem1); 4847 __ st1(newElem, __ T2S, Address(newArrCur)); 4848 __ b(ShiftTwoLoop); 4849 4850 __ BIND(ShiftThree); 4851 __ tbz(idx, 1, ShiftOne); 4852 __ tbz(idx, 0, ShiftTwo); 4853 __ ldrw(r10, Address(oldArr, 12)); 4854 __ ldrw(r11, Address(oldArr, 8)); 4855 __ lsrvw(r10, r10, shiftCount); 4856 __ lslvw(r11, r11, shiftRevCount); 4857 __ orrw(r12, r10, r11); 4858 __ strw(r12, Address(newArr, 8)); 4859 4860 __ BIND(ShiftTwo); 4861 __ ldrw(r10, Address(oldArr, 8)); 4862 __ ldrw(r11, Address(oldArr, 4)); 4863 __ lsrvw(r10, r10, shiftCount); 4864 __ lslvw(r11, r11, shiftRevCount); 4865 __ orrw(r12, r10, r11); 4866 __ strw(r12, Address(newArr, 4)); 4867 4868 __ BIND(ShiftOne); 4869 __ ldrw(r10, Address(oldArr, 4)); 4870 __ ldrw(r11, Address(oldArr)); 4871 __ lsrvw(r10, r10, shiftCount); 4872 __ lslvw(r11, r11, shiftRevCount); 4873 __ orrw(r12, r10, r11); 4874 __ strw(r12, Address(newArr)); 4875 4876 __ BIND(Exit); 4877 __ ret(lr); 4878 4879 return start; 4880 } 4881 4882 // Arguments: 4883 // 4884 // Input: 4885 // c_rarg0 - newArr address 4886 // c_rarg1 - oldArr address 4887 // c_rarg2 - newIdx 4888 // c_rarg3 - shiftCount 4889 // c_rarg4 - numIter 4890 // 4891 address generate_bigIntegerLeftShift() { 4892 __ align(CodeEntryAlignment); 4893 StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker"); 4894 address start = __ pc(); 4895 4896 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; 4897 4898 Register newArr = c_rarg0; 4899 Register oldArr = c_rarg1; 4900 Register newIdx = c_rarg2; 4901 Register shiftCount = c_rarg3; 4902 Register numIter = c_rarg4; 4903 4904 Register shiftRevCount = rscratch1; 4905 Register oldArrNext = rscratch2; 4906 4907 FloatRegister oldElem0 = v0; 4908 FloatRegister oldElem1 = v1; 4909 FloatRegister newElem = v2; 4910 FloatRegister shiftVCount = v3; 4911 FloatRegister shiftVRevCount = v4; 4912 4913 __ cbz(numIter, Exit); 4914 4915 __ add(oldArrNext, oldArr, 4); 4916 __ add(newArr, newArr, newIdx, Assembler::LSL, 2); 4917 4918 // right shift count 4919 __ movw(shiftRevCount, 32); 4920 __ subw(shiftRevCount, shiftRevCount, shiftCount); 4921 4922 // numIter too small to allow a 4-words SIMD loop, rolling back 4923 __ cmp(numIter, (u1)4); 4924 __ br(Assembler::LT, ShiftThree); 4925 4926 __ dup(shiftVCount, __ T4S, shiftCount); 4927 __ dup(shiftVRevCount, __ T4S, shiftRevCount); 4928 __ negr(shiftVRevCount, __ T4S, shiftVRevCount); 4929 4930 __ BIND(ShiftSIMDLoop); 4931 4932 // load 4 words and process 4933 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); 4934 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); 4935 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); 4936 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); 4937 __ orr(newElem, __ T16B, oldElem0, oldElem1); 4938 __ st1(newElem, __ T4S, __ post(newArr, 16)); 4939 __ sub(numIter, numIter, 4); 4940 4941 __ cmp(numIter, (u1)4); 4942 __ br(Assembler::LT, ShiftTwoLoop); 4943 __ b(ShiftSIMDLoop); 4944 4945 __ BIND(ShiftTwoLoop); 4946 __ cbz(numIter, Exit); 4947 __ cmp(numIter, (u1)1); 4948 __ br(Assembler::EQ, ShiftOne); 4949 4950 // load 2 words and process 4951 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); 4952 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); 4953 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); 4954 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); 4955 __ orr(newElem, __ T8B, oldElem0, oldElem1); 4956 __ st1(newElem, __ T2S, __ post(newArr, 8)); 4957 __ sub(numIter, numIter, 2); 4958 __ b(ShiftTwoLoop); 4959 4960 __ BIND(ShiftThree); 4961 __ ldrw(r10, __ post(oldArr, 4)); 4962 __ ldrw(r11, __ post(oldArrNext, 4)); 4963 __ lslvw(r10, r10, shiftCount); 4964 __ lsrvw(r11, r11, shiftRevCount); 4965 __ orrw(r12, r10, r11); 4966 __ strw(r12, __ post(newArr, 4)); 4967 __ tbz(numIter, 1, Exit); 4968 __ tbz(numIter, 0, ShiftOne); 4969 4970 __ BIND(ShiftTwo); 4971 __ ldrw(r10, __ post(oldArr, 4)); 4972 __ ldrw(r11, __ post(oldArrNext, 4)); 4973 __ lslvw(r10, r10, shiftCount); 4974 __ lsrvw(r11, r11, shiftRevCount); 4975 __ orrw(r12, r10, r11); 4976 __ strw(r12, __ post(newArr, 4)); 4977 4978 __ BIND(ShiftOne); 4979 __ ldrw(r10, Address(oldArr)); 4980 __ ldrw(r11, Address(oldArrNext)); 4981 __ lslvw(r10, r10, shiftCount); 4982 __ lsrvw(r11, r11, shiftRevCount); 4983 __ orrw(r12, r10, r11); 4984 __ strw(r12, Address(newArr)); 4985 4986 __ BIND(Exit); 4987 __ ret(lr); 4988 4989 return start; 4990 } 4991 4992 address generate_count_positives(address &count_positives_long) { 4993 const u1 large_loop_size = 64; 4994 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 4995 int dcache_line = VM_Version::dcache_line_size(); 4996 4997 Register ary1 = r1, len = r2, result = r0; 4998 4999 __ align(CodeEntryAlignment); 5000 5001 StubCodeMark mark(this, "StubRoutines", "count_positives"); 5002 5003 address entry = __ pc(); 5004 5005 __ enter(); 5006 // precondition: a copy of len is already in result 5007 // __ mov(result, len); 5008 5009 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16, 5010 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 5011 5012 __ cmp(len, (u1)15); 5013 __ br(Assembler::GT, LEN_OVER_15); 5014 // The only case when execution falls into this code is when pointer is near 5015 // the end of memory page and we have to avoid reading next page 5016 __ add(ary1, ary1, len); 5017 __ subs(len, len, 8); 5018 __ br(Assembler::GT, LEN_OVER_8); 5019 __ ldr(rscratch2, Address(ary1, -8)); 5020 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 5021 __ lsrv(rscratch2, rscratch2, rscratch1); 5022 __ tst(rscratch2, UPPER_BIT_MASK); 5023 __ csel(result, zr, result, Assembler::NE); 5024 __ leave(); 5025 __ ret(lr); 5026 __ bind(LEN_OVER_8); 5027 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 5028 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 5029 __ tst(rscratch2, UPPER_BIT_MASK); 5030 __ br(Assembler::NE, RET_NO_POP); 5031 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 5032 __ lsrv(rscratch1, rscratch1, rscratch2); 5033 __ tst(rscratch1, UPPER_BIT_MASK); 5034 __ bind(RET_NO_POP); 5035 __ csel(result, zr, result, Assembler::NE); 5036 __ leave(); 5037 __ ret(lr); 5038 5039 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 5040 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 5041 5042 count_positives_long = __ pc(); // 2nd entry point 5043 5044 __ enter(); 5045 5046 __ bind(LEN_OVER_15); 5047 __ push(spilled_regs, sp); 5048 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 5049 __ cbz(rscratch2, ALIGNED); 5050 __ ldp(tmp6, tmp1, Address(ary1)); 5051 __ mov(tmp5, 16); 5052 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 5053 __ add(ary1, ary1, rscratch1); 5054 __ orr(tmp6, tmp6, tmp1); 5055 __ tst(tmp6, UPPER_BIT_MASK); 5056 __ br(Assembler::NE, RET_ADJUST); 5057 __ sub(len, len, rscratch1); 5058 5059 __ bind(ALIGNED); 5060 __ cmp(len, large_loop_size); 5061 __ br(Assembler::LT, CHECK_16); 5062 // Perform 16-byte load as early return in pre-loop to handle situation 5063 // when initially aligned large array has negative values at starting bytes, 5064 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 5065 // slower. Cases with negative bytes further ahead won't be affected that 5066 // much. In fact, it'll be faster due to early loads, less instructions and 5067 // less branches in LARGE_LOOP. 5068 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 5069 __ sub(len, len, 16); 5070 __ orr(tmp6, tmp6, tmp1); 5071 __ tst(tmp6, UPPER_BIT_MASK); 5072 __ br(Assembler::NE, RET_ADJUST_16); 5073 __ cmp(len, large_loop_size); 5074 __ br(Assembler::LT, CHECK_16); 5075 5076 if (SoftwarePrefetchHintDistance >= 0 5077 && SoftwarePrefetchHintDistance >= dcache_line) { 5078 // initial prefetch 5079 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 5080 } 5081 __ bind(LARGE_LOOP); 5082 if (SoftwarePrefetchHintDistance >= 0) { 5083 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 5084 } 5085 // Issue load instructions first, since it can save few CPU/MEM cycles, also 5086 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 5087 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 5088 // instructions per cycle and have less branches, but this approach disables 5089 // early return, thus, all 64 bytes are loaded and checked every time. 5090 __ ldp(tmp2, tmp3, Address(ary1)); 5091 __ ldp(tmp4, tmp5, Address(ary1, 16)); 5092 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 5093 __ ldp(tmp6, tmp1, Address(ary1, 48)); 5094 __ add(ary1, ary1, large_loop_size); 5095 __ sub(len, len, large_loop_size); 5096 __ orr(tmp2, tmp2, tmp3); 5097 __ orr(tmp4, tmp4, tmp5); 5098 __ orr(rscratch1, rscratch1, rscratch2); 5099 __ orr(tmp6, tmp6, tmp1); 5100 __ orr(tmp2, tmp2, tmp4); 5101 __ orr(rscratch1, rscratch1, tmp6); 5102 __ orr(tmp2, tmp2, rscratch1); 5103 __ tst(tmp2, UPPER_BIT_MASK); 5104 __ br(Assembler::NE, RET_ADJUST_LONG); 5105 __ cmp(len, large_loop_size); 5106 __ br(Assembler::GE, LARGE_LOOP); 5107 5108 __ bind(CHECK_16); // small 16-byte load pre-loop 5109 __ cmp(len, (u1)16); 5110 __ br(Assembler::LT, POST_LOOP16); 5111 5112 __ bind(LOOP16); // small 16-byte load loop 5113 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 5114 __ sub(len, len, 16); 5115 __ orr(tmp2, tmp2, tmp3); 5116 __ tst(tmp2, UPPER_BIT_MASK); 5117 __ br(Assembler::NE, RET_ADJUST_16); 5118 __ cmp(len, (u1)16); 5119 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 5120 5121 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 5122 __ cmp(len, (u1)8); 5123 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 5124 __ ldr(tmp3, Address(__ post(ary1, 8))); 5125 __ tst(tmp3, UPPER_BIT_MASK); 5126 __ br(Assembler::NE, RET_ADJUST); 5127 __ sub(len, len, 8); 5128 5129 __ bind(POST_LOOP16_LOAD_TAIL); 5130 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0 5131 __ ldr(tmp1, Address(ary1)); 5132 __ mov(tmp2, 64); 5133 __ sub(tmp4, tmp2, len, __ LSL, 3); 5134 __ lslv(tmp1, tmp1, tmp4); 5135 __ tst(tmp1, UPPER_BIT_MASK); 5136 __ br(Assembler::NE, RET_ADJUST); 5137 // Fallthrough 5138 5139 __ bind(RET_LEN); 5140 __ pop(spilled_regs, sp); 5141 __ leave(); 5142 __ ret(lr); 5143 5144 // difference result - len is the count of guaranteed to be 5145 // positive bytes 5146 5147 __ bind(RET_ADJUST_LONG); 5148 __ add(len, len, (u1)(large_loop_size - 16)); 5149 __ bind(RET_ADJUST_16); 5150 __ add(len, len, 16); 5151 __ bind(RET_ADJUST); 5152 __ pop(spilled_regs, sp); 5153 __ leave(); 5154 __ sub(result, result, len); 5155 __ ret(lr); 5156 5157 return entry; 5158 } 5159 5160 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 5161 bool usePrefetch, Label &NOT_EQUAL) { 5162 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5163 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 5164 tmp7 = r12, tmp8 = r13; 5165 Label LOOP; 5166 5167 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5168 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5169 __ bind(LOOP); 5170 if (usePrefetch) { 5171 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 5172 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 5173 } 5174 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 5175 __ eor(tmp1, tmp1, tmp2); 5176 __ eor(tmp3, tmp3, tmp4); 5177 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 5178 __ orr(tmp1, tmp1, tmp3); 5179 __ cbnz(tmp1, NOT_EQUAL); 5180 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5181 __ eor(tmp5, tmp5, tmp6); 5182 __ eor(tmp7, tmp7, tmp8); 5183 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5184 __ orr(tmp5, tmp5, tmp7); 5185 __ cbnz(tmp5, NOT_EQUAL); 5186 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 5187 __ eor(tmp1, tmp1, tmp2); 5188 __ eor(tmp3, tmp3, tmp4); 5189 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 5190 __ orr(tmp1, tmp1, tmp3); 5191 __ cbnz(tmp1, NOT_EQUAL); 5192 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 5193 __ eor(tmp5, tmp5, tmp6); 5194 __ sub(cnt1, cnt1, 8 * wordSize); 5195 __ eor(tmp7, tmp7, tmp8); 5196 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 5197 // tmp6 is not used. MacroAssembler::subs is used here (rather than 5198 // cmp) because subs allows an unlimited range of immediate operand. 5199 __ subs(tmp6, cnt1, loopThreshold); 5200 __ orr(tmp5, tmp5, tmp7); 5201 __ cbnz(tmp5, NOT_EQUAL); 5202 __ br(__ GE, LOOP); 5203 // post-loop 5204 __ eor(tmp1, tmp1, tmp2); 5205 __ eor(tmp3, tmp3, tmp4); 5206 __ orr(tmp1, tmp1, tmp3); 5207 __ sub(cnt1, cnt1, 2 * wordSize); 5208 __ cbnz(tmp1, NOT_EQUAL); 5209 } 5210 5211 void generate_large_array_equals_loop_simd(int loopThreshold, 5212 bool usePrefetch, Label &NOT_EQUAL) { 5213 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5214 tmp2 = rscratch2; 5215 Label LOOP; 5216 5217 __ bind(LOOP); 5218 if (usePrefetch) { 5219 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 5220 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 5221 } 5222 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 5223 __ sub(cnt1, cnt1, 8 * wordSize); 5224 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 5225 __ subs(tmp1, cnt1, loopThreshold); 5226 __ eor(v0, __ T16B, v0, v4); 5227 __ eor(v1, __ T16B, v1, v5); 5228 __ eor(v2, __ T16B, v2, v6); 5229 __ eor(v3, __ T16B, v3, v7); 5230 __ orr(v0, __ T16B, v0, v1); 5231 __ orr(v1, __ T16B, v2, v3); 5232 __ orr(v0, __ T16B, v0, v1); 5233 __ umov(tmp1, v0, __ D, 0); 5234 __ umov(tmp2, v0, __ D, 1); 5235 __ orr(tmp1, tmp1, tmp2); 5236 __ cbnz(tmp1, NOT_EQUAL); 5237 __ br(__ GE, LOOP); 5238 } 5239 5240 // a1 = r1 - array1 address 5241 // a2 = r2 - array2 address 5242 // result = r0 - return value. Already contains "false" 5243 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 5244 // r3-r5 are reserved temporary registers 5245 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2 5246 address generate_large_array_equals() { 5247 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 5248 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 5249 tmp7 = r12, tmp8 = r13; 5250 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 5251 SMALL_LOOP, POST_LOOP; 5252 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 5253 // calculate if at least 32 prefetched bytes are used 5254 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 5255 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 5256 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 5257 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 5258 tmp5, tmp6, tmp7, tmp8); 5259 5260 __ align(CodeEntryAlignment); 5261 5262 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 5263 5264 address entry = __ pc(); 5265 __ enter(); 5266 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 5267 // also advance pointers to use post-increment instead of pre-increment 5268 __ add(a1, a1, wordSize); 5269 __ add(a2, a2, wordSize); 5270 if (AvoidUnalignedAccesses) { 5271 // both implementations (SIMD/nonSIMD) are using relatively large load 5272 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 5273 // on some CPUs in case of address is not at least 16-byte aligned. 5274 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 5275 // load if needed at least for 1st address and make if 16-byte aligned. 5276 Label ALIGNED16; 5277 __ tbz(a1, 3, ALIGNED16); 5278 __ ldr(tmp1, Address(__ post(a1, wordSize))); 5279 __ ldr(tmp2, Address(__ post(a2, wordSize))); 5280 __ sub(cnt1, cnt1, wordSize); 5281 __ eor(tmp1, tmp1, tmp2); 5282 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 5283 __ bind(ALIGNED16); 5284 } 5285 if (UseSIMDForArrayEquals) { 5286 if (SoftwarePrefetchHintDistance >= 0) { 5287 __ subs(tmp1, cnt1, prefetchLoopThreshold); 5288 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 5289 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 5290 /* prfm = */ true, NOT_EQUAL); 5291 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 5292 __ br(__ LT, TAIL); 5293 } 5294 __ bind(NO_PREFETCH_LARGE_LOOP); 5295 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 5296 /* prfm = */ false, NOT_EQUAL); 5297 } else { 5298 __ push(spilled_regs, sp); 5299 if (SoftwarePrefetchHintDistance >= 0) { 5300 __ subs(tmp1, cnt1, prefetchLoopThreshold); 5301 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 5302 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 5303 /* prfm = */ true, NOT_EQUAL); 5304 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 5305 __ br(__ LT, TAIL); 5306 } 5307 __ bind(NO_PREFETCH_LARGE_LOOP); 5308 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 5309 /* prfm = */ false, NOT_EQUAL); 5310 } 5311 __ bind(TAIL); 5312 __ cbz(cnt1, EQUAL); 5313 __ subs(cnt1, cnt1, wordSize); 5314 __ br(__ LE, POST_LOOP); 5315 __ bind(SMALL_LOOP); 5316 __ ldr(tmp1, Address(__ post(a1, wordSize))); 5317 __ ldr(tmp2, Address(__ post(a2, wordSize))); 5318 __ subs(cnt1, cnt1, wordSize); 5319 __ eor(tmp1, tmp1, tmp2); 5320 __ cbnz(tmp1, NOT_EQUAL); 5321 __ br(__ GT, SMALL_LOOP); 5322 __ bind(POST_LOOP); 5323 __ ldr(tmp1, Address(a1, cnt1)); 5324 __ ldr(tmp2, Address(a2, cnt1)); 5325 __ eor(tmp1, tmp1, tmp2); 5326 __ cbnz(tmp1, NOT_EQUAL); 5327 __ bind(EQUAL); 5328 __ mov(result, true); 5329 __ bind(NOT_EQUAL); 5330 if (!UseSIMDForArrayEquals) { 5331 __ pop(spilled_regs, sp); 5332 } 5333 __ bind(NOT_EQUAL_NO_POP); 5334 __ leave(); 5335 __ ret(lr); 5336 return entry; 5337 } 5338 5339 // result = r0 - return value. Contains initial hashcode value on entry. 5340 // ary = r1 - array address 5341 // cnt = r2 - elements count 5342 // Clobbers: v0-v13, rscratch1, rscratch2 5343 address generate_large_arrays_hashcode(BasicType eltype) { 5344 const Register result = r0, ary = r1, cnt = r2; 5345 const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0; 5346 const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7; 5347 const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0> 5348 const FloatRegister vpowm = v13; 5349 5350 ARRAYS_HASHCODE_REGISTERS; 5351 5352 Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE; 5353 5354 unsigned int vf; // vectorization factor 5355 bool multiply_by_halves; 5356 Assembler::SIMD_Arrangement load_arrangement; 5357 switch (eltype) { 5358 case T_BOOLEAN: 5359 case T_BYTE: 5360 load_arrangement = Assembler::T8B; 5361 multiply_by_halves = true; 5362 vf = 8; 5363 break; 5364 case T_CHAR: 5365 case T_SHORT: 5366 load_arrangement = Assembler::T8H; 5367 multiply_by_halves = true; 5368 vf = 8; 5369 break; 5370 case T_INT: 5371 load_arrangement = Assembler::T4S; 5372 multiply_by_halves = false; 5373 vf = 4; 5374 break; 5375 default: 5376 ShouldNotReachHere(); 5377 } 5378 5379 // Unroll factor 5380 const unsigned uf = 4; 5381 5382 // Effective vectorization factor 5383 const unsigned evf = vf * uf; 5384 5385 __ align(CodeEntryAlignment); 5386 5387 const char *mark_name = ""; 5388 switch (eltype) { 5389 case T_BOOLEAN: 5390 mark_name = "_large_arrays_hashcode_boolean"; 5391 break; 5392 case T_BYTE: 5393 mark_name = "_large_arrays_hashcode_byte"; 5394 break; 5395 case T_CHAR: 5396 mark_name = "_large_arrays_hashcode_char"; 5397 break; 5398 case T_SHORT: 5399 mark_name = "_large_arrays_hashcode_short"; 5400 break; 5401 case T_INT: 5402 mark_name = "_large_arrays_hashcode_int"; 5403 break; 5404 default: 5405 mark_name = "_large_arrays_hashcode_incorrect_type"; 5406 __ should_not_reach_here(); 5407 }; 5408 5409 StubCodeMark mark(this, "StubRoutines", mark_name); 5410 5411 address entry = __ pc(); 5412 __ enter(); 5413 5414 // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in 5415 // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's 5416 // value shouldn't change throughout both loops. 5417 __ movw(rscratch1, intpow(31U, 3)); 5418 __ mov(vpow, Assembler::S, 0, rscratch1); 5419 __ movw(rscratch1, intpow(31U, 2)); 5420 __ mov(vpow, Assembler::S, 1, rscratch1); 5421 __ movw(rscratch1, intpow(31U, 1)); 5422 __ mov(vpow, Assembler::S, 2, rscratch1); 5423 __ movw(rscratch1, intpow(31U, 0)); 5424 __ mov(vpow, Assembler::S, 3, rscratch1); 5425 5426 __ mov(vmul0, Assembler::T16B, 0); 5427 __ mov(vmul0, Assembler::S, 3, result); 5428 5429 __ andr(rscratch2, cnt, (uf - 1) * vf); 5430 __ cbz(rscratch2, LARGE_LOOP_PREHEADER); 5431 5432 __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf)); 5433 __ mov(vpowm, Assembler::S, 0, rscratch1); 5434 5435 // SMALL LOOP 5436 __ bind(SMALL_LOOP); 5437 5438 __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype)))); 5439 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 5440 __ subsw(rscratch2, rscratch2, vf); 5441 5442 if (load_arrangement == Assembler::T8B) { 5443 // Extend 8B to 8H to be able to use vector multiply 5444 // instructions 5445 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 5446 if (is_signed_subword_type(eltype)) { 5447 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 5448 } else { 5449 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 5450 } 5451 } 5452 5453 switch (load_arrangement) { 5454 case Assembler::T4S: 5455 __ addv(vmul0, load_arrangement, vmul0, vdata0); 5456 break; 5457 case Assembler::T8B: 5458 case Assembler::T8H: 5459 assert(is_subword_type(eltype), "subword type expected"); 5460 if (is_signed_subword_type(eltype)) { 5461 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 5462 } else { 5463 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 5464 } 5465 break; 5466 default: 5467 __ should_not_reach_here(); 5468 } 5469 5470 // Process the upper half of a vector 5471 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 5472 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 5473 if (is_signed_subword_type(eltype)) { 5474 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 5475 } else { 5476 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 5477 } 5478 } 5479 5480 __ br(Assembler::HI, SMALL_LOOP); 5481 5482 // SMALL LOOP'S EPILOQUE 5483 __ lsr(rscratch2, cnt, exact_log2(evf)); 5484 __ cbnz(rscratch2, LARGE_LOOP_PREHEADER); 5485 5486 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 5487 __ addv(vmul0, Assembler::T4S, vmul0); 5488 __ umov(result, vmul0, Assembler::S, 0); 5489 5490 // TAIL 5491 __ bind(TAIL); 5492 5493 // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs 5494 // of load + madd insns i.e. it only executes cnt % vf load + madd pairs. 5495 assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC"); 5496 __ andr(rscratch2, cnt, vf - 1); 5497 __ bind(TAIL_SHORTCUT); 5498 __ adr(rscratch1, BR_BASE); 5499 __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, 3); 5500 __ movw(rscratch2, 0x1f); 5501 __ br(rscratch1); 5502 5503 for (size_t i = 0; i < vf - 1; ++i) { 5504 __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))), 5505 eltype); 5506 __ maddw(result, result, rscratch2, rscratch1); 5507 } 5508 __ bind(BR_BASE); 5509 5510 __ leave(); 5511 __ ret(lr); 5512 5513 // LARGE LOOP 5514 __ bind(LARGE_LOOP_PREHEADER); 5515 5516 __ lsr(rscratch2, cnt, exact_log2(evf)); 5517 5518 if (multiply_by_halves) { 5519 // 31^4 - multiplier between lower and upper parts of a register 5520 __ movw(rscratch1, intpow(31U, vf / 2)); 5521 __ mov(vpowm, Assembler::S, 1, rscratch1); 5522 // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4 5523 __ movw(rscratch1, intpow(31U, evf - vf / 2)); 5524 __ mov(vpowm, Assembler::S, 0, rscratch1); 5525 } else { 5526 // 31^16 5527 __ movw(rscratch1, intpow(31U, evf)); 5528 __ mov(vpowm, Assembler::S, 0, rscratch1); 5529 } 5530 5531 __ mov(vmul3, Assembler::T16B, 0); 5532 __ mov(vmul2, Assembler::T16B, 0); 5533 __ mov(vmul1, Assembler::T16B, 0); 5534 5535 __ bind(LARGE_LOOP); 5536 5537 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0); 5538 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0); 5539 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0); 5540 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0); 5541 5542 __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement, 5543 Address(__ post(ary, evf * type2aelembytes(eltype)))); 5544 5545 if (load_arrangement == Assembler::T8B) { 5546 // Extend 8B to 8H to be able to use vector multiply 5547 // instructions 5548 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H"); 5549 if (is_signed_subword_type(eltype)) { 5550 __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 5551 __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 5552 __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 5553 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 5554 } else { 5555 __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement); 5556 __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement); 5557 __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement); 5558 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement); 5559 } 5560 } 5561 5562 switch (load_arrangement) { 5563 case Assembler::T4S: 5564 __ addv(vmul3, load_arrangement, vmul3, vdata3); 5565 __ addv(vmul2, load_arrangement, vmul2, vdata2); 5566 __ addv(vmul1, load_arrangement, vmul1, vdata1); 5567 __ addv(vmul0, load_arrangement, vmul0, vdata0); 5568 break; 5569 case Assembler::T8B: 5570 case Assembler::T8H: 5571 assert(is_subword_type(eltype), "subword type expected"); 5572 if (is_signed_subword_type(eltype)) { 5573 __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 5574 __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 5575 __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 5576 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 5577 } else { 5578 __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H); 5579 __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H); 5580 __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H); 5581 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H); 5582 } 5583 break; 5584 default: 5585 __ should_not_reach_here(); 5586 } 5587 5588 // Process the upper half of a vector 5589 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) { 5590 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1); 5591 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1); 5592 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1); 5593 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1); 5594 if (is_signed_subword_type(eltype)) { 5595 __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 5596 __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 5597 __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 5598 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 5599 } else { 5600 __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H); 5601 __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H); 5602 __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H); 5603 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H); 5604 } 5605 } 5606 5607 __ subsw(rscratch2, rscratch2, 1); 5608 __ br(Assembler::HI, LARGE_LOOP); 5609 5610 __ mulv(vmul3, Assembler::T4S, vmul3, vpow); 5611 __ addv(vmul3, Assembler::T4S, vmul3); 5612 __ umov(result, vmul3, Assembler::S, 0); 5613 5614 __ mov(rscratch2, intpow(31U, vf)); 5615 5616 __ mulv(vmul2, Assembler::T4S, vmul2, vpow); 5617 __ addv(vmul2, Assembler::T4S, vmul2); 5618 __ umov(rscratch1, vmul2, Assembler::S, 0); 5619 __ maddw(result, result, rscratch2, rscratch1); 5620 5621 __ mulv(vmul1, Assembler::T4S, vmul1, vpow); 5622 __ addv(vmul1, Assembler::T4S, vmul1); 5623 __ umov(rscratch1, vmul1, Assembler::S, 0); 5624 __ maddw(result, result, rscratch2, rscratch1); 5625 5626 __ mulv(vmul0, Assembler::T4S, vmul0, vpow); 5627 __ addv(vmul0, Assembler::T4S, vmul0); 5628 __ umov(rscratch1, vmul0, Assembler::S, 0); 5629 __ maddw(result, result, rscratch2, rscratch1); 5630 5631 __ andr(rscratch2, cnt, vf - 1); 5632 __ cbnz(rscratch2, TAIL_SHORTCUT); 5633 5634 __ leave(); 5635 __ ret(lr); 5636 5637 return entry; 5638 } 5639 5640 address generate_dsin_dcos(bool isCos) { 5641 __ align(CodeEntryAlignment); 5642 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin"); 5643 address start = __ pc(); 5644 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 5645 (address)StubRoutines::aarch64::_two_over_pi, 5646 (address)StubRoutines::aarch64::_pio2, 5647 (address)StubRoutines::aarch64::_dsin_coef, 5648 (address)StubRoutines::aarch64::_dcos_coef); 5649 return start; 5650 } 5651 5652 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 5653 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 5654 Label &DIFF2) { 5655 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 5656 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 5657 5658 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 5659 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5660 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 5661 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 5662 5663 __ fmovd(tmpL, vtmp3); 5664 __ eor(rscratch2, tmp3, tmpL); 5665 __ cbnz(rscratch2, DIFF2); 5666 5667 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5668 __ umov(tmpL, vtmp3, __ D, 1); 5669 __ eor(rscratch2, tmpU, tmpL); 5670 __ cbnz(rscratch2, DIFF1); 5671 5672 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 5673 __ ldr(tmpU, Address(__ post(cnt1, 8))); 5674 __ fmovd(tmpL, vtmp); 5675 __ eor(rscratch2, tmp3, tmpL); 5676 __ cbnz(rscratch2, DIFF2); 5677 5678 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5679 __ umov(tmpL, vtmp, __ D, 1); 5680 __ eor(rscratch2, tmpU, tmpL); 5681 __ cbnz(rscratch2, DIFF1); 5682 } 5683 5684 // r0 = result 5685 // r1 = str1 5686 // r2 = cnt1 5687 // r3 = str2 5688 // r4 = cnt2 5689 // r10 = tmp1 5690 // r11 = tmp2 5691 address generate_compare_long_string_different_encoding(bool isLU) { 5692 __ align(CodeEntryAlignment); 5693 StubCodeMark mark(this, "StubRoutines", isLU 5694 ? "compare_long_string_different_encoding LU" 5695 : "compare_long_string_different_encoding UL"); 5696 address entry = __ pc(); 5697 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 5698 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 5699 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 5700 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5701 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 5702 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 5703 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 5704 5705 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 5706 5707 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 5708 // cnt2 == amount of characters left to compare 5709 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 5710 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5711 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 5712 __ add(str2, str2, isLU ? wordSize : wordSize/2); 5713 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 5714 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 5715 __ eor(rscratch2, tmp1, tmp2); 5716 __ mov(rscratch1, tmp2); 5717 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 5718 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 5719 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 5720 __ push(spilled_regs, sp); 5721 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 5722 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 5723 5724 __ ldr(tmp3, Address(__ post(cnt1, 8))); 5725 5726 if (SoftwarePrefetchHintDistance >= 0) { 5727 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5728 __ br(__ LT, NO_PREFETCH); 5729 __ bind(LARGE_LOOP_PREFETCH); 5730 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 5731 __ mov(tmp4, 2); 5732 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5733 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 5734 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5735 __ subs(tmp4, tmp4, 1); 5736 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 5737 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 5738 __ mov(tmp4, 2); 5739 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 5740 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5741 __ subs(tmp4, tmp4, 1); 5742 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 5743 __ sub(cnt2, cnt2, 64); 5744 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 5745 __ br(__ GE, LARGE_LOOP_PREFETCH); 5746 } 5747 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 5748 __ bind(NO_PREFETCH); 5749 __ subs(cnt2, cnt2, 16); 5750 __ br(__ LT, TAIL); 5751 __ align(OptoLoopAlignment); 5752 __ bind(SMALL_LOOP); // smaller loop 5753 __ subs(cnt2, cnt2, 16); 5754 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 5755 __ br(__ GE, SMALL_LOOP); 5756 __ cmn(cnt2, (u1)16); 5757 __ br(__ EQ, LOAD_LAST); 5758 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 5759 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 5760 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 5761 __ ldr(tmp3, Address(cnt1, -8)); 5762 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 5763 __ b(LOAD_LAST); 5764 __ bind(DIFF2); 5765 __ mov(tmpU, tmp3); 5766 __ bind(DIFF1); 5767 __ pop(spilled_regs, sp); 5768 __ b(CALCULATE_DIFFERENCE); 5769 __ bind(LOAD_LAST); 5770 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 5771 // No need to load it again 5772 __ mov(tmpU, tmp3); 5773 __ pop(spilled_regs, sp); 5774 5775 // tmp2 points to the address of the last 4 Latin1 characters right now 5776 __ ldrs(vtmp, Address(tmp2)); 5777 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 5778 __ fmovd(tmpL, vtmp); 5779 5780 __ eor(rscratch2, tmpU, tmpL); 5781 __ cbz(rscratch2, DONE); 5782 5783 // Find the first different characters in the longwords and 5784 // compute their difference. 5785 __ bind(CALCULATE_DIFFERENCE); 5786 __ rev(rscratch2, rscratch2); 5787 __ clz(rscratch2, rscratch2); 5788 __ andr(rscratch2, rscratch2, -16); 5789 __ lsrv(tmp1, tmp1, rscratch2); 5790 __ uxthw(tmp1, tmp1); 5791 __ lsrv(rscratch1, rscratch1, rscratch2); 5792 __ uxthw(rscratch1, rscratch1); 5793 __ subw(result, tmp1, rscratch1); 5794 __ bind(DONE); 5795 __ ret(lr); 5796 return entry; 5797 } 5798 5799 // r0 = input (float16) 5800 // v0 = result (float) 5801 // v1 = temporary float register 5802 address generate_float16ToFloat() { 5803 __ align(CodeEntryAlignment); 5804 StubCodeMark mark(this, "StubRoutines", "float16ToFloat"); 5805 address entry = __ pc(); 5806 BLOCK_COMMENT("Entry:"); 5807 __ flt16_to_flt(v0, r0, v1); 5808 __ ret(lr); 5809 return entry; 5810 } 5811 5812 // v0 = input (float) 5813 // r0 = result (float16) 5814 // v1 = temporary float register 5815 address generate_floatToFloat16() { 5816 __ align(CodeEntryAlignment); 5817 StubCodeMark mark(this, "StubRoutines", "floatToFloat16"); 5818 address entry = __ pc(); 5819 BLOCK_COMMENT("Entry:"); 5820 __ flt_to_flt16(r0, v0, v1); 5821 __ ret(lr); 5822 return entry; 5823 } 5824 5825 address generate_method_entry_barrier() { 5826 __ align(CodeEntryAlignment); 5827 StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier"); 5828 5829 Label deoptimize_label; 5830 5831 address start = __ pc(); 5832 5833 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler(); 5834 5835 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) { 5836 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 5837 // We can get here despite the nmethod being good, if we have not 5838 // yet applied our cross modification fence (or data fence). 5839 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4); 5840 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr())); 5841 __ ldrw(rscratch2, rscratch2); 5842 __ strw(rscratch2, thread_epoch_addr); 5843 __ isb(); 5844 __ membar(__ LoadLoad); 5845 } 5846 5847 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 5848 5849 __ enter(); 5850 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 5851 5852 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 5853 5854 __ push_call_clobbered_registers(); 5855 5856 __ mov(c_rarg0, rscratch2); 5857 __ call_VM_leaf 5858 (CAST_FROM_FN_PTR 5859 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 5860 5861 __ reset_last_Java_frame(true); 5862 5863 __ mov(rscratch1, r0); 5864 5865 __ pop_call_clobbered_registers(); 5866 5867 __ cbnz(rscratch1, deoptimize_label); 5868 5869 __ leave(); 5870 __ ret(lr); 5871 5872 __ BIND(deoptimize_label); 5873 5874 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 5875 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 5876 5877 __ mov(sp, rscratch1); 5878 __ br(rscratch2); 5879 5880 return start; 5881 } 5882 5883 // r0 = result 5884 // r1 = str1 5885 // r2 = cnt1 5886 // r3 = str2 5887 // r4 = cnt2 5888 // r10 = tmp1 5889 // r11 = tmp2 5890 address generate_compare_long_string_same_encoding(bool isLL) { 5891 __ align(CodeEntryAlignment); 5892 StubCodeMark mark(this, "StubRoutines", isLL 5893 ? "compare_long_string_same_encoding LL" 5894 : "compare_long_string_same_encoding UU"); 5895 address entry = __ pc(); 5896 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 5897 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2; 5898 5899 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF; 5900 5901 // exit from large loop when less than 64 bytes left to read or we're about 5902 // to prefetch memory behind array border 5903 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 5904 5905 // before jumping to stub, pre-load 8 bytes already, so do comparison directly 5906 __ eor(rscratch2, tmp1, tmp2); 5907 __ cbnz(rscratch2, CAL_DIFFERENCE); 5908 5909 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 5910 // update pointers, because of previous read 5911 __ add(str1, str1, wordSize); 5912 __ add(str2, str2, wordSize); 5913 if (SoftwarePrefetchHintDistance >= 0) { 5914 __ align(OptoLoopAlignment); 5915 __ bind(LARGE_LOOP_PREFETCH); 5916 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 5917 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 5918 5919 for (int i = 0; i < 4; i++) { 5920 __ ldp(tmp1, tmp1h, Address(str1, i * 16)); 5921 __ ldp(tmp2, tmp2h, Address(str2, i * 16)); 5922 __ cmp(tmp1, tmp2); 5923 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5924 __ br(Assembler::NE, DIFF); 5925 } 5926 __ sub(cnt2, cnt2, isLL ? 64 : 32); 5927 __ add(str1, str1, 64); 5928 __ add(str2, str2, 64); 5929 __ subs(rscratch2, cnt2, largeLoopExitCondition); 5930 __ br(Assembler::GE, LARGE_LOOP_PREFETCH); 5931 __ cbz(cnt2, LENGTH_DIFF); // no more chars left? 5932 } 5933 5934 __ subs(rscratch1, cnt2, isLL ? 16 : 8); 5935 __ br(Assembler::LE, LESS16); 5936 __ align(OptoLoopAlignment); 5937 __ bind(LOOP_COMPARE16); 5938 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 5939 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 5940 __ cmp(tmp1, tmp2); 5941 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5942 __ br(Assembler::NE, DIFF); 5943 __ sub(cnt2, cnt2, isLL ? 16 : 8); 5944 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 5945 __ br(Assembler::LT, LESS16); 5946 5947 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16))); 5948 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16))); 5949 __ cmp(tmp1, tmp2); 5950 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ); 5951 __ br(Assembler::NE, DIFF); 5952 __ sub(cnt2, cnt2, isLL ? 16 : 8); 5953 __ subs(rscratch2, cnt2, isLL ? 16 : 8); 5954 __ br(Assembler::GE, LOOP_COMPARE16); 5955 __ cbz(cnt2, LENGTH_DIFF); 5956 5957 __ bind(LESS16); 5958 // each 8 compare 5959 __ subs(cnt2, cnt2, isLL ? 8 : 4); 5960 __ br(Assembler::LE, LESS8); 5961 __ ldr(tmp1, Address(__ post(str1, 8))); 5962 __ ldr(tmp2, Address(__ post(str2, 8))); 5963 __ eor(rscratch2, tmp1, tmp2); 5964 __ cbnz(rscratch2, CAL_DIFFERENCE); 5965 __ sub(cnt2, cnt2, isLL ? 8 : 4); 5966 5967 __ bind(LESS8); // directly load last 8 bytes 5968 if (!isLL) { 5969 __ add(cnt2, cnt2, cnt2); 5970 } 5971 __ ldr(tmp1, Address(str1, cnt2)); 5972 __ ldr(tmp2, Address(str2, cnt2)); 5973 __ eor(rscratch2, tmp1, tmp2); 5974 __ cbz(rscratch2, LENGTH_DIFF); 5975 __ b(CAL_DIFFERENCE); 5976 5977 __ bind(DIFF); 5978 __ cmp(tmp1, tmp2); 5979 __ csel(tmp1, tmp1, tmp1h, Assembler::NE); 5980 __ csel(tmp2, tmp2, tmp2h, Assembler::NE); 5981 // reuse rscratch2 register for the result of eor instruction 5982 __ eor(rscratch2, tmp1, tmp2); 5983 5984 __ bind(CAL_DIFFERENCE); 5985 __ rev(rscratch2, rscratch2); 5986 __ clz(rscratch2, rscratch2); 5987 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 5988 __ lsrv(tmp1, tmp1, rscratch2); 5989 __ lsrv(tmp2, tmp2, rscratch2); 5990 if (isLL) { 5991 __ uxtbw(tmp1, tmp1); 5992 __ uxtbw(tmp2, tmp2); 5993 } else { 5994 __ uxthw(tmp1, tmp1); 5995 __ uxthw(tmp2, tmp2); 5996 } 5997 __ subw(result, tmp1, tmp2); 5998 5999 __ bind(LENGTH_DIFF); 6000 __ ret(lr); 6001 return entry; 6002 } 6003 6004 enum string_compare_mode { 6005 LL, 6006 LU, 6007 UL, 6008 UU, 6009 }; 6010 6011 // The following registers are declared in aarch64.ad 6012 // r0 = result 6013 // r1 = str1 6014 // r2 = cnt1 6015 // r3 = str2 6016 // r4 = cnt2 6017 // r10 = tmp1 6018 // r11 = tmp2 6019 // z0 = ztmp1 6020 // z1 = ztmp2 6021 // p0 = pgtmp1 6022 // p1 = pgtmp2 6023 address generate_compare_long_string_sve(string_compare_mode mode) { 6024 __ align(CodeEntryAlignment); 6025 address entry = __ pc(); 6026 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 6027 tmp1 = r10, tmp2 = r11; 6028 6029 Label LOOP, DONE, MISMATCH; 6030 Register vec_len = tmp1; 6031 Register idx = tmp2; 6032 // The minimum of the string lengths has been stored in cnt2. 6033 Register cnt = cnt2; 6034 FloatRegister ztmp1 = z0, ztmp2 = z1; 6035 PRegister pgtmp1 = p0, pgtmp2 = p1; 6036 6037 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \ 6038 switch (mode) { \ 6039 case LL: \ 6040 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \ 6041 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \ 6042 break; \ 6043 case LU: \ 6044 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \ 6045 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 6046 break; \ 6047 case UL: \ 6048 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 6049 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \ 6050 break; \ 6051 case UU: \ 6052 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \ 6053 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \ 6054 break; \ 6055 default: \ 6056 ShouldNotReachHere(); \ 6057 } 6058 6059 const char* stubname; 6060 switch (mode) { 6061 case LL: stubname = "compare_long_string_same_encoding LL"; break; 6062 case LU: stubname = "compare_long_string_different_encoding LU"; break; 6063 case UL: stubname = "compare_long_string_different_encoding UL"; break; 6064 case UU: stubname = "compare_long_string_same_encoding UU"; break; 6065 default: ShouldNotReachHere(); 6066 } 6067 6068 StubCodeMark mark(this, "StubRoutines", stubname); 6069 6070 __ mov(idx, 0); 6071 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 6072 6073 if (mode == LL) { 6074 __ sve_cntb(vec_len); 6075 } else { 6076 __ sve_cnth(vec_len); 6077 } 6078 6079 __ sub(rscratch1, cnt, vec_len); 6080 6081 __ bind(LOOP); 6082 6083 // main loop 6084 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 6085 __ add(idx, idx, vec_len); 6086 // Compare strings. 6087 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 6088 __ br(__ NE, MISMATCH); 6089 __ cmp(idx, rscratch1); 6090 __ br(__ LT, LOOP); 6091 6092 // post loop, last iteration 6093 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt); 6094 6095 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx); 6096 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2); 6097 __ br(__ EQ, DONE); 6098 6099 __ bind(MISMATCH); 6100 6101 // Crop the vector to find its location. 6102 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */); 6103 // Extract the first different characters of each string. 6104 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1); 6105 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2); 6106 6107 // Compute the difference of the first different characters. 6108 __ sub(result, rscratch1, rscratch2); 6109 6110 __ bind(DONE); 6111 __ ret(lr); 6112 #undef LOAD_PAIR 6113 return entry; 6114 } 6115 6116 void generate_compare_long_strings() { 6117 if (UseSVE == 0) { 6118 StubRoutines::aarch64::_compare_long_string_LL 6119 = generate_compare_long_string_same_encoding(true); 6120 StubRoutines::aarch64::_compare_long_string_UU 6121 = generate_compare_long_string_same_encoding(false); 6122 StubRoutines::aarch64::_compare_long_string_LU 6123 = generate_compare_long_string_different_encoding(true); 6124 StubRoutines::aarch64::_compare_long_string_UL 6125 = generate_compare_long_string_different_encoding(false); 6126 } else { 6127 StubRoutines::aarch64::_compare_long_string_LL 6128 = generate_compare_long_string_sve(LL); 6129 StubRoutines::aarch64::_compare_long_string_UU 6130 = generate_compare_long_string_sve(UU); 6131 StubRoutines::aarch64::_compare_long_string_LU 6132 = generate_compare_long_string_sve(LU); 6133 StubRoutines::aarch64::_compare_long_string_UL 6134 = generate_compare_long_string_sve(UL); 6135 } 6136 } 6137 6138 // R0 = result 6139 // R1 = str2 6140 // R2 = cnt1 6141 // R3 = str1 6142 // R4 = cnt2 6143 // Clobbers: rscratch1, rscratch2, v0, v1, rflags 6144 // 6145 // This generic linear code use few additional ideas, which makes it faster: 6146 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 6147 // in order to skip initial loading(help in systems with 1 ld pipeline) 6148 // 2) we can use "fast" algorithm of finding single character to search for 6149 // first symbol with less branches(1 branch per each loaded register instead 6150 // of branch for each symbol), so, this is where constants like 6151 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 6152 // 3) after loading and analyzing 1st register of source string, it can be 6153 // used to search for every 1st character entry, saving few loads in 6154 // comparison with "simplier-but-slower" implementation 6155 // 4) in order to avoid lots of push/pop operations, code below is heavily 6156 // re-using/re-initializing/compressing register values, which makes code 6157 // larger and a bit less readable, however, most of extra operations are 6158 // issued during loads or branches, so, penalty is minimal 6159 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 6160 const char* stubName = str1_isL 6161 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul") 6162 : "indexof_linear_uu"; 6163 __ align(CodeEntryAlignment); 6164 StubCodeMark mark(this, "StubRoutines", stubName); 6165 address entry = __ pc(); 6166 6167 int str1_chr_size = str1_isL ? 1 : 2; 6168 int str2_chr_size = str2_isL ? 1 : 2; 6169 int str1_chr_shift = str1_isL ? 0 : 1; 6170 int str2_chr_shift = str2_isL ? 0 : 1; 6171 bool isL = str1_isL && str2_isL; 6172 // parameters 6173 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 6174 // temporary registers 6175 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 6176 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 6177 // redefinitions 6178 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 6179 6180 __ push(spilled_regs, sp); 6181 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 6182 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 6183 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 6184 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 6185 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 6186 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 6187 // Read whole register from str1. It is safe, because length >=8 here 6188 __ ldr(ch1, Address(str1)); 6189 // Read whole register from str2. It is safe, because length >=8 here 6190 __ ldr(ch2, Address(str2)); 6191 __ sub(cnt2, cnt2, cnt1); 6192 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 6193 if (str1_isL != str2_isL) { 6194 __ eor(v0, __ T16B, v0, v0); 6195 } 6196 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 6197 __ mul(first, first, tmp1); 6198 // check if we have less than 1 register to check 6199 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 6200 if (str1_isL != str2_isL) { 6201 __ fmovd(v1, ch1); 6202 } 6203 __ br(__ LE, L_SMALL); 6204 __ eor(ch2, first, ch2); 6205 if (str1_isL != str2_isL) { 6206 __ zip1(v1, __ T16B, v1, v0); 6207 } 6208 __ sub(tmp2, ch2, tmp1); 6209 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 6210 __ bics(tmp2, tmp2, ch2); 6211 if (str1_isL != str2_isL) { 6212 __ fmovd(ch1, v1); 6213 } 6214 __ br(__ NE, L_HAS_ZERO); 6215 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 6216 __ add(result, result, wordSize/str2_chr_size); 6217 __ add(str2, str2, wordSize); 6218 __ br(__ LT, L_POST_LOOP); 6219 __ BIND(L_LOOP); 6220 __ ldr(ch2, Address(str2)); 6221 __ eor(ch2, first, ch2); 6222 __ sub(tmp2, ch2, tmp1); 6223 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 6224 __ bics(tmp2, tmp2, ch2); 6225 __ br(__ NE, L_HAS_ZERO); 6226 __ BIND(L_LOOP_PROCEED); 6227 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 6228 __ add(str2, str2, wordSize); 6229 __ add(result, result, wordSize/str2_chr_size); 6230 __ br(__ GE, L_LOOP); 6231 __ BIND(L_POST_LOOP); 6232 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 6233 __ br(__ LE, NOMATCH); 6234 __ ldr(ch2, Address(str2)); 6235 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 6236 __ eor(ch2, first, ch2); 6237 __ sub(tmp2, ch2, tmp1); 6238 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 6239 __ mov(tmp4, -1); // all bits set 6240 __ b(L_SMALL_PROCEED); 6241 __ align(OptoLoopAlignment); 6242 __ BIND(L_SMALL); 6243 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 6244 __ eor(ch2, first, ch2); 6245 if (str1_isL != str2_isL) { 6246 __ zip1(v1, __ T16B, v1, v0); 6247 } 6248 __ sub(tmp2, ch2, tmp1); 6249 __ mov(tmp4, -1); // all bits set 6250 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 6251 if (str1_isL != str2_isL) { 6252 __ fmovd(ch1, v1); // move converted 4 symbols 6253 } 6254 __ BIND(L_SMALL_PROCEED); 6255 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 6256 __ bic(tmp2, tmp2, ch2); 6257 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 6258 __ rbit(tmp2, tmp2); 6259 __ br(__ EQ, NOMATCH); 6260 __ BIND(L_SMALL_HAS_ZERO_LOOP); 6261 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 6262 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 6263 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 6264 if (str2_isL) { // LL 6265 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 6266 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 6267 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 6268 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 6269 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 6270 } else { 6271 __ mov(ch2, 0xE); // all bits in byte set except last one 6272 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6273 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6274 __ lslv(tmp2, tmp2, tmp4); 6275 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6276 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6277 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 6278 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6279 } 6280 __ cmp(ch1, ch2); 6281 __ mov(tmp4, wordSize/str2_chr_size); 6282 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 6283 __ BIND(L_SMALL_CMP_LOOP); 6284 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 6285 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 6286 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 6287 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 6288 __ add(tmp4, tmp4, 1); 6289 __ cmp(tmp4, cnt1); 6290 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 6291 __ cmp(first, ch2); 6292 __ br(__ EQ, L_SMALL_CMP_LOOP); 6293 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 6294 __ cbz(tmp2, NOMATCH); // no more matches. exit 6295 __ clz(tmp4, tmp2); 6296 __ add(result, result, 1); // advance index 6297 __ add(str2, str2, str2_chr_size); // advance pointer 6298 __ b(L_SMALL_HAS_ZERO_LOOP); 6299 __ align(OptoLoopAlignment); 6300 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 6301 __ cmp(first, ch2); 6302 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 6303 __ b(DONE); 6304 __ align(OptoLoopAlignment); 6305 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 6306 if (str2_isL) { // LL 6307 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 6308 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 6309 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 6310 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 6311 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 6312 } else { 6313 __ mov(ch2, 0xE); // all bits in byte set except last one 6314 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6315 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6316 __ lslv(tmp2, tmp2, tmp4); 6317 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6318 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6319 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 6320 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6321 } 6322 __ cmp(ch1, ch2); 6323 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 6324 __ b(DONE); 6325 __ align(OptoLoopAlignment); 6326 __ BIND(L_HAS_ZERO); 6327 __ rbit(tmp2, tmp2); 6328 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 6329 // Now, perform compression of counters(cnt2 and cnt1) into one register. 6330 // It's fine because both counters are 32bit and are not changed in this 6331 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 6332 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 6333 __ sub(result, result, 1); 6334 __ BIND(L_HAS_ZERO_LOOP); 6335 __ mov(cnt1, wordSize/str2_chr_size); 6336 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 6337 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 6338 if (str2_isL) { 6339 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 6340 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6341 __ lslv(tmp2, tmp2, tmp4); 6342 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6343 __ add(tmp4, tmp4, 1); 6344 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6345 __ lsl(tmp2, tmp2, 1); 6346 __ mov(tmp4, wordSize/str2_chr_size); 6347 } else { 6348 __ mov(ch2, 0xE); 6349 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6350 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6351 __ lslv(tmp2, tmp2, tmp4); 6352 __ add(tmp4, tmp4, 1); 6353 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6354 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 6355 __ lsl(tmp2, tmp2, 1); 6356 __ mov(tmp4, wordSize/str2_chr_size); 6357 __ sub(str2, str2, str2_chr_size); 6358 } 6359 __ cmp(ch1, ch2); 6360 __ mov(tmp4, wordSize/str2_chr_size); 6361 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6362 __ BIND(L_CMP_LOOP); 6363 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 6364 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 6365 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 6366 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 6367 __ add(tmp4, tmp4, 1); 6368 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 6369 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 6370 __ cmp(cnt1, ch2); 6371 __ br(__ EQ, L_CMP_LOOP); 6372 __ BIND(L_CMP_LOOP_NOMATCH); 6373 // here we're not matched 6374 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 6375 __ clz(tmp4, tmp2); 6376 __ add(str2, str2, str2_chr_size); // advance pointer 6377 __ b(L_HAS_ZERO_LOOP); 6378 __ align(OptoLoopAlignment); 6379 __ BIND(L_CMP_LOOP_LAST_CMP); 6380 __ cmp(cnt1, ch2); 6381 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6382 __ b(DONE); 6383 __ align(OptoLoopAlignment); 6384 __ BIND(L_CMP_LOOP_LAST_CMP2); 6385 if (str2_isL) { 6386 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 6387 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6388 __ lslv(tmp2, tmp2, tmp4); 6389 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6390 __ add(tmp4, tmp4, 1); 6391 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6392 __ lsl(tmp2, tmp2, 1); 6393 } else { 6394 __ mov(ch2, 0xE); 6395 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 6396 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 6397 __ lslv(tmp2, tmp2, tmp4); 6398 __ add(tmp4, tmp4, 1); 6399 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 6400 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 6401 __ lsl(tmp2, tmp2, 1); 6402 __ sub(str2, str2, str2_chr_size); 6403 } 6404 __ cmp(ch1, ch2); 6405 __ br(__ NE, L_CMP_LOOP_NOMATCH); 6406 __ b(DONE); 6407 __ align(OptoLoopAlignment); 6408 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 6409 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 6410 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 6411 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 6412 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 6413 // result by analyzed characters value, so, we can just reset lower bits 6414 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 6415 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 6416 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 6417 // index of last analyzed substring inside current octet. So, str2 in at 6418 // respective start address. We need to advance it to next octet 6419 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 6420 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 6421 __ bfm(result, zr, 0, 2 - str2_chr_shift); 6422 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 6423 __ movw(cnt2, cnt2); 6424 __ b(L_LOOP_PROCEED); 6425 __ align(OptoLoopAlignment); 6426 __ BIND(NOMATCH); 6427 __ mov(result, -1); 6428 __ BIND(DONE); 6429 __ pop(spilled_regs, sp); 6430 __ ret(lr); 6431 return entry; 6432 } 6433 6434 void generate_string_indexof_stubs() { 6435 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 6436 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 6437 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 6438 } 6439 6440 void inflate_and_store_2_fp_registers(bool generatePrfm, 6441 FloatRegister src1, FloatRegister src2) { 6442 Register dst = r1; 6443 __ zip1(v1, __ T16B, src1, v0); 6444 __ zip2(v2, __ T16B, src1, v0); 6445 if (generatePrfm) { 6446 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 6447 } 6448 __ zip1(v3, __ T16B, src2, v0); 6449 __ zip2(v4, __ T16B, src2, v0); 6450 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 6451 } 6452 6453 // R0 = src 6454 // R1 = dst 6455 // R2 = len 6456 // R3 = len >> 3 6457 // V0 = 0 6458 // v1 = loaded 8 bytes 6459 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6 6460 address generate_large_byte_array_inflate() { 6461 __ align(CodeEntryAlignment); 6462 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); 6463 address entry = __ pc(); 6464 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 6465 Register src = r0, dst = r1, len = r2, octetCounter = r3; 6466 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 6467 6468 // do one more 8-byte read to have address 16-byte aligned in most cases 6469 // also use single store instruction 6470 __ ldrd(v2, __ post(src, 8)); 6471 __ sub(octetCounter, octetCounter, 2); 6472 __ zip1(v1, __ T16B, v1, v0); 6473 __ zip1(v2, __ T16B, v2, v0); 6474 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 6475 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6476 __ subs(rscratch1, octetCounter, large_loop_threshold); 6477 __ br(__ LE, LOOP_START); 6478 __ b(LOOP_PRFM_START); 6479 __ bind(LOOP_PRFM); 6480 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6481 __ bind(LOOP_PRFM_START); 6482 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 6483 __ sub(octetCounter, octetCounter, 8); 6484 __ subs(rscratch1, octetCounter, large_loop_threshold); 6485 inflate_and_store_2_fp_registers(true, v3, v4); 6486 inflate_and_store_2_fp_registers(true, v5, v6); 6487 __ br(__ GT, LOOP_PRFM); 6488 __ cmp(octetCounter, (u1)8); 6489 __ br(__ LT, DONE); 6490 __ bind(LOOP); 6491 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 6492 __ bind(LOOP_START); 6493 __ sub(octetCounter, octetCounter, 8); 6494 __ cmp(octetCounter, (u1)8); 6495 inflate_and_store_2_fp_registers(false, v3, v4); 6496 inflate_and_store_2_fp_registers(false, v5, v6); 6497 __ br(__ GE, LOOP); 6498 __ bind(DONE); 6499 __ ret(lr); 6500 return entry; 6501 } 6502 6503 /** 6504 * Arguments: 6505 * 6506 * Input: 6507 * c_rarg0 - current state address 6508 * c_rarg1 - H key address 6509 * c_rarg2 - data address 6510 * c_rarg3 - number of blocks 6511 * 6512 * Output: 6513 * Updated state at c_rarg0 6514 */ 6515 address generate_ghash_processBlocks() { 6516 // Bafflingly, GCM uses little-endian for the byte order, but 6517 // big-endian for the bit order. For example, the polynomial 1 is 6518 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 6519 // 6520 // So, we must either reverse the bytes in each word and do 6521 // everything big-endian or reverse the bits in each byte and do 6522 // it little-endian. On AArch64 it's more idiomatic to reverse 6523 // the bits in each byte (we have an instruction, RBIT, to do 6524 // that) and keep the data in little-endian bit order through the 6525 // calculation, bit-reversing the inputs and outputs. 6526 6527 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 6528 __ align(wordSize * 2); 6529 address p = __ pc(); 6530 __ emit_int64(0x87); // The low-order bits of the field 6531 // polynomial (i.e. p = z^7+z^2+z+1) 6532 // repeated in the low and high parts of a 6533 // 128-bit vector 6534 __ emit_int64(0x87); 6535 6536 __ align(CodeEntryAlignment); 6537 address start = __ pc(); 6538 6539 Register state = c_rarg0; 6540 Register subkeyH = c_rarg1; 6541 Register data = c_rarg2; 6542 Register blocks = c_rarg3; 6543 6544 FloatRegister vzr = v30; 6545 __ eor(vzr, __ T16B, vzr, vzr); // zero register 6546 6547 __ ldrq(v24, p); // The field polynomial 6548 6549 __ ldrq(v0, Address(state)); 6550 __ ldrq(v1, Address(subkeyH)); 6551 6552 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 6553 __ rbit(v0, __ T16B, v0); 6554 __ rev64(v1, __ T16B, v1); 6555 __ rbit(v1, __ T16B, v1); 6556 6557 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 6558 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 6559 6560 { 6561 Label L_ghash_loop; 6562 __ bind(L_ghash_loop); 6563 6564 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 6565 // reversing each byte 6566 __ rbit(v2, __ T16B, v2); 6567 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 6568 6569 // Multiply state in v2 by subkey in v1 6570 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 6571 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4, 6572 /*temps*/v6, v3, /*reuse/clobber b*/v2); 6573 // Reduce v7:v5 by the field polynomial 6574 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3); 6575 6576 __ sub(blocks, blocks, 1); 6577 __ cbnz(blocks, L_ghash_loop); 6578 } 6579 6580 // The bit-reversed result is at this point in v0 6581 __ rev64(v0, __ T16B, v0); 6582 __ rbit(v0, __ T16B, v0); 6583 6584 __ st1(v0, __ T16B, state); 6585 __ ret(lr); 6586 6587 return start; 6588 } 6589 6590 address generate_ghash_processBlocks_wide() { 6591 address small = generate_ghash_processBlocks(); 6592 6593 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide"); 6594 __ align(wordSize * 2); 6595 address p = __ pc(); 6596 __ emit_int64(0x87); // The low-order bits of the field 6597 // polynomial (i.e. p = z^7+z^2+z+1) 6598 // repeated in the low and high parts of a 6599 // 128-bit vector 6600 __ emit_int64(0x87); 6601 6602 __ align(CodeEntryAlignment); 6603 address start = __ pc(); 6604 6605 Register state = c_rarg0; 6606 Register subkeyH = c_rarg1; 6607 Register data = c_rarg2; 6608 Register blocks = c_rarg3; 6609 6610 const int unroll = 4; 6611 6612 __ cmp(blocks, (unsigned char)(unroll * 2)); 6613 __ br(__ LT, small); 6614 6615 if (unroll > 1) { 6616 // Save state before entering routine 6617 __ sub(sp, sp, 4 * 16); 6618 __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); 6619 __ sub(sp, sp, 4 * 16); 6620 __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); 6621 } 6622 6623 __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); 6624 6625 if (unroll > 1) { 6626 // And restore state 6627 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); 6628 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); 6629 } 6630 6631 __ cmp(blocks, (unsigned char)0); 6632 __ br(__ GT, small); 6633 6634 __ ret(lr); 6635 6636 return start; 6637 } 6638 6639 void generate_base64_encode_simdround(Register src, Register dst, 6640 FloatRegister codec, u8 size) { 6641 6642 FloatRegister in0 = v4, in1 = v5, in2 = v6; 6643 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19; 6644 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23; 6645 6646 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 6647 6648 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size)); 6649 6650 __ ushr(ind0, arrangement, in0, 2); 6651 6652 __ ushr(ind1, arrangement, in1, 2); 6653 __ shl(in0, arrangement, in0, 6); 6654 __ orr(ind1, arrangement, ind1, in0); 6655 __ ushr(ind1, arrangement, ind1, 2); 6656 6657 __ ushr(ind2, arrangement, in2, 4); 6658 __ shl(in1, arrangement, in1, 4); 6659 __ orr(ind2, arrangement, in1, ind2); 6660 __ ushr(ind2, arrangement, ind2, 2); 6661 6662 __ shl(ind3, arrangement, in2, 2); 6663 __ ushr(ind3, arrangement, ind3, 2); 6664 6665 __ tbl(out0, arrangement, codec, 4, ind0); 6666 __ tbl(out1, arrangement, codec, 4, ind1); 6667 __ tbl(out2, arrangement, codec, 4, ind2); 6668 __ tbl(out3, arrangement, codec, 4, ind3); 6669 6670 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size)); 6671 } 6672 6673 /** 6674 * Arguments: 6675 * 6676 * Input: 6677 * c_rarg0 - src_start 6678 * c_rarg1 - src_offset 6679 * c_rarg2 - src_length 6680 * c_rarg3 - dest_start 6681 * c_rarg4 - dest_offset 6682 * c_rarg5 - isURL 6683 * 6684 */ 6685 address generate_base64_encodeBlock() { 6686 6687 static const char toBase64[64] = { 6688 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 6689 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 6690 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 6691 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 6692 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/' 6693 }; 6694 6695 static const char toBase64URL[64] = { 6696 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 6697 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 6698 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 6699 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 6700 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_' 6701 }; 6702 6703 __ align(CodeEntryAlignment); 6704 StubCodeMark mark(this, "StubRoutines", "encodeBlock"); 6705 address start = __ pc(); 6706 6707 Register src = c_rarg0; // source array 6708 Register soff = c_rarg1; // source start offset 6709 Register send = c_rarg2; // source end offset 6710 Register dst = c_rarg3; // dest array 6711 Register doff = c_rarg4; // position for writing to dest array 6712 Register isURL = c_rarg5; // Base64 or URL character set 6713 6714 // c_rarg6 and c_rarg7 are free to use as temps 6715 Register codec = c_rarg6; 6716 Register length = c_rarg7; 6717 6718 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit; 6719 6720 __ add(src, src, soff); 6721 __ add(dst, dst, doff); 6722 __ sub(length, send, soff); 6723 6724 // load the codec base address 6725 __ lea(codec, ExternalAddress((address) toBase64)); 6726 __ cbz(isURL, ProcessData); 6727 __ lea(codec, ExternalAddress((address) toBase64URL)); 6728 6729 __ BIND(ProcessData); 6730 6731 // too short to formup a SIMD loop, roll back 6732 __ cmp(length, (u1)24); 6733 __ br(Assembler::LT, Process3B); 6734 6735 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec)); 6736 6737 __ BIND(Process48B); 6738 __ cmp(length, (u1)48); 6739 __ br(Assembler::LT, Process24B); 6740 generate_base64_encode_simdround(src, dst, v0, 16); 6741 __ sub(length, length, 48); 6742 __ b(Process48B); 6743 6744 __ BIND(Process24B); 6745 __ cmp(length, (u1)24); 6746 __ br(Assembler::LT, SIMDExit); 6747 generate_base64_encode_simdround(src, dst, v0, 8); 6748 __ sub(length, length, 24); 6749 6750 __ BIND(SIMDExit); 6751 __ cbz(length, Exit); 6752 6753 __ BIND(Process3B); 6754 // 3 src bytes, 24 bits 6755 __ ldrb(r10, __ post(src, 1)); 6756 __ ldrb(r11, __ post(src, 1)); 6757 __ ldrb(r12, __ post(src, 1)); 6758 __ orrw(r11, r11, r10, Assembler::LSL, 8); 6759 __ orrw(r12, r12, r11, Assembler::LSL, 8); 6760 // codec index 6761 __ ubfmw(r15, r12, 18, 23); 6762 __ ubfmw(r14, r12, 12, 17); 6763 __ ubfmw(r13, r12, 6, 11); 6764 __ andw(r12, r12, 63); 6765 // get the code based on the codec 6766 __ ldrb(r15, Address(codec, r15, Address::uxtw(0))); 6767 __ ldrb(r14, Address(codec, r14, Address::uxtw(0))); 6768 __ ldrb(r13, Address(codec, r13, Address::uxtw(0))); 6769 __ ldrb(r12, Address(codec, r12, Address::uxtw(0))); 6770 __ strb(r15, __ post(dst, 1)); 6771 __ strb(r14, __ post(dst, 1)); 6772 __ strb(r13, __ post(dst, 1)); 6773 __ strb(r12, __ post(dst, 1)); 6774 __ sub(length, length, 3); 6775 __ cbnz(length, Process3B); 6776 6777 __ BIND(Exit); 6778 __ ret(lr); 6779 6780 return start; 6781 } 6782 6783 void generate_base64_decode_simdround(Register src, Register dst, 6784 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) { 6785 6786 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19; 6787 FloatRegister out0 = v20, out1 = v21, out2 = v22; 6788 6789 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26; 6790 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31; 6791 6792 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData; 6793 6794 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B; 6795 6796 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size)); 6797 6798 // we need unsigned saturating subtract, to make sure all input values 6799 // in range [0, 63] will have 0U value in the higher half lookup 6800 __ uqsubv(decH0, __ T16B, in0, v27); 6801 __ uqsubv(decH1, __ T16B, in1, v27); 6802 __ uqsubv(decH2, __ T16B, in2, v27); 6803 __ uqsubv(decH3, __ T16B, in3, v27); 6804 6805 // lower half lookup 6806 __ tbl(decL0, arrangement, codecL, 4, in0); 6807 __ tbl(decL1, arrangement, codecL, 4, in1); 6808 __ tbl(decL2, arrangement, codecL, 4, in2); 6809 __ tbl(decL3, arrangement, codecL, 4, in3); 6810 6811 // higher half lookup 6812 __ tbx(decH0, arrangement, codecH, 4, decH0); 6813 __ tbx(decH1, arrangement, codecH, 4, decH1); 6814 __ tbx(decH2, arrangement, codecH, 4, decH2); 6815 __ tbx(decH3, arrangement, codecH, 4, decH3); 6816 6817 // combine lower and higher 6818 __ orr(decL0, arrangement, decL0, decH0); 6819 __ orr(decL1, arrangement, decL1, decH1); 6820 __ orr(decL2, arrangement, decL2, decH2); 6821 __ orr(decL3, arrangement, decL3, decH3); 6822 6823 // check illegal inputs, value larger than 63 (maximum of 6 bits) 6824 __ cm(Assembler::HI, decH0, arrangement, decL0, v27); 6825 __ cm(Assembler::HI, decH1, arrangement, decL1, v27); 6826 __ cm(Assembler::HI, decH2, arrangement, decL2, v27); 6827 __ cm(Assembler::HI, decH3, arrangement, decL3, v27); 6828 __ orr(in0, arrangement, decH0, decH1); 6829 __ orr(in1, arrangement, decH2, decH3); 6830 __ orr(in2, arrangement, in0, in1); 6831 __ umaxv(in3, arrangement, in2); 6832 __ umov(rscratch2, in3, __ B, 0); 6833 6834 // get the data to output 6835 __ shl(out0, arrangement, decL0, 2); 6836 __ ushr(out1, arrangement, decL1, 4); 6837 __ orr(out0, arrangement, out0, out1); 6838 __ shl(out1, arrangement, decL1, 4); 6839 __ ushr(out2, arrangement, decL2, 2); 6840 __ orr(out1, arrangement, out1, out2); 6841 __ shl(out2, arrangement, decL2, 6); 6842 __ orr(out2, arrangement, out2, decL3); 6843 6844 __ cbz(rscratch2, NoIllegalData); 6845 6846 // handle illegal input 6847 __ umov(r10, in2, __ D, 0); 6848 if (size == 16) { 6849 __ cbnz(r10, ErrorInLowerHalf); 6850 6851 // illegal input is in higher half, store the lower half now. 6852 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24)); 6853 6854 __ umov(r10, in2, __ D, 1); 6855 __ umov(r11, out0, __ D, 1); 6856 __ umov(r12, out1, __ D, 1); 6857 __ umov(r13, out2, __ D, 1); 6858 __ b(StoreLegalData); 6859 6860 __ BIND(ErrorInLowerHalf); 6861 } 6862 __ umov(r11, out0, __ D, 0); 6863 __ umov(r12, out1, __ D, 0); 6864 __ umov(r13, out2, __ D, 0); 6865 6866 __ BIND(StoreLegalData); 6867 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input 6868 __ strb(r11, __ post(dst, 1)); 6869 __ strb(r12, __ post(dst, 1)); 6870 __ strb(r13, __ post(dst, 1)); 6871 __ lsr(r10, r10, 8); 6872 __ lsr(r11, r11, 8); 6873 __ lsr(r12, r12, 8); 6874 __ lsr(r13, r13, 8); 6875 __ b(StoreLegalData); 6876 6877 __ BIND(NoIllegalData); 6878 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size)); 6879 } 6880 6881 6882 /** 6883 * Arguments: 6884 * 6885 * Input: 6886 * c_rarg0 - src_start 6887 * c_rarg1 - src_offset 6888 * c_rarg2 - src_length 6889 * c_rarg3 - dest_start 6890 * c_rarg4 - dest_offset 6891 * c_rarg5 - isURL 6892 * c_rarg6 - isMIME 6893 * 6894 */ 6895 address generate_base64_decodeBlock() { 6896 6897 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined 6898 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section 6899 // titled "Base64 decoding". 6900 6901 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64, 6902 // except the trailing character '=' is also treated illegal value in this intrinsic. That 6903 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here. 6904 static const uint8_t fromBase64ForNoSIMD[256] = { 6905 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6906 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6907 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 6908 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6909 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 6910 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u, 6911 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 6912 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 6913 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6914 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6915 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6916 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6917 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6918 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6919 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6920 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6921 }; 6922 6923 static const uint8_t fromBase64URLForNoSIMD[256] = { 6924 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6925 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6926 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 6927 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6928 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 6929 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u, 6930 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u, 6931 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u, 6932 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6933 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6934 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6935 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6936 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6937 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6938 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6939 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6940 }; 6941 6942 // A legal value of base64 code is in range [0, 127]. We need two lookups 6943 // with tbl/tbx and combine them to get the decode data. The 1st table vector 6944 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd 6945 // table vector lookup use tbx, out of range indices are unchanged in 6946 // destination. Input [64..126] is mapped to index [65, 127] in second lookup. 6947 // The value of index 64 is set to 0, so that we know that we already get the 6948 // decoded data with the 1st lookup. 6949 static const uint8_t fromBase64ForSIMD[128] = { 6950 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6951 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6952 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u, 6953 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6954 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 6955 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 6956 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 6957 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 6958 }; 6959 6960 static const uint8_t fromBase64URLForSIMD[128] = { 6961 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6962 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 6963 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 6964 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u, 6965 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 6966 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 6967 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 6968 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 6969 }; 6970 6971 __ align(CodeEntryAlignment); 6972 StubCodeMark mark(this, "StubRoutines", "decodeBlock"); 6973 address start = __ pc(); 6974 6975 Register src = c_rarg0; // source array 6976 Register soff = c_rarg1; // source start offset 6977 Register send = c_rarg2; // source end offset 6978 Register dst = c_rarg3; // dest array 6979 Register doff = c_rarg4; // position for writing to dest array 6980 Register isURL = c_rarg5; // Base64 or URL character set 6981 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation 6982 6983 Register length = send; // reuse send as length of source data to process 6984 6985 Register simd_codec = c_rarg6; 6986 Register nosimd_codec = c_rarg7; 6987 6988 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit; 6989 6990 __ enter(); 6991 6992 __ add(src, src, soff); 6993 __ add(dst, dst, doff); 6994 6995 __ mov(doff, dst); 6996 6997 __ sub(length, send, soff); 6998 __ bfm(length, zr, 0, 1); 6999 7000 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD)); 7001 __ cbz(isURL, ProcessData); 7002 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD)); 7003 7004 __ BIND(ProcessData); 7005 __ mov(rscratch1, length); 7006 __ cmp(length, (u1)144); // 144 = 80 + 64 7007 __ br(Assembler::LT, Process4B); 7008 7009 // In the MIME case, the line length cannot be more than 76 7010 // bytes (see RFC 2045). This is too short a block for SIMD 7011 // to be worthwhile, so we use non-SIMD here. 7012 __ movw(rscratch1, 79); 7013 7014 __ BIND(Process4B); 7015 __ ldrw(r14, __ post(src, 4)); 7016 __ ubfxw(r10, r14, 0, 8); 7017 __ ubfxw(r11, r14, 8, 8); 7018 __ ubfxw(r12, r14, 16, 8); 7019 __ ubfxw(r13, r14, 24, 8); 7020 // get the de-code 7021 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0))); 7022 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0))); 7023 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0))); 7024 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0))); 7025 // error detection, 255u indicates an illegal input 7026 __ orrw(r14, r10, r11); 7027 __ orrw(r15, r12, r13); 7028 __ orrw(r14, r14, r15); 7029 __ tbnz(r14, 7, Exit); 7030 // recover the data 7031 __ lslw(r14, r10, 10); 7032 __ bfiw(r14, r11, 4, 6); 7033 __ bfmw(r14, r12, 2, 5); 7034 __ rev16w(r14, r14); 7035 __ bfiw(r13, r12, 6, 2); 7036 __ strh(r14, __ post(dst, 2)); 7037 __ strb(r13, __ post(dst, 1)); 7038 // non-simd loop 7039 __ subsw(rscratch1, rscratch1, 4); 7040 __ br(Assembler::GT, Process4B); 7041 7042 // if exiting from PreProcess80B, rscratch1 == -1; 7043 // otherwise, rscratch1 == 0. 7044 __ cbzw(rscratch1, Exit); 7045 __ sub(length, length, 80); 7046 7047 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD)); 7048 __ cbz(isURL, SIMDEnter); 7049 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD)); 7050 7051 __ BIND(SIMDEnter); 7052 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64)); 7053 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec)); 7054 __ mov(rscratch1, 63); 7055 __ dup(v27, __ T16B, rscratch1); 7056 7057 __ BIND(Process64B); 7058 __ cmp(length, (u1)64); 7059 __ br(Assembler::LT, Process32B); 7060 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit); 7061 __ sub(length, length, 64); 7062 __ b(Process64B); 7063 7064 __ BIND(Process32B); 7065 __ cmp(length, (u1)32); 7066 __ br(Assembler::LT, SIMDExit); 7067 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit); 7068 __ sub(length, length, 32); 7069 __ b(Process32B); 7070 7071 __ BIND(SIMDExit); 7072 __ cbz(length, Exit); 7073 __ movw(rscratch1, length); 7074 __ b(Process4B); 7075 7076 __ BIND(Exit); 7077 __ sub(c_rarg0, dst, doff); 7078 7079 __ leave(); 7080 __ ret(lr); 7081 7082 return start; 7083 } 7084 7085 // Support for spin waits. 7086 address generate_spin_wait() { 7087 __ align(CodeEntryAlignment); 7088 StubCodeMark mark(this, "StubRoutines", "spin_wait"); 7089 address start = __ pc(); 7090 7091 __ spin_wait(); 7092 __ ret(lr); 7093 7094 return start; 7095 } 7096 7097 address generate_lookup_secondary_supers_table_stub(u1 super_klass_index) { 7098 StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table"); 7099 7100 address start = __ pc(); 7101 const Register 7102 r_super_klass = r0, 7103 r_array_base = r1, 7104 r_array_length = r2, 7105 r_array_index = r3, 7106 r_sub_klass = r4, 7107 r_bitmap = rscratch2, 7108 result = r5; 7109 const FloatRegister 7110 vtemp = v0; 7111 7112 Label L_success; 7113 __ enter(); 7114 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass, 7115 r_array_base, r_array_length, r_array_index, 7116 vtemp, result, super_klass_index, 7117 /*stub_is_near*/true); 7118 __ leave(); 7119 __ ret(lr); 7120 7121 return start; 7122 } 7123 7124 // Slow path implementation for UseSecondarySupersTable. 7125 address generate_lookup_secondary_supers_table_slow_path_stub() { 7126 StubCodeMark mark(this, "StubRoutines", "lookup_secondary_supers_table_slow_path"); 7127 7128 address start = __ pc(); 7129 const Register 7130 r_super_klass = r0, // argument 7131 r_array_base = r1, // argument 7132 temp1 = r2, // temp 7133 r_array_index = r3, // argument 7134 r_bitmap = rscratch2, // argument 7135 result = r5; // argument 7136 7137 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result); 7138 __ ret(lr); 7139 7140 return start; 7141 } 7142 7143 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 7144 7145 // ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX. 7146 // 7147 // If LSE is in use, generate LSE versions of all the stubs. The 7148 // non-LSE versions are in atomic_aarch64.S. 7149 7150 // class AtomicStubMark records the entry point of a stub and the 7151 // stub pointer which will point to it. The stub pointer is set to 7152 // the entry point when ~AtomicStubMark() is called, which must be 7153 // after ICache::invalidate_range. This ensures safe publication of 7154 // the generated code. 7155 class AtomicStubMark { 7156 address _entry_point; 7157 aarch64_atomic_stub_t *_stub; 7158 MacroAssembler *_masm; 7159 public: 7160 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) { 7161 _masm = masm; 7162 __ align(32); 7163 _entry_point = __ pc(); 7164 _stub = stub; 7165 } 7166 ~AtomicStubMark() { 7167 *_stub = (aarch64_atomic_stub_t)_entry_point; 7168 } 7169 }; 7170 7171 // NB: For memory_order_conservative we need a trailing membar after 7172 // LSE atomic operations but not a leading membar. 7173 // 7174 // We don't need a leading membar because a clause in the Arm ARM 7175 // says: 7176 // 7177 // Barrier-ordered-before 7178 // 7179 // Barrier instructions order prior Memory effects before subsequent 7180 // Memory effects generated by the same Observer. A read or a write 7181 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same 7182 // Observer if and only if RW1 appears in program order before RW 2 7183 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic 7184 // instruction with both Acquire and Release semantics. 7185 // 7186 // All the atomic instructions {ldaddal, swapal, casal} have Acquire 7187 // and Release semantics, therefore we don't need a leading 7188 // barrier. However, there is no corresponding Barrier-ordered-after 7189 // relationship, therefore we need a trailing membar to prevent a 7190 // later store or load from being reordered with the store in an 7191 // atomic instruction. 7192 // 7193 // This was checked by using the herd7 consistency model simulator 7194 // (http://diy.inria.fr/) with this test case: 7195 // 7196 // AArch64 LseCas 7197 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } 7198 // P0 | P1; 7199 // LDR W4, [X2] | MOV W3, #0; 7200 // DMB LD | MOV W4, #1; 7201 // LDR W3, [X1] | CASAL W3, W4, [X1]; 7202 // | DMB ISH; 7203 // | STR W4, [X2]; 7204 // exists 7205 // (0:X3=0 /\ 0:X4=1) 7206 // 7207 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered 7208 // with the store to x in P1. Without the DMB in P1 this may happen. 7209 // 7210 // At the time of writing we don't know of any AArch64 hardware that 7211 // reorders stores in this way, but the Reference Manual permits it. 7212 7213 void gen_cas_entry(Assembler::operand_size size, 7214 atomic_memory_order order) { 7215 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1, 7216 exchange_val = c_rarg2; 7217 bool acquire, release; 7218 switch (order) { 7219 case memory_order_relaxed: 7220 acquire = false; 7221 release = false; 7222 break; 7223 case memory_order_release: 7224 acquire = false; 7225 release = true; 7226 break; 7227 default: 7228 acquire = true; 7229 release = true; 7230 break; 7231 } 7232 __ mov(prev, compare_val); 7233 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true); 7234 if (order == memory_order_conservative) { 7235 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 7236 } 7237 if (size == Assembler::xword) { 7238 __ mov(r0, prev); 7239 } else { 7240 __ movw(r0, prev); 7241 } 7242 __ ret(lr); 7243 } 7244 7245 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) { 7246 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 7247 // If not relaxed, then default to conservative. Relaxed is the only 7248 // case we use enough to be worth specializing. 7249 if (order == memory_order_relaxed) { 7250 __ ldadd(size, incr, prev, addr); 7251 } else { 7252 __ ldaddal(size, incr, prev, addr); 7253 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 7254 } 7255 if (size == Assembler::xword) { 7256 __ mov(r0, prev); 7257 } else { 7258 __ movw(r0, prev); 7259 } 7260 __ ret(lr); 7261 } 7262 7263 void gen_swpal_entry(Assembler::operand_size size) { 7264 Register prev = r2, addr = c_rarg0, incr = c_rarg1; 7265 __ swpal(size, incr, prev, addr); 7266 __ membar(Assembler::StoreStore|Assembler::StoreLoad); 7267 if (size == Assembler::xword) { 7268 __ mov(r0, prev); 7269 } else { 7270 __ movw(r0, prev); 7271 } 7272 __ ret(lr); 7273 } 7274 7275 void generate_atomic_entry_points() { 7276 if (! UseLSE) { 7277 return; 7278 } 7279 7280 __ align(CodeEntryAlignment); 7281 StubCodeMark mark(this, "StubRoutines", "atomic entry points"); 7282 address first_entry = __ pc(); 7283 7284 // ADD, memory_order_conservative 7285 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl); 7286 gen_ldadd_entry(Assembler::word, memory_order_conservative); 7287 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl); 7288 gen_ldadd_entry(Assembler::xword, memory_order_conservative); 7289 7290 // ADD, memory_order_relaxed 7291 AtomicStubMark mark_fetch_add_4_relaxed 7292 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl); 7293 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed); 7294 AtomicStubMark mark_fetch_add_8_relaxed 7295 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl); 7296 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed); 7297 7298 // XCHG, memory_order_conservative 7299 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl); 7300 gen_swpal_entry(Assembler::word); 7301 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl); 7302 gen_swpal_entry(Assembler::xword); 7303 7304 // CAS, memory_order_conservative 7305 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl); 7306 gen_cas_entry(MacroAssembler::byte, memory_order_conservative); 7307 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl); 7308 gen_cas_entry(MacroAssembler::word, memory_order_conservative); 7309 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl); 7310 gen_cas_entry(MacroAssembler::xword, memory_order_conservative); 7311 7312 // CAS, memory_order_relaxed 7313 AtomicStubMark mark_cmpxchg_1_relaxed 7314 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl); 7315 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed); 7316 AtomicStubMark mark_cmpxchg_4_relaxed 7317 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl); 7318 gen_cas_entry(MacroAssembler::word, memory_order_relaxed); 7319 AtomicStubMark mark_cmpxchg_8_relaxed 7320 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl); 7321 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed); 7322 7323 AtomicStubMark mark_cmpxchg_4_release 7324 (_masm, &aarch64_atomic_cmpxchg_4_release_impl); 7325 gen_cas_entry(MacroAssembler::word, memory_order_release); 7326 AtomicStubMark mark_cmpxchg_8_release 7327 (_masm, &aarch64_atomic_cmpxchg_8_release_impl); 7328 gen_cas_entry(MacroAssembler::xword, memory_order_release); 7329 7330 AtomicStubMark mark_cmpxchg_4_seq_cst 7331 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl); 7332 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst); 7333 AtomicStubMark mark_cmpxchg_8_seq_cst 7334 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl); 7335 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst); 7336 7337 ICache::invalidate_range(first_entry, __ pc() - first_entry); 7338 } 7339 #endif // LINUX 7340 7341 address generate_cont_thaw(Continuation::thaw_kind kind) { 7342 bool return_barrier = Continuation::is_thaw_return_barrier(kind); 7343 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind); 7344 7345 address start = __ pc(); 7346 7347 if (return_barrier) { 7348 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())); 7349 __ mov(sp, rscratch1); 7350 } 7351 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 7352 7353 if (return_barrier) { 7354 // preserve possible return value from a method returning to the return barrier 7355 __ fmovd(rscratch1, v0); 7356 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 7357 } 7358 7359 __ movw(c_rarg1, (return_barrier ? 1 : 0)); 7360 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1); 7361 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames 7362 7363 if (return_barrier) { 7364 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 7365 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 7366 __ fmovd(v0, rscratch1); 7367 } 7368 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp"); 7369 7370 7371 Label thaw_success; 7372 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames 7373 __ cbnz(rscratch2, thaw_success); 7374 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry())); 7375 __ br(rscratch1); 7376 __ bind(thaw_success); 7377 7378 // make room for the thawed frames 7379 __ sub(rscratch1, sp, rscratch2); 7380 __ andr(rscratch1, rscratch1, -16); // align 7381 __ mov(sp, rscratch1); 7382 7383 if (return_barrier) { 7384 // save original return value -- again 7385 __ fmovd(rscratch1, v0); 7386 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize))); 7387 } 7388 7389 // If we want, we can templatize thaw by kind, and have three different entries 7390 __ movw(c_rarg1, (uint32_t)kind); 7391 7392 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1); 7393 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame 7394 7395 if (return_barrier) { 7396 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK) 7397 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize))); 7398 __ fmovd(v0, rscratch1); 7399 } else { 7400 __ mov(r0, zr); // return 0 (success) from doYield 7401 } 7402 7403 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down) 7404 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill 7405 __ mov(rfp, sp); 7406 7407 if (return_barrier_exception) { 7408 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address 7409 __ authenticate_return_address(c_rarg1); 7410 __ verify_oop(r0); 7411 // save return value containing the exception oop in callee-saved R19 7412 __ mov(r19, r0); 7413 7414 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); 7415 7416 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code. 7417 // __ reinitialize_ptrue(); 7418 7419 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc 7420 7421 __ mov(r1, r0); // the exception handler 7422 __ mov(r0, r19); // restore return value containing the exception oop 7423 __ verify_oop(r0); 7424 7425 __ leave(); 7426 __ mov(r3, lr); 7427 __ br(r1); // the exception handler 7428 } else { 7429 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame 7430 __ leave(); 7431 __ ret(lr); 7432 } 7433 7434 return start; 7435 } 7436 7437 address generate_cont_thaw() { 7438 if (!Continuations::enabled()) return nullptr; 7439 7440 StubCodeMark mark(this, "StubRoutines", "Cont thaw"); 7441 address start = __ pc(); 7442 generate_cont_thaw(Continuation::thaw_top); 7443 return start; 7444 } 7445 7446 address generate_cont_returnBarrier() { 7447 if (!Continuations::enabled()) return nullptr; 7448 7449 // TODO: will probably need multiple return barriers depending on return type 7450 StubCodeMark mark(this, "StubRoutines", "cont return barrier"); 7451 address start = __ pc(); 7452 7453 generate_cont_thaw(Continuation::thaw_return_barrier); 7454 7455 return start; 7456 } 7457 7458 address generate_cont_returnBarrier_exception() { 7459 if (!Continuations::enabled()) return nullptr; 7460 7461 StubCodeMark mark(this, "StubRoutines", "cont return barrier exception handler"); 7462 address start = __ pc(); 7463 7464 generate_cont_thaw(Continuation::thaw_return_barrier_exception); 7465 7466 return start; 7467 } 7468 7469 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers 7470 // are represented as long[5], with BITS_PER_LIMB = 26. 7471 // Pack five 26-bit limbs into three 64-bit registers. 7472 void pack_26(Register dest0, Register dest1, Register dest2, Register src) { 7473 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits 7474 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits 7475 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong))); 7476 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits 7477 7478 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits 7479 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits 7480 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong))); 7481 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits 7482 7483 if (dest2->is_valid()) { 7484 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits 7485 } else { 7486 #ifdef ASSERT 7487 Label OK; 7488 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits 7489 __ br(__ EQ, OK); 7490 __ stop("high bits of Poly1305 integer should be zero"); 7491 __ should_not_reach_here(); 7492 __ bind(OK); 7493 #endif 7494 } 7495 } 7496 7497 // As above, but return only a 128-bit integer, packed into two 7498 // 64-bit registers. 7499 void pack_26(Register dest0, Register dest1, Register src) { 7500 pack_26(dest0, dest1, noreg, src); 7501 } 7502 7503 // Multiply and multiply-accumulate unsigned 64-bit registers. 7504 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) { 7505 __ mul(prod_lo, n, m); 7506 __ umulh(prod_hi, n, m); 7507 } 7508 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) { 7509 wide_mul(rscratch1, rscratch2, n, m); 7510 __ adds(sum_lo, sum_lo, rscratch1); 7511 __ adc(sum_hi, sum_hi, rscratch2); 7512 } 7513 7514 // Poly1305, RFC 7539 7515 7516 // See https://loup-vaillant.fr/tutorials/poly1305-design for a 7517 // description of the tricks used to simplify and accelerate this 7518 // computation. 7519 7520 address generate_poly1305_processBlocks() { 7521 __ align(CodeEntryAlignment); 7522 StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks"); 7523 address start = __ pc(); 7524 Label here; 7525 __ enter(); 7526 RegSet callee_saved = RegSet::range(r19, r28); 7527 __ push(callee_saved, sp); 7528 7529 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin(); 7530 7531 // Arguments 7532 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs; 7533 7534 // R_n is the 128-bit randomly-generated key, packed into two 7535 // registers. The caller passes this key to us as long[5], with 7536 // BITS_PER_LIMB = 26. 7537 const Register R_0 = *++regs, R_1 = *++regs; 7538 pack_26(R_0, R_1, r_start); 7539 7540 // RR_n is (R_n >> 2) * 5 7541 const Register RR_0 = *++regs, RR_1 = *++regs; 7542 __ lsr(RR_0, R_0, 2); 7543 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2); 7544 __ lsr(RR_1, R_1, 2); 7545 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2); 7546 7547 // U_n is the current checksum 7548 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs; 7549 pack_26(U_0, U_1, U_2, acc_start); 7550 7551 static constexpr int BLOCK_LENGTH = 16; 7552 Label DONE, LOOP; 7553 7554 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 7555 __ br(Assembler::LT, DONE); { 7556 __ bind(LOOP); 7557 7558 // S_n is to be the sum of U_n and the next block of data 7559 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs; 7560 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize)); 7561 __ adds(S_0, U_0, S_0); 7562 __ adcs(S_1, U_1, S_1); 7563 __ adc(S_2, U_2, zr); 7564 __ add(S_2, S_2, 1); 7565 7566 const Register U_0HI = *++regs, U_1HI = *++regs; 7567 7568 // NB: this logic depends on some of the special properties of 7569 // Poly1305 keys. In particular, because we know that the top 7570 // four bits of R_0 and R_1 are zero, we can add together 7571 // partial products without any risk of needing to propagate a 7572 // carry out. 7573 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0); 7574 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1); 7575 __ andr(U_2, R_0, 3); 7576 __ mul(U_2, S_2, U_2); 7577 7578 // Recycle registers S_0, S_1, S_2 7579 regs = (regs.remaining() + S_0 + S_1 + S_2).begin(); 7580 7581 // Partial reduction mod 2**130 - 5 7582 __ adds(U_1, U_0HI, U_1); 7583 __ adc(U_2, U_1HI, U_2); 7584 // Sum now in U_2:U_1:U_0. 7585 // Dead: U_0HI, U_1HI. 7586 regs = (regs.remaining() + U_0HI + U_1HI).begin(); 7587 7588 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps 7589 7590 // First, U_2:U_1:U_0 += (U_2 >> 2) 7591 __ lsr(rscratch1, U_2, 2); 7592 __ andr(U_2, U_2, (u8)3); 7593 __ adds(U_0, U_0, rscratch1); 7594 __ adcs(U_1, U_1, zr); 7595 __ adc(U_2, U_2, zr); 7596 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2 7597 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2); 7598 __ adcs(U_1, U_1, zr); 7599 __ adc(U_2, U_2, zr); 7600 7601 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH)); 7602 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH)); 7603 __ br(~ Assembler::LT, LOOP); 7604 } 7605 7606 // Further reduce modulo 2^130 - 5 7607 __ lsr(rscratch1, U_2, 2); 7608 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5 7609 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5 7610 __ adcs(U_1, U_1, zr); 7611 __ andr(U_2, U_2, (u1)3); 7612 __ adc(U_2, U_2, zr); 7613 7614 // Unpack the sum into five 26-bit limbs and write to memory. 7615 __ ubfiz(rscratch1, U_0, 0, 26); 7616 __ ubfx(rscratch2, U_0, 26, 26); 7617 __ stp(rscratch1, rscratch2, Address(acc_start)); 7618 __ ubfx(rscratch1, U_0, 52, 12); 7619 __ bfi(rscratch1, U_1, 12, 14); 7620 __ ubfx(rscratch2, U_1, 14, 26); 7621 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong))); 7622 __ ubfx(rscratch1, U_1, 40, 24); 7623 __ bfi(rscratch1, U_2, 24, 3); 7624 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong))); 7625 7626 __ bind(DONE); 7627 __ pop(callee_saved, sp); 7628 __ leave(); 7629 __ ret(lr); 7630 7631 return start; 7632 } 7633 7634 // exception handler for upcall stubs 7635 address generate_upcall_stub_exception_handler() { 7636 StubCodeMark mark(this, "StubRoutines", "upcall stub exception handler"); 7637 address start = __ pc(); 7638 7639 // Native caller has no idea how to handle exceptions, 7640 // so we just crash here. Up to callee to catch exceptions. 7641 __ verify_oop(r0); 7642 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception)); 7643 __ blr(rscratch1); 7644 __ should_not_reach_here(); 7645 7646 return start; 7647 } 7648 7649 // load Method* target of MethodHandle 7650 // j_rarg0 = jobject receiver 7651 // rmethod = result 7652 address generate_upcall_stub_load_target() { 7653 StubCodeMark mark(this, "StubRoutines", "upcall_stub_load_target"); 7654 address start = __ pc(); 7655 7656 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2); 7657 // Load target method from receiver 7658 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2); 7659 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2); 7660 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2); 7661 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod, 7662 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()), 7663 noreg, noreg); 7664 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized 7665 7666 __ ret(lr); 7667 7668 return start; 7669 } 7670 7671 #undef __ 7672 #define __ masm-> 7673 7674 class MontgomeryMultiplyGenerator : public MacroAssembler { 7675 7676 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 7677 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 7678 7679 RegSet _toSave; 7680 bool _squaring; 7681 7682 public: 7683 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 7684 : MacroAssembler(as->code()), _squaring(squaring) { 7685 7686 // Register allocation 7687 7688 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin(); 7689 Pa_base = *regs; // Argument registers 7690 if (squaring) 7691 Pb_base = Pa_base; 7692 else 7693 Pb_base = *++regs; 7694 Pn_base = *++regs; 7695 Rlen= *++regs; 7696 inv = *++regs; 7697 Pm_base = *++regs; 7698 7699 // Working registers: 7700 Ra = *++regs; // The current digit of a, b, n, and m. 7701 Rb = *++regs; 7702 Rm = *++regs; 7703 Rn = *++regs; 7704 7705 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 7706 Pb = *++regs; 7707 Pm = *++regs; 7708 Pn = *++regs; 7709 7710 t0 = *++regs; // Three registers which form a 7711 t1 = *++regs; // triple-precision accumuator. 7712 t2 = *++regs; 7713 7714 Ri = *++regs; // Inner and outer loop indexes. 7715 Rj = *++regs; 7716 7717 Rhi_ab = *++regs; // Product registers: low and high parts 7718 Rlo_ab = *++regs; // of a*b and m*n. 7719 Rhi_mn = *++regs; 7720 Rlo_mn = *++regs; 7721 7722 // r19 and up are callee-saved. 7723 _toSave = RegSet::range(r19, *regs) + Pm_base; 7724 } 7725 7726 private: 7727 void save_regs() { 7728 push(_toSave, sp); 7729 } 7730 7731 void restore_regs() { 7732 pop(_toSave, sp); 7733 } 7734 7735 template <typename T> 7736 void unroll_2(Register count, T block) { 7737 Label loop, end, odd; 7738 tbnz(count, 0, odd); 7739 cbz(count, end); 7740 align(16); 7741 bind(loop); 7742 (this->*block)(); 7743 bind(odd); 7744 (this->*block)(); 7745 subs(count, count, 2); 7746 br(Assembler::GT, loop); 7747 bind(end); 7748 } 7749 7750 template <typename T> 7751 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 7752 Label loop, end, odd; 7753 tbnz(count, 0, odd); 7754 cbz(count, end); 7755 align(16); 7756 bind(loop); 7757 (this->*block)(d, s, tmp); 7758 bind(odd); 7759 (this->*block)(d, s, tmp); 7760 subs(count, count, 2); 7761 br(Assembler::GT, loop); 7762 bind(end); 7763 } 7764 7765 void pre1(RegisterOrConstant i) { 7766 block_comment("pre1"); 7767 // Pa = Pa_base; 7768 // Pb = Pb_base + i; 7769 // Pm = Pm_base; 7770 // Pn = Pn_base + i; 7771 // Ra = *Pa; 7772 // Rb = *Pb; 7773 // Rm = *Pm; 7774 // Rn = *Pn; 7775 ldr(Ra, Address(Pa_base)); 7776 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 7777 ldr(Rm, Address(Pm_base)); 7778 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7779 lea(Pa, Address(Pa_base)); 7780 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 7781 lea(Pm, Address(Pm_base)); 7782 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7783 7784 // Zero the m*n result. 7785 mov(Rhi_mn, zr); 7786 mov(Rlo_mn, zr); 7787 } 7788 7789 // The core multiply-accumulate step of a Montgomery 7790 // multiplication. The idea is to schedule operations as a 7791 // pipeline so that instructions with long latencies (loads and 7792 // multiplies) have time to complete before their results are 7793 // used. This most benefits in-order implementations of the 7794 // architecture but out-of-order ones also benefit. 7795 void step() { 7796 block_comment("step"); 7797 // MACC(Ra, Rb, t0, t1, t2); 7798 // Ra = *++Pa; 7799 // Rb = *--Pb; 7800 umulh(Rhi_ab, Ra, Rb); 7801 mul(Rlo_ab, Ra, Rb); 7802 ldr(Ra, pre(Pa, wordSize)); 7803 ldr(Rb, pre(Pb, -wordSize)); 7804 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 7805 // previous iteration. 7806 // MACC(Rm, Rn, t0, t1, t2); 7807 // Rm = *++Pm; 7808 // Rn = *--Pn; 7809 umulh(Rhi_mn, Rm, Rn); 7810 mul(Rlo_mn, Rm, Rn); 7811 ldr(Rm, pre(Pm, wordSize)); 7812 ldr(Rn, pre(Pn, -wordSize)); 7813 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7814 } 7815 7816 void post1() { 7817 block_comment("post1"); 7818 7819 // MACC(Ra, Rb, t0, t1, t2); 7820 // Ra = *++Pa; 7821 // Rb = *--Pb; 7822 umulh(Rhi_ab, Ra, Rb); 7823 mul(Rlo_ab, Ra, Rb); 7824 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7825 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7826 7827 // *Pm = Rm = t0 * inv; 7828 mul(Rm, t0, inv); 7829 str(Rm, Address(Pm)); 7830 7831 // MACC(Rm, Rn, t0, t1, t2); 7832 // t0 = t1; t1 = t2; t2 = 0; 7833 umulh(Rhi_mn, Rm, Rn); 7834 7835 #ifndef PRODUCT 7836 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 7837 { 7838 mul(Rlo_mn, Rm, Rn); 7839 add(Rlo_mn, t0, Rlo_mn); 7840 Label ok; 7841 cbz(Rlo_mn, ok); { 7842 stop("broken Montgomery multiply"); 7843 } bind(ok); 7844 } 7845 #endif 7846 // We have very carefully set things up so that 7847 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 7848 // the lower half of Rm * Rn because we know the result already: 7849 // it must be -t0. t0 + (-t0) must generate a carry iff 7850 // t0 != 0. So, rather than do a mul and an adds we just set 7851 // the carry flag iff t0 is nonzero. 7852 // 7853 // mul(Rlo_mn, Rm, Rn); 7854 // adds(zr, t0, Rlo_mn); 7855 subs(zr, t0, 1); // Set carry iff t0 is nonzero 7856 adcs(t0, t1, Rhi_mn); 7857 adc(t1, t2, zr); 7858 mov(t2, zr); 7859 } 7860 7861 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 7862 block_comment("pre2"); 7863 // Pa = Pa_base + i-len; 7864 // Pb = Pb_base + len; 7865 // Pm = Pm_base + i-len; 7866 // Pn = Pn_base + len; 7867 7868 if (i.is_register()) { 7869 sub(Rj, i.as_register(), len); 7870 } else { 7871 mov(Rj, i.as_constant()); 7872 sub(Rj, Rj, len); 7873 } 7874 // Rj == i-len 7875 7876 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 7877 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 7878 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 7879 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 7880 7881 // Ra = *++Pa; 7882 // Rb = *--Pb; 7883 // Rm = *++Pm; 7884 // Rn = *--Pn; 7885 ldr(Ra, pre(Pa, wordSize)); 7886 ldr(Rb, pre(Pb, -wordSize)); 7887 ldr(Rm, pre(Pm, wordSize)); 7888 ldr(Rn, pre(Pn, -wordSize)); 7889 7890 mov(Rhi_mn, zr); 7891 mov(Rlo_mn, zr); 7892 } 7893 7894 void post2(RegisterOrConstant i, RegisterOrConstant len) { 7895 block_comment("post2"); 7896 if (i.is_constant()) { 7897 mov(Rj, i.as_constant()-len.as_constant()); 7898 } else { 7899 sub(Rj, i.as_register(), len); 7900 } 7901 7902 adds(t0, t0, Rlo_mn); // The pending m*n, low part 7903 7904 // As soon as we know the least significant digit of our result, 7905 // store it. 7906 // Pm_base[i-len] = t0; 7907 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 7908 7909 // t0 = t1; t1 = t2; t2 = 0; 7910 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 7911 adc(t1, t2, zr); 7912 mov(t2, zr); 7913 } 7914 7915 // A carry in t0 after Montgomery multiplication means that we 7916 // should subtract multiples of n from our result in m. We'll 7917 // keep doing that until there is no carry. 7918 void normalize(RegisterOrConstant len) { 7919 block_comment("normalize"); 7920 // while (t0) 7921 // t0 = sub(Pm_base, Pn_base, t0, len); 7922 Label loop, post, again; 7923 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 7924 cbz(t0, post); { 7925 bind(again); { 7926 mov(i, zr); 7927 mov(cnt, len); 7928 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7929 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7930 subs(zr, zr, zr); // set carry flag, i.e. no borrow 7931 align(16); 7932 bind(loop); { 7933 sbcs(Rm, Rm, Rn); 7934 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7935 add(i, i, 1); 7936 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 7937 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 7938 sub(cnt, cnt, 1); 7939 } cbnz(cnt, loop); 7940 sbc(t0, t0, zr); 7941 } cbnz(t0, again); 7942 } bind(post); 7943 } 7944 7945 // Move memory at s to d, reversing words. 7946 // Increments d to end of copied memory 7947 // Destroys tmp1, tmp2 7948 // Preserves len 7949 // Leaves s pointing to the address which was in d at start 7950 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 7951 assert(tmp1->encoding() < r19->encoding(), "register corruption"); 7952 assert(tmp2->encoding() < r19->encoding(), "register corruption"); 7953 7954 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 7955 mov(tmp1, len); 7956 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 7957 sub(s, d, len, ext::uxtw, LogBytesPerWord); 7958 } 7959 // where 7960 void reverse1(Register d, Register s, Register tmp) { 7961 ldr(tmp, pre(s, -wordSize)); 7962 ror(tmp, tmp, 32); 7963 str(tmp, post(d, wordSize)); 7964 } 7965 7966 void step_squaring() { 7967 // An extra ACC 7968 step(); 7969 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7970 } 7971 7972 void last_squaring(RegisterOrConstant i) { 7973 Label dont; 7974 // if ((i & 1) == 0) { 7975 tbnz(i.as_register(), 0, dont); { 7976 // MACC(Ra, Rb, t0, t1, t2); 7977 // Ra = *++Pa; 7978 // Rb = *--Pb; 7979 umulh(Rhi_ab, Ra, Rb); 7980 mul(Rlo_ab, Ra, Rb); 7981 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 7982 } bind(dont); 7983 } 7984 7985 void extra_step_squaring() { 7986 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7987 7988 // MACC(Rm, Rn, t0, t1, t2); 7989 // Rm = *++Pm; 7990 // Rn = *--Pn; 7991 umulh(Rhi_mn, Rm, Rn); 7992 mul(Rlo_mn, Rm, Rn); 7993 ldr(Rm, pre(Pm, wordSize)); 7994 ldr(Rn, pre(Pn, -wordSize)); 7995 } 7996 7997 void post1_squaring() { 7998 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 7999 8000 // *Pm = Rm = t0 * inv; 8001 mul(Rm, t0, inv); 8002 str(Rm, Address(Pm)); 8003 8004 // MACC(Rm, Rn, t0, t1, t2); 8005 // t0 = t1; t1 = t2; t2 = 0; 8006 umulh(Rhi_mn, Rm, Rn); 8007 8008 #ifndef PRODUCT 8009 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 8010 { 8011 mul(Rlo_mn, Rm, Rn); 8012 add(Rlo_mn, t0, Rlo_mn); 8013 Label ok; 8014 cbz(Rlo_mn, ok); { 8015 stop("broken Montgomery multiply"); 8016 } bind(ok); 8017 } 8018 #endif 8019 // We have very carefully set things up so that 8020 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 8021 // the lower half of Rm * Rn because we know the result already: 8022 // it must be -t0. t0 + (-t0) must generate a carry iff 8023 // t0 != 0. So, rather than do a mul and an adds we just set 8024 // the carry flag iff t0 is nonzero. 8025 // 8026 // mul(Rlo_mn, Rm, Rn); 8027 // adds(zr, t0, Rlo_mn); 8028 subs(zr, t0, 1); // Set carry iff t0 is nonzero 8029 adcs(t0, t1, Rhi_mn); 8030 adc(t1, t2, zr); 8031 mov(t2, zr); 8032 } 8033 8034 void acc(Register Rhi, Register Rlo, 8035 Register t0, Register t1, Register t2) { 8036 adds(t0, t0, Rlo); 8037 adcs(t1, t1, Rhi); 8038 adc(t2, t2, zr); 8039 } 8040 8041 public: 8042 /** 8043 * Fast Montgomery multiplication. The derivation of the 8044 * algorithm is in A Cryptographic Library for the Motorola 8045 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 8046 * 8047 * Arguments: 8048 * 8049 * Inputs for multiplication: 8050 * c_rarg0 - int array elements a 8051 * c_rarg1 - int array elements b 8052 * c_rarg2 - int array elements n (the modulus) 8053 * c_rarg3 - int length 8054 * c_rarg4 - int inv 8055 * c_rarg5 - int array elements m (the result) 8056 * 8057 * Inputs for squaring: 8058 * c_rarg0 - int array elements a 8059 * c_rarg1 - int array elements n (the modulus) 8060 * c_rarg2 - int length 8061 * c_rarg3 - int inv 8062 * c_rarg4 - int array elements m (the result) 8063 * 8064 */ 8065 address generate_multiply() { 8066 Label argh, nothing; 8067 bind(argh); 8068 stop("MontgomeryMultiply total_allocation must be <= 8192"); 8069 8070 align(CodeEntryAlignment); 8071 address entry = pc(); 8072 8073 cbzw(Rlen, nothing); 8074 8075 enter(); 8076 8077 // Make room. 8078 cmpw(Rlen, 512); 8079 br(Assembler::HI, argh); 8080 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 8081 andr(sp, Ra, -2 * wordSize); 8082 8083 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 8084 8085 { 8086 // Copy input args, reversing as we go. We use Ra as a 8087 // temporary variable. 8088 reverse(Ra, Pa_base, Rlen, t0, t1); 8089 if (!_squaring) 8090 reverse(Ra, Pb_base, Rlen, t0, t1); 8091 reverse(Ra, Pn_base, Rlen, t0, t1); 8092 } 8093 8094 // Push all call-saved registers and also Pm_base which we'll need 8095 // at the end. 8096 save_regs(); 8097 8098 #ifndef PRODUCT 8099 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 8100 { 8101 ldr(Rn, Address(Pn_base, 0)); 8102 mul(Rlo_mn, Rn, inv); 8103 subs(zr, Rlo_mn, -1); 8104 Label ok; 8105 br(EQ, ok); { 8106 stop("broken inverse in Montgomery multiply"); 8107 } bind(ok); 8108 } 8109 #endif 8110 8111 mov(Pm_base, Ra); 8112 8113 mov(t0, zr); 8114 mov(t1, zr); 8115 mov(t2, zr); 8116 8117 block_comment("for (int i = 0; i < len; i++) {"); 8118 mov(Ri, zr); { 8119 Label loop, end; 8120 cmpw(Ri, Rlen); 8121 br(Assembler::GE, end); 8122 8123 bind(loop); 8124 pre1(Ri); 8125 8126 block_comment(" for (j = i; j; j--) {"); { 8127 movw(Rj, Ri); 8128 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 8129 } block_comment(" } // j"); 8130 8131 post1(); 8132 addw(Ri, Ri, 1); 8133 cmpw(Ri, Rlen); 8134 br(Assembler::LT, loop); 8135 bind(end); 8136 block_comment("} // i"); 8137 } 8138 8139 block_comment("for (int i = len; i < 2*len; i++) {"); 8140 mov(Ri, Rlen); { 8141 Label loop, end; 8142 cmpw(Ri, Rlen, Assembler::LSL, 1); 8143 br(Assembler::GE, end); 8144 8145 bind(loop); 8146 pre2(Ri, Rlen); 8147 8148 block_comment(" for (j = len*2-i-1; j; j--) {"); { 8149 lslw(Rj, Rlen, 1); 8150 subw(Rj, Rj, Ri); 8151 subw(Rj, Rj, 1); 8152 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 8153 } block_comment(" } // j"); 8154 8155 post2(Ri, Rlen); 8156 addw(Ri, Ri, 1); 8157 cmpw(Ri, Rlen, Assembler::LSL, 1); 8158 br(Assembler::LT, loop); 8159 bind(end); 8160 } 8161 block_comment("} // i"); 8162 8163 normalize(Rlen); 8164 8165 mov(Ra, Pm_base); // Save Pm_base in Ra 8166 restore_regs(); // Restore caller's Pm_base 8167 8168 // Copy our result into caller's Pm_base 8169 reverse(Pm_base, Ra, Rlen, t0, t1); 8170 8171 leave(); 8172 bind(nothing); 8173 ret(lr); 8174 8175 return entry; 8176 } 8177 // In C, approximately: 8178 8179 // void 8180 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 8181 // julong Pn_base[], julong Pm_base[], 8182 // julong inv, int len) { 8183 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 8184 // julong *Pa, *Pb, *Pn, *Pm; 8185 // julong Ra, Rb, Rn, Rm; 8186 8187 // int i; 8188 8189 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 8190 8191 // for (i = 0; i < len; i++) { 8192 // int j; 8193 8194 // Pa = Pa_base; 8195 // Pb = Pb_base + i; 8196 // Pm = Pm_base; 8197 // Pn = Pn_base + i; 8198 8199 // Ra = *Pa; 8200 // Rb = *Pb; 8201 // Rm = *Pm; 8202 // Rn = *Pn; 8203 8204 // int iters = i; 8205 // for (j = 0; iters--; j++) { 8206 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 8207 // MACC(Ra, Rb, t0, t1, t2); 8208 // Ra = *++Pa; 8209 // Rb = *--Pb; 8210 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8211 // MACC(Rm, Rn, t0, t1, t2); 8212 // Rm = *++Pm; 8213 // Rn = *--Pn; 8214 // } 8215 8216 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 8217 // MACC(Ra, Rb, t0, t1, t2); 8218 // *Pm = Rm = t0 * inv; 8219 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 8220 // MACC(Rm, Rn, t0, t1, t2); 8221 8222 // assert(t0 == 0, "broken Montgomery multiply"); 8223 8224 // t0 = t1; t1 = t2; t2 = 0; 8225 // } 8226 8227 // for (i = len; i < 2*len; i++) { 8228 // int j; 8229 8230 // Pa = Pa_base + i-len; 8231 // Pb = Pb_base + len; 8232 // Pm = Pm_base + i-len; 8233 // Pn = Pn_base + len; 8234 8235 // Ra = *++Pa; 8236 // Rb = *--Pb; 8237 // Rm = *++Pm; 8238 // Rn = *--Pn; 8239 8240 // int iters = len*2-i-1; 8241 // for (j = i-len+1; iters--; j++) { 8242 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 8243 // MACC(Ra, Rb, t0, t1, t2); 8244 // Ra = *++Pa; 8245 // Rb = *--Pb; 8246 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8247 // MACC(Rm, Rn, t0, t1, t2); 8248 // Rm = *++Pm; 8249 // Rn = *--Pn; 8250 // } 8251 8252 // Pm_base[i-len] = t0; 8253 // t0 = t1; t1 = t2; t2 = 0; 8254 // } 8255 8256 // while (t0) 8257 // t0 = sub(Pm_base, Pn_base, t0, len); 8258 // } 8259 8260 /** 8261 * Fast Montgomery squaring. This uses asymptotically 25% fewer 8262 * multiplies than Montgomery multiplication so it should be up to 8263 * 25% faster. However, its loop control is more complex and it 8264 * may actually run slower on some machines. 8265 * 8266 * Arguments: 8267 * 8268 * Inputs: 8269 * c_rarg0 - int array elements a 8270 * c_rarg1 - int array elements n (the modulus) 8271 * c_rarg2 - int length 8272 * c_rarg3 - int inv 8273 * c_rarg4 - int array elements m (the result) 8274 * 8275 */ 8276 address generate_square() { 8277 Label argh; 8278 bind(argh); 8279 stop("MontgomeryMultiply total_allocation must be <= 8192"); 8280 8281 align(CodeEntryAlignment); 8282 address entry = pc(); 8283 8284 enter(); 8285 8286 // Make room. 8287 cmpw(Rlen, 512); 8288 br(Assembler::HI, argh); 8289 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 8290 andr(sp, Ra, -2 * wordSize); 8291 8292 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 8293 8294 { 8295 // Copy input args, reversing as we go. We use Ra as a 8296 // temporary variable. 8297 reverse(Ra, Pa_base, Rlen, t0, t1); 8298 reverse(Ra, Pn_base, Rlen, t0, t1); 8299 } 8300 8301 // Push all call-saved registers and also Pm_base which we'll need 8302 // at the end. 8303 save_regs(); 8304 8305 mov(Pm_base, Ra); 8306 8307 mov(t0, zr); 8308 mov(t1, zr); 8309 mov(t2, zr); 8310 8311 block_comment("for (int i = 0; i < len; i++) {"); 8312 mov(Ri, zr); { 8313 Label loop, end; 8314 bind(loop); 8315 cmp(Ri, Rlen); 8316 br(Assembler::GE, end); 8317 8318 pre1(Ri); 8319 8320 block_comment("for (j = (i+1)/2; j; j--) {"); { 8321 add(Rj, Ri, 1); 8322 lsr(Rj, Rj, 1); 8323 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 8324 } block_comment(" } // j"); 8325 8326 last_squaring(Ri); 8327 8328 block_comment(" for (j = i/2; j; j--) {"); { 8329 lsr(Rj, Ri, 1); 8330 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 8331 } block_comment(" } // j"); 8332 8333 post1_squaring(); 8334 add(Ri, Ri, 1); 8335 cmp(Ri, Rlen); 8336 br(Assembler::LT, loop); 8337 8338 bind(end); 8339 block_comment("} // i"); 8340 } 8341 8342 block_comment("for (int i = len; i < 2*len; i++) {"); 8343 mov(Ri, Rlen); { 8344 Label loop, end; 8345 bind(loop); 8346 cmp(Ri, Rlen, Assembler::LSL, 1); 8347 br(Assembler::GE, end); 8348 8349 pre2(Ri, Rlen); 8350 8351 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 8352 lsl(Rj, Rlen, 1); 8353 sub(Rj, Rj, Ri); 8354 sub(Rj, Rj, 1); 8355 lsr(Rj, Rj, 1); 8356 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 8357 } block_comment(" } // j"); 8358 8359 last_squaring(Ri); 8360 8361 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 8362 lsl(Rj, Rlen, 1); 8363 sub(Rj, Rj, Ri); 8364 lsr(Rj, Rj, 1); 8365 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 8366 } block_comment(" } // j"); 8367 8368 post2(Ri, Rlen); 8369 add(Ri, Ri, 1); 8370 cmp(Ri, Rlen, Assembler::LSL, 1); 8371 8372 br(Assembler::LT, loop); 8373 bind(end); 8374 block_comment("} // i"); 8375 } 8376 8377 normalize(Rlen); 8378 8379 mov(Ra, Pm_base); // Save Pm_base in Ra 8380 restore_regs(); // Restore caller's Pm_base 8381 8382 // Copy our result into caller's Pm_base 8383 reverse(Pm_base, Ra, Rlen, t0, t1); 8384 8385 leave(); 8386 ret(lr); 8387 8388 return entry; 8389 } 8390 // In C, approximately: 8391 8392 // void 8393 // montgomery_square(julong Pa_base[], julong Pn_base[], 8394 // julong Pm_base[], julong inv, int len) { 8395 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 8396 // julong *Pa, *Pb, *Pn, *Pm; 8397 // julong Ra, Rb, Rn, Rm; 8398 8399 // int i; 8400 8401 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 8402 8403 // for (i = 0; i < len; i++) { 8404 // int j; 8405 8406 // Pa = Pa_base; 8407 // Pb = Pa_base + i; 8408 // Pm = Pm_base; 8409 // Pn = Pn_base + i; 8410 8411 // Ra = *Pa; 8412 // Rb = *Pb; 8413 // Rm = *Pm; 8414 // Rn = *Pn; 8415 8416 // int iters = (i+1)/2; 8417 // for (j = 0; iters--; j++) { 8418 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 8419 // MACC2(Ra, Rb, t0, t1, t2); 8420 // Ra = *++Pa; 8421 // Rb = *--Pb; 8422 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8423 // MACC(Rm, Rn, t0, t1, t2); 8424 // Rm = *++Pm; 8425 // Rn = *--Pn; 8426 // } 8427 // if ((i & 1) == 0) { 8428 // assert(Ra == Pa_base[j], "must be"); 8429 // MACC(Ra, Ra, t0, t1, t2); 8430 // } 8431 // iters = i/2; 8432 // assert(iters == i-j, "must be"); 8433 // for (; iters--; j++) { 8434 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8435 // MACC(Rm, Rn, t0, t1, t2); 8436 // Rm = *++Pm; 8437 // Rn = *--Pn; 8438 // } 8439 8440 // *Pm = Rm = t0 * inv; 8441 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 8442 // MACC(Rm, Rn, t0, t1, t2); 8443 8444 // assert(t0 == 0, "broken Montgomery multiply"); 8445 8446 // t0 = t1; t1 = t2; t2 = 0; 8447 // } 8448 8449 // for (i = len; i < 2*len; i++) { 8450 // int start = i-len+1; 8451 // int end = start + (len - start)/2; 8452 // int j; 8453 8454 // Pa = Pa_base + i-len; 8455 // Pb = Pa_base + len; 8456 // Pm = Pm_base + i-len; 8457 // Pn = Pn_base + len; 8458 8459 // Ra = *++Pa; 8460 // Rb = *--Pb; 8461 // Rm = *++Pm; 8462 // Rn = *--Pn; 8463 8464 // int iters = (2*len-i-1)/2; 8465 // assert(iters == end-start, "must be"); 8466 // for (j = start; iters--; j++) { 8467 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 8468 // MACC2(Ra, Rb, t0, t1, t2); 8469 // Ra = *++Pa; 8470 // Rb = *--Pb; 8471 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8472 // MACC(Rm, Rn, t0, t1, t2); 8473 // Rm = *++Pm; 8474 // Rn = *--Pn; 8475 // } 8476 // if ((i & 1) == 0) { 8477 // assert(Ra == Pa_base[j], "must be"); 8478 // MACC(Ra, Ra, t0, t1, t2); 8479 // } 8480 // iters = (2*len-i)/2; 8481 // assert(iters == len-j, "must be"); 8482 // for (; iters--; j++) { 8483 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 8484 // MACC(Rm, Rn, t0, t1, t2); 8485 // Rm = *++Pm; 8486 // Rn = *--Pn; 8487 // } 8488 // Pm_base[i-len] = t0; 8489 // t0 = t1; t1 = t2; t2 = 0; 8490 // } 8491 8492 // while (t0) 8493 // t0 = sub(Pm_base, Pn_base, t0, len); 8494 // } 8495 }; 8496 8497 void generate_vector_math_stubs() { 8498 // Get native vector math stub routine addresses 8499 void* libsleef = nullptr; 8500 char ebuf[1024]; 8501 char dll_name[JVM_MAXPATHLEN]; 8502 if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "sleef")) { 8503 libsleef = os::dll_load(dll_name, ebuf, sizeof ebuf); 8504 } 8505 if (libsleef == nullptr) { 8506 log_info(library)("Failed to load native vector math library, %s!", ebuf); 8507 return; 8508 } 8509 // Method naming convention 8510 // All the methods are named as <OP><T><N>_<U><suffix> 8511 // Where: 8512 // <OP> is the operation name, e.g. sin 8513 // <T> is optional to indicate float/double 8514 // "f/d" for vector float/double operation 8515 // <N> is the number of elements in the vector 8516 // "2/4" for neon, and "x" for sve 8517 // <U> is the precision level 8518 // "u10/u05" represents 1.0/0.5 ULP error bounds 8519 // We use "u10" for all operations by default 8520 // But for those functions do not have u10 support, we use "u05" instead 8521 // <suffix> indicates neon/sve 8522 // "sve/advsimd" for sve/neon implementations 8523 // e.g. sinfx_u10sve is the method for computing vector float sin using SVE instructions 8524 // cosd2_u10advsimd is the method for computing 2 elements vector double cos using NEON instructions 8525 // 8526 log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "sleef" JNI_LIB_SUFFIX, p2i(libsleef)); 8527 8528 // Math vector stubs implemented with SVE for scalable vector size. 8529 if (UseSVE > 0) { 8530 for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) { 8531 int vop = VectorSupport::VECTOR_OP_MATH_START + op; 8532 // Skip "tanh" because there is performance regression 8533 if (vop == VectorSupport::VECTOR_OP_TANH) { 8534 continue; 8535 } 8536 8537 // The native library does not support u10 level of "hypot". 8538 const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10"; 8539 8540 snprintf(ebuf, sizeof(ebuf), "%sfx_%ssve", VectorSupport::mathname[op], ulf); 8541 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf); 8542 8543 snprintf(ebuf, sizeof(ebuf), "%sdx_%ssve", VectorSupport::mathname[op], ulf); 8544 StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_SCALABLE][op] = (address)os::dll_lookup(libsleef, ebuf); 8545 } 8546 } 8547 8548 // Math vector stubs implemented with NEON for 64/128 bits vector size. 8549 for (int op = 0; op < VectorSupport::NUM_VECTOR_OP_MATH; op++) { 8550 int vop = VectorSupport::VECTOR_OP_MATH_START + op; 8551 // Skip "tanh" because there is performance regression 8552 if (vop == VectorSupport::VECTOR_OP_TANH) { 8553 continue; 8554 } 8555 8556 // The native library does not support u10 level of "hypot". 8557 const char* ulf = (vop == VectorSupport::VECTOR_OP_HYPOT) ? "u05" : "u10"; 8558 8559 snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf); 8560 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_64][op] = (address)os::dll_lookup(libsleef, ebuf); 8561 8562 snprintf(ebuf, sizeof(ebuf), "%sf4_%sadvsimd", VectorSupport::mathname[op], ulf); 8563 StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf); 8564 8565 snprintf(ebuf, sizeof(ebuf), "%sd2_%sadvsimd", VectorSupport::mathname[op], ulf); 8566 StubRoutines::_vector_d_math[VectorSupport::VEC_SIZE_128][op] = (address)os::dll_lookup(libsleef, ebuf); 8567 } 8568 } 8569 8570 // Initialization 8571 void generate_initial_stubs() { 8572 // Generate initial stubs and initializes the entry points 8573 8574 // entry points that exist in all platforms Note: This is code 8575 // that could be shared among different platforms - however the 8576 // benefit seems to be smaller than the disadvantage of having a 8577 // much more complicated generator structure. See also comment in 8578 // stubRoutines.hpp. 8579 8580 StubRoutines::_forward_exception_entry = generate_forward_exception(); 8581 8582 StubRoutines::_call_stub_entry = 8583 generate_call_stub(StubRoutines::_call_stub_return_address); 8584 8585 // is referenced by megamorphic call 8586 StubRoutines::_catch_exception_entry = generate_catch_exception(); 8587 8588 // Initialize table for copy memory (arraycopy) check. 8589 if (UnsafeMemoryAccess::_table == nullptr) { 8590 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory 8591 } 8592 8593 if (UseCRC32Intrinsics) { 8594 // set table address before stub generation which use it 8595 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 8596 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 8597 } 8598 8599 if (UseCRC32CIntrinsics) { 8600 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 8601 } 8602 8603 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 8604 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 8605 } 8606 8607 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 8608 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 8609 } 8610 8611 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) && 8612 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) { 8613 StubRoutines::_hf2f = generate_float16ToFloat(); 8614 StubRoutines::_f2hf = generate_floatToFloat16(); 8615 } 8616 } 8617 8618 void generate_continuation_stubs() { 8619 // Continuation stubs: 8620 StubRoutines::_cont_thaw = generate_cont_thaw(); 8621 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier(); 8622 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception(); 8623 } 8624 8625 void generate_final_stubs() { 8626 // support for verify_oop (must happen after universe_init) 8627 if (VerifyOops) { 8628 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 8629 } 8630 8631 // arraycopy stubs used by compilers 8632 generate_arraycopy_stubs(); 8633 8634 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 8635 if (bs_nm != nullptr) { 8636 StubRoutines::_method_entry_barrier = generate_method_entry_barrier(); 8637 } 8638 8639 StubRoutines::aarch64::_spin_wait = generate_spin_wait(); 8640 8641 if (UsePoly1305Intrinsics) { 8642 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); 8643 } 8644 8645 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS) 8646 8647 generate_atomic_entry_points(); 8648 8649 #endif // LINUX 8650 8651 #ifdef COMPILER2 8652 if (UseSecondarySupersTable) { 8653 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub(); 8654 if (! InlineSecondarySupersTest) { 8655 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) { 8656 StubRoutines::_lookup_secondary_supers_table_stubs[slot] 8657 = generate_lookup_secondary_supers_table_stub(slot); 8658 } 8659 } 8660 } 8661 #endif 8662 8663 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler(); 8664 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target(); 8665 8666 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated 8667 } 8668 8669 void generate_compiler_stubs() { 8670 #if COMPILER2_OR_JVMCI 8671 8672 if (UseSVE == 0) { 8673 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices"); 8674 } 8675 8676 // array equals stub for large arrays. 8677 if (!UseSimpleArrayEquals) { 8678 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 8679 } 8680 8681 // arrays_hascode stub for large arrays. 8682 StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN); 8683 StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE); 8684 StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR); 8685 StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT); 8686 StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT); 8687 8688 // byte_array_inflate stub for large arrays. 8689 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 8690 8691 // countPositives stub for large arrays. 8692 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long); 8693 8694 generate_compare_long_strings(); 8695 8696 generate_string_indexof_stubs(); 8697 8698 #ifdef COMPILER2 8699 if (UseMultiplyToLenIntrinsic) { 8700 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 8701 } 8702 8703 if (UseSquareToLenIntrinsic) { 8704 StubRoutines::_squareToLen = generate_squareToLen(); 8705 } 8706 8707 if (UseMulAddIntrinsic) { 8708 StubRoutines::_mulAdd = generate_mulAdd(); 8709 } 8710 8711 if (UseSIMDForBigIntegerShiftIntrinsics) { 8712 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); 8713 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); 8714 } 8715 8716 if (UseMontgomeryMultiplyIntrinsic) { 8717 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 8718 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 8719 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 8720 } 8721 8722 if (UseMontgomerySquareIntrinsic) { 8723 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 8724 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 8725 // We use generate_multiply() rather than generate_square() 8726 // because it's faster for the sizes of modulus we care about. 8727 StubRoutines::_montgomerySquare = g.generate_multiply(); 8728 } 8729 8730 generate_vector_math_stubs(); 8731 8732 #endif // COMPILER2 8733 8734 if (UseChaCha20Intrinsics) { 8735 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar(); 8736 } 8737 8738 if (UseBASE64Intrinsics) { 8739 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 8740 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock(); 8741 } 8742 8743 // data cache line writeback 8744 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 8745 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 8746 8747 if (UseAESIntrinsics) { 8748 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 8749 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 8750 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 8751 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 8752 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); 8753 } 8754 if (UseGHASHIntrinsics) { 8755 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 8756 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); 8757 } 8758 if (UseAESIntrinsics && UseGHASHIntrinsics) { 8759 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt(); 8760 } 8761 8762 if (UseMD5Intrinsics) { 8763 StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress"); 8764 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB"); 8765 } 8766 if (UseSHA1Intrinsics) { 8767 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 8768 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 8769 } 8770 if (UseSHA256Intrinsics) { 8771 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 8772 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 8773 } 8774 if (UseSHA512Intrinsics) { 8775 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); 8776 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); 8777 } 8778 if (UseSHA3Intrinsics) { 8779 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(false, "sha3_implCompress"); 8780 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(true, "sha3_implCompressMB"); 8781 } 8782 8783 // generate Adler32 intrinsics code 8784 if (UseAdler32Intrinsics) { 8785 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 8786 } 8787 8788 #endif // COMPILER2_OR_JVMCI 8789 } 8790 8791 public: 8792 StubGenerator(CodeBuffer* code, StubsKind kind) : StubCodeGenerator(code) { 8793 switch(kind) { 8794 case Initial_stubs: 8795 generate_initial_stubs(); 8796 break; 8797 case Continuation_stubs: 8798 generate_continuation_stubs(); 8799 break; 8800 case Compiler_stubs: 8801 generate_compiler_stubs(); 8802 break; 8803 case Final_stubs: 8804 generate_final_stubs(); 8805 break; 8806 default: 8807 fatal("unexpected stubs kind: %d", kind); 8808 break; 8809 }; 8810 } 8811 }; // end class declaration 8812 8813 void StubGenerator_generate(CodeBuffer* code, StubCodeGenerator::StubsKind kind) { 8814 StubGenerator g(code, kind); 8815 } 8816 8817 8818 #if defined (LINUX) 8819 8820 // Define pointers to atomic stubs and initialize them to point to the 8821 // code in atomic_aarch64.S. 8822 8823 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \ 8824 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \ 8825 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \ 8826 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \ 8827 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl; 8828 8829 DEFAULT_ATOMIC_OP(fetch_add, 4, ) 8830 DEFAULT_ATOMIC_OP(fetch_add, 8, ) 8831 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed) 8832 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed) 8833 DEFAULT_ATOMIC_OP(xchg, 4, ) 8834 DEFAULT_ATOMIC_OP(xchg, 8, ) 8835 DEFAULT_ATOMIC_OP(cmpxchg, 1, ) 8836 DEFAULT_ATOMIC_OP(cmpxchg, 4, ) 8837 DEFAULT_ATOMIC_OP(cmpxchg, 8, ) 8838 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed) 8839 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed) 8840 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed) 8841 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release) 8842 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release) 8843 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst) 8844 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst) 8845 8846 #undef DEFAULT_ATOMIC_OP 8847 8848 #endif // LINUX