1 /* 2 * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "opto/c2_MacroAssembler.hpp" 29 #include "opto/compile.hpp" 30 #include "opto/intrinsicnode.hpp" 31 #include "opto/matcher.hpp" 32 #include "opto/output.hpp" 33 #include "opto/subnode.hpp" 34 #include "runtime/stubRoutines.hpp" 35 #include "utilities/globalDefinitions.hpp" 36 #include "utilities/powerOfTwo.hpp" 37 38 #ifdef PRODUCT 39 #define BLOCK_COMMENT(str) /* nothing */ 40 #define STOP(error) stop(error) 41 #else 42 #define BLOCK_COMMENT(str) block_comment(str) 43 #define STOP(error) block_comment(error); stop(error) 44 #endif 45 46 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 47 48 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 49 50 void C2_MacroAssembler::entry_barrier() { 51 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 52 if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) { 53 // Dummy labels for just measuring the code size 54 Label dummy_slow_path; 55 Label dummy_continuation; 56 Label dummy_guard; 57 Label* slow_path = &dummy_slow_path; 58 Label* continuation = &dummy_continuation; 59 Label* guard = &dummy_guard; 60 if (!Compile::current()->output()->in_scratch_emit_size()) { 61 // Use real labels from actual stub when not emitting code for the purpose of measuring its size 62 C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub(); 63 Compile::current()->output()->add_stub(stub); 64 slow_path = &stub->entry(); 65 continuation = &stub->continuation(); 66 guard = &stub->guard(); 67 } 68 // In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub. 69 bs->nmethod_entry_barrier(this, slow_path, continuation, guard); 70 } 71 } 72 73 // jdk.internal.util.ArraysSupport.vectorizedHashCode 74 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result, 75 FloatRegister vdata0, FloatRegister vdata1, 76 FloatRegister vdata2, FloatRegister vdata3, 77 FloatRegister vmul0, FloatRegister vmul1, 78 FloatRegister vmul2, FloatRegister vmul3, 79 FloatRegister vpow, FloatRegister vpowm, 80 BasicType eltype) { 81 ARRAYS_HASHCODE_REGISTERS; 82 83 Register tmp1 = rscratch1, tmp2 = rscratch2; 84 85 Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE; 86 87 // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We 88 // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to 89 // use 4H for chars and shorts instead, but using 8H gives better performance. 90 const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8 91 : eltype == T_CHAR || eltype == T_SHORT ? 8 92 : eltype == T_INT ? 4 93 : 0; 94 guarantee(vf, "unsupported eltype"); 95 96 // Unroll factor for the scalar loop below. The value is chosen based on performance analysis. 97 const size_t unroll_factor = 4; 98 99 switch (eltype) { 100 case T_BOOLEAN: 101 BLOCK_COMMENT("arrays_hashcode(unsigned byte) {"); 102 break; 103 case T_CHAR: 104 BLOCK_COMMENT("arrays_hashcode(char) {"); 105 break; 106 case T_BYTE: 107 BLOCK_COMMENT("arrays_hashcode(byte) {"); 108 break; 109 case T_SHORT: 110 BLOCK_COMMENT("arrays_hashcode(short) {"); 111 break; 112 case T_INT: 113 BLOCK_COMMENT("arrays_hashcode(int) {"); 114 break; 115 default: 116 ShouldNotReachHere(); 117 } 118 119 // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop 120 // implemented by the stub executes just once. Call the stub only if at least two iterations will 121 // be executed. 122 const size_t large_threshold = eltype == T_INT ? vf * 2 : vf; 123 cmpw(cnt, large_threshold); 124 br(Assembler::HS, LARGE); 125 126 bind(TAIL); 127 128 // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past 129 // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs. 130 // Iteration eats up the remainder, uf elements at a time. 131 assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC"); 132 andr(tmp2, cnt, unroll_factor - 1); 133 adr(tmp1, BR_BASE); 134 sub(tmp1, tmp1, tmp2, ext::sxtw, 3); 135 movw(tmp2, 0x1f); 136 br(tmp1); 137 138 bind(LOOP); 139 for (size_t i = 0; i < unroll_factor; ++i) { 140 load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype); 141 maddw(result, result, tmp2, tmp1); 142 } 143 bind(BR_BASE); 144 subsw(cnt, cnt, unroll_factor); 145 br(Assembler::HS, LOOP); 146 147 b(DONE); 148 149 bind(LARGE); 150 151 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype)); 152 assert(stub.target() != nullptr, "array_hashcode stub has not been generated"); 153 address tpc = trampoline_call(stub); 154 if (tpc == nullptr) { 155 DEBUG_ONLY(reset_labels(TAIL, BR_BASE)); 156 postcond(pc() == badAddress); 157 return nullptr; 158 } 159 160 bind(DONE); 161 162 BLOCK_COMMENT("} // arrays_hashcode"); 163 164 postcond(pc() != badAddress); 165 return pc(); 166 } 167 168 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg, 169 Register tmp2Reg, Register tmp3Reg) { 170 Register oop = objectReg; 171 Register box = boxReg; 172 Register disp_hdr = tmpReg; 173 Register tmp = tmp2Reg; 174 Label cont; 175 Label object_has_monitor; 176 Label count, no_count; 177 178 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight"); 179 assert_different_registers(oop, box, tmp, disp_hdr); 180 181 // Load markWord from object into displaced_header. 182 ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes())); 183 184 if (DiagnoseSyncOnValueBasedClasses != 0) { 185 load_klass(tmp, oop); 186 ldrb(tmp, Address(tmp, Klass::misc_flags_offset())); 187 tst(tmp, KlassFlags::_misc_is_value_based_class); 188 br(Assembler::NE, cont); 189 } 190 191 // Check for existing monitor 192 tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor); 193 194 if (LockingMode == LM_MONITOR) { 195 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 196 b(cont); 197 } else { 198 assert(LockingMode == LM_LEGACY, "must be"); 199 // Set tmp to be (markWord of object | UNLOCK_VALUE). 200 orr(tmp, disp_hdr, markWord::unlocked_value); 201 202 if (EnableValhalla) { 203 // Mask inline_type bit such that we go to the slow path if object is an inline type 204 andr(tmp, tmp, ~((int) markWord::inline_type_bit_in_place)); 205 } 206 207 // Initialize the box. (Must happen before we update the object mark!) 208 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 209 210 // Compare object markWord with an unlocked value (tmp) and if 211 // equal exchange the stack address of our box with object markWord. 212 // On failure disp_hdr contains the possibly locked markWord. 213 cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true, 214 /*release*/ true, /*weak*/ false, disp_hdr); 215 br(Assembler::EQ, cont); 216 217 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 218 219 // If the compare-and-exchange succeeded, then we found an unlocked 220 // object, will have now locked it will continue at label cont 221 222 // Check if the owner is self by comparing the value in the 223 // markWord of object (disp_hdr) with the stack pointer. 224 mov(rscratch1, sp); 225 sub(disp_hdr, disp_hdr, rscratch1); 226 mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place)); 227 // If condition is true we are cont and hence we can store 0 as the 228 // displaced header in the box, which indicates that it is a recursive lock. 229 ands(tmp/*==0?*/, disp_hdr, tmp); // Sets flags for result 230 str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes())); 231 b(cont); 232 } 233 234 // Handle existing monitor. 235 bind(object_has_monitor); 236 237 // The object's monitor m is unlocked iff m->owner == nullptr, 238 // otherwise m->owner may contain a thread or a stack address. 239 // 240 // Try to CAS m->owner from null to current thread. 241 add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value)); 242 cmpxchg(tmp, zr, rthread, Assembler::xword, /*acquire*/ true, 243 /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result 244 245 // Store a non-null value into the box to avoid looking like a re-entrant 246 // lock. The fast-path monitor unlock code checks for 247 // markWord::monitor_value so use markWord::unused_mark which has the 248 // relevant bit set, and also matches ObjectSynchronizer::enter. 249 mov(tmp, (address)markWord::unused_mark().value()); 250 str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes())); 251 252 br(Assembler::EQ, cont); // CAS success means locking succeeded 253 254 cmp(tmp3Reg, rthread); 255 br(Assembler::NE, cont); // Check for recursive locking 256 257 // Recursive lock case 258 increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1); 259 // flag == EQ still from the cmp above, checking if this is a reentrant lock 260 261 bind(cont); 262 // flag == EQ indicates success 263 // flag == NE indicates failure 264 br(Assembler::NE, no_count); 265 266 bind(count); 267 increment(Address(rthread, JavaThread::held_monitor_count_offset())); 268 269 bind(no_count); 270 } 271 272 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg, 273 Register tmp2Reg) { 274 Register oop = objectReg; 275 Register box = boxReg; 276 Register disp_hdr = tmpReg; 277 Register owner_addr = tmpReg; 278 Register tmp = tmp2Reg; 279 Label cont; 280 Label object_has_monitor; 281 Label count, no_count; 282 Label unlocked; 283 284 assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight"); 285 assert_different_registers(oop, box, tmp, disp_hdr); 286 287 if (LockingMode == LM_LEGACY) { 288 // Find the lock address and load the displaced header from the stack. 289 ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes())); 290 291 // If the displaced header is 0, we have a recursive unlock. 292 cmp(disp_hdr, zr); 293 br(Assembler::EQ, cont); 294 } 295 296 // Handle existing monitor. 297 ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes())); 298 tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor); 299 300 if (LockingMode == LM_MONITOR) { 301 tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0. 302 b(cont); 303 } else { 304 assert(LockingMode == LM_LEGACY, "must be"); 305 // Check if it is still a light weight lock, this is is true if we 306 // see the stack address of the basicLock in the markWord of the 307 // object. 308 309 cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false, 310 /*release*/ true, /*weak*/ false, tmp); 311 b(cont); 312 } 313 314 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 315 316 // Handle existing monitor. 317 bind(object_has_monitor); 318 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 319 add(tmp, tmp, -(int)markWord::monitor_value); // monitor 320 321 ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 322 323 Label notRecursive; 324 cbz(disp_hdr, notRecursive); 325 326 // Recursive lock 327 sub(disp_hdr, disp_hdr, 1u); 328 str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset())); 329 cmp(disp_hdr, disp_hdr); // Sets flags for result 330 b(cont); 331 332 bind(notRecursive); 333 334 // Compute owner address. 335 lea(owner_addr, Address(tmp, ObjectMonitor::owner_offset())); 336 337 // Set owner to null. 338 // Release to satisfy the JMM 339 stlr(zr, owner_addr); 340 // We need a full fence after clearing owner to avoid stranding. 341 // StoreLoad achieves this. 342 membar(StoreLoad); 343 344 // Check if the entry lists are empty (EntryList first - by convention). 345 ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset())); 346 ldr(tmpReg, Address(tmp, ObjectMonitor::cxq_offset())); 347 orr(rscratch1, rscratch1, tmpReg); 348 cmp(rscratch1, zr); 349 br(Assembler::EQ, cont); // If so we are done. 350 351 // Check if there is a successor. 352 ldr(rscratch1, Address(tmp, ObjectMonitor::succ_offset())); 353 cmp(rscratch1, zr); 354 br(Assembler::NE, unlocked); // If so we are done. 355 356 // Save the monitor pointer in the current thread, so we can try to 357 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 358 str(tmp, Address(rthread, JavaThread::unlocked_inflated_monitor_offset())); 359 360 cmp(zr, rthread); // Set Flag to NE => slow path 361 b(cont); 362 363 bind(unlocked); 364 cmp(zr, zr); // Set Flag to EQ => fast path 365 366 // Intentional fall-through 367 368 bind(cont); 369 // flag == EQ indicates success 370 // flag == NE indicates failure 371 br(Assembler::NE, no_count); 372 373 bind(count); 374 decrement(Address(rthread, JavaThread::held_monitor_count_offset())); 375 376 bind(no_count); 377 } 378 379 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register t1, 380 Register t2, Register t3) { 381 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 382 assert_different_registers(obj, box, t1, t2, t3); 383 384 // Handle inflated monitor. 385 Label inflated; 386 // Finish fast lock successfully. MUST branch to with flag == EQ 387 Label locked; 388 // Finish fast lock unsuccessfully. MUST branch to with flag == NE 389 Label slow_path; 390 391 if (UseObjectMonitorTable) { 392 // Clear cache in case fast locking succeeds. 393 str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 394 } 395 396 if (DiagnoseSyncOnValueBasedClasses != 0) { 397 load_klass(t1, obj); 398 ldrb(t1, Address(t1, Klass::misc_flags_offset())); 399 tst(t1, KlassFlags::_misc_is_value_based_class); 400 br(Assembler::NE, slow_path); 401 } 402 403 const Register t1_mark = t1; 404 const Register t3_t = t3; 405 406 { // Lightweight locking 407 408 // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ 409 Label push; 410 411 const Register t2_top = t2; 412 413 // Check if lock-stack is full. 414 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 415 cmpw(t2_top, (unsigned)LockStack::end_offset() - 1); 416 br(Assembler::GT, slow_path); 417 418 // Check if recursive. 419 subw(t3_t, t2_top, oopSize); 420 ldr(t3_t, Address(rthread, t3_t)); 421 cmp(obj, t3_t); 422 br(Assembler::EQ, push); 423 424 // Relaxed normal load to check for monitor. Optimization for monitor case. 425 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 426 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 427 428 // Not inflated 429 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea"); 430 431 // Try to lock. Transition lock-bits 0b01 => 0b00 432 orr(t1_mark, t1_mark, markWord::unlocked_value); 433 eor(t3_t, t1_mark, markWord::unlocked_value); 434 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 435 /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg); 436 br(Assembler::NE, slow_path); 437 438 bind(push); 439 // After successful lock, push object on lock-stack. 440 str(obj, Address(rthread, t2_top)); 441 addw(t2_top, t2_top, oopSize); 442 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 443 b(locked); 444 } 445 446 { // Handle inflated monitor. 447 bind(inflated); 448 449 const Register t1_monitor = t1; 450 451 if (!UseObjectMonitorTable) { 452 assert(t1_monitor == t1_mark, "should be the same here"); 453 } else { 454 Label monitor_found; 455 456 // Load cache address 457 lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset())); 458 459 const int num_unrolled = 2; 460 for (int i = 0; i < num_unrolled; i++) { 461 ldr(t1, Address(t3_t)); 462 cmp(obj, t1); 463 br(Assembler::EQ, monitor_found); 464 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); 465 } 466 467 Label loop; 468 469 // Search for obj in cache. 470 bind(loop); 471 472 // Check for match. 473 ldr(t1, Address(t3_t)); 474 cmp(obj, t1); 475 br(Assembler::EQ, monitor_found); 476 477 // Search until null encountered, guaranteed _null_sentinel at end. 478 increment(t3_t, in_bytes(OMCache::oop_to_oop_difference())); 479 cbnz(t1, loop); 480 // Cache Miss, NE set from cmp above, cbnz does not set flags 481 b(slow_path); 482 483 bind(monitor_found); 484 ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference())); 485 } 486 487 const Register t2_owner_addr = t2; 488 const Register t3_owner = t3; 489 const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value)); 490 const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag); 491 const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag); 492 493 Label monitor_locked; 494 495 // Compute owner address. 496 lea(t2_owner_addr, owner_address); 497 498 // CAS owner (null => current thread). 499 cmpxchg(t2_owner_addr, zr, rthread, Assembler::xword, /*acquire*/ true, 500 /*release*/ false, /*weak*/ false, t3_owner); 501 br(Assembler::EQ, monitor_locked); 502 503 // Check if recursive. 504 cmp(t3_owner, rthread); 505 br(Assembler::NE, slow_path); 506 507 // Recursive. 508 increment(recursions_address, 1); 509 510 bind(monitor_locked); 511 if (UseObjectMonitorTable) { 512 str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 513 } 514 } 515 516 bind(locked); 517 increment(Address(rthread, JavaThread::held_monitor_count_offset())); 518 519 #ifdef ASSERT 520 // Check that locked label is reached with Flags == EQ. 521 Label flag_correct; 522 br(Assembler::EQ, flag_correct); 523 stop("Fast Lock Flag != EQ"); 524 #endif 525 526 bind(slow_path); 527 #ifdef ASSERT 528 // Check that slow_path label is reached with Flags == NE. 529 br(Assembler::NE, flag_correct); 530 stop("Fast Lock Flag != NE"); 531 bind(flag_correct); 532 #endif 533 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 534 } 535 536 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, Register t1, 537 Register t2, Register t3) { 538 assert(LockingMode == LM_LIGHTWEIGHT, "must be"); 539 assert_different_registers(obj, box, t1, t2, t3); 540 541 // Handle inflated monitor. 542 Label inflated, inflated_load_mark; 543 // Finish fast unlock successfully. MUST branch to with flag == EQ 544 Label unlocked; 545 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE 546 Label slow_path; 547 548 const Register t1_mark = t1; 549 const Register t2_top = t2; 550 const Register t3_t = t3; 551 552 { // Lightweight unlock 553 554 Label push_and_slow_path; 555 556 // Check if obj is top of lock-stack. 557 ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 558 subw(t2_top, t2_top, oopSize); 559 ldr(t3_t, Address(rthread, t2_top)); 560 cmp(obj, t3_t); 561 // Top of lock stack was not obj. Must be monitor. 562 br(Assembler::NE, inflated_load_mark); 563 564 // Pop lock-stack. 565 DEBUG_ONLY(str(zr, Address(rthread, t2_top));) 566 strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 567 568 // Check if recursive. 569 subw(t3_t, t2_top, oopSize); 570 ldr(t3_t, Address(rthread, t3_t)); 571 cmp(obj, t3_t); 572 br(Assembler::EQ, unlocked); 573 574 // Not recursive. 575 // Load Mark. 576 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 577 578 // Check header for monitor (0b10). 579 // Because we got here by popping (meaning we pushed in locked) 580 // there will be no monitor in the box. So we need to push back the obj 581 // so that the runtime can fix any potential anonymous owner. 582 tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated); 583 584 // Try to unlock. Transition lock bits 0b00 => 0b01 585 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea"); 586 orr(t3_t, t1_mark, markWord::unlocked_value); 587 cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword, 588 /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg); 589 br(Assembler::EQ, unlocked); 590 591 bind(push_and_slow_path); 592 // Compare and exchange failed. 593 // Restore lock-stack and handle the unlock in runtime. 594 DEBUG_ONLY(str(obj, Address(rthread, t2_top));) 595 addw(t2_top, t2_top, oopSize); 596 str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset())); 597 b(slow_path); 598 } 599 600 601 { // Handle inflated monitor. 602 bind(inflated_load_mark); 603 ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes())); 604 #ifdef ASSERT 605 tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated); 606 stop("Fast Unlock not monitor"); 607 #endif 608 609 bind(inflated); 610 611 #ifdef ASSERT 612 Label check_done; 613 subw(t2_top, t2_top, oopSize); 614 cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset())); 615 br(Assembler::LT, check_done); 616 ldr(t3_t, Address(rthread, t2_top)); 617 cmp(obj, t3_t); 618 br(Assembler::NE, inflated); 619 stop("Fast Unlock lock on stack"); 620 bind(check_done); 621 #endif 622 623 const Register t1_monitor = t1; 624 625 if (!UseObjectMonitorTable) { 626 assert(t1_monitor == t1_mark, "should be the same here"); 627 628 // Untag the monitor. 629 add(t1_monitor, t1_mark, -(int)markWord::monitor_value); 630 } else { 631 ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes())); 632 // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*) 633 cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*))); 634 br(Assembler::LO, slow_path); 635 } 636 637 const Register t2_recursions = t2; 638 Label not_recursive; 639 640 // Check if recursive. 641 ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 642 cbz(t2_recursions, not_recursive); 643 644 // Recursive unlock. 645 sub(t2_recursions, t2_recursions, 1u); 646 str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset())); 647 // Set flag == EQ 648 cmp(t2_recursions, t2_recursions); 649 b(unlocked); 650 651 bind(not_recursive); 652 653 const Register t2_owner_addr = t2; 654 655 // Compute owner address. 656 lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset())); 657 658 // Set owner to null. 659 // Release to satisfy the JMM 660 stlr(zr, t2_owner_addr); 661 // We need a full fence after clearing owner to avoid stranding. 662 // StoreLoad achieves this. 663 membar(StoreLoad); 664 665 // Check if the entry lists are empty (EntryList first - by convention). 666 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::EntryList_offset())); 667 ldr(t3_t, Address(t1_monitor, ObjectMonitor::cxq_offset())); 668 orr(rscratch1, rscratch1, t3_t); 669 cmp(rscratch1, zr); 670 br(Assembler::EQ, unlocked); // If so we are done. 671 672 // Check if there is a successor. 673 ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset())); 674 cmp(rscratch1, zr); 675 br(Assembler::NE, unlocked); // If so we are done. 676 677 // Save the monitor pointer in the current thread, so we can try to 678 // reacquire the lock in SharedRuntime::monitor_exit_helper(). 679 str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset())); 680 681 cmp(zr, rthread); // Set Flag to NE => slow path 682 b(slow_path); 683 } 684 685 bind(unlocked); 686 decrement(Address(rthread, JavaThread::held_monitor_count_offset())); 687 cmp(zr, zr); // Set Flags to EQ => fast path 688 689 #ifdef ASSERT 690 // Check that unlocked label is reached with Flags == EQ. 691 Label flag_correct; 692 br(Assembler::EQ, flag_correct); 693 stop("Fast Unlock Flag != EQ"); 694 #endif 695 696 bind(slow_path); 697 #ifdef ASSERT 698 // Check that slow_path label is reached with Flags == NE. 699 br(Assembler::NE, flag_correct); 700 stop("Fast Unlock Flag != NE"); 701 bind(flag_correct); 702 #endif 703 // C2 uses the value of Flags (NE vs EQ) to determine the continuation. 704 } 705 706 // Search for str1 in str2 and return index or -1 707 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1. 708 void C2_MacroAssembler::string_indexof(Register str2, Register str1, 709 Register cnt2, Register cnt1, 710 Register tmp1, Register tmp2, 711 Register tmp3, Register tmp4, 712 Register tmp5, Register tmp6, 713 int icnt1, Register result, int ae) { 714 // NOTE: tmp5, tmp6 can be zr depending on specific method version 715 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 716 717 Register ch1 = rscratch1; 718 Register ch2 = rscratch2; 719 Register cnt1tmp = tmp1; 720 Register cnt2tmp = tmp2; 721 Register cnt1_neg = cnt1; 722 Register cnt2_neg = cnt2; 723 Register result_tmp = tmp4; 724 725 bool isL = ae == StrIntrinsicNode::LL; 726 727 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 728 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 729 int str1_chr_shift = str1_isL ? 0:1; 730 int str2_chr_shift = str2_isL ? 0:1; 731 int str1_chr_size = str1_isL ? 1:2; 732 int str2_chr_size = str2_isL ? 1:2; 733 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 734 (chr_insn)&MacroAssembler::ldrh; 735 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 736 (chr_insn)&MacroAssembler::ldrh; 737 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 738 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 739 740 // Note, inline_string_indexOf() generates checks: 741 // if (substr.count > string.count) return -1; 742 // if (substr.count == 0) return 0; 743 744 // We have two strings, a source string in str2, cnt2 and a pattern string 745 // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1. 746 747 // For larger pattern and source we use a simplified Boyer Moore algorithm. 748 // With a small pattern and source we use linear scan. 749 750 if (icnt1 == -1) { 751 sub(result_tmp, cnt2, cnt1); 752 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 753 br(LT, LINEARSEARCH); 754 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 755 subs(zr, cnt1, 256); 756 lsr(tmp1, cnt2, 2); 757 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 758 br(GE, LINEARSTUB); 759 } 760 761 // The Boyer Moore alogorithm is based on the description here:- 762 // 763 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 764 // 765 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 766 // and the 'Good Suffix' rule. 767 // 768 // These rules are essentially heuristics for how far we can shift the 769 // pattern along the search string. 770 // 771 // The implementation here uses the 'Bad Character' rule only because of the 772 // complexity of initialisation for the 'Good Suffix' rule. 773 // 774 // This is also known as the Boyer-Moore-Horspool algorithm:- 775 // 776 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 777 // 778 // This particular implementation has few java-specific optimizations. 779 // 780 // #define ASIZE 256 781 // 782 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 783 // int i, j; 784 // unsigned c; 785 // unsigned char bc[ASIZE]; 786 // 787 // /* Preprocessing */ 788 // for (i = 0; i < ASIZE; ++i) 789 // bc[i] = m; 790 // for (i = 0; i < m - 1; ) { 791 // c = x[i]; 792 // ++i; 793 // // c < 256 for Latin1 string, so, no need for branch 794 // #ifdef PATTERN_STRING_IS_LATIN1 795 // bc[c] = m - i; 796 // #else 797 // if (c < ASIZE) bc[c] = m - i; 798 // #endif 799 // } 800 // 801 // /* Searching */ 802 // j = 0; 803 // while (j <= n - m) { 804 // c = y[i+j]; 805 // if (x[m-1] == c) 806 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 807 // if (i < 0) return j; 808 // // c < 256 for Latin1 string, so, no need for branch 809 // #ifdef SOURCE_STRING_IS_LATIN1 810 // // LL case: (c< 256) always true. Remove branch 811 // j += bc[y[j+m-1]]; 812 // #endif 813 // #ifndef PATTERN_STRING_IS_UTF 814 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 815 // if (c < ASIZE) 816 // j += bc[y[j+m-1]]; 817 // else 818 // j += 1 819 // #endif 820 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 821 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 822 // if (c < ASIZE) 823 // j += bc[y[j+m-1]]; 824 // else 825 // j += m 826 // #endif 827 // } 828 // } 829 830 if (icnt1 == -1) { 831 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 832 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 833 Register cnt1end = tmp2; 834 Register str2end = cnt2; 835 Register skipch = tmp2; 836 837 // str1 length is >=8, so, we can read at least 1 register for cases when 838 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 839 // UL case. We'll re-read last character in inner pre-loop code to have 840 // single outer pre-loop load 841 const int firstStep = isL ? 7 : 3; 842 843 const int ASIZE = 256; 844 const int STORED_BYTES = 32; // amount of bytes stored per instruction 845 sub(sp, sp, ASIZE); 846 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 847 mov(ch1, sp); 848 BIND(BM_INIT_LOOP); 849 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 850 subs(tmp5, tmp5, 1); 851 br(GT, BM_INIT_LOOP); 852 853 sub(cnt1tmp, cnt1, 1); 854 mov(tmp5, str2); 855 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 856 sub(ch2, cnt1, 1); 857 mov(tmp3, str1); 858 BIND(BCLOOP); 859 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 860 if (!str1_isL) { 861 subs(zr, ch1, ASIZE); 862 br(HS, BCSKIP); 863 } 864 strb(ch2, Address(sp, ch1)); 865 BIND(BCSKIP); 866 subs(ch2, ch2, 1); 867 br(GT, BCLOOP); 868 869 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 870 if (str1_isL == str2_isL) { 871 // load last 8 bytes (8LL/4UU symbols) 872 ldr(tmp6, Address(tmp6, -wordSize)); 873 } else { 874 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 875 // convert Latin1 to UTF. We'll have to wait until load completed, but 876 // it's still faster than per-character loads+checks 877 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 878 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 879 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 880 andr(tmp6, tmp6, 0xFF); // str1[N-4] 881 orr(ch2, ch1, ch2, LSL, 16); 882 orr(tmp6, tmp6, tmp3, LSL, 48); 883 orr(tmp6, tmp6, ch2, LSL, 16); 884 } 885 BIND(BMLOOPSTR2); 886 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 887 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 888 if (str1_isL == str2_isL) { 889 // re-init tmp3. It's for free because it's executed in parallel with 890 // load above. Alternative is to initialize it before loop, but it'll 891 // affect performance on in-order systems with 2 or more ld/st pipelines 892 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 893 } 894 if (!isL) { // UU/UL case 895 lsl(ch2, cnt1tmp, 1); // offset in bytes 896 } 897 cmp(tmp3, skipch); 898 br(NE, BMSKIP); 899 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 900 mov(ch1, tmp6); 901 if (isL) { 902 b(BMLOOPSTR1_AFTER_LOAD); 903 } else { 904 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 905 b(BMLOOPSTR1_CMP); 906 } 907 BIND(BMLOOPSTR1); 908 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 909 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 910 BIND(BMLOOPSTR1_AFTER_LOAD); 911 subs(cnt1tmp, cnt1tmp, 1); 912 br(LT, BMLOOPSTR1_LASTCMP); 913 BIND(BMLOOPSTR1_CMP); 914 cmp(ch1, ch2); 915 br(EQ, BMLOOPSTR1); 916 BIND(BMSKIP); 917 if (!isL) { 918 // if we've met UTF symbol while searching Latin1 pattern, then we can 919 // skip cnt1 symbols 920 if (str1_isL != str2_isL) { 921 mov(result_tmp, cnt1); 922 } else { 923 mov(result_tmp, 1); 924 } 925 subs(zr, skipch, ASIZE); 926 br(HS, BMADV); 927 } 928 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 929 BIND(BMADV); 930 sub(cnt1tmp, cnt1, 1); 931 add(str2, str2, result_tmp, LSL, str2_chr_shift); 932 cmp(str2, str2end); 933 br(LE, BMLOOPSTR2); 934 add(sp, sp, ASIZE); 935 b(NOMATCH); 936 BIND(BMLOOPSTR1_LASTCMP); 937 cmp(ch1, ch2); 938 br(NE, BMSKIP); 939 BIND(BMMATCH); 940 sub(result, str2, tmp5); 941 if (!str2_isL) lsr(result, result, 1); 942 add(sp, sp, ASIZE); 943 b(DONE); 944 945 BIND(LINEARSTUB); 946 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 947 br(LT, LINEAR_MEDIUM); 948 mov(result, zr); 949 RuntimeAddress stub = nullptr; 950 if (isL) { 951 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 952 assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated"); 953 } else if (str1_isL) { 954 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 955 assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated"); 956 } else { 957 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 958 assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated"); 959 } 960 address call = trampoline_call(stub); 961 if (call == nullptr) { 962 DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH)); 963 ciEnv::current()->record_failure("CodeCache is full"); 964 return; 965 } 966 b(DONE); 967 } 968 969 BIND(LINEARSEARCH); 970 { 971 Label DO1, DO2, DO3; 972 973 Register str2tmp = tmp2; 974 Register first = tmp3; 975 976 if (icnt1 == -1) 977 { 978 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 979 980 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 981 br(LT, DOSHORT); 982 BIND(LINEAR_MEDIUM); 983 (this->*str1_load_1chr)(first, Address(str1)); 984 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 985 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 986 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 987 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 988 989 BIND(FIRST_LOOP); 990 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 991 cmp(first, ch2); 992 br(EQ, STR1_LOOP); 993 BIND(STR2_NEXT); 994 adds(cnt2_neg, cnt2_neg, str2_chr_size); 995 br(LE, FIRST_LOOP); 996 b(NOMATCH); 997 998 BIND(STR1_LOOP); 999 adds(cnt1tmp, cnt1_neg, str1_chr_size); 1000 add(cnt2tmp, cnt2_neg, str2_chr_size); 1001 br(GE, MATCH); 1002 1003 BIND(STR1_NEXT); 1004 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 1005 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 1006 cmp(ch1, ch2); 1007 br(NE, STR2_NEXT); 1008 adds(cnt1tmp, cnt1tmp, str1_chr_size); 1009 add(cnt2tmp, cnt2tmp, str2_chr_size); 1010 br(LT, STR1_NEXT); 1011 b(MATCH); 1012 1013 BIND(DOSHORT); 1014 if (str1_isL == str2_isL) { 1015 cmp(cnt1, (u1)2); 1016 br(LT, DO1); 1017 br(GT, DO3); 1018 } 1019 } 1020 1021 if (icnt1 == 4) { 1022 Label CH1_LOOP; 1023 1024 (this->*load_4chr)(ch1, str1); 1025 sub(result_tmp, cnt2, 4); 1026 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 1027 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 1028 1029 BIND(CH1_LOOP); 1030 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 1031 cmp(ch1, ch2); 1032 br(EQ, MATCH); 1033 adds(cnt2_neg, cnt2_neg, str2_chr_size); 1034 br(LE, CH1_LOOP); 1035 b(NOMATCH); 1036 } 1037 1038 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 1039 Label CH1_LOOP; 1040 1041 BIND(DO2); 1042 (this->*load_2chr)(ch1, str1); 1043 if (icnt1 == 2) { 1044 sub(result_tmp, cnt2, 2); 1045 } 1046 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 1047 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 1048 BIND(CH1_LOOP); 1049 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 1050 cmp(ch1, ch2); 1051 br(EQ, MATCH); 1052 adds(cnt2_neg, cnt2_neg, str2_chr_size); 1053 br(LE, CH1_LOOP); 1054 b(NOMATCH); 1055 } 1056 1057 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 1058 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 1059 1060 BIND(DO3); 1061 (this->*load_2chr)(first, str1); 1062 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 1063 if (icnt1 == 3) { 1064 sub(result_tmp, cnt2, 3); 1065 } 1066 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 1067 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 1068 BIND(FIRST_LOOP); 1069 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 1070 cmpw(first, ch2); 1071 br(EQ, STR1_LOOP); 1072 BIND(STR2_NEXT); 1073 adds(cnt2_neg, cnt2_neg, str2_chr_size); 1074 br(LE, FIRST_LOOP); 1075 b(NOMATCH); 1076 1077 BIND(STR1_LOOP); 1078 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 1079 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 1080 cmp(ch1, ch2); 1081 br(NE, STR2_NEXT); 1082 b(MATCH); 1083 } 1084 1085 if (icnt1 == -1 || icnt1 == 1) { 1086 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 1087 1088 BIND(DO1); 1089 (this->*str1_load_1chr)(ch1, str1); 1090 cmp(cnt2, (u1)8); 1091 br(LT, DO1_SHORT); 1092 1093 sub(result_tmp, cnt2, 8/str2_chr_size); 1094 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 1095 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 1096 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 1097 1098 if (str2_isL) { 1099 orr(ch1, ch1, ch1, LSL, 8); 1100 } 1101 orr(ch1, ch1, ch1, LSL, 16); 1102 orr(ch1, ch1, ch1, LSL, 32); 1103 BIND(CH1_LOOP); 1104 ldr(ch2, Address(str2, cnt2_neg)); 1105 eor(ch2, ch1, ch2); 1106 sub(tmp1, ch2, tmp3); 1107 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 1108 bics(tmp1, tmp1, tmp2); 1109 br(NE, HAS_ZERO); 1110 adds(cnt2_neg, cnt2_neg, 8); 1111 br(LT, CH1_LOOP); 1112 1113 cmp(cnt2_neg, (u1)8); 1114 mov(cnt2_neg, 0); 1115 br(LT, CH1_LOOP); 1116 b(NOMATCH); 1117 1118 BIND(HAS_ZERO); 1119 rev(tmp1, tmp1); 1120 clz(tmp1, tmp1); 1121 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 1122 b(MATCH); 1123 1124 BIND(DO1_SHORT); 1125 mov(result_tmp, cnt2); 1126 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 1127 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 1128 BIND(DO1_LOOP); 1129 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 1130 cmpw(ch1, ch2); 1131 br(EQ, MATCH); 1132 adds(cnt2_neg, cnt2_neg, str2_chr_size); 1133 br(LT, DO1_LOOP); 1134 } 1135 } 1136 BIND(NOMATCH); 1137 mov(result, -1); 1138 b(DONE); 1139 BIND(MATCH); 1140 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 1141 BIND(DONE); 1142 } 1143 1144 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 1145 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 1146 1147 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, 1148 Register ch, Register result, 1149 Register tmp1, Register tmp2, Register tmp3) 1150 { 1151 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 1152 Register cnt1_neg = cnt1; 1153 Register ch1 = rscratch1; 1154 Register result_tmp = rscratch2; 1155 1156 cbz(cnt1, NOMATCH); 1157 1158 cmp(cnt1, (u1)4); 1159 br(LT, DO1_SHORT); 1160 1161 orr(ch, ch, ch, LSL, 16); 1162 orr(ch, ch, ch, LSL, 32); 1163 1164 sub(cnt1, cnt1, 4); 1165 mov(result_tmp, cnt1); 1166 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 1167 sub(cnt1_neg, zr, cnt1, LSL, 1); 1168 1169 mov(tmp3, 0x0001000100010001); 1170 1171 BIND(CH1_LOOP); 1172 ldr(ch1, Address(str1, cnt1_neg)); 1173 eor(ch1, ch, ch1); 1174 sub(tmp1, ch1, tmp3); 1175 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 1176 bics(tmp1, tmp1, tmp2); 1177 br(NE, HAS_ZERO); 1178 adds(cnt1_neg, cnt1_neg, 8); 1179 br(LT, CH1_LOOP); 1180 1181 cmp(cnt1_neg, (u1)8); 1182 mov(cnt1_neg, 0); 1183 br(LT, CH1_LOOP); 1184 b(NOMATCH); 1185 1186 BIND(HAS_ZERO); 1187 rev(tmp1, tmp1); 1188 clz(tmp1, tmp1); 1189 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 1190 b(MATCH); 1191 1192 BIND(DO1_SHORT); 1193 mov(result_tmp, cnt1); 1194 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 1195 sub(cnt1_neg, zr, cnt1, LSL, 1); 1196 BIND(DO1_LOOP); 1197 ldrh(ch1, Address(str1, cnt1_neg)); 1198 cmpw(ch, ch1); 1199 br(EQ, MATCH); 1200 adds(cnt1_neg, cnt1_neg, 2); 1201 br(LT, DO1_LOOP); 1202 BIND(NOMATCH); 1203 mov(result, -1); 1204 b(DONE); 1205 BIND(MATCH); 1206 add(result, result_tmp, cnt1_neg, ASR, 1); 1207 BIND(DONE); 1208 } 1209 1210 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1, 1211 Register ch, Register result, 1212 FloatRegister ztmp1, 1213 FloatRegister ztmp2, 1214 PRegister tmp_pg, 1215 PRegister tmp_pdn, bool isL) 1216 { 1217 // Note that `tmp_pdn` should *NOT* be used as governing predicate register. 1218 assert(tmp_pg->is_governing(), 1219 "this register has to be a governing predicate register"); 1220 1221 Label LOOP, MATCH, DONE, NOMATCH; 1222 Register vec_len = rscratch1; 1223 Register idx = rscratch2; 1224 1225 SIMD_RegVariant T = (isL == true) ? B : H; 1226 1227 cbz(cnt1, NOMATCH); 1228 1229 // Assign the particular char throughout the vector. 1230 sve_dup(ztmp2, T, ch); 1231 if (isL) { 1232 sve_cntb(vec_len); 1233 } else { 1234 sve_cnth(vec_len); 1235 } 1236 mov(idx, 0); 1237 1238 // Generate a predicate to control the reading of input string. 1239 sve_whilelt(tmp_pg, T, idx, cnt1); 1240 1241 BIND(LOOP); 1242 // Read a vector of 8- or 16-bit data depending on the string type. Note 1243 // that inactive elements indicated by the predicate register won't cause 1244 // a data read from memory to the destination vector. 1245 if (isL) { 1246 sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx)); 1247 } else { 1248 sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1))); 1249 } 1250 add(idx, idx, vec_len); 1251 1252 // Perform the comparison. An element of the destination predicate is set 1253 // to active if the particular char is matched. 1254 sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2); 1255 1256 // Branch if the particular char is found. 1257 br(NE, MATCH); 1258 1259 sve_whilelt(tmp_pg, T, idx, cnt1); 1260 1261 // Loop back if the particular char not found. 1262 br(MI, LOOP); 1263 1264 BIND(NOMATCH); 1265 mov(result, -1); 1266 b(DONE); 1267 1268 BIND(MATCH); 1269 // Undo the index increment. 1270 sub(idx, idx, vec_len); 1271 1272 // Crop the vector to find its location. 1273 sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */); 1274 add(result, idx, -1); 1275 sve_incp(result, T, tmp_pdn); 1276 BIND(DONE); 1277 } 1278 1279 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, 1280 Register ch, Register result, 1281 Register tmp1, Register tmp2, Register tmp3) 1282 { 1283 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 1284 Register cnt1_neg = cnt1; 1285 Register ch1 = rscratch1; 1286 Register result_tmp = rscratch2; 1287 1288 cbz(cnt1, NOMATCH); 1289 1290 cmp(cnt1, (u1)8); 1291 br(LT, DO1_SHORT); 1292 1293 orr(ch, ch, ch, LSL, 8); 1294 orr(ch, ch, ch, LSL, 16); 1295 orr(ch, ch, ch, LSL, 32); 1296 1297 sub(cnt1, cnt1, 8); 1298 mov(result_tmp, cnt1); 1299 lea(str1, Address(str1, cnt1)); 1300 sub(cnt1_neg, zr, cnt1); 1301 1302 mov(tmp3, 0x0101010101010101); 1303 1304 BIND(CH1_LOOP); 1305 ldr(ch1, Address(str1, cnt1_neg)); 1306 eor(ch1, ch, ch1); 1307 sub(tmp1, ch1, tmp3); 1308 orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f); 1309 bics(tmp1, tmp1, tmp2); 1310 br(NE, HAS_ZERO); 1311 adds(cnt1_neg, cnt1_neg, 8); 1312 br(LT, CH1_LOOP); 1313 1314 cmp(cnt1_neg, (u1)8); 1315 mov(cnt1_neg, 0); 1316 br(LT, CH1_LOOP); 1317 b(NOMATCH); 1318 1319 BIND(HAS_ZERO); 1320 rev(tmp1, tmp1); 1321 clz(tmp1, tmp1); 1322 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 1323 b(MATCH); 1324 1325 BIND(DO1_SHORT); 1326 mov(result_tmp, cnt1); 1327 lea(str1, Address(str1, cnt1)); 1328 sub(cnt1_neg, zr, cnt1); 1329 BIND(DO1_LOOP); 1330 ldrb(ch1, Address(str1, cnt1_neg)); 1331 cmp(ch, ch1); 1332 br(EQ, MATCH); 1333 adds(cnt1_neg, cnt1_neg, 1); 1334 br(LT, DO1_LOOP); 1335 BIND(NOMATCH); 1336 mov(result, -1); 1337 b(DONE); 1338 BIND(MATCH); 1339 add(result, result_tmp, cnt1_neg); 1340 BIND(DONE); 1341 } 1342 1343 // Compare strings. 1344 void C2_MacroAssembler::string_compare(Register str1, Register str2, 1345 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 1346 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 1347 PRegister pgtmp1, PRegister pgtmp2, int ae) { 1348 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 1349 DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 1350 SHORT_LOOP_START, TAIL_CHECK; 1351 1352 bool isLL = ae == StrIntrinsicNode::LL; 1353 bool isLU = ae == StrIntrinsicNode::LU; 1354 bool isUL = ae == StrIntrinsicNode::UL; 1355 1356 // The stub threshold for LL strings is: 72 (64 + 8) chars 1357 // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch) 1358 // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least) 1359 const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36); 1360 1361 bool str1_isL = isLL || isLU; 1362 bool str2_isL = isLL || isUL; 1363 1364 int str1_chr_shift = str1_isL ? 0 : 1; 1365 int str2_chr_shift = str2_isL ? 0 : 1; 1366 int str1_chr_size = str1_isL ? 1 : 2; 1367 int str2_chr_size = str2_isL ? 1 : 2; 1368 int minCharsInWord = isLL ? wordSize : wordSize/2; 1369 1370 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 1371 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 1372 (chr_insn)&MacroAssembler::ldrh; 1373 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 1374 (chr_insn)&MacroAssembler::ldrh; 1375 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 1376 (uxt_insn)&MacroAssembler::uxthw; 1377 1378 BLOCK_COMMENT("string_compare {"); 1379 1380 // Bizarrely, the counts are passed in bytes, regardless of whether they 1381 // are L or U strings, however the result is always in characters. 1382 if (!str1_isL) asrw(cnt1, cnt1, 1); 1383 if (!str2_isL) asrw(cnt2, cnt2, 1); 1384 1385 // Compute the minimum of the string lengths and save the difference. 1386 subsw(result, cnt1, cnt2); 1387 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 1388 1389 // A very short string 1390 cmpw(cnt2, minCharsInWord); 1391 br(Assembler::LE, SHORT_STRING); 1392 1393 // Compare longwords 1394 // load first parts of strings and finish initialization while loading 1395 { 1396 if (str1_isL == str2_isL) { // LL or UU 1397 ldr(tmp1, Address(str1)); 1398 cmp(str1, str2); 1399 br(Assembler::EQ, DONE); 1400 ldr(tmp2, Address(str2)); 1401 cmp(cnt2, stub_threshold); 1402 br(GE, STUB); 1403 subsw(cnt2, cnt2, minCharsInWord); 1404 br(EQ, TAIL_CHECK); 1405 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1406 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1407 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1408 } else if (isLU) { 1409 ldrs(vtmp, Address(str1)); 1410 ldr(tmp2, Address(str2)); 1411 cmp(cnt2, stub_threshold); 1412 br(GE, STUB); 1413 subw(cnt2, cnt2, 4); 1414 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1415 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1416 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1417 zip1(vtmp, T8B, vtmp, vtmpZ); 1418 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1419 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1420 add(cnt1, cnt1, 4); 1421 fmovd(tmp1, vtmp); 1422 } else { // UL case 1423 ldr(tmp1, Address(str1)); 1424 ldrs(vtmp, Address(str2)); 1425 cmp(cnt2, stub_threshold); 1426 br(GE, STUB); 1427 subw(cnt2, cnt2, 4); 1428 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 1429 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 1430 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 1431 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 1432 zip1(vtmp, T8B, vtmp, vtmpZ); 1433 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 1434 add(cnt1, cnt1, 8); 1435 fmovd(tmp2, vtmp); 1436 } 1437 adds(cnt2, cnt2, isUL ? 4 : 8); 1438 br(GE, TAIL); 1439 eor(rscratch2, tmp1, tmp2); 1440 cbnz(rscratch2, DIFF); 1441 // main loop 1442 bind(NEXT_WORD); 1443 if (str1_isL == str2_isL) { 1444 ldr(tmp1, Address(str1, cnt2)); 1445 ldr(tmp2, Address(str2, cnt2)); 1446 adds(cnt2, cnt2, 8); 1447 } else if (isLU) { 1448 ldrs(vtmp, Address(str1, cnt1)); 1449 ldr(tmp2, Address(str2, cnt2)); 1450 add(cnt1, cnt1, 4); 1451 zip1(vtmp, T8B, vtmp, vtmpZ); 1452 fmovd(tmp1, vtmp); 1453 adds(cnt2, cnt2, 8); 1454 } else { // UL 1455 ldrs(vtmp, Address(str2, cnt2)); 1456 ldr(tmp1, Address(str1, cnt1)); 1457 zip1(vtmp, T8B, vtmp, vtmpZ); 1458 add(cnt1, cnt1, 8); 1459 fmovd(tmp2, vtmp); 1460 adds(cnt2, cnt2, 4); 1461 } 1462 br(GE, TAIL); 1463 1464 eor(rscratch2, tmp1, tmp2); 1465 cbz(rscratch2, NEXT_WORD); 1466 b(DIFF); 1467 bind(TAIL); 1468 eor(rscratch2, tmp1, tmp2); 1469 cbnz(rscratch2, DIFF); 1470 // Last longword. In the case where length == 4 we compare the 1471 // same longword twice, but that's still faster than another 1472 // conditional branch. 1473 if (str1_isL == str2_isL) { 1474 ldr(tmp1, Address(str1)); 1475 ldr(tmp2, Address(str2)); 1476 } else if (isLU) { 1477 ldrs(vtmp, Address(str1)); 1478 ldr(tmp2, Address(str2)); 1479 zip1(vtmp, T8B, vtmp, vtmpZ); 1480 fmovd(tmp1, vtmp); 1481 } else { // UL 1482 ldrs(vtmp, Address(str2)); 1483 ldr(tmp1, Address(str1)); 1484 zip1(vtmp, T8B, vtmp, vtmpZ); 1485 fmovd(tmp2, vtmp); 1486 } 1487 bind(TAIL_CHECK); 1488 eor(rscratch2, tmp1, tmp2); 1489 cbz(rscratch2, DONE); 1490 1491 // Find the first different characters in the longwords and 1492 // compute their difference. 1493 bind(DIFF); 1494 rev(rscratch2, rscratch2); 1495 clz(rscratch2, rscratch2); 1496 andr(rscratch2, rscratch2, isLL ? -8 : -16); 1497 lsrv(tmp1, tmp1, rscratch2); 1498 (this->*ext_chr)(tmp1, tmp1); 1499 lsrv(tmp2, tmp2, rscratch2); 1500 (this->*ext_chr)(tmp2, tmp2); 1501 subw(result, tmp1, tmp2); 1502 b(DONE); 1503 } 1504 1505 bind(STUB); 1506 RuntimeAddress stub = nullptr; 1507 switch(ae) { 1508 case StrIntrinsicNode::LL: 1509 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 1510 break; 1511 case StrIntrinsicNode::UU: 1512 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 1513 break; 1514 case StrIntrinsicNode::LU: 1515 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 1516 break; 1517 case StrIntrinsicNode::UL: 1518 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 1519 break; 1520 default: 1521 ShouldNotReachHere(); 1522 } 1523 assert(stub.target() != nullptr, "compare_long_string stub has not been generated"); 1524 address call = trampoline_call(stub); 1525 if (call == nullptr) { 1526 DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START)); 1527 ciEnv::current()->record_failure("CodeCache is full"); 1528 return; 1529 } 1530 b(DONE); 1531 1532 bind(SHORT_STRING); 1533 // Is the minimum length zero? 1534 cbz(cnt2, DONE); 1535 // arrange code to do most branches while loading and loading next characters 1536 // while comparing previous 1537 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1538 subs(cnt2, cnt2, 1); 1539 br(EQ, SHORT_LAST_INIT); 1540 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1541 b(SHORT_LOOP_START); 1542 bind(SHORT_LOOP); 1543 subs(cnt2, cnt2, 1); 1544 br(EQ, SHORT_LAST); 1545 bind(SHORT_LOOP_START); 1546 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 1547 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 1548 cmp(tmp1, cnt1); 1549 br(NE, SHORT_LOOP_TAIL); 1550 subs(cnt2, cnt2, 1); 1551 br(EQ, SHORT_LAST2); 1552 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 1553 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1554 cmp(tmp2, rscratch1); 1555 br(EQ, SHORT_LOOP); 1556 sub(result, tmp2, rscratch1); 1557 b(DONE); 1558 bind(SHORT_LOOP_TAIL); 1559 sub(result, tmp1, cnt1); 1560 b(DONE); 1561 bind(SHORT_LAST2); 1562 cmp(tmp2, rscratch1); 1563 br(EQ, DONE); 1564 sub(result, tmp2, rscratch1); 1565 1566 b(DONE); 1567 bind(SHORT_LAST_INIT); 1568 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 1569 bind(SHORT_LAST); 1570 cmp(tmp1, cnt1); 1571 br(EQ, DONE); 1572 sub(result, tmp1, cnt1); 1573 1574 bind(DONE); 1575 1576 BLOCK_COMMENT("} string_compare"); 1577 } 1578 1579 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1, 1580 FloatRegister src2, Condition cond, bool isQ) { 1581 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1582 FloatRegister zn = src1, zm = src2; 1583 bool needs_negation = false; 1584 switch (cond) { 1585 case LT: cond = GT; zn = src2; zm = src1; break; 1586 case LE: cond = GE; zn = src2; zm = src1; break; 1587 case LO: cond = HI; zn = src2; zm = src1; break; 1588 case LS: cond = HS; zn = src2; zm = src1; break; 1589 case NE: cond = EQ; needs_negation = true; break; 1590 default: 1591 break; 1592 } 1593 1594 if (is_floating_point_type(bt)) { 1595 fcm(cond, dst, size, zn, zm); 1596 } else { 1597 cm(cond, dst, size, zn, zm); 1598 } 1599 1600 if (needs_negation) { 1601 notr(dst, isQ ? T16B : T8B, dst); 1602 } 1603 } 1604 1605 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src, 1606 Condition cond, bool isQ) { 1607 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 1608 if (bt == T_FLOAT || bt == T_DOUBLE) { 1609 if (cond == Assembler::NE) { 1610 fcm(Assembler::EQ, dst, size, src); 1611 notr(dst, isQ ? T16B : T8B, dst); 1612 } else { 1613 fcm(cond, dst, size, src); 1614 } 1615 } else { 1616 if (cond == Assembler::NE) { 1617 cm(Assembler::EQ, dst, size, src); 1618 notr(dst, isQ ? T16B : T8B, dst); 1619 } else { 1620 cm(cond, dst, size, src); 1621 } 1622 } 1623 } 1624 1625 // Compress the least significant bit of each byte to the rightmost and clear 1626 // the higher garbage bits. 1627 void C2_MacroAssembler::bytemask_compress(Register dst) { 1628 // Example input, dst = 0x01 00 00 00 01 01 00 01 1629 // The "??" bytes are garbage. 1630 orr(dst, dst, dst, Assembler::LSR, 7); // dst = 0x?? 02 ?? 00 ?? 03 ?? 01 1631 orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D 1632 orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D 1633 andr(dst, dst, 0xff); // dst = 0x8D 1634 } 1635 1636 // Pack the lowest-numbered bit of each mask element in src into a long value 1637 // in dst, at most the first 64 lane elements. 1638 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM. 1639 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt, 1640 FloatRegister vtmp1, FloatRegister vtmp2) { 1641 assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count"); 1642 assert_different_registers(dst, rscratch1); 1643 assert_different_registers(vtmp1, vtmp2); 1644 1645 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1646 // Example: src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16 1647 // Expected: dst = 0x658D 1648 1649 // Convert the mask into vector with sequential bytes. 1650 // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001 1651 sve_cpy(vtmp1, size, src, 1, false); 1652 if (bt != T_BYTE) { 1653 sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2); 1654 } 1655 1656 if (UseSVE > 1 && VM_Version::supports_svebitperm()) { 1657 // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea 1658 // is to compress each significant bit of the byte in a cross-lane way. Due 1659 // to the lack of a cross-lane bit-compress instruction, we use BEXT 1660 // (bit-compress in each lane) with the biggest lane size (T = D) then 1661 // concatenate the results. 1662 1663 // The second source input of BEXT, initialized with 0x01 in each byte. 1664 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1665 sve_dup(vtmp2, B, 1); 1666 1667 // BEXT vtmp1.D, vtmp1.D, vtmp2.D 1668 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1669 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1670 // --------------------------------------- 1671 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1672 sve_bext(vtmp1, D, vtmp1, vtmp2); 1673 1674 // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the 1675 // result to dst. 1676 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1677 // dst = 0x658D 1678 if (lane_cnt <= 8) { 1679 // No need to concatenate. 1680 umov(dst, vtmp1, B, 0); 1681 } else if (lane_cnt <= 16) { 1682 ins(vtmp1, B, vtmp1, 1, 8); 1683 umov(dst, vtmp1, H, 0); 1684 } else { 1685 // As the lane count is 64 at most, the final expected value must be in 1686 // the lowest 64 bits after narrowing vtmp1 from D to B. 1687 sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2); 1688 umov(dst, vtmp1, D, 0); 1689 } 1690 } else if (UseSVE > 0) { 1691 // Compress the lowest 8 bytes. 1692 fmovd(dst, vtmp1); 1693 bytemask_compress(dst); 1694 if (lane_cnt <= 8) return; 1695 1696 // Repeat on higher bytes and join the results. 1697 // Compress 8 bytes in each iteration. 1698 for (int idx = 1; idx < (lane_cnt / 8); idx++) { 1699 sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2); 1700 bytemask_compress(rscratch1); 1701 orr(dst, dst, rscratch1, Assembler::LSL, idx << 3); 1702 } 1703 } else { 1704 assert(false, "unsupported"); 1705 ShouldNotReachHere(); 1706 } 1707 } 1708 1709 // Unpack the mask, a long value in src, into predicate register dst based on the 1710 // corresponding data type. Note that dst can support at most 64 lanes. 1711 // Below example gives the expected dst predicate register in different types, with 1712 // a valid src(0x658D) on a 1024-bit vector size machine. 1713 // BYTE: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D 1714 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51 1715 // INT: dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01 1716 // LONG: dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01 1717 // 1718 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which 1719 // has 24 significant bits would be an invalid input if dst predicate register refers to 1720 // a LONG type 1024-bit vector, which has at most 16 lanes. 1721 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt, 1722 FloatRegister vtmp1, FloatRegister vtmp2) { 1723 assert(UseSVE == 2 && VM_Version::supports_svebitperm() && 1724 lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported"); 1725 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 1726 // Example: src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16 1727 // Expected: dst = 0b01101001 10001101 1728 1729 // Put long value from general purpose register into the first lane of vector. 1730 // vtmp1 = 0x0000000000000000 | 0x000000000000658D 1731 sve_dup(vtmp1, B, 0); 1732 mov(vtmp1, D, 0, src); 1733 1734 // As sve_cmp generates mask value with the minimum unit in byte, we should 1735 // transform the value in the first lane which is mask in bit now to the 1736 // mask in byte, which can be done by SVE2's BDEP instruction. 1737 1738 // The first source input of BDEP instruction. Deposite each byte in every 8 bytes. 1739 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1740 if (lane_cnt <= 8) { 1741 // Nothing. As only one byte exsits. 1742 } else if (lane_cnt <= 16) { 1743 ins(vtmp1, B, vtmp1, 8, 1); 1744 mov(vtmp1, B, 1, zr); 1745 } else { 1746 sve_vector_extend(vtmp1, D, vtmp1, B); 1747 } 1748 1749 // The second source input of BDEP instruction, initialized with 0x01 for each byte. 1750 // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101 1751 sve_dup(vtmp2, B, 1); 1752 1753 // BDEP vtmp1.D, vtmp1.D, vtmp2.D 1754 // vtmp1 = 0x0000000000000065 | 0x000000000000008D 1755 // vtmp2 = 0x0101010101010101 | 0x0101010101010101 1756 // --------------------------------------- 1757 // vtmp1 = 0x0001010000010001 | 0x0100000001010001 1758 sve_bdep(vtmp1, D, vtmp1, vtmp2); 1759 1760 if (bt != T_BYTE) { 1761 sve_vector_extend(vtmp1, size, vtmp1, B); 1762 } 1763 // Generate mask according to the given vector, in which the elements have been 1764 // extended to expected type. 1765 // dst = 0b01101001 10001101 1766 sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0); 1767 } 1768 1769 // Clobbers: rflags 1770 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg, 1771 FloatRegister zn, FloatRegister zm, Condition cond) { 1772 assert(pg->is_governing(), "This register has to be a governing predicate register"); 1773 FloatRegister z1 = zn, z2 = zm; 1774 switch (cond) { 1775 case LE: z1 = zm; z2 = zn; cond = GE; break; 1776 case LT: z1 = zm; z2 = zn; cond = GT; break; 1777 case LO: z1 = zm; z2 = zn; cond = HI; break; 1778 case LS: z1 = zm; z2 = zn; cond = HS; break; 1779 default: 1780 break; 1781 } 1782 1783 SIMD_RegVariant size = elemType_to_regVariant(bt); 1784 if (is_floating_point_type(bt)) { 1785 sve_fcm(cond, pd, size, pg, z1, z2); 1786 } else { 1787 assert(is_integral_type(bt), "unsupported element type"); 1788 sve_cmp(cond, pd, size, pg, z1, z2); 1789 } 1790 } 1791 1792 // Get index of the last mask lane that is set 1793 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) { 1794 SIMD_RegVariant size = elemType_to_regVariant(bt); 1795 sve_rev(ptmp, size, src); 1796 sve_brkb(ptmp, ptrue, ptmp, false); 1797 sve_cntp(dst, size, ptrue, ptmp); 1798 movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1); 1799 subw(dst, rscratch1, dst); 1800 } 1801 1802 // Extend integer vector src to dst with the same lane count 1803 // but larger element size, e.g. 4B -> 4I 1804 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes, 1805 FloatRegister src, BasicType src_bt, bool is_unsigned) { 1806 if (src_bt == T_BYTE) { 1807 if (dst_bt == T_SHORT) { 1808 // 4B/8B to 4S/8S 1809 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1810 } else { 1811 // 4B to 4I 1812 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1813 _xshll(is_unsigned, dst, T8H, src, T8B, 0); 1814 _xshll(is_unsigned, dst, T4S, dst, T4H, 0); 1815 } 1816 } else if (src_bt == T_SHORT) { 1817 // 4S to 4I 1818 assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported"); 1819 _xshll(is_unsigned, dst, T4S, src, T4H, 0); 1820 } else if (src_bt == T_INT) { 1821 // 2I to 2L 1822 assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported"); 1823 _xshll(is_unsigned, dst, T2D, src, T2S, 0); 1824 } else { 1825 ShouldNotReachHere(); 1826 } 1827 } 1828 1829 // Narrow integer vector src down to dst with the same lane count 1830 // but smaller element size, e.g. 4I -> 4B 1831 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt, 1832 FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) { 1833 if (src_bt == T_SHORT) { 1834 // 4S/8S to 4B/8B 1835 assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported"); 1836 assert(dst_bt == T_BYTE, "unsupported"); 1837 xtn(dst, T8B, src, T8H); 1838 } else if (src_bt == T_INT) { 1839 // 4I to 4B/4S 1840 assert(src_vlen_in_bytes == 16, "unsupported"); 1841 assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported"); 1842 xtn(dst, T4H, src, T4S); 1843 if (dst_bt == T_BYTE) { 1844 xtn(dst, T8B, dst, T8H); 1845 } 1846 } else if (src_bt == T_LONG) { 1847 // 2L to 2I 1848 assert(src_vlen_in_bytes == 16, "unsupported"); 1849 assert(dst_bt == T_INT, "unsupported"); 1850 xtn(dst, T2S, src, T2D); 1851 } else { 1852 ShouldNotReachHere(); 1853 } 1854 } 1855 1856 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size, 1857 FloatRegister src, SIMD_RegVariant src_size, 1858 bool is_unsigned) { 1859 assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size"); 1860 1861 if (src_size == B) { 1862 switch (dst_size) { 1863 case H: 1864 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1865 break; 1866 case S: 1867 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1868 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1869 break; 1870 case D: 1871 _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src); 1872 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst); 1873 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1874 break; 1875 default: 1876 ShouldNotReachHere(); 1877 } 1878 } else if (src_size == H) { 1879 if (dst_size == S) { 1880 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1881 } else { // D 1882 _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src); 1883 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst); 1884 } 1885 } else if (src_size == S) { 1886 _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src); 1887 } 1888 } 1889 1890 // Vector narrow from src to dst with specified element sizes. 1891 // High part of dst vector will be filled with zero. 1892 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size, 1893 FloatRegister src, SIMD_RegVariant src_size, 1894 FloatRegister tmp) { 1895 assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size"); 1896 assert_different_registers(src, tmp); 1897 sve_dup(tmp, src_size, 0); 1898 if (src_size == D) { 1899 switch (dst_size) { 1900 case S: 1901 sve_uzp1(dst, S, src, tmp); 1902 break; 1903 case H: 1904 assert_different_registers(dst, tmp); 1905 sve_uzp1(dst, S, src, tmp); 1906 sve_uzp1(dst, H, dst, tmp); 1907 break; 1908 case B: 1909 assert_different_registers(dst, tmp); 1910 sve_uzp1(dst, S, src, tmp); 1911 sve_uzp1(dst, H, dst, tmp); 1912 sve_uzp1(dst, B, dst, tmp); 1913 break; 1914 default: 1915 ShouldNotReachHere(); 1916 } 1917 } else if (src_size == S) { 1918 if (dst_size == H) { 1919 sve_uzp1(dst, H, src, tmp); 1920 } else { // B 1921 assert_different_registers(dst, tmp); 1922 sve_uzp1(dst, H, src, tmp); 1923 sve_uzp1(dst, B, dst, tmp); 1924 } 1925 } else if (src_size == H) { 1926 sve_uzp1(dst, B, src, tmp); 1927 } 1928 } 1929 1930 // Extend src predicate to dst predicate with the same lane count but larger 1931 // element size, e.g. 64Byte -> 512Long 1932 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src, 1933 uint dst_element_length_in_bytes, 1934 uint src_element_length_in_bytes) { 1935 if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) { 1936 sve_punpklo(dst, src); 1937 } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) { 1938 sve_punpklo(dst, src); 1939 sve_punpklo(dst, dst); 1940 } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) { 1941 sve_punpklo(dst, src); 1942 sve_punpklo(dst, dst); 1943 sve_punpklo(dst, dst); 1944 } else { 1945 assert(false, "unsupported"); 1946 ShouldNotReachHere(); 1947 } 1948 } 1949 1950 // Narrow src predicate to dst predicate with the same lane count but 1951 // smaller element size, e.g. 512Long -> 64Byte 1952 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp, 1953 uint dst_element_length_in_bytes, uint src_element_length_in_bytes) { 1954 // The insignificant bits in src predicate are expected to be zero. 1955 // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is 1956 // passed as the second argument. An example narrowing operation with a given mask would be - 1957 // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I 1958 // Mask (for 2 Longs) : TF 1959 // Predicate register for the above mask (16 bits) : 00000001 00000000 1960 // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000 1961 // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0) 1962 assert_different_registers(src, ptmp); 1963 assert_different_registers(dst, ptmp); 1964 sve_pfalse(ptmp); 1965 if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) { 1966 sve_uzp1(dst, B, src, ptmp); 1967 } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) { 1968 sve_uzp1(dst, H, src, ptmp); 1969 sve_uzp1(dst, B, dst, ptmp); 1970 } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) { 1971 sve_uzp1(dst, S, src, ptmp); 1972 sve_uzp1(dst, H, dst, ptmp); 1973 sve_uzp1(dst, B, dst, ptmp); 1974 } else { 1975 assert(false, "unsupported"); 1976 ShouldNotReachHere(); 1977 } 1978 } 1979 1980 // Vector reduction add for integral type with ASIMD instructions. 1981 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt, 1982 Register isrc, FloatRegister vsrc, 1983 unsigned vector_length_in_bytes, 1984 FloatRegister vtmp) { 1985 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 1986 assert_different_registers(dst, isrc); 1987 bool isQ = vector_length_in_bytes == 16; 1988 1989 BLOCK_COMMENT("neon_reduce_add_integral {"); 1990 switch(bt) { 1991 case T_BYTE: 1992 addv(vtmp, isQ ? T16B : T8B, vsrc); 1993 smov(dst, vtmp, B, 0); 1994 addw(dst, dst, isrc, ext::sxtb); 1995 break; 1996 case T_SHORT: 1997 addv(vtmp, isQ ? T8H : T4H, vsrc); 1998 smov(dst, vtmp, H, 0); 1999 addw(dst, dst, isrc, ext::sxth); 2000 break; 2001 case T_INT: 2002 isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc); 2003 umov(dst, vtmp, S, 0); 2004 addw(dst, dst, isrc); 2005 break; 2006 case T_LONG: 2007 assert(isQ, "unsupported"); 2008 addpd(vtmp, vsrc); 2009 umov(dst, vtmp, D, 0); 2010 add(dst, dst, isrc); 2011 break; 2012 default: 2013 assert(false, "unsupported"); 2014 ShouldNotReachHere(); 2015 } 2016 BLOCK_COMMENT("} neon_reduce_add_integral"); 2017 } 2018 2019 // Vector reduction multiply for integral type with ASIMD instructions. 2020 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases. 2021 // Clobbers: rscratch1 2022 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt, 2023 Register isrc, FloatRegister vsrc, 2024 unsigned vector_length_in_bytes, 2025 FloatRegister vtmp1, FloatRegister vtmp2) { 2026 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2027 bool isQ = vector_length_in_bytes == 16; 2028 2029 BLOCK_COMMENT("neon_reduce_mul_integral {"); 2030 switch(bt) { 2031 case T_BYTE: 2032 if (isQ) { 2033 // Multiply the lower half and higher half of vector iteratively. 2034 // vtmp1 = vsrc[8:15] 2035 ins(vtmp1, D, vsrc, 0, 1); 2036 // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7] 2037 mulv(vtmp1, T8B, vtmp1, vsrc); 2038 // vtmp2 = vtmp1[4:7] 2039 ins(vtmp2, S, vtmp1, 0, 1); 2040 // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3] 2041 mulv(vtmp1, T8B, vtmp2, vtmp1); 2042 } else { 2043 ins(vtmp1, S, vsrc, 0, 1); 2044 mulv(vtmp1, T8B, vtmp1, vsrc); 2045 } 2046 // vtmp2 = vtmp1[2:3] 2047 ins(vtmp2, H, vtmp1, 0, 1); 2048 // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1] 2049 mulv(vtmp2, T8B, vtmp2, vtmp1); 2050 // dst = vtmp2[0] * isrc * vtmp2[1] 2051 umov(rscratch1, vtmp2, B, 0); 2052 mulw(dst, rscratch1, isrc); 2053 sxtb(dst, dst); 2054 umov(rscratch1, vtmp2, B, 1); 2055 mulw(dst, rscratch1, dst); 2056 sxtb(dst, dst); 2057 break; 2058 case T_SHORT: 2059 if (isQ) { 2060 ins(vtmp2, D, vsrc, 0, 1); 2061 mulv(vtmp2, T4H, vtmp2, vsrc); 2062 ins(vtmp1, S, vtmp2, 0, 1); 2063 mulv(vtmp1, T4H, vtmp1, vtmp2); 2064 } else { 2065 ins(vtmp1, S, vsrc, 0, 1); 2066 mulv(vtmp1, T4H, vtmp1, vsrc); 2067 } 2068 umov(rscratch1, vtmp1, H, 0); 2069 mulw(dst, rscratch1, isrc); 2070 sxth(dst, dst); 2071 umov(rscratch1, vtmp1, H, 1); 2072 mulw(dst, rscratch1, dst); 2073 sxth(dst, dst); 2074 break; 2075 case T_INT: 2076 if (isQ) { 2077 ins(vtmp1, D, vsrc, 0, 1); 2078 mulv(vtmp1, T2S, vtmp1, vsrc); 2079 } else { 2080 vtmp1 = vsrc; 2081 } 2082 umov(rscratch1, vtmp1, S, 0); 2083 mul(dst, rscratch1, isrc); 2084 umov(rscratch1, vtmp1, S, 1); 2085 mul(dst, rscratch1, dst); 2086 break; 2087 case T_LONG: 2088 umov(rscratch1, vsrc, D, 0); 2089 mul(dst, isrc, rscratch1); 2090 umov(rscratch1, vsrc, D, 1); 2091 mul(dst, dst, rscratch1); 2092 break; 2093 default: 2094 assert(false, "unsupported"); 2095 ShouldNotReachHere(); 2096 } 2097 BLOCK_COMMENT("} neon_reduce_mul_integral"); 2098 } 2099 2100 // Vector reduction multiply for floating-point type with ASIMD instructions. 2101 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt, 2102 FloatRegister fsrc, FloatRegister vsrc, 2103 unsigned vector_length_in_bytes, 2104 FloatRegister vtmp) { 2105 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2106 bool isQ = vector_length_in_bytes == 16; 2107 2108 BLOCK_COMMENT("neon_reduce_mul_fp {"); 2109 switch(bt) { 2110 case T_FLOAT: 2111 fmuls(dst, fsrc, vsrc); 2112 ins(vtmp, S, vsrc, 0, 1); 2113 fmuls(dst, dst, vtmp); 2114 if (isQ) { 2115 ins(vtmp, S, vsrc, 0, 2); 2116 fmuls(dst, dst, vtmp); 2117 ins(vtmp, S, vsrc, 0, 3); 2118 fmuls(dst, dst, vtmp); 2119 } 2120 break; 2121 case T_DOUBLE: 2122 assert(isQ, "unsupported"); 2123 fmuld(dst, fsrc, vsrc); 2124 ins(vtmp, D, vsrc, 0, 1); 2125 fmuld(dst, dst, vtmp); 2126 break; 2127 default: 2128 assert(false, "unsupported"); 2129 ShouldNotReachHere(); 2130 } 2131 BLOCK_COMMENT("} neon_reduce_mul_fp"); 2132 } 2133 2134 // Helper to select logical instruction 2135 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd, 2136 Register Rn, Register Rm, 2137 enum shift_kind kind, unsigned shift) { 2138 switch(opc) { 2139 case Op_AndReductionV: 2140 is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift); 2141 break; 2142 case Op_OrReductionV: 2143 is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift); 2144 break; 2145 case Op_XorReductionV: 2146 is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift); 2147 break; 2148 default: 2149 assert(false, "unsupported"); 2150 ShouldNotReachHere(); 2151 } 2152 } 2153 2154 // Vector reduction logical operations And, Or, Xor 2155 // Clobbers: rscratch1 2156 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt, 2157 Register isrc, FloatRegister vsrc, 2158 unsigned vector_length_in_bytes) { 2159 assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV, 2160 "unsupported"); 2161 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2162 assert_different_registers(dst, isrc); 2163 bool isQ = vector_length_in_bytes == 16; 2164 2165 BLOCK_COMMENT("neon_reduce_logical {"); 2166 umov(rscratch1, vsrc, isQ ? D : S, 0); 2167 umov(dst, vsrc, isQ ? D : S, 1); 2168 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1); 2169 switch(bt) { 2170 case T_BYTE: 2171 if (isQ) { 2172 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2173 } 2174 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 2175 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8); 2176 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2177 sxtb(dst, dst); 2178 break; 2179 case T_SHORT: 2180 if (isQ) { 2181 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2182 } 2183 neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16); 2184 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2185 sxth(dst, dst); 2186 break; 2187 case T_INT: 2188 if (isQ) { 2189 neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32); 2190 } 2191 neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst); 2192 break; 2193 case T_LONG: 2194 assert(isQ, "unsupported"); 2195 neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst); 2196 break; 2197 default: 2198 assert(false, "unsupported"); 2199 ShouldNotReachHere(); 2200 } 2201 BLOCK_COMMENT("} neon_reduce_logical"); 2202 } 2203 2204 // Vector reduction min/max for integral type with ASIMD instructions. 2205 // Note: vtmp is not used and expected to be fnoreg for T_LONG case. 2206 // Clobbers: rscratch1, rflags 2207 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt, 2208 Register isrc, FloatRegister vsrc, 2209 unsigned vector_length_in_bytes, 2210 FloatRegister vtmp) { 2211 assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported"); 2212 assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported"); 2213 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported"); 2214 assert_different_registers(dst, isrc); 2215 bool isQ = vector_length_in_bytes == 16; 2216 bool is_min = opc == Op_MinReductionV; 2217 2218 BLOCK_COMMENT("neon_reduce_minmax_integral {"); 2219 if (bt == T_LONG) { 2220 assert(vtmp == fnoreg, "should be"); 2221 assert(isQ, "should be"); 2222 umov(rscratch1, vsrc, D, 0); 2223 cmp(isrc, rscratch1); 2224 csel(dst, isrc, rscratch1, is_min ? LT : GT); 2225 umov(rscratch1, vsrc, D, 1); 2226 cmp(dst, rscratch1); 2227 csel(dst, dst, rscratch1, is_min ? LT : GT); 2228 } else { 2229 SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ); 2230 if (size == T2S) { 2231 is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc); 2232 } else { 2233 is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc); 2234 } 2235 if (bt == T_INT) { 2236 umov(dst, vtmp, S, 0); 2237 } else { 2238 smov(dst, vtmp, elemType_to_regVariant(bt), 0); 2239 } 2240 cmpw(dst, isrc); 2241 cselw(dst, dst, isrc, is_min ? LT : GT); 2242 } 2243 BLOCK_COMMENT("} neon_reduce_minmax_integral"); 2244 } 2245 2246 // Vector reduction for integral type with SVE instruction. 2247 // Supported operations are Add, And, Or, Xor, Max, Min. 2248 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV. 2249 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1, 2250 FloatRegister src2, PRegister pg, FloatRegister tmp) { 2251 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2252 assert(pg->is_governing(), "This register has to be a governing predicate register"); 2253 assert_different_registers(src1, dst); 2254 // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved. 2255 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2256 switch (opc) { 2257 case Op_AddReductionVI: { 2258 sve_uaddv(tmp, size, pg, src2); 2259 if (bt == T_BYTE) { 2260 smov(dst, tmp, size, 0); 2261 addw(dst, src1, dst, ext::sxtb); 2262 } else if (bt == T_SHORT) { 2263 smov(dst, tmp, size, 0); 2264 addw(dst, src1, dst, ext::sxth); 2265 } else { 2266 umov(dst, tmp, size, 0); 2267 addw(dst, dst, src1); 2268 } 2269 break; 2270 } 2271 case Op_AddReductionVL: { 2272 sve_uaddv(tmp, size, pg, src2); 2273 umov(dst, tmp, size, 0); 2274 add(dst, dst, src1); 2275 break; 2276 } 2277 case Op_AndReductionV: { 2278 sve_andv(tmp, size, pg, src2); 2279 if (bt == T_INT || bt == T_LONG) { 2280 umov(dst, tmp, size, 0); 2281 } else { 2282 smov(dst, tmp, size, 0); 2283 } 2284 if (bt == T_LONG) { 2285 andr(dst, dst, src1); 2286 } else { 2287 andw(dst, dst, src1); 2288 } 2289 break; 2290 } 2291 case Op_OrReductionV: { 2292 sve_orv(tmp, size, pg, src2); 2293 if (bt == T_INT || bt == T_LONG) { 2294 umov(dst, tmp, size, 0); 2295 } else { 2296 smov(dst, tmp, size, 0); 2297 } 2298 if (bt == T_LONG) { 2299 orr(dst, dst, src1); 2300 } else { 2301 orrw(dst, dst, src1); 2302 } 2303 break; 2304 } 2305 case Op_XorReductionV: { 2306 sve_eorv(tmp, size, pg, src2); 2307 if (bt == T_INT || bt == T_LONG) { 2308 umov(dst, tmp, size, 0); 2309 } else { 2310 smov(dst, tmp, size, 0); 2311 } 2312 if (bt == T_LONG) { 2313 eor(dst, dst, src1); 2314 } else { 2315 eorw(dst, dst, src1); 2316 } 2317 break; 2318 } 2319 case Op_MaxReductionV: { 2320 sve_smaxv(tmp, size, pg, src2); 2321 if (bt == T_INT || bt == T_LONG) { 2322 umov(dst, tmp, size, 0); 2323 } else { 2324 smov(dst, tmp, size, 0); 2325 } 2326 if (bt == T_LONG) { 2327 cmp(dst, src1); 2328 csel(dst, dst, src1, Assembler::GT); 2329 } else { 2330 cmpw(dst, src1); 2331 cselw(dst, dst, src1, Assembler::GT); 2332 } 2333 break; 2334 } 2335 case Op_MinReductionV: { 2336 sve_sminv(tmp, size, pg, src2); 2337 if (bt == T_INT || bt == T_LONG) { 2338 umov(dst, tmp, size, 0); 2339 } else { 2340 smov(dst, tmp, size, 0); 2341 } 2342 if (bt == T_LONG) { 2343 cmp(dst, src1); 2344 csel(dst, dst, src1, Assembler::LT); 2345 } else { 2346 cmpw(dst, src1); 2347 cselw(dst, dst, src1, Assembler::LT); 2348 } 2349 break; 2350 } 2351 default: 2352 assert(false, "unsupported"); 2353 ShouldNotReachHere(); 2354 } 2355 2356 if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) { 2357 if (bt == T_BYTE) { 2358 sxtb(dst, dst); 2359 } else if (bt == T_SHORT) { 2360 sxth(dst, dst); 2361 } 2362 } 2363 } 2364 2365 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or 2366 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported 2367 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg. 2368 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) { 2369 uint32_t max_vector_length = Matcher::max_vector_size(bt); 2370 assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt"); 2371 2372 // Set all elements to false if the input "lane_cnt" is zero. 2373 if (lane_cnt == 0) { 2374 sve_pfalse(dst); 2375 return; 2376 } 2377 2378 SIMD_RegVariant size = elemType_to_regVariant(bt); 2379 assert(size != Q, "invalid size"); 2380 2381 // Set all true if "lane_cnt" equals to the max lane count. 2382 if (lane_cnt == max_vector_length) { 2383 sve_ptrue(dst, size, /* ALL */ 0b11111); 2384 return; 2385 } 2386 2387 // Fixed numbers for "ptrue". 2388 switch(lane_cnt) { 2389 case 1: /* VL1 */ 2390 case 2: /* VL2 */ 2391 case 3: /* VL3 */ 2392 case 4: /* VL4 */ 2393 case 5: /* VL5 */ 2394 case 6: /* VL6 */ 2395 case 7: /* VL7 */ 2396 case 8: /* VL8 */ 2397 sve_ptrue(dst, size, lane_cnt); 2398 return; 2399 case 16: 2400 sve_ptrue(dst, size, /* VL16 */ 0b01001); 2401 return; 2402 case 32: 2403 sve_ptrue(dst, size, /* VL32 */ 0b01010); 2404 return; 2405 case 64: 2406 sve_ptrue(dst, size, /* VL64 */ 0b01011); 2407 return; 2408 case 128: 2409 sve_ptrue(dst, size, /* VL128 */ 0b01100); 2410 return; 2411 case 256: 2412 sve_ptrue(dst, size, /* VL256 */ 0b01101); 2413 return; 2414 default: 2415 break; 2416 } 2417 2418 // Special patterns for "ptrue". 2419 if (lane_cnt == round_down_power_of_2(max_vector_length)) { 2420 sve_ptrue(dst, size, /* POW2 */ 0b00000); 2421 } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) { 2422 sve_ptrue(dst, size, /* MUL4 */ 0b11101); 2423 } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) { 2424 sve_ptrue(dst, size, /* MUL3 */ 0b11110); 2425 } else { 2426 // Encode to "whileltw" for the remaining cases. 2427 mov(rscratch1, lane_cnt); 2428 sve_whileltw(dst, size, zr, rscratch1); 2429 } 2430 } 2431 2432 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst. 2433 // Any remaining elements of dst will be filled with zero. 2434 // Clobbers: rscratch1 2435 // Preserves: src, mask 2436 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask, 2437 FloatRegister vtmp1, FloatRegister vtmp2, 2438 PRegister pgtmp) { 2439 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2440 assert_different_registers(dst, src, vtmp1, vtmp2); 2441 assert_different_registers(mask, pgtmp); 2442 2443 // Example input: src = 8888 7777 6666 5555 4444 3333 2222 1111 2444 // mask = 0001 0000 0000 0001 0001 0000 0001 0001 2445 // Expected result: dst = 0000 0000 0000 8888 5555 4444 2222 1111 2446 sve_dup(vtmp2, H, 0); 2447 2448 // Extend lowest half to type INT. 2449 // dst = 00004444 00003333 00002222 00001111 2450 sve_uunpklo(dst, S, src); 2451 // pgtmp = 00000001 00000000 00000001 00000001 2452 sve_punpklo(pgtmp, mask); 2453 // Pack the active elements in size of type INT to the right, 2454 // and fill the remainings with zero. 2455 // dst = 00000000 00004444 00002222 00001111 2456 sve_compact(dst, S, dst, pgtmp); 2457 // Narrow the result back to type SHORT. 2458 // dst = 0000 0000 0000 0000 0000 4444 2222 1111 2459 sve_uzp1(dst, H, dst, vtmp2); 2460 // Count the active elements of lowest half. 2461 // rscratch1 = 3 2462 sve_cntp(rscratch1, S, ptrue, pgtmp); 2463 2464 // Repeat to the highest half. 2465 // pgtmp = 00000001 00000000 00000000 00000001 2466 sve_punpkhi(pgtmp, mask); 2467 // vtmp1 = 00008888 00007777 00006666 00005555 2468 sve_uunpkhi(vtmp1, S, src); 2469 // vtmp1 = 00000000 00000000 00008888 00005555 2470 sve_compact(vtmp1, S, vtmp1, pgtmp); 2471 // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2472 sve_uzp1(vtmp1, H, vtmp1, vtmp2); 2473 2474 // Compressed low: dst = 0000 0000 0000 0000 0000 4444 2222 1111 2475 // Compressed high: vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555 2476 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2477 // TRUE_CNT is the number of active elements in the compressed low. 2478 neg(rscratch1, rscratch1); 2479 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2480 sve_index(vtmp2, H, rscratch1, 1); 2481 // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000 2482 sve_tbl(vtmp1, H, vtmp1, vtmp2); 2483 2484 // Combine the compressed high(after shifted) with the compressed low. 2485 // dst = 0000 0000 0000 8888 5555 4444 2222 1111 2486 sve_orr(dst, dst, vtmp1); 2487 } 2488 2489 // Clobbers: rscratch1, rscratch2 2490 // Preserves: src, mask 2491 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask, 2492 FloatRegister vtmp1, FloatRegister vtmp2, 2493 FloatRegister vtmp3, FloatRegister vtmp4, 2494 PRegister ptmp, PRegister pgtmp) { 2495 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2496 assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4); 2497 assert_different_registers(mask, ptmp, pgtmp); 2498 // Example input: src = 88 77 66 55 44 33 22 11 2499 // mask = 01 00 00 01 01 00 01 01 2500 // Expected result: dst = 00 00 00 88 55 44 22 11 2501 2502 sve_dup(vtmp4, B, 0); 2503 // Extend lowest half to type SHORT. 2504 // vtmp1 = 0044 0033 0022 0011 2505 sve_uunpklo(vtmp1, H, src); 2506 // ptmp = 0001 0000 0001 0001 2507 sve_punpklo(ptmp, mask); 2508 // Count the active elements of lowest half. 2509 // rscratch2 = 3 2510 sve_cntp(rscratch2, H, ptrue, ptmp); 2511 // Pack the active elements in size of type SHORT to the right, 2512 // and fill the remainings with zero. 2513 // dst = 0000 0044 0022 0011 2514 sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp); 2515 // Narrow the result back to type BYTE. 2516 // dst = 00 00 00 00 00 44 22 11 2517 sve_uzp1(dst, B, dst, vtmp4); 2518 2519 // Repeat to the highest half. 2520 // ptmp = 0001 0000 0000 0001 2521 sve_punpkhi(ptmp, mask); 2522 // vtmp1 = 0088 0077 0066 0055 2523 sve_uunpkhi(vtmp2, H, src); 2524 // vtmp1 = 0000 0000 0088 0055 2525 sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp); 2526 2527 sve_dup(vtmp4, B, 0); 2528 // vtmp1 = 00 00 00 00 00 00 88 55 2529 sve_uzp1(vtmp1, B, vtmp1, vtmp4); 2530 2531 // Compressed low: dst = 00 00 00 00 00 44 22 11 2532 // Compressed high: vtmp1 = 00 00 00 00 00 00 88 55 2533 // Left shift(cross lane) compressed high with TRUE_CNT lanes, 2534 // TRUE_CNT is the number of active elements in the compressed low. 2535 neg(rscratch2, rscratch2); 2536 // vtmp2 = {4 3 2 1 0 -1 -2 -3} 2537 sve_index(vtmp2, B, rscratch2, 1); 2538 // vtmp1 = 00 00 00 88 55 00 00 00 2539 sve_tbl(vtmp1, B, vtmp1, vtmp2); 2540 // Combine the compressed high(after shifted) with the compressed low. 2541 // dst = 00 00 00 88 55 44 22 11 2542 sve_orr(dst, dst, vtmp1); 2543 } 2544 2545 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2546 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2547 SIMD_Arrangement size = isQ ? T16B : T8B; 2548 if (bt == T_BYTE) { 2549 rbit(dst, size, src); 2550 } else { 2551 neon_reverse_bytes(dst, src, bt, isQ); 2552 rbit(dst, size, dst); 2553 } 2554 } 2555 2556 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) { 2557 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type"); 2558 SIMD_Arrangement size = isQ ? T16B : T8B; 2559 switch (bt) { 2560 case T_BYTE: 2561 if (dst != src) { 2562 orr(dst, size, src, src); 2563 } 2564 break; 2565 case T_SHORT: 2566 rev16(dst, size, src); 2567 break; 2568 case T_INT: 2569 rev32(dst, size, src); 2570 break; 2571 case T_LONG: 2572 rev64(dst, size, src); 2573 break; 2574 default: 2575 assert(false, "unsupported"); 2576 ShouldNotReachHere(); 2577 } 2578 } 2579 2580 // Extract a scalar element from an sve vector at position 'idx'. 2581 // The input elements in src are expected to be of integral type. 2582 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src, 2583 int idx, FloatRegister vtmp) { 2584 assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type"); 2585 Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt); 2586 if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction 2587 if (bt == T_INT || bt == T_LONG) { 2588 umov(dst, src, size, idx); 2589 } else { 2590 smov(dst, src, size, idx); 2591 } 2592 } else { 2593 sve_orr(vtmp, src, src); 2594 sve_ext(vtmp, vtmp, idx << size); 2595 if (bt == T_INT || bt == T_LONG) { 2596 umov(dst, vtmp, size, 0); 2597 } else { 2598 smov(dst, vtmp, size, 0); 2599 } 2600 } 2601 } 2602 2603 // java.lang.Math::round intrinsics 2604 2605 // Clobbers: rscratch1, rflags 2606 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2607 FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) { 2608 assert_different_registers(tmp1, tmp2, tmp3, src, dst); 2609 switch (T) { 2610 case T2S: 2611 case T4S: 2612 fmovs(tmp1, T, 0.5f); 2613 mov(rscratch1, jint_cast(0x1.0p23f)); 2614 break; 2615 case T2D: 2616 fmovd(tmp1, T, 0.5); 2617 mov(rscratch1, julong_cast(0x1.0p52)); 2618 break; 2619 default: 2620 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2621 } 2622 fadd(tmp1, T, tmp1, src); 2623 fcvtms(tmp1, T, tmp1); 2624 // tmp1 = floor(src + 0.5, ties to even) 2625 2626 fcvtas(dst, T, src); 2627 // dst = round(src), ties to away 2628 2629 fneg(tmp3, T, src); 2630 dup(tmp2, T, rscratch1); 2631 cm(HS, tmp3, T, tmp3, tmp2); 2632 // tmp3 is now a set of flags 2633 2634 bif(dst, T16B, tmp1, tmp3); 2635 // result in dst 2636 } 2637 2638 // Clobbers: rscratch1, rflags 2639 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1, 2640 FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) { 2641 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2642 assert_different_registers(tmp1, tmp2, src, dst); 2643 2644 switch (T) { 2645 case S: 2646 mov(rscratch1, jint_cast(0x1.0p23f)); 2647 break; 2648 case D: 2649 mov(rscratch1, julong_cast(0x1.0p52)); 2650 break; 2651 default: 2652 assert(T == S || T == D, "invalid register variant"); 2653 } 2654 2655 sve_frinta(dst, T, ptrue, src); 2656 // dst = round(src), ties to away 2657 2658 Label none; 2659 2660 sve_fneg(tmp1, T, ptrue, src); 2661 sve_dup(tmp2, T, rscratch1); 2662 sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1); 2663 br(EQ, none); 2664 { 2665 sve_cpy(tmp1, T, pgtmp, 0.5); 2666 sve_fadd(tmp1, T, pgtmp, src); 2667 sve_frintm(dst, T, pgtmp, tmp1); 2668 // dst = floor(src + 0.5, ties to even) 2669 } 2670 bind(none); 2671 2672 sve_fcvtzs(dst, T, ptrue, dst, T); 2673 // result in dst 2674 } 2675 2676 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero, 2677 FloatRegister one, SIMD_Arrangement T) { 2678 assert_different_registers(dst, src, zero, one); 2679 assert(T == T2S || T == T4S || T == T2D, "invalid arrangement"); 2680 2681 facgt(dst, T, src, zero); 2682 ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise 2683 bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst 2684 } 2685 2686 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero, 2687 FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) { 2688 assert_different_registers(dst, src, zero, one, vtmp); 2689 assert(pgtmp->is_governing(), "This register has to be a governing predicate register"); 2690 2691 sve_orr(vtmp, src, src); 2692 sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise 2693 switch (T) { 2694 case S: 2695 sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src 2696 sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending 2697 // on the sign of the float value 2698 break; 2699 case D: 2700 sve_and(vtmp, T, min_jlong); 2701 sve_orr(vtmp, T, jlong_cast(1.0)); 2702 break; 2703 default: 2704 assert(false, "unsupported"); 2705 ShouldNotReachHere(); 2706 } 2707 sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp 2708 // Result in dst 2709 } 2710 2711 bool C2_MacroAssembler::in_scratch_emit_size() { 2712 if (ciEnv::current()->task() != nullptr) { 2713 PhaseOutput* phase_output = Compile::current()->output(); 2714 if (phase_output != nullptr && phase_output->in_scratch_emit_size()) { 2715 return true; 2716 } 2717 } 2718 return MacroAssembler::in_scratch_emit_size(); 2719 }