1 /*
   2  * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "opto/c2_MacroAssembler.hpp"
  29 #include "opto/compile.hpp"
  30 #include "opto/intrinsicnode.hpp"
  31 #include "opto/matcher.hpp"
  32 #include "opto/output.hpp"
  33 #include "opto/subnode.hpp"
  34 #include "runtime/stubRoutines.hpp"
  35 #include "utilities/globalDefinitions.hpp"
  36 #include "utilities/powerOfTwo.hpp"
  37 
  38 #ifdef PRODUCT
  39 #define BLOCK_COMMENT(str) /* nothing */
  40 #define STOP(error) stop(error)
  41 #else
  42 #define BLOCK_COMMENT(str) block_comment(str)
  43 #define STOP(error) block_comment(error); stop(error)
  44 #endif
  45 
  46 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  47 
  48 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
  49 
  50 void C2_MacroAssembler::entry_barrier() {
  51   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
  52   if (BarrierSet::barrier_set()->barrier_set_nmethod() != nullptr) {
  53     // Dummy labels for just measuring the code size
  54     Label dummy_slow_path;
  55     Label dummy_continuation;
  56     Label dummy_guard;
  57     Label* slow_path = &dummy_slow_path;
  58     Label* continuation = &dummy_continuation;
  59     Label* guard = &dummy_guard;
  60     if (!Compile::current()->output()->in_scratch_emit_size()) {
  61       // Use real labels from actual stub when not emitting code for the purpose of measuring its size
  62       C2EntryBarrierStub* stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
  63       Compile::current()->output()->add_stub(stub);
  64       slow_path = &stub->entry();
  65       continuation = &stub->continuation();
  66       guard = &stub->guard();
  67     }
  68     // In the C2 code, we move the non-hot part of nmethod entry barriers out-of-line to a stub.
  69     bs->nmethod_entry_barrier(this, slow_path, continuation, guard);
  70   }
  71 }
  72 
  73 // jdk.internal.util.ArraysSupport.vectorizedHashCode
  74 address C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register result,
  75                                            FloatRegister vdata0, FloatRegister vdata1,
  76                                            FloatRegister vdata2, FloatRegister vdata3,
  77                                            FloatRegister vmul0, FloatRegister vmul1,
  78                                            FloatRegister vmul2, FloatRegister vmul3,
  79                                            FloatRegister vpow, FloatRegister vpowm,
  80                                            BasicType eltype) {
  81   ARRAYS_HASHCODE_REGISTERS;
  82 
  83   Register tmp1 = rscratch1, tmp2 = rscratch2;
  84 
  85   Label TAIL, STUB_SWITCH, STUB_SWITCH_OUT, LOOP, BR_BASE, LARGE, DONE;
  86 
  87   // Vectorization factor. Number of array elements loaded to one SIMD&FP registers by the stubs. We
  88   // use 8H load arrangements for chars and shorts and 8B for booleans and bytes. It's possible to
  89   // use 4H for chars and shorts instead, but using 8H gives better performance.
  90   const size_t vf = eltype == T_BOOLEAN || eltype == T_BYTE ? 8
  91                     : eltype == T_CHAR || eltype == T_SHORT ? 8
  92                     : eltype == T_INT                       ? 4
  93                                                             : 0;
  94   guarantee(vf, "unsupported eltype");
  95 
  96   // Unroll factor for the scalar loop below. The value is chosen based on performance analysis.
  97   const size_t unroll_factor = 4;
  98 
  99   switch (eltype) {
 100   case T_BOOLEAN:
 101     BLOCK_COMMENT("arrays_hashcode(unsigned byte) {");
 102     break;
 103   case T_CHAR:
 104     BLOCK_COMMENT("arrays_hashcode(char) {");
 105     break;
 106   case T_BYTE:
 107     BLOCK_COMMENT("arrays_hashcode(byte) {");
 108     break;
 109   case T_SHORT:
 110     BLOCK_COMMENT("arrays_hashcode(short) {");
 111     break;
 112   case T_INT:
 113     BLOCK_COMMENT("arrays_hashcode(int) {");
 114     break;
 115   default:
 116     ShouldNotReachHere();
 117   }
 118 
 119   // large_arrays_hashcode(T_INT) performs worse than the scalar loop below when the Neon loop
 120   // implemented by the stub executes just once. Call the stub only if at least two iterations will
 121   // be executed.
 122   const size_t large_threshold = eltype == T_INT ? vf * 2 : vf;
 123   cmpw(cnt, large_threshold);
 124   br(Assembler::HS, LARGE);
 125 
 126   bind(TAIL);
 127 
 128   // The andr performs cnt % uf where uf = unroll_factor. The subtract shifted by 3 offsets past
 129   // uf - (cnt % uf) pairs of load + madd insns i.e. it only executes cnt % uf load + madd pairs.
 130   // Iteration eats up the remainder, uf elements at a time.
 131   assert(is_power_of_2(unroll_factor), "can't use this value to calculate the jump target PC");
 132   andr(tmp2, cnt, unroll_factor - 1);
 133   adr(tmp1, BR_BASE);
 134   sub(tmp1, tmp1, tmp2, ext::sxtw, 3);
 135   movw(tmp2, 0x1f);
 136   br(tmp1);
 137 
 138   bind(LOOP);
 139   for (size_t i = 0; i < unroll_factor; ++i) {
 140     load(tmp1, Address(post(ary, type2aelembytes(eltype))), eltype);
 141     maddw(result, result, tmp2, tmp1);
 142   }
 143   bind(BR_BASE);
 144   subsw(cnt, cnt, unroll_factor);
 145   br(Assembler::HS, LOOP);
 146 
 147   b(DONE);
 148 
 149   bind(LARGE);
 150 
 151   RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_arrays_hashcode(eltype));
 152   assert(stub.target() != nullptr, "array_hashcode stub has not been generated");
 153   address tpc = trampoline_call(stub);
 154   if (tpc == nullptr) {
 155     DEBUG_ONLY(reset_labels(TAIL, BR_BASE));
 156     postcond(pc() == badAddress);
 157     return nullptr;
 158   }
 159 
 160   bind(DONE);
 161 
 162   BLOCK_COMMENT("} // arrays_hashcode");
 163 
 164   postcond(pc() != badAddress);
 165   return pc();
 166 }
 167 
 168 void C2_MacroAssembler::fast_lock(Register objectReg, Register boxReg, Register tmpReg,
 169                                   Register tmp2Reg, Register tmp3Reg) {
 170   Register oop = objectReg;
 171   Register box = boxReg;
 172   Register disp_hdr = tmpReg;
 173   Register tmp = tmp2Reg;
 174   Label cont;
 175   Label object_has_monitor;
 176   Label count, no_count;
 177 
 178   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_lock_lightweight");
 179   assert_different_registers(oop, box, tmp, disp_hdr);
 180 
 181   // Load markWord from object into displaced_header.
 182   ldr(disp_hdr, Address(oop, oopDesc::mark_offset_in_bytes()));
 183 
 184   if (DiagnoseSyncOnValueBasedClasses != 0) {
 185     load_klass(tmp, oop);
 186     ldrb(tmp, Address(tmp, Klass::misc_flags_offset()));
 187     tst(tmp, KlassFlags::_misc_is_value_based_class);
 188     br(Assembler::NE, cont);
 189   }
 190 
 191   // Check for existing monitor
 192   tbnz(disp_hdr, exact_log2(markWord::monitor_value), object_has_monitor);
 193 
 194   if (LockingMode == LM_MONITOR) {
 195     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
 196     b(cont);
 197   } else {
 198     assert(LockingMode == LM_LEGACY, "must be");
 199     // Set tmp to be (markWord of object | UNLOCK_VALUE).
 200     orr(tmp, disp_hdr, markWord::unlocked_value);
 201 
 202     if (EnableValhalla) {
 203       // Mask inline_type bit such that we go to the slow path if object is an inline type
 204       andr(tmp, tmp, ~((int) markWord::inline_type_bit_in_place));
 205     }
 206 
 207     // Initialize the box. (Must happen before we update the object mark!)
 208     str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 209 
 210     // Compare object markWord with an unlocked value (tmp) and if
 211     // equal exchange the stack address of our box with object markWord.
 212     // On failure disp_hdr contains the possibly locked markWord.
 213     cmpxchg(oop, tmp, box, Assembler::xword, /*acquire*/ true,
 214             /*release*/ true, /*weak*/ false, disp_hdr);
 215     br(Assembler::EQ, cont);
 216 
 217     assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 218 
 219     // If the compare-and-exchange succeeded, then we found an unlocked
 220     // object, will have now locked it will continue at label cont
 221 
 222     // Check if the owner is self by comparing the value in the
 223     // markWord of object (disp_hdr) with the stack pointer.
 224     mov(rscratch1, sp);
 225     sub(disp_hdr, disp_hdr, rscratch1);
 226     mov(tmp, (address) (~(os::vm_page_size()-1) | markWord::lock_mask_in_place));
 227     // If condition is true we are cont and hence we can store 0 as the
 228     // displaced header in the box, which indicates that it is a recursive lock.
 229     ands(tmp/*==0?*/, disp_hdr, tmp);   // Sets flags for result
 230     str(tmp/*==0, perhaps*/, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 231     b(cont);
 232   }
 233 
 234   // Handle existing monitor.
 235   bind(object_has_monitor);
 236 
 237   // The object's monitor m is unlocked iff m->owner == nullptr,
 238   // otherwise m->owner may contain a thread or a stack address.
 239   //
 240   // Try to CAS m->owner from null to current thread.
 241   add(tmp, disp_hdr, (in_bytes(ObjectMonitor::owner_offset())-markWord::monitor_value));
 242   cmpxchg(tmp, zr, rthread, Assembler::xword, /*acquire*/ true,
 243           /*release*/ true, /*weak*/ false, tmp3Reg); // Sets flags for result
 244 
 245   // Store a non-null value into the box to avoid looking like a re-entrant
 246   // lock. The fast-path monitor unlock code checks for
 247   // markWord::monitor_value so use markWord::unused_mark which has the
 248   // relevant bit set, and also matches ObjectSynchronizer::enter.
 249   mov(tmp, (address)markWord::unused_mark().value());
 250   str(tmp, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 251 
 252   br(Assembler::EQ, cont); // CAS success means locking succeeded
 253 
 254   cmp(tmp3Reg, rthread);
 255   br(Assembler::NE, cont); // Check for recursive locking
 256 
 257   // Recursive lock case
 258   increment(Address(disp_hdr, in_bytes(ObjectMonitor::recursions_offset()) - markWord::monitor_value), 1);
 259   // flag == EQ still from the cmp above, checking if this is a reentrant lock
 260 
 261   bind(cont);
 262   // flag == EQ indicates success
 263   // flag == NE indicates failure
 264   br(Assembler::NE, no_count);
 265 
 266   bind(count);
 267   increment(Address(rthread, JavaThread::held_monitor_count_offset()));
 268 
 269   bind(no_count);
 270 }
 271 
 272 void C2_MacroAssembler::fast_unlock(Register objectReg, Register boxReg, Register tmpReg,
 273                                     Register tmp2Reg) {
 274   Register oop = objectReg;
 275   Register box = boxReg;
 276   Register disp_hdr = tmpReg;
 277   Register owner_addr = tmpReg;
 278   Register tmp = tmp2Reg;
 279   Label cont;
 280   Label object_has_monitor;
 281   Label count, no_count;
 282   Label unlocked;
 283 
 284   assert(LockingMode != LM_LIGHTWEIGHT, "lightweight locking should use fast_unlock_lightweight");
 285   assert_different_registers(oop, box, tmp, disp_hdr);
 286 
 287   if (LockingMode == LM_LEGACY) {
 288     // Find the lock address and load the displaced header from the stack.
 289     ldr(disp_hdr, Address(box, BasicLock::displaced_header_offset_in_bytes()));
 290 
 291     // If the displaced header is 0, we have a recursive unlock.
 292     cmp(disp_hdr, zr);
 293     br(Assembler::EQ, cont);
 294   }
 295 
 296   // Handle existing monitor.
 297   ldr(tmp, Address(oop, oopDesc::mark_offset_in_bytes()));
 298   tbnz(tmp, exact_log2(markWord::monitor_value), object_has_monitor);
 299 
 300   if (LockingMode == LM_MONITOR) {
 301     tst(oop, oop); // Set NE to indicate 'failure' -> take slow-path. We know that oop != 0.
 302     b(cont);
 303   } else {
 304     assert(LockingMode == LM_LEGACY, "must be");
 305     // Check if it is still a light weight lock, this is is true if we
 306     // see the stack address of the basicLock in the markWord of the
 307     // object.
 308 
 309     cmpxchg(oop, box, disp_hdr, Assembler::xword, /*acquire*/ false,
 310             /*release*/ true, /*weak*/ false, tmp);
 311     b(cont);
 312   }
 313 
 314   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
 315 
 316   // Handle existing monitor.
 317   bind(object_has_monitor);
 318   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
 319   add(tmp, tmp, -(int)markWord::monitor_value); // monitor
 320 
 321   ldr(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 322 
 323   Label notRecursive;
 324   cbz(disp_hdr, notRecursive);
 325 
 326   // Recursive lock
 327   sub(disp_hdr, disp_hdr, 1u);
 328   str(disp_hdr, Address(tmp, ObjectMonitor::recursions_offset()));
 329   cmp(disp_hdr, disp_hdr); // Sets flags for result
 330   b(cont);
 331 
 332   bind(notRecursive);
 333 
 334   // Compute owner address.
 335   lea(owner_addr, Address(tmp, ObjectMonitor::owner_offset()));
 336 
 337   // Set owner to null.
 338   // Release to satisfy the JMM
 339   stlr(zr, owner_addr);
 340   // We need a full fence after clearing owner to avoid stranding.
 341   // StoreLoad achieves this.
 342   membar(StoreLoad);
 343 
 344   // Check if the entry lists are empty (EntryList first - by convention).
 345   ldr(rscratch1, Address(tmp, ObjectMonitor::EntryList_offset()));
 346   ldr(tmpReg, Address(tmp, ObjectMonitor::cxq_offset()));
 347   orr(rscratch1, rscratch1, tmpReg);
 348   cmp(rscratch1, zr);
 349   br(Assembler::EQ, cont);     // If so we are done.
 350 
 351   // Check if there is a successor.
 352   ldr(rscratch1, Address(tmp, ObjectMonitor::succ_offset()));
 353   cmp(rscratch1, zr);
 354   br(Assembler::NE, unlocked); // If so we are done.
 355 
 356   // Save the monitor pointer in the current thread, so we can try to
 357   // reacquire the lock in SharedRuntime::monitor_exit_helper().
 358   str(tmp, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
 359 
 360   cmp(zr, rthread); // Set Flag to NE => slow path
 361   b(cont);
 362 
 363   bind(unlocked);
 364   cmp(zr, zr); // Set Flag to EQ => fast path
 365 
 366   // Intentional fall-through
 367 
 368   bind(cont);
 369   // flag == EQ indicates success
 370   // flag == NE indicates failure
 371   br(Assembler::NE, no_count);
 372 
 373   bind(count);
 374   decrement(Address(rthread, JavaThread::held_monitor_count_offset()));
 375 
 376   bind(no_count);
 377 }
 378 
 379 void C2_MacroAssembler::fast_lock_lightweight(Register obj, Register box, Register t1,
 380                                               Register t2, Register t3) {
 381   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 382   assert_different_registers(obj, box, t1, t2, t3);
 383 
 384   // Handle inflated monitor.
 385   Label inflated;
 386   // Finish fast lock successfully. MUST branch to with flag == EQ
 387   Label locked;
 388   // Finish fast lock unsuccessfully. MUST branch to with flag == NE
 389   Label slow_path;
 390 
 391   if (UseObjectMonitorTable) {
 392     // Clear cache in case fast locking succeeds.
 393     str(zr, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 394   }
 395 
 396   if (DiagnoseSyncOnValueBasedClasses != 0) {
 397     load_klass(t1, obj);
 398     ldrb(t1, Address(t1, Klass::misc_flags_offset()));
 399     tst(t1, KlassFlags::_misc_is_value_based_class);
 400     br(Assembler::NE, slow_path);
 401   }
 402 
 403   const Register t1_mark = t1;
 404   const Register t3_t = t3;
 405 
 406   { // Lightweight locking
 407 
 408     // Push lock to the lock stack and finish successfully. MUST branch to with flag == EQ
 409     Label push;
 410 
 411     const Register t2_top = t2;
 412 
 413     // Check if lock-stack is full.
 414     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 415     cmpw(t2_top, (unsigned)LockStack::end_offset() - 1);
 416     br(Assembler::GT, slow_path);
 417 
 418     // Check if recursive.
 419     subw(t3_t, t2_top, oopSize);
 420     ldr(t3_t, Address(rthread, t3_t));
 421     cmp(obj, t3_t);
 422     br(Assembler::EQ, push);
 423 
 424     // Relaxed normal load to check for monitor. Optimization for monitor case.
 425     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 426     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 427 
 428     // Not inflated
 429     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
 430 
 431     // Try to lock. Transition lock-bits 0b01 => 0b00
 432     orr(t1_mark, t1_mark, markWord::unlocked_value);
 433     eor(t3_t, t1_mark, markWord::unlocked_value);
 434     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 435             /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
 436     br(Assembler::NE, slow_path);
 437 
 438     bind(push);
 439     // After successful lock, push object on lock-stack.
 440     str(obj, Address(rthread, t2_top));
 441     addw(t2_top, t2_top, oopSize);
 442     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 443     b(locked);
 444   }
 445 
 446   { // Handle inflated monitor.
 447     bind(inflated);
 448 
 449     const Register t1_monitor = t1;
 450 
 451     if (!UseObjectMonitorTable) {
 452       assert(t1_monitor == t1_mark, "should be the same here");
 453     } else {
 454       Label monitor_found;
 455 
 456       // Load cache address
 457       lea(t3_t, Address(rthread, JavaThread::om_cache_oops_offset()));
 458 
 459       const int num_unrolled = 2;
 460       for (int i = 0; i < num_unrolled; i++) {
 461         ldr(t1, Address(t3_t));
 462         cmp(obj, t1);
 463         br(Assembler::EQ, monitor_found);
 464         increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 465       }
 466 
 467       Label loop;
 468 
 469       // Search for obj in cache.
 470       bind(loop);
 471 
 472       // Check for match.
 473       ldr(t1, Address(t3_t));
 474       cmp(obj, t1);
 475       br(Assembler::EQ, monitor_found);
 476 
 477       // Search until null encountered, guaranteed _null_sentinel at end.
 478       increment(t3_t, in_bytes(OMCache::oop_to_oop_difference()));
 479       cbnz(t1, loop);
 480       // Cache Miss, NE set from cmp above, cbnz does not set flags
 481       b(slow_path);
 482 
 483       bind(monitor_found);
 484       ldr(t1_monitor, Address(t3_t, OMCache::oop_to_monitor_difference()));
 485     }
 486 
 487     const Register t2_owner_addr = t2;
 488     const Register t3_owner = t3;
 489     const ByteSize monitor_tag = in_ByteSize(UseObjectMonitorTable ? 0 : checked_cast<int>(markWord::monitor_value));
 490     const Address owner_address(t1_monitor, ObjectMonitor::owner_offset() - monitor_tag);
 491     const Address recursions_address(t1_monitor, ObjectMonitor::recursions_offset() - monitor_tag);
 492 
 493     Label monitor_locked;
 494 
 495     // Compute owner address.
 496     lea(t2_owner_addr, owner_address);
 497 
 498     // CAS owner (null => current thread).
 499     cmpxchg(t2_owner_addr, zr, rthread, Assembler::xword, /*acquire*/ true,
 500             /*release*/ false, /*weak*/ false, t3_owner);
 501     br(Assembler::EQ, monitor_locked);
 502 
 503     // Check if recursive.
 504     cmp(t3_owner, rthread);
 505     br(Assembler::NE, slow_path);
 506 
 507     // Recursive.
 508     increment(recursions_address, 1);
 509 
 510     bind(monitor_locked);
 511     if (UseObjectMonitorTable) {
 512       str(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 513     }
 514   }
 515 
 516   bind(locked);
 517   increment(Address(rthread, JavaThread::held_monitor_count_offset()));
 518 
 519 #ifdef ASSERT
 520   // Check that locked label is reached with Flags == EQ.
 521   Label flag_correct;
 522   br(Assembler::EQ, flag_correct);
 523   stop("Fast Lock Flag != EQ");
 524 #endif
 525 
 526   bind(slow_path);
 527 #ifdef ASSERT
 528   // Check that slow_path label is reached with Flags == NE.
 529   br(Assembler::NE, flag_correct);
 530   stop("Fast Lock Flag != NE");
 531   bind(flag_correct);
 532 #endif
 533   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 534 }
 535 
 536 void C2_MacroAssembler::fast_unlock_lightweight(Register obj, Register box, Register t1,
 537                                                 Register t2, Register t3) {
 538   assert(LockingMode == LM_LIGHTWEIGHT, "must be");
 539   assert_different_registers(obj, box, t1, t2, t3);
 540 
 541   // Handle inflated monitor.
 542   Label inflated, inflated_load_mark;
 543   // Finish fast unlock successfully. MUST branch to with flag == EQ
 544   Label unlocked;
 545   // Finish fast unlock unsuccessfully. MUST branch to with flag == NE
 546   Label slow_path;
 547 
 548   const Register t1_mark = t1;
 549   const Register t2_top = t2;
 550   const Register t3_t = t3;
 551 
 552   { // Lightweight unlock
 553 
 554     Label push_and_slow_path;
 555 
 556     // Check if obj is top of lock-stack.
 557     ldrw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 558     subw(t2_top, t2_top, oopSize);
 559     ldr(t3_t, Address(rthread, t2_top));
 560     cmp(obj, t3_t);
 561     // Top of lock stack was not obj. Must be monitor.
 562     br(Assembler::NE, inflated_load_mark);
 563 
 564     // Pop lock-stack.
 565     DEBUG_ONLY(str(zr, Address(rthread, t2_top));)
 566     strw(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 567 
 568     // Check if recursive.
 569     subw(t3_t, t2_top, oopSize);
 570     ldr(t3_t, Address(rthread, t3_t));
 571     cmp(obj, t3_t);
 572     br(Assembler::EQ, unlocked);
 573 
 574     // Not recursive.
 575     // Load Mark.
 576     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 577 
 578     // Check header for monitor (0b10).
 579     // Because we got here by popping (meaning we pushed in locked)
 580     // there will be no monitor in the box. So we need to push back the obj
 581     // so that the runtime can fix any potential anonymous owner.
 582     tbnz(t1_mark, exact_log2(markWord::monitor_value), UseObjectMonitorTable ? push_and_slow_path : inflated);
 583 
 584     // Try to unlock. Transition lock bits 0b00 => 0b01
 585     assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
 586     orr(t3_t, t1_mark, markWord::unlocked_value);
 587     cmpxchg(/*addr*/ obj, /*expected*/ t1_mark, /*new*/ t3_t, Assembler::xword,
 588             /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
 589     br(Assembler::EQ, unlocked);
 590 
 591     bind(push_and_slow_path);
 592     // Compare and exchange failed.
 593     // Restore lock-stack and handle the unlock in runtime.
 594     DEBUG_ONLY(str(obj, Address(rthread, t2_top));)
 595     addw(t2_top, t2_top, oopSize);
 596     str(t2_top, Address(rthread, JavaThread::lock_stack_top_offset()));
 597     b(slow_path);
 598   }
 599 
 600 
 601   { // Handle inflated monitor.
 602     bind(inflated_load_mark);
 603     ldr(t1_mark, Address(obj, oopDesc::mark_offset_in_bytes()));
 604 #ifdef ASSERT
 605     tbnz(t1_mark, exact_log2(markWord::monitor_value), inflated);
 606     stop("Fast Unlock not monitor");
 607 #endif
 608 
 609     bind(inflated);
 610 
 611 #ifdef ASSERT
 612     Label check_done;
 613     subw(t2_top, t2_top, oopSize);
 614     cmpw(t2_top, in_bytes(JavaThread::lock_stack_base_offset()));
 615     br(Assembler::LT, check_done);
 616     ldr(t3_t, Address(rthread, t2_top));
 617     cmp(obj, t3_t);
 618     br(Assembler::NE, inflated);
 619     stop("Fast Unlock lock on stack");
 620     bind(check_done);
 621 #endif
 622 
 623     const Register t1_monitor = t1;
 624 
 625     if (!UseObjectMonitorTable) {
 626       assert(t1_monitor == t1_mark, "should be the same here");
 627 
 628       // Untag the monitor.
 629       add(t1_monitor, t1_mark, -(int)markWord::monitor_value);
 630     } else {
 631       ldr(t1_monitor, Address(box, BasicLock::object_monitor_cache_offset_in_bytes()));
 632       // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
 633       cmp(t1_monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
 634       br(Assembler::LO, slow_path);
 635     }
 636 
 637     const Register t2_recursions = t2;
 638     Label not_recursive;
 639 
 640     // Check if recursive.
 641     ldr(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 642     cbz(t2_recursions, not_recursive);
 643 
 644     // Recursive unlock.
 645     sub(t2_recursions, t2_recursions, 1u);
 646     str(t2_recursions, Address(t1_monitor, ObjectMonitor::recursions_offset()));
 647     // Set flag == EQ
 648     cmp(t2_recursions, t2_recursions);
 649     b(unlocked);
 650 
 651     bind(not_recursive);
 652 
 653     const Register t2_owner_addr = t2;
 654 
 655     // Compute owner address.
 656     lea(t2_owner_addr, Address(t1_monitor, ObjectMonitor::owner_offset()));
 657 
 658     // Set owner to null.
 659     // Release to satisfy the JMM
 660     stlr(zr, t2_owner_addr);
 661     // We need a full fence after clearing owner to avoid stranding.
 662     // StoreLoad achieves this.
 663     membar(StoreLoad);
 664 
 665     // Check if the entry lists are empty (EntryList first - by convention).
 666     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::EntryList_offset()));
 667     ldr(t3_t, Address(t1_monitor, ObjectMonitor::cxq_offset()));
 668     orr(rscratch1, rscratch1, t3_t);
 669     cmp(rscratch1, zr);
 670     br(Assembler::EQ, unlocked);  // If so we are done.
 671 
 672     // Check if there is a successor.
 673     ldr(rscratch1, Address(t1_monitor, ObjectMonitor::succ_offset()));
 674     cmp(rscratch1, zr);
 675     br(Assembler::NE, unlocked);  // If so we are done.
 676 
 677     // Save the monitor pointer in the current thread, so we can try to
 678     // reacquire the lock in SharedRuntime::monitor_exit_helper().
 679     str(t1_monitor, Address(rthread, JavaThread::unlocked_inflated_monitor_offset()));
 680 
 681     cmp(zr, rthread); // Set Flag to NE => slow path
 682     b(slow_path);
 683   }
 684 
 685   bind(unlocked);
 686   decrement(Address(rthread, JavaThread::held_monitor_count_offset()));
 687   cmp(zr, zr); // Set Flags to EQ => fast path
 688 
 689 #ifdef ASSERT
 690   // Check that unlocked label is reached with Flags == EQ.
 691   Label flag_correct;
 692   br(Assembler::EQ, flag_correct);
 693   stop("Fast Unlock Flag != EQ");
 694 #endif
 695 
 696   bind(slow_path);
 697 #ifdef ASSERT
 698   // Check that slow_path label is reached with Flags == NE.
 699   br(Assembler::NE, flag_correct);
 700   stop("Fast Unlock Flag != NE");
 701   bind(flag_correct);
 702 #endif
 703   // C2 uses the value of Flags (NE vs EQ) to determine the continuation.
 704 }
 705 
 706 // Search for str1 in str2 and return index or -1
 707 // Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1.
 708 void C2_MacroAssembler::string_indexof(Register str2, Register str1,
 709                                        Register cnt2, Register cnt1,
 710                                        Register tmp1, Register tmp2,
 711                                        Register tmp3, Register tmp4,
 712                                        Register tmp5, Register tmp6,
 713                                        int icnt1, Register result, int ae) {
 714   // NOTE: tmp5, tmp6 can be zr depending on specific method version
 715   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
 716 
 717   Register ch1 = rscratch1;
 718   Register ch2 = rscratch2;
 719   Register cnt1tmp = tmp1;
 720   Register cnt2tmp = tmp2;
 721   Register cnt1_neg = cnt1;
 722   Register cnt2_neg = cnt2;
 723   Register result_tmp = tmp4;
 724 
 725   bool isL = ae == StrIntrinsicNode::LL;
 726 
 727   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
 728   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
 729   int str1_chr_shift = str1_isL ? 0:1;
 730   int str2_chr_shift = str2_isL ? 0:1;
 731   int str1_chr_size = str1_isL ? 1:2;
 732   int str2_chr_size = str2_isL ? 1:2;
 733   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
 734                                       (chr_insn)&MacroAssembler::ldrh;
 735   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
 736                                       (chr_insn)&MacroAssembler::ldrh;
 737   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
 738   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
 739 
 740   // Note, inline_string_indexOf() generates checks:
 741   // if (substr.count > string.count) return -1;
 742   // if (substr.count == 0) return 0;
 743 
 744   // We have two strings, a source string in str2, cnt2 and a pattern string
 745   // in str1, cnt1. Find the 1st occurrence of pattern in source or return -1.
 746 
 747   // For larger pattern and source we use a simplified Boyer Moore algorithm.
 748   // With a small pattern and source we use linear scan.
 749 
 750   if (icnt1 == -1) {
 751     sub(result_tmp, cnt2, cnt1);
 752     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
 753     br(LT, LINEARSEARCH);
 754     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
 755     subs(zr, cnt1, 256);
 756     lsr(tmp1, cnt2, 2);
 757     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
 758     br(GE, LINEARSTUB);
 759   }
 760 
 761 // The Boyer Moore alogorithm is based on the description here:-
 762 //
 763 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
 764 //
 765 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
 766 // and the 'Good Suffix' rule.
 767 //
 768 // These rules are essentially heuristics for how far we can shift the
 769 // pattern along the search string.
 770 //
 771 // The implementation here uses the 'Bad Character' rule only because of the
 772 // complexity of initialisation for the 'Good Suffix' rule.
 773 //
 774 // This is also known as the Boyer-Moore-Horspool algorithm:-
 775 //
 776 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
 777 //
 778 // This particular implementation has few java-specific optimizations.
 779 //
 780 // #define ASIZE 256
 781 //
 782 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
 783 //       int i, j;
 784 //       unsigned c;
 785 //       unsigned char bc[ASIZE];
 786 //
 787 //       /* Preprocessing */
 788 //       for (i = 0; i < ASIZE; ++i)
 789 //          bc[i] = m;
 790 //       for (i = 0; i < m - 1; ) {
 791 //          c = x[i];
 792 //          ++i;
 793 //          // c < 256 for Latin1 string, so, no need for branch
 794 //          #ifdef PATTERN_STRING_IS_LATIN1
 795 //          bc[c] = m - i;
 796 //          #else
 797 //          if (c < ASIZE) bc[c] = m - i;
 798 //          #endif
 799 //       }
 800 //
 801 //       /* Searching */
 802 //       j = 0;
 803 //       while (j <= n - m) {
 804 //          c = y[i+j];
 805 //          if (x[m-1] == c)
 806 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
 807 //          if (i < 0) return j;
 808 //          // c < 256 for Latin1 string, so, no need for branch
 809 //          #ifdef SOURCE_STRING_IS_LATIN1
 810 //          // LL case: (c< 256) always true. Remove branch
 811 //          j += bc[y[j+m-1]];
 812 //          #endif
 813 //          #ifndef PATTERN_STRING_IS_UTF
 814 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
 815 //          if (c < ASIZE)
 816 //            j += bc[y[j+m-1]];
 817 //          else
 818 //            j += 1
 819 //          #endif
 820 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
 821 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
 822 //          if (c < ASIZE)
 823 //            j += bc[y[j+m-1]];
 824 //          else
 825 //            j += m
 826 //          #endif
 827 //       }
 828 //    }
 829 
 830   if (icnt1 == -1) {
 831     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
 832         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
 833     Register cnt1end = tmp2;
 834     Register str2end = cnt2;
 835     Register skipch = tmp2;
 836 
 837     // str1 length is >=8, so, we can read at least 1 register for cases when
 838     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
 839     // UL case. We'll re-read last character in inner pre-loop code to have
 840     // single outer pre-loop load
 841     const int firstStep = isL ? 7 : 3;
 842 
 843     const int ASIZE = 256;
 844     const int STORED_BYTES = 32; // amount of bytes stored per instruction
 845     sub(sp, sp, ASIZE);
 846     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
 847     mov(ch1, sp);
 848     BIND(BM_INIT_LOOP);
 849       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
 850       subs(tmp5, tmp5, 1);
 851       br(GT, BM_INIT_LOOP);
 852 
 853       sub(cnt1tmp, cnt1, 1);
 854       mov(tmp5, str2);
 855       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
 856       sub(ch2, cnt1, 1);
 857       mov(tmp3, str1);
 858     BIND(BCLOOP);
 859       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
 860       if (!str1_isL) {
 861         subs(zr, ch1, ASIZE);
 862         br(HS, BCSKIP);
 863       }
 864       strb(ch2, Address(sp, ch1));
 865     BIND(BCSKIP);
 866       subs(ch2, ch2, 1);
 867       br(GT, BCLOOP);
 868 
 869       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
 870       if (str1_isL == str2_isL) {
 871         // load last 8 bytes (8LL/4UU symbols)
 872         ldr(tmp6, Address(tmp6, -wordSize));
 873       } else {
 874         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
 875         // convert Latin1 to UTF. We'll have to wait until load completed, but
 876         // it's still faster than per-character loads+checks
 877         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
 878         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
 879         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
 880         andr(tmp6, tmp6, 0xFF); // str1[N-4]
 881         orr(ch2, ch1, ch2, LSL, 16);
 882         orr(tmp6, tmp6, tmp3, LSL, 48);
 883         orr(tmp6, tmp6, ch2, LSL, 16);
 884       }
 885     BIND(BMLOOPSTR2);
 886       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 887       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
 888       if (str1_isL == str2_isL) {
 889         // re-init tmp3. It's for free because it's executed in parallel with
 890         // load above. Alternative is to initialize it before loop, but it'll
 891         // affect performance on in-order systems with 2 or more ld/st pipelines
 892         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
 893       }
 894       if (!isL) { // UU/UL case
 895         lsl(ch2, cnt1tmp, 1); // offset in bytes
 896       }
 897       cmp(tmp3, skipch);
 898       br(NE, BMSKIP);
 899       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
 900       mov(ch1, tmp6);
 901       if (isL) {
 902         b(BMLOOPSTR1_AFTER_LOAD);
 903       } else {
 904         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
 905         b(BMLOOPSTR1_CMP);
 906       }
 907     BIND(BMLOOPSTR1);
 908       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
 909       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
 910     BIND(BMLOOPSTR1_AFTER_LOAD);
 911       subs(cnt1tmp, cnt1tmp, 1);
 912       br(LT, BMLOOPSTR1_LASTCMP);
 913     BIND(BMLOOPSTR1_CMP);
 914       cmp(ch1, ch2);
 915       br(EQ, BMLOOPSTR1);
 916     BIND(BMSKIP);
 917       if (!isL) {
 918         // if we've met UTF symbol while searching Latin1 pattern, then we can
 919         // skip cnt1 symbols
 920         if (str1_isL != str2_isL) {
 921           mov(result_tmp, cnt1);
 922         } else {
 923           mov(result_tmp, 1);
 924         }
 925         subs(zr, skipch, ASIZE);
 926         br(HS, BMADV);
 927       }
 928       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
 929     BIND(BMADV);
 930       sub(cnt1tmp, cnt1, 1);
 931       add(str2, str2, result_tmp, LSL, str2_chr_shift);
 932       cmp(str2, str2end);
 933       br(LE, BMLOOPSTR2);
 934       add(sp, sp, ASIZE);
 935       b(NOMATCH);
 936     BIND(BMLOOPSTR1_LASTCMP);
 937       cmp(ch1, ch2);
 938       br(NE, BMSKIP);
 939     BIND(BMMATCH);
 940       sub(result, str2, tmp5);
 941       if (!str2_isL) lsr(result, result, 1);
 942       add(sp, sp, ASIZE);
 943       b(DONE);
 944 
 945     BIND(LINEARSTUB);
 946     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
 947     br(LT, LINEAR_MEDIUM);
 948     mov(result, zr);
 949     RuntimeAddress stub = nullptr;
 950     if (isL) {
 951       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
 952       assert(stub.target() != nullptr, "string_indexof_linear_ll stub has not been generated");
 953     } else if (str1_isL) {
 954       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
 955        assert(stub.target() != nullptr, "string_indexof_linear_ul stub has not been generated");
 956     } else {
 957       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
 958       assert(stub.target() != nullptr, "string_indexof_linear_uu stub has not been generated");
 959     }
 960     address call = trampoline_call(stub);
 961     if (call == nullptr) {
 962       DEBUG_ONLY(reset_labels(LINEARSEARCH, LINEAR_MEDIUM, DONE, NOMATCH, MATCH));
 963       ciEnv::current()->record_failure("CodeCache is full");
 964       return;
 965     }
 966     b(DONE);
 967   }
 968 
 969   BIND(LINEARSEARCH);
 970   {
 971     Label DO1, DO2, DO3;
 972 
 973     Register str2tmp = tmp2;
 974     Register first = tmp3;
 975 
 976     if (icnt1 == -1)
 977     {
 978         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 979 
 980         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
 981         br(LT, DOSHORT);
 982       BIND(LINEAR_MEDIUM);
 983         (this->*str1_load_1chr)(first, Address(str1));
 984         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
 985         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
 986         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
 987         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
 988 
 989       BIND(FIRST_LOOP);
 990         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
 991         cmp(first, ch2);
 992         br(EQ, STR1_LOOP);
 993       BIND(STR2_NEXT);
 994         adds(cnt2_neg, cnt2_neg, str2_chr_size);
 995         br(LE, FIRST_LOOP);
 996         b(NOMATCH);
 997 
 998       BIND(STR1_LOOP);
 999         adds(cnt1tmp, cnt1_neg, str1_chr_size);
1000         add(cnt2tmp, cnt2_neg, str2_chr_size);
1001         br(GE, MATCH);
1002 
1003       BIND(STR1_NEXT);
1004         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
1005         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
1006         cmp(ch1, ch2);
1007         br(NE, STR2_NEXT);
1008         adds(cnt1tmp, cnt1tmp, str1_chr_size);
1009         add(cnt2tmp, cnt2tmp, str2_chr_size);
1010         br(LT, STR1_NEXT);
1011         b(MATCH);
1012 
1013       BIND(DOSHORT);
1014       if (str1_isL == str2_isL) {
1015         cmp(cnt1, (u1)2);
1016         br(LT, DO1);
1017         br(GT, DO3);
1018       }
1019     }
1020 
1021     if (icnt1 == 4) {
1022       Label CH1_LOOP;
1023 
1024         (this->*load_4chr)(ch1, str1);
1025         sub(result_tmp, cnt2, 4);
1026         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
1027         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
1028 
1029       BIND(CH1_LOOP);
1030         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
1031         cmp(ch1, ch2);
1032         br(EQ, MATCH);
1033         adds(cnt2_neg, cnt2_neg, str2_chr_size);
1034         br(LE, CH1_LOOP);
1035         b(NOMATCH);
1036       }
1037 
1038     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
1039       Label CH1_LOOP;
1040 
1041       BIND(DO2);
1042         (this->*load_2chr)(ch1, str1);
1043         if (icnt1 == 2) {
1044           sub(result_tmp, cnt2, 2);
1045         }
1046         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
1047         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
1048       BIND(CH1_LOOP);
1049         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
1050         cmp(ch1, ch2);
1051         br(EQ, MATCH);
1052         adds(cnt2_neg, cnt2_neg, str2_chr_size);
1053         br(LE, CH1_LOOP);
1054         b(NOMATCH);
1055     }
1056 
1057     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
1058       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
1059 
1060       BIND(DO3);
1061         (this->*load_2chr)(first, str1);
1062         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
1063         if (icnt1 == 3) {
1064           sub(result_tmp, cnt2, 3);
1065         }
1066         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
1067         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
1068       BIND(FIRST_LOOP);
1069         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
1070         cmpw(first, ch2);
1071         br(EQ, STR1_LOOP);
1072       BIND(STR2_NEXT);
1073         adds(cnt2_neg, cnt2_neg, str2_chr_size);
1074         br(LE, FIRST_LOOP);
1075         b(NOMATCH);
1076 
1077       BIND(STR1_LOOP);
1078         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
1079         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
1080         cmp(ch1, ch2);
1081         br(NE, STR2_NEXT);
1082         b(MATCH);
1083     }
1084 
1085     if (icnt1 == -1 || icnt1 == 1) {
1086       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
1087 
1088       BIND(DO1);
1089         (this->*str1_load_1chr)(ch1, str1);
1090         cmp(cnt2, (u1)8);
1091         br(LT, DO1_SHORT);
1092 
1093         sub(result_tmp, cnt2, 8/str2_chr_size);
1094         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
1095         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
1096         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
1097 
1098         if (str2_isL) {
1099           orr(ch1, ch1, ch1, LSL, 8);
1100         }
1101         orr(ch1, ch1, ch1, LSL, 16);
1102         orr(ch1, ch1, ch1, LSL, 32);
1103       BIND(CH1_LOOP);
1104         ldr(ch2, Address(str2, cnt2_neg));
1105         eor(ch2, ch1, ch2);
1106         sub(tmp1, ch2, tmp3);
1107         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
1108         bics(tmp1, tmp1, tmp2);
1109         br(NE, HAS_ZERO);
1110         adds(cnt2_neg, cnt2_neg, 8);
1111         br(LT, CH1_LOOP);
1112 
1113         cmp(cnt2_neg, (u1)8);
1114         mov(cnt2_neg, 0);
1115         br(LT, CH1_LOOP);
1116         b(NOMATCH);
1117 
1118       BIND(HAS_ZERO);
1119         rev(tmp1, tmp1);
1120         clz(tmp1, tmp1);
1121         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
1122         b(MATCH);
1123 
1124       BIND(DO1_SHORT);
1125         mov(result_tmp, cnt2);
1126         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
1127         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
1128       BIND(DO1_LOOP);
1129         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
1130         cmpw(ch1, ch2);
1131         br(EQ, MATCH);
1132         adds(cnt2_neg, cnt2_neg, str2_chr_size);
1133         br(LT, DO1_LOOP);
1134     }
1135   }
1136   BIND(NOMATCH);
1137     mov(result, -1);
1138     b(DONE);
1139   BIND(MATCH);
1140     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
1141   BIND(DONE);
1142 }
1143 
1144 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
1145 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
1146 
1147 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1,
1148                                             Register ch, Register result,
1149                                             Register tmp1, Register tmp2, Register tmp3)
1150 {
1151   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1152   Register cnt1_neg = cnt1;
1153   Register ch1 = rscratch1;
1154   Register result_tmp = rscratch2;
1155 
1156   cbz(cnt1, NOMATCH);
1157 
1158   cmp(cnt1, (u1)4);
1159   br(LT, DO1_SHORT);
1160 
1161   orr(ch, ch, ch, LSL, 16);
1162   orr(ch, ch, ch, LSL, 32);
1163 
1164   sub(cnt1, cnt1, 4);
1165   mov(result_tmp, cnt1);
1166   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
1167   sub(cnt1_neg, zr, cnt1, LSL, 1);
1168 
1169   mov(tmp3, 0x0001000100010001);
1170 
1171   BIND(CH1_LOOP);
1172     ldr(ch1, Address(str1, cnt1_neg));
1173     eor(ch1, ch, ch1);
1174     sub(tmp1, ch1, tmp3);
1175     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
1176     bics(tmp1, tmp1, tmp2);
1177     br(NE, HAS_ZERO);
1178     adds(cnt1_neg, cnt1_neg, 8);
1179     br(LT, CH1_LOOP);
1180 
1181     cmp(cnt1_neg, (u1)8);
1182     mov(cnt1_neg, 0);
1183     br(LT, CH1_LOOP);
1184     b(NOMATCH);
1185 
1186   BIND(HAS_ZERO);
1187     rev(tmp1, tmp1);
1188     clz(tmp1, tmp1);
1189     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1190     b(MATCH);
1191 
1192   BIND(DO1_SHORT);
1193     mov(result_tmp, cnt1);
1194     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
1195     sub(cnt1_neg, zr, cnt1, LSL, 1);
1196   BIND(DO1_LOOP);
1197     ldrh(ch1, Address(str1, cnt1_neg));
1198     cmpw(ch, ch1);
1199     br(EQ, MATCH);
1200     adds(cnt1_neg, cnt1_neg, 2);
1201     br(LT, DO1_LOOP);
1202   BIND(NOMATCH);
1203     mov(result, -1);
1204     b(DONE);
1205   BIND(MATCH);
1206     add(result, result_tmp, cnt1_neg, ASR, 1);
1207   BIND(DONE);
1208 }
1209 
1210 void C2_MacroAssembler::string_indexof_char_sve(Register str1, Register cnt1,
1211                                                 Register ch, Register result,
1212                                                 FloatRegister ztmp1,
1213                                                 FloatRegister ztmp2,
1214                                                 PRegister tmp_pg,
1215                                                 PRegister tmp_pdn, bool isL)
1216 {
1217   // Note that `tmp_pdn` should *NOT* be used as governing predicate register.
1218   assert(tmp_pg->is_governing(),
1219          "this register has to be a governing predicate register");
1220 
1221   Label LOOP, MATCH, DONE, NOMATCH;
1222   Register vec_len = rscratch1;
1223   Register idx = rscratch2;
1224 
1225   SIMD_RegVariant T = (isL == true) ? B : H;
1226 
1227   cbz(cnt1, NOMATCH);
1228 
1229   // Assign the particular char throughout the vector.
1230   sve_dup(ztmp2, T, ch);
1231   if (isL) {
1232     sve_cntb(vec_len);
1233   } else {
1234     sve_cnth(vec_len);
1235   }
1236   mov(idx, 0);
1237 
1238   // Generate a predicate to control the reading of input string.
1239   sve_whilelt(tmp_pg, T, idx, cnt1);
1240 
1241   BIND(LOOP);
1242     // Read a vector of 8- or 16-bit data depending on the string type. Note
1243     // that inactive elements indicated by the predicate register won't cause
1244     // a data read from memory to the destination vector.
1245     if (isL) {
1246       sve_ld1b(ztmp1, T, tmp_pg, Address(str1, idx));
1247     } else {
1248       sve_ld1h(ztmp1, T, tmp_pg, Address(str1, idx, Address::lsl(1)));
1249     }
1250     add(idx, idx, vec_len);
1251 
1252     // Perform the comparison. An element of the destination predicate is set
1253     // to active if the particular char is matched.
1254     sve_cmp(Assembler::EQ, tmp_pdn, T, tmp_pg, ztmp1, ztmp2);
1255 
1256     // Branch if the particular char is found.
1257     br(NE, MATCH);
1258 
1259     sve_whilelt(tmp_pg, T, idx, cnt1);
1260 
1261     // Loop back if the particular char not found.
1262     br(MI, LOOP);
1263 
1264   BIND(NOMATCH);
1265     mov(result, -1);
1266     b(DONE);
1267 
1268   BIND(MATCH);
1269     // Undo the index increment.
1270     sub(idx, idx, vec_len);
1271 
1272     // Crop the vector to find its location.
1273     sve_brka(tmp_pdn, tmp_pg, tmp_pdn, false /* isMerge */);
1274     add(result, idx, -1);
1275     sve_incp(result, T, tmp_pdn);
1276   BIND(DONE);
1277 }
1278 
1279 void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1,
1280                                             Register ch, Register result,
1281                                             Register tmp1, Register tmp2, Register tmp3)
1282 {
1283   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
1284   Register cnt1_neg = cnt1;
1285   Register ch1 = rscratch1;
1286   Register result_tmp = rscratch2;
1287 
1288   cbz(cnt1, NOMATCH);
1289 
1290   cmp(cnt1, (u1)8);
1291   br(LT, DO1_SHORT);
1292 
1293   orr(ch, ch, ch, LSL, 8);
1294   orr(ch, ch, ch, LSL, 16);
1295   orr(ch, ch, ch, LSL, 32);
1296 
1297   sub(cnt1, cnt1, 8);
1298   mov(result_tmp, cnt1);
1299   lea(str1, Address(str1, cnt1));
1300   sub(cnt1_neg, zr, cnt1);
1301 
1302   mov(tmp3, 0x0101010101010101);
1303 
1304   BIND(CH1_LOOP);
1305     ldr(ch1, Address(str1, cnt1_neg));
1306     eor(ch1, ch, ch1);
1307     sub(tmp1, ch1, tmp3);
1308     orr(tmp2, ch1, 0x7f7f7f7f7f7f7f7f);
1309     bics(tmp1, tmp1, tmp2);
1310     br(NE, HAS_ZERO);
1311     adds(cnt1_neg, cnt1_neg, 8);
1312     br(LT, CH1_LOOP);
1313 
1314     cmp(cnt1_neg, (u1)8);
1315     mov(cnt1_neg, 0);
1316     br(LT, CH1_LOOP);
1317     b(NOMATCH);
1318 
1319   BIND(HAS_ZERO);
1320     rev(tmp1, tmp1);
1321     clz(tmp1, tmp1);
1322     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
1323     b(MATCH);
1324 
1325   BIND(DO1_SHORT);
1326     mov(result_tmp, cnt1);
1327     lea(str1, Address(str1, cnt1));
1328     sub(cnt1_neg, zr, cnt1);
1329   BIND(DO1_LOOP);
1330     ldrb(ch1, Address(str1, cnt1_neg));
1331     cmp(ch, ch1);
1332     br(EQ, MATCH);
1333     adds(cnt1_neg, cnt1_neg, 1);
1334     br(LT, DO1_LOOP);
1335   BIND(NOMATCH);
1336     mov(result, -1);
1337     b(DONE);
1338   BIND(MATCH);
1339     add(result, result_tmp, cnt1_neg);
1340   BIND(DONE);
1341 }
1342 
1343 // Compare strings.
1344 void C2_MacroAssembler::string_compare(Register str1, Register str2,
1345     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
1346     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
1347     PRegister pgtmp1, PRegister pgtmp2, int ae) {
1348   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
1349       DIFF, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
1350       SHORT_LOOP_START, TAIL_CHECK;
1351 
1352   bool isLL = ae == StrIntrinsicNode::LL;
1353   bool isLU = ae == StrIntrinsicNode::LU;
1354   bool isUL = ae == StrIntrinsicNode::UL;
1355 
1356   // The stub threshold for LL strings is: 72 (64 + 8) chars
1357   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
1358   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
1359   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
1360 
1361   bool str1_isL = isLL || isLU;
1362   bool str2_isL = isLL || isUL;
1363 
1364   int str1_chr_shift = str1_isL ? 0 : 1;
1365   int str2_chr_shift = str2_isL ? 0 : 1;
1366   int str1_chr_size = str1_isL ? 1 : 2;
1367   int str2_chr_size = str2_isL ? 1 : 2;
1368   int minCharsInWord = isLL ? wordSize : wordSize/2;
1369 
1370   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
1371   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
1372                                       (chr_insn)&MacroAssembler::ldrh;
1373   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
1374                                       (chr_insn)&MacroAssembler::ldrh;
1375   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
1376                             (uxt_insn)&MacroAssembler::uxthw;
1377 
1378   BLOCK_COMMENT("string_compare {");
1379 
1380   // Bizarrely, the counts are passed in bytes, regardless of whether they
1381   // are L or U strings, however the result is always in characters.
1382   if (!str1_isL) asrw(cnt1, cnt1, 1);
1383   if (!str2_isL) asrw(cnt2, cnt2, 1);
1384 
1385   // Compute the minimum of the string lengths and save the difference.
1386   subsw(result, cnt1, cnt2);
1387   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
1388 
1389   // A very short string
1390   cmpw(cnt2, minCharsInWord);
1391   br(Assembler::LE, SHORT_STRING);
1392 
1393   // Compare longwords
1394   // load first parts of strings and finish initialization while loading
1395   {
1396     if (str1_isL == str2_isL) { // LL or UU
1397       ldr(tmp1, Address(str1));
1398       cmp(str1, str2);
1399       br(Assembler::EQ, DONE);
1400       ldr(tmp2, Address(str2));
1401       cmp(cnt2, stub_threshold);
1402       br(GE, STUB);
1403       subsw(cnt2, cnt2, minCharsInWord);
1404       br(EQ, TAIL_CHECK);
1405       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1406       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1407       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1408     } else if (isLU) {
1409       ldrs(vtmp, Address(str1));
1410       ldr(tmp2, Address(str2));
1411       cmp(cnt2, stub_threshold);
1412       br(GE, STUB);
1413       subw(cnt2, cnt2, 4);
1414       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1415       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1416       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1417       zip1(vtmp, T8B, vtmp, vtmpZ);
1418       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1419       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1420       add(cnt1, cnt1, 4);
1421       fmovd(tmp1, vtmp);
1422     } else { // UL case
1423       ldr(tmp1, Address(str1));
1424       ldrs(vtmp, Address(str2));
1425       cmp(cnt2, stub_threshold);
1426       br(GE, STUB);
1427       subw(cnt2, cnt2, 4);
1428       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
1429       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
1430       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
1431       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
1432       zip1(vtmp, T8B, vtmp, vtmpZ);
1433       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
1434       add(cnt1, cnt1, 8);
1435       fmovd(tmp2, vtmp);
1436     }
1437     adds(cnt2, cnt2, isUL ? 4 : 8);
1438     br(GE, TAIL);
1439     eor(rscratch2, tmp1, tmp2);
1440     cbnz(rscratch2, DIFF);
1441     // main loop
1442     bind(NEXT_WORD);
1443     if (str1_isL == str2_isL) {
1444       ldr(tmp1, Address(str1, cnt2));
1445       ldr(tmp2, Address(str2, cnt2));
1446       adds(cnt2, cnt2, 8);
1447     } else if (isLU) {
1448       ldrs(vtmp, Address(str1, cnt1));
1449       ldr(tmp2, Address(str2, cnt2));
1450       add(cnt1, cnt1, 4);
1451       zip1(vtmp, T8B, vtmp, vtmpZ);
1452       fmovd(tmp1, vtmp);
1453       adds(cnt2, cnt2, 8);
1454     } else { // UL
1455       ldrs(vtmp, Address(str2, cnt2));
1456       ldr(tmp1, Address(str1, cnt1));
1457       zip1(vtmp, T8B, vtmp, vtmpZ);
1458       add(cnt1, cnt1, 8);
1459       fmovd(tmp2, vtmp);
1460       adds(cnt2, cnt2, 4);
1461     }
1462     br(GE, TAIL);
1463 
1464     eor(rscratch2, tmp1, tmp2);
1465     cbz(rscratch2, NEXT_WORD);
1466     b(DIFF);
1467     bind(TAIL);
1468     eor(rscratch2, tmp1, tmp2);
1469     cbnz(rscratch2, DIFF);
1470     // Last longword.  In the case where length == 4 we compare the
1471     // same longword twice, but that's still faster than another
1472     // conditional branch.
1473     if (str1_isL == str2_isL) {
1474       ldr(tmp1, Address(str1));
1475       ldr(tmp2, Address(str2));
1476     } else if (isLU) {
1477       ldrs(vtmp, Address(str1));
1478       ldr(tmp2, Address(str2));
1479       zip1(vtmp, T8B, vtmp, vtmpZ);
1480       fmovd(tmp1, vtmp);
1481     } else { // UL
1482       ldrs(vtmp, Address(str2));
1483       ldr(tmp1, Address(str1));
1484       zip1(vtmp, T8B, vtmp, vtmpZ);
1485       fmovd(tmp2, vtmp);
1486     }
1487     bind(TAIL_CHECK);
1488     eor(rscratch2, tmp1, tmp2);
1489     cbz(rscratch2, DONE);
1490 
1491     // Find the first different characters in the longwords and
1492     // compute their difference.
1493     bind(DIFF);
1494     rev(rscratch2, rscratch2);
1495     clz(rscratch2, rscratch2);
1496     andr(rscratch2, rscratch2, isLL ? -8 : -16);
1497     lsrv(tmp1, tmp1, rscratch2);
1498     (this->*ext_chr)(tmp1, tmp1);
1499     lsrv(tmp2, tmp2, rscratch2);
1500     (this->*ext_chr)(tmp2, tmp2);
1501     subw(result, tmp1, tmp2);
1502     b(DONE);
1503   }
1504 
1505   bind(STUB);
1506     RuntimeAddress stub = nullptr;
1507     switch(ae) {
1508       case StrIntrinsicNode::LL:
1509         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
1510         break;
1511       case StrIntrinsicNode::UU:
1512         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
1513         break;
1514       case StrIntrinsicNode::LU:
1515         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
1516         break;
1517       case StrIntrinsicNode::UL:
1518         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
1519         break;
1520       default:
1521         ShouldNotReachHere();
1522      }
1523     assert(stub.target() != nullptr, "compare_long_string stub has not been generated");
1524     address call = trampoline_call(stub);
1525     if (call == nullptr) {
1526       DEBUG_ONLY(reset_labels(DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, SHORT_LOOP_START));
1527       ciEnv::current()->record_failure("CodeCache is full");
1528       return;
1529     }
1530     b(DONE);
1531 
1532   bind(SHORT_STRING);
1533   // Is the minimum length zero?
1534   cbz(cnt2, DONE);
1535   // arrange code to do most branches while loading and loading next characters
1536   // while comparing previous
1537   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1538   subs(cnt2, cnt2, 1);
1539   br(EQ, SHORT_LAST_INIT);
1540   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1541   b(SHORT_LOOP_START);
1542   bind(SHORT_LOOP);
1543   subs(cnt2, cnt2, 1);
1544   br(EQ, SHORT_LAST);
1545   bind(SHORT_LOOP_START);
1546   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
1547   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
1548   cmp(tmp1, cnt1);
1549   br(NE, SHORT_LOOP_TAIL);
1550   subs(cnt2, cnt2, 1);
1551   br(EQ, SHORT_LAST2);
1552   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
1553   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1554   cmp(tmp2, rscratch1);
1555   br(EQ, SHORT_LOOP);
1556   sub(result, tmp2, rscratch1);
1557   b(DONE);
1558   bind(SHORT_LOOP_TAIL);
1559   sub(result, tmp1, cnt1);
1560   b(DONE);
1561   bind(SHORT_LAST2);
1562   cmp(tmp2, rscratch1);
1563   br(EQ, DONE);
1564   sub(result, tmp2, rscratch1);
1565 
1566   b(DONE);
1567   bind(SHORT_LAST_INIT);
1568   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
1569   bind(SHORT_LAST);
1570   cmp(tmp1, cnt1);
1571   br(EQ, DONE);
1572   sub(result, tmp1, cnt1);
1573 
1574   bind(DONE);
1575 
1576   BLOCK_COMMENT("} string_compare");
1577 }
1578 
1579 void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
1580                                      FloatRegister src2, Condition cond, bool isQ) {
1581   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1582   FloatRegister zn = src1, zm = src2;
1583   bool needs_negation = false;
1584   switch (cond) {
1585     case LT: cond = GT; zn = src2; zm = src1; break;
1586     case LE: cond = GE; zn = src2; zm = src1; break;
1587     case LO: cond = HI; zn = src2; zm = src1; break;
1588     case LS: cond = HS; zn = src2; zm = src1; break;
1589     case NE: cond = EQ; needs_negation = true; break;
1590     default:
1591       break;
1592   }
1593 
1594   if (is_floating_point_type(bt)) {
1595     fcm(cond, dst, size, zn, zm);
1596   } else {
1597     cm(cond, dst, size, zn, zm);
1598   }
1599 
1600   if (needs_negation) {
1601     notr(dst, isQ ? T16B : T8B, dst);
1602   }
1603 }
1604 
1605 void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
1606                                           Condition cond, bool isQ) {
1607   SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
1608   if (bt == T_FLOAT || bt == T_DOUBLE) {
1609     if (cond == Assembler::NE) {
1610       fcm(Assembler::EQ, dst, size, src);
1611       notr(dst, isQ ? T16B : T8B, dst);
1612     } else {
1613       fcm(cond, dst, size, src);
1614     }
1615   } else {
1616     if (cond == Assembler::NE) {
1617       cm(Assembler::EQ, dst, size, src);
1618       notr(dst, isQ ? T16B : T8B, dst);
1619     } else {
1620       cm(cond, dst, size, src);
1621     }
1622   }
1623 }
1624 
1625 // Compress the least significant bit of each byte to the rightmost and clear
1626 // the higher garbage bits.
1627 void C2_MacroAssembler::bytemask_compress(Register dst) {
1628   // Example input, dst = 0x01 00 00 00 01 01 00 01
1629   // The "??" bytes are garbage.
1630   orr(dst, dst, dst, Assembler::LSR, 7);  // dst = 0x?? 02 ?? 00 ?? 03 ?? 01
1631   orr(dst, dst, dst, Assembler::LSR, 14); // dst = 0x????????08 ??????0D
1632   orr(dst, dst, dst, Assembler::LSR, 28); // dst = 0x????????????????8D
1633   andr(dst, dst, 0xff);                   // dst = 0x8D
1634 }
1635 
1636 // Pack the lowest-numbered bit of each mask element in src into a long value
1637 // in dst, at most the first 64 lane elements.
1638 // Clobbers: rscratch1, if UseSVE=1 or the hardware doesn't support FEAT_BITPERM.
1639 void C2_MacroAssembler::sve_vmask_tolong(Register dst, PRegister src, BasicType bt, int lane_cnt,
1640                                          FloatRegister vtmp1, FloatRegister vtmp2) {
1641   assert(lane_cnt <= 64 && is_power_of_2(lane_cnt), "Unsupported lane count");
1642   assert_different_registers(dst, rscratch1);
1643   assert_different_registers(vtmp1, vtmp2);
1644 
1645   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1646   // Example:   src = 0b01100101 10001101, bt = T_BYTE, lane_cnt = 16
1647   // Expected:  dst = 0x658D
1648 
1649   // Convert the mask into vector with sequential bytes.
1650   // vtmp1 = 0x00010100 0x00010001 0x01000000 0x01010001
1651   sve_cpy(vtmp1, size, src, 1, false);
1652   if (bt != T_BYTE) {
1653     sve_vector_narrow(vtmp1, B, vtmp1, size, vtmp2);
1654   }
1655 
1656   if (UseSVE > 1 && VM_Version::supports_svebitperm()) {
1657     // Given a vector with the value 0x00 or 0x01 in each byte, the basic idea
1658     // is to compress each significant bit of the byte in a cross-lane way. Due
1659     // to the lack of a cross-lane bit-compress instruction, we use BEXT
1660     // (bit-compress in each lane) with the biggest lane size (T = D) then
1661     // concatenate the results.
1662 
1663     // The second source input of BEXT, initialized with 0x01 in each byte.
1664     // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1665     sve_dup(vtmp2, B, 1);
1666 
1667     // BEXT vtmp1.D, vtmp1.D, vtmp2.D
1668     // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1669     // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1670     //         ---------------------------------------
1671     // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1672     sve_bext(vtmp1, D, vtmp1, vtmp2);
1673 
1674     // Concatenate the lowest significant 8 bits in each 8 bytes, and extract the
1675     // result to dst.
1676     // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1677     // dst   = 0x658D
1678     if (lane_cnt <= 8) {
1679       // No need to concatenate.
1680       umov(dst, vtmp1, B, 0);
1681     } else if (lane_cnt <= 16) {
1682       ins(vtmp1, B, vtmp1, 1, 8);
1683       umov(dst, vtmp1, H, 0);
1684     } else {
1685       // As the lane count is 64 at most, the final expected value must be in
1686       // the lowest 64 bits after narrowing vtmp1 from D to B.
1687       sve_vector_narrow(vtmp1, B, vtmp1, D, vtmp2);
1688       umov(dst, vtmp1, D, 0);
1689     }
1690   } else if (UseSVE > 0) {
1691     // Compress the lowest 8 bytes.
1692     fmovd(dst, vtmp1);
1693     bytemask_compress(dst);
1694     if (lane_cnt <= 8) return;
1695 
1696     // Repeat on higher bytes and join the results.
1697     // Compress 8 bytes in each iteration.
1698     for (int idx = 1; idx < (lane_cnt / 8); idx++) {
1699       sve_extract_integral(rscratch1, T_LONG, vtmp1, idx, vtmp2);
1700       bytemask_compress(rscratch1);
1701       orr(dst, dst, rscratch1, Assembler::LSL, idx << 3);
1702     }
1703   } else {
1704     assert(false, "unsupported");
1705     ShouldNotReachHere();
1706   }
1707 }
1708 
1709 // Unpack the mask, a long value in src, into predicate register dst based on the
1710 // corresponding data type. Note that dst can support at most 64 lanes.
1711 // Below example gives the expected dst predicate register in different types, with
1712 // a valid src(0x658D) on a 1024-bit vector size machine.
1713 // BYTE:  dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 00 00 65 8D
1714 // SHORT: dst = 0x00 00 00 00 00 00 00 00 00 00 00 00 14 11 40 51
1715 // INT:   dst = 0x00 00 00 00 00 00 00 00 01 10 01 01 10 00 11 01
1716 // LONG:  dst = 0x00 01 01 00 00 01 00 01 01 00 00 00 01 01 00 01
1717 //
1718 // The number of significant bits of src must be equal to lane_cnt. E.g., 0xFF658D which
1719 // has 24 significant bits would be an invalid input if dst predicate register refers to
1720 // a LONG type 1024-bit vector, which has at most 16 lanes.
1721 void C2_MacroAssembler::sve_vmask_fromlong(PRegister dst, Register src, BasicType bt, int lane_cnt,
1722                                            FloatRegister vtmp1, FloatRegister vtmp2) {
1723   assert(UseSVE == 2 && VM_Version::supports_svebitperm() &&
1724          lane_cnt <= 64 && is_power_of_2(lane_cnt), "unsupported");
1725   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
1726   // Example:   src = 0x658D, bt = T_BYTE, size = B, lane_cnt = 16
1727   // Expected:  dst = 0b01101001 10001101
1728 
1729   // Put long value from general purpose register into the first lane of vector.
1730   // vtmp1 = 0x0000000000000000 | 0x000000000000658D
1731   sve_dup(vtmp1, B, 0);
1732   mov(vtmp1, D, 0, src);
1733 
1734   // As sve_cmp generates mask value with the minimum unit in byte, we should
1735   // transform the value in the first lane which is mask in bit now to the
1736   // mask in byte, which can be done by SVE2's BDEP instruction.
1737 
1738   // The first source input of BDEP instruction. Deposite each byte in every 8 bytes.
1739   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1740   if (lane_cnt <= 8) {
1741     // Nothing. As only one byte exsits.
1742   } else if (lane_cnt <= 16) {
1743     ins(vtmp1, B, vtmp1, 8, 1);
1744     mov(vtmp1, B, 1, zr);
1745   } else {
1746     sve_vector_extend(vtmp1, D, vtmp1, B);
1747   }
1748 
1749   // The second source input of BDEP instruction, initialized with 0x01 for each byte.
1750   // vtmp2 = 0x01010101 0x01010101 0x01010101 0x01010101
1751   sve_dup(vtmp2, B, 1);
1752 
1753   // BDEP vtmp1.D, vtmp1.D, vtmp2.D
1754   // vtmp1 = 0x0000000000000065 | 0x000000000000008D
1755   // vtmp2 = 0x0101010101010101 | 0x0101010101010101
1756   //         ---------------------------------------
1757   // vtmp1 = 0x0001010000010001 | 0x0100000001010001
1758   sve_bdep(vtmp1, D, vtmp1, vtmp2);
1759 
1760   if (bt != T_BYTE) {
1761     sve_vector_extend(vtmp1, size, vtmp1, B);
1762   }
1763   // Generate mask according to the given vector, in which the elements have been
1764   // extended to expected type.
1765   // dst = 0b01101001 10001101
1766   sve_cmp(Assembler::NE, dst, size, ptrue, vtmp1, 0);
1767 }
1768 
1769 // Clobbers: rflags
1770 void C2_MacroAssembler::sve_compare(PRegister pd, BasicType bt, PRegister pg,
1771                                     FloatRegister zn, FloatRegister zm, Condition cond) {
1772   assert(pg->is_governing(), "This register has to be a governing predicate register");
1773   FloatRegister z1 = zn, z2 = zm;
1774   switch (cond) {
1775     case LE: z1 = zm; z2 = zn; cond = GE; break;
1776     case LT: z1 = zm; z2 = zn; cond = GT; break;
1777     case LO: z1 = zm; z2 = zn; cond = HI; break;
1778     case LS: z1 = zm; z2 = zn; cond = HS; break;
1779     default:
1780       break;
1781   }
1782 
1783   SIMD_RegVariant size = elemType_to_regVariant(bt);
1784   if (is_floating_point_type(bt)) {
1785     sve_fcm(cond, pd, size, pg, z1, z2);
1786   } else {
1787     assert(is_integral_type(bt), "unsupported element type");
1788     sve_cmp(cond, pd, size, pg, z1, z2);
1789   }
1790 }
1791 
1792 // Get index of the last mask lane that is set
1793 void C2_MacroAssembler::sve_vmask_lasttrue(Register dst, BasicType bt, PRegister src, PRegister ptmp) {
1794   SIMD_RegVariant size = elemType_to_regVariant(bt);
1795   sve_rev(ptmp, size, src);
1796   sve_brkb(ptmp, ptrue, ptmp, false);
1797   sve_cntp(dst, size, ptrue, ptmp);
1798   movw(rscratch1, MaxVectorSize / type2aelembytes(bt) - 1);
1799   subw(dst, rscratch1, dst);
1800 }
1801 
1802 // Extend integer vector src to dst with the same lane count
1803 // but larger element size, e.g. 4B -> 4I
1804 void C2_MacroAssembler::neon_vector_extend(FloatRegister dst, BasicType dst_bt, unsigned dst_vlen_in_bytes,
1805                                            FloatRegister src, BasicType src_bt, bool is_unsigned) {
1806   if (src_bt == T_BYTE) {
1807     if (dst_bt == T_SHORT) {
1808       // 4B/8B to 4S/8S
1809       _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1810     } else {
1811       // 4B to 4I
1812       assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1813       _xshll(is_unsigned, dst, T8H, src, T8B, 0);
1814       _xshll(is_unsigned, dst, T4S, dst, T4H, 0);
1815     }
1816   } else if (src_bt == T_SHORT) {
1817     // 4S to 4I
1818     assert(dst_vlen_in_bytes == 16 && dst_bt == T_INT, "unsupported");
1819     _xshll(is_unsigned, dst, T4S, src, T4H, 0);
1820   } else if (src_bt == T_INT) {
1821     // 2I to 2L
1822     assert(dst_vlen_in_bytes == 16 && dst_bt == T_LONG, "unsupported");
1823     _xshll(is_unsigned, dst, T2D, src, T2S, 0);
1824   } else {
1825     ShouldNotReachHere();
1826   }
1827 }
1828 
1829 // Narrow integer vector src down to dst with the same lane count
1830 // but smaller element size, e.g. 4I -> 4B
1831 void C2_MacroAssembler::neon_vector_narrow(FloatRegister dst, BasicType dst_bt,
1832                                            FloatRegister src, BasicType src_bt, unsigned src_vlen_in_bytes) {
1833   if (src_bt == T_SHORT) {
1834     // 4S/8S to 4B/8B
1835     assert(src_vlen_in_bytes == 8 || src_vlen_in_bytes == 16, "unsupported");
1836     assert(dst_bt == T_BYTE, "unsupported");
1837     xtn(dst, T8B, src, T8H);
1838   } else if (src_bt == T_INT) {
1839     // 4I to 4B/4S
1840     assert(src_vlen_in_bytes == 16, "unsupported");
1841     assert(dst_bt == T_BYTE || dst_bt == T_SHORT, "unsupported");
1842     xtn(dst, T4H, src, T4S);
1843     if (dst_bt == T_BYTE) {
1844       xtn(dst, T8B, dst, T8H);
1845     }
1846   } else if (src_bt == T_LONG) {
1847     // 2L to 2I
1848     assert(src_vlen_in_bytes == 16, "unsupported");
1849     assert(dst_bt == T_INT, "unsupported");
1850     xtn(dst, T2S, src, T2D);
1851   } else {
1852     ShouldNotReachHere();
1853   }
1854 }
1855 
1856 void C2_MacroAssembler::sve_vector_extend(FloatRegister dst, SIMD_RegVariant dst_size,
1857                                           FloatRegister src, SIMD_RegVariant src_size,
1858                                           bool is_unsigned) {
1859   assert(dst_size > src_size && dst_size <= D && src_size <= S, "invalid element size");
1860 
1861   if (src_size == B) {
1862     switch (dst_size) {
1863     case H:
1864       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1865       break;
1866     case S:
1867       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1868       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1869       break;
1870     case D:
1871       _sve_xunpk(is_unsigned, /* is_high */ false, dst, H, src);
1872       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, dst);
1873       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1874       break;
1875     default:
1876       ShouldNotReachHere();
1877     }
1878   } else if (src_size == H) {
1879     if (dst_size == S) {
1880       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1881     } else { // D
1882       _sve_xunpk(is_unsigned, /* is_high */ false, dst, S, src);
1883       _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, dst);
1884     }
1885   } else if (src_size == S) {
1886     _sve_xunpk(is_unsigned, /* is_high */ false, dst, D, src);
1887   }
1888 }
1889 
1890 // Vector narrow from src to dst with specified element sizes.
1891 // High part of dst vector will be filled with zero.
1892 void C2_MacroAssembler::sve_vector_narrow(FloatRegister dst, SIMD_RegVariant dst_size,
1893                                           FloatRegister src, SIMD_RegVariant src_size,
1894                                           FloatRegister tmp) {
1895   assert(dst_size < src_size && dst_size <= S && src_size <= D, "invalid element size");
1896   assert_different_registers(src, tmp);
1897   sve_dup(tmp, src_size, 0);
1898   if (src_size == D) {
1899     switch (dst_size) {
1900     case S:
1901       sve_uzp1(dst, S, src, tmp);
1902       break;
1903     case H:
1904       assert_different_registers(dst, tmp);
1905       sve_uzp1(dst, S, src, tmp);
1906       sve_uzp1(dst, H, dst, tmp);
1907       break;
1908     case B:
1909       assert_different_registers(dst, tmp);
1910       sve_uzp1(dst, S, src, tmp);
1911       sve_uzp1(dst, H, dst, tmp);
1912       sve_uzp1(dst, B, dst, tmp);
1913       break;
1914     default:
1915       ShouldNotReachHere();
1916     }
1917   } else if (src_size == S) {
1918     if (dst_size == H) {
1919       sve_uzp1(dst, H, src, tmp);
1920     } else { // B
1921       assert_different_registers(dst, tmp);
1922       sve_uzp1(dst, H, src, tmp);
1923       sve_uzp1(dst, B, dst, tmp);
1924     }
1925   } else if (src_size == H) {
1926     sve_uzp1(dst, B, src, tmp);
1927   }
1928 }
1929 
1930 // Extend src predicate to dst predicate with the same lane count but larger
1931 // element size, e.g. 64Byte -> 512Long
1932 void C2_MacroAssembler::sve_vmaskcast_extend(PRegister dst, PRegister src,
1933                                              uint dst_element_length_in_bytes,
1934                                              uint src_element_length_in_bytes) {
1935   if (dst_element_length_in_bytes == 2 * src_element_length_in_bytes) {
1936     sve_punpklo(dst, src);
1937   } else if (dst_element_length_in_bytes == 4 * src_element_length_in_bytes) {
1938     sve_punpklo(dst, src);
1939     sve_punpklo(dst, dst);
1940   } else if (dst_element_length_in_bytes == 8 * src_element_length_in_bytes) {
1941     sve_punpklo(dst, src);
1942     sve_punpklo(dst, dst);
1943     sve_punpklo(dst, dst);
1944   } else {
1945     assert(false, "unsupported");
1946     ShouldNotReachHere();
1947   }
1948 }
1949 
1950 // Narrow src predicate to dst predicate with the same lane count but
1951 // smaller element size, e.g. 512Long -> 64Byte
1952 void C2_MacroAssembler::sve_vmaskcast_narrow(PRegister dst, PRegister src, PRegister ptmp,
1953                                              uint dst_element_length_in_bytes, uint src_element_length_in_bytes) {
1954   // The insignificant bits in src predicate are expected to be zero.
1955   // To ensure the higher order bits of the resultant narrowed vector are 0, an all-zero predicate is
1956   // passed as the second argument. An example narrowing operation with a given mask would be -
1957   // 128Long -> 64Int on a 128-bit machine i.e 2L -> 2I
1958   // Mask (for 2 Longs) : TF
1959   // Predicate register for the above mask (16 bits) : 00000001 00000000
1960   // After narrowing (uzp1 dst.b, src.b, ptmp.b) : 0000 0000 0001 0000
1961   // Which translates to mask for 2 integers as : TF (lower half is considered while upper half is 0)
1962   assert_different_registers(src, ptmp);
1963   assert_different_registers(dst, ptmp);
1964   sve_pfalse(ptmp);
1965   if (dst_element_length_in_bytes * 2 == src_element_length_in_bytes) {
1966     sve_uzp1(dst, B, src, ptmp);
1967   } else if (dst_element_length_in_bytes * 4 == src_element_length_in_bytes) {
1968     sve_uzp1(dst, H, src, ptmp);
1969     sve_uzp1(dst, B, dst, ptmp);
1970   } else if (dst_element_length_in_bytes * 8 == src_element_length_in_bytes) {
1971     sve_uzp1(dst, S, src, ptmp);
1972     sve_uzp1(dst, H, dst, ptmp);
1973     sve_uzp1(dst, B, dst, ptmp);
1974   } else {
1975     assert(false, "unsupported");
1976     ShouldNotReachHere();
1977   }
1978 }
1979 
1980 // Vector reduction add for integral type with ASIMD instructions.
1981 void C2_MacroAssembler::neon_reduce_add_integral(Register dst, BasicType bt,
1982                                                  Register isrc, FloatRegister vsrc,
1983                                                  unsigned vector_length_in_bytes,
1984                                                  FloatRegister vtmp) {
1985   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
1986   assert_different_registers(dst, isrc);
1987   bool isQ = vector_length_in_bytes == 16;
1988 
1989   BLOCK_COMMENT("neon_reduce_add_integral {");
1990     switch(bt) {
1991       case T_BYTE:
1992         addv(vtmp, isQ ? T16B : T8B, vsrc);
1993         smov(dst, vtmp, B, 0);
1994         addw(dst, dst, isrc, ext::sxtb);
1995         break;
1996       case T_SHORT:
1997         addv(vtmp, isQ ? T8H : T4H, vsrc);
1998         smov(dst, vtmp, H, 0);
1999         addw(dst, dst, isrc, ext::sxth);
2000         break;
2001       case T_INT:
2002         isQ ? addv(vtmp, T4S, vsrc) : addpv(vtmp, T2S, vsrc, vsrc);
2003         umov(dst, vtmp, S, 0);
2004         addw(dst, dst, isrc);
2005         break;
2006       case T_LONG:
2007         assert(isQ, "unsupported");
2008         addpd(vtmp, vsrc);
2009         umov(dst, vtmp, D, 0);
2010         add(dst, dst, isrc);
2011         break;
2012       default:
2013         assert(false, "unsupported");
2014         ShouldNotReachHere();
2015     }
2016   BLOCK_COMMENT("} neon_reduce_add_integral");
2017 }
2018 
2019 // Vector reduction multiply for integral type with ASIMD instructions.
2020 // Note: temporary registers vtmp1 and vtmp2 are not used in some cases.
2021 // Clobbers: rscratch1
2022 void C2_MacroAssembler::neon_reduce_mul_integral(Register dst, BasicType bt,
2023                                                  Register isrc, FloatRegister vsrc,
2024                                                  unsigned vector_length_in_bytes,
2025                                                  FloatRegister vtmp1, FloatRegister vtmp2) {
2026   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2027   bool isQ = vector_length_in_bytes == 16;
2028 
2029   BLOCK_COMMENT("neon_reduce_mul_integral {");
2030     switch(bt) {
2031       case T_BYTE:
2032         if (isQ) {
2033           // Multiply the lower half and higher half of vector iteratively.
2034           // vtmp1 = vsrc[8:15]
2035           ins(vtmp1, D, vsrc, 0, 1);
2036           // vtmp1[n] = vsrc[n] * vsrc[n + 8], where n=[0, 7]
2037           mulv(vtmp1, T8B, vtmp1, vsrc);
2038           // vtmp2 = vtmp1[4:7]
2039           ins(vtmp2, S, vtmp1, 0, 1);
2040           // vtmp1[n] = vtmp1[n] * vtmp1[n + 4], where n=[0, 3]
2041           mulv(vtmp1, T8B, vtmp2, vtmp1);
2042         } else {
2043           ins(vtmp1, S, vsrc, 0, 1);
2044           mulv(vtmp1, T8B, vtmp1, vsrc);
2045         }
2046         // vtmp2 = vtmp1[2:3]
2047         ins(vtmp2, H, vtmp1, 0, 1);
2048         // vtmp2[n] = vtmp1[n] * vtmp1[n + 2], where n=[0, 1]
2049         mulv(vtmp2, T8B, vtmp2, vtmp1);
2050         // dst = vtmp2[0] * isrc * vtmp2[1]
2051         umov(rscratch1, vtmp2, B, 0);
2052         mulw(dst, rscratch1, isrc);
2053         sxtb(dst, dst);
2054         umov(rscratch1, vtmp2, B, 1);
2055         mulw(dst, rscratch1, dst);
2056         sxtb(dst, dst);
2057         break;
2058       case T_SHORT:
2059         if (isQ) {
2060           ins(vtmp2, D, vsrc, 0, 1);
2061           mulv(vtmp2, T4H, vtmp2, vsrc);
2062           ins(vtmp1, S, vtmp2, 0, 1);
2063           mulv(vtmp1, T4H, vtmp1, vtmp2);
2064         } else {
2065           ins(vtmp1, S, vsrc, 0, 1);
2066           mulv(vtmp1, T4H, vtmp1, vsrc);
2067         }
2068         umov(rscratch1, vtmp1, H, 0);
2069         mulw(dst, rscratch1, isrc);
2070         sxth(dst, dst);
2071         umov(rscratch1, vtmp1, H, 1);
2072         mulw(dst, rscratch1, dst);
2073         sxth(dst, dst);
2074         break;
2075       case T_INT:
2076         if (isQ) {
2077           ins(vtmp1, D, vsrc, 0, 1);
2078           mulv(vtmp1, T2S, vtmp1, vsrc);
2079         } else {
2080           vtmp1 = vsrc;
2081         }
2082         umov(rscratch1, vtmp1, S, 0);
2083         mul(dst, rscratch1, isrc);
2084         umov(rscratch1, vtmp1, S, 1);
2085         mul(dst, rscratch1, dst);
2086         break;
2087       case T_LONG:
2088         umov(rscratch1, vsrc, D, 0);
2089         mul(dst, isrc, rscratch1);
2090         umov(rscratch1, vsrc, D, 1);
2091         mul(dst, dst, rscratch1);
2092         break;
2093       default:
2094         assert(false, "unsupported");
2095         ShouldNotReachHere();
2096     }
2097   BLOCK_COMMENT("} neon_reduce_mul_integral");
2098 }
2099 
2100 // Vector reduction multiply for floating-point type with ASIMD instructions.
2101 void C2_MacroAssembler::neon_reduce_mul_fp(FloatRegister dst, BasicType bt,
2102                                            FloatRegister fsrc, FloatRegister vsrc,
2103                                            unsigned vector_length_in_bytes,
2104                                            FloatRegister vtmp) {
2105   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2106   bool isQ = vector_length_in_bytes == 16;
2107 
2108   BLOCK_COMMENT("neon_reduce_mul_fp {");
2109     switch(bt) {
2110       case T_FLOAT:
2111         fmuls(dst, fsrc, vsrc);
2112         ins(vtmp, S, vsrc, 0, 1);
2113         fmuls(dst, dst, vtmp);
2114         if (isQ) {
2115           ins(vtmp, S, vsrc, 0, 2);
2116           fmuls(dst, dst, vtmp);
2117           ins(vtmp, S, vsrc, 0, 3);
2118           fmuls(dst, dst, vtmp);
2119          }
2120         break;
2121       case T_DOUBLE:
2122         assert(isQ, "unsupported");
2123         fmuld(dst, fsrc, vsrc);
2124         ins(vtmp, D, vsrc, 0, 1);
2125         fmuld(dst, dst, vtmp);
2126         break;
2127       default:
2128         assert(false, "unsupported");
2129         ShouldNotReachHere();
2130     }
2131   BLOCK_COMMENT("} neon_reduce_mul_fp");
2132 }
2133 
2134 // Helper to select logical instruction
2135 void C2_MacroAssembler::neon_reduce_logical_helper(int opc, bool is64, Register Rd,
2136                                                    Register Rn, Register Rm,
2137                                                    enum shift_kind kind, unsigned shift) {
2138   switch(opc) {
2139     case Op_AndReductionV:
2140       is64 ? andr(Rd, Rn, Rm, kind, shift) : andw(Rd, Rn, Rm, kind, shift);
2141       break;
2142     case Op_OrReductionV:
2143       is64 ? orr(Rd, Rn, Rm, kind, shift) : orrw(Rd, Rn, Rm, kind, shift);
2144       break;
2145     case Op_XorReductionV:
2146       is64 ? eor(Rd, Rn, Rm, kind, shift) : eorw(Rd, Rn, Rm, kind, shift);
2147       break;
2148     default:
2149       assert(false, "unsupported");
2150       ShouldNotReachHere();
2151   }
2152 }
2153 
2154 // Vector reduction logical operations And, Or, Xor
2155 // Clobbers: rscratch1
2156 void C2_MacroAssembler::neon_reduce_logical(int opc, Register dst, BasicType bt,
2157                                             Register isrc, FloatRegister vsrc,
2158                                             unsigned vector_length_in_bytes) {
2159   assert(opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV,
2160          "unsupported");
2161   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2162   assert_different_registers(dst, isrc);
2163   bool isQ = vector_length_in_bytes == 16;
2164 
2165   BLOCK_COMMENT("neon_reduce_logical {");
2166     umov(rscratch1, vsrc, isQ ? D : S, 0);
2167     umov(dst, vsrc, isQ ? D : S, 1);
2168     neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, rscratch1);
2169     switch(bt) {
2170       case T_BYTE:
2171         if (isQ) {
2172           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2173         }
2174         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2175         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 8);
2176         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2177         sxtb(dst, dst);
2178         break;
2179       case T_SHORT:
2180         if (isQ) {
2181           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2182         }
2183         neon_reduce_logical_helper(opc, /* is64 */ false, dst, dst, dst, Assembler::LSR, 16);
2184         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2185         sxth(dst, dst);
2186         break;
2187       case T_INT:
2188         if (isQ) {
2189           neon_reduce_logical_helper(opc, /* is64 */ true, dst, dst, dst, Assembler::LSR, 32);
2190         }
2191         neon_reduce_logical_helper(opc, /* is64 */ false, dst, isrc, dst);
2192         break;
2193       case T_LONG:
2194         assert(isQ, "unsupported");
2195         neon_reduce_logical_helper(opc, /* is64 */ true, dst, isrc, dst);
2196         break;
2197       default:
2198         assert(false, "unsupported");
2199         ShouldNotReachHere();
2200     }
2201   BLOCK_COMMENT("} neon_reduce_logical");
2202 }
2203 
2204 // Vector reduction min/max for integral type with ASIMD instructions.
2205 // Note: vtmp is not used and expected to be fnoreg for T_LONG case.
2206 // Clobbers: rscratch1, rflags
2207 void C2_MacroAssembler::neon_reduce_minmax_integral(int opc, Register dst, BasicType bt,
2208                                                     Register isrc, FloatRegister vsrc,
2209                                                     unsigned vector_length_in_bytes,
2210                                                     FloatRegister vtmp) {
2211   assert(opc == Op_MinReductionV || opc == Op_MaxReductionV, "unsupported");
2212   assert(vector_length_in_bytes == 8 || vector_length_in_bytes == 16, "unsupported");
2213   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported");
2214   assert_different_registers(dst, isrc);
2215   bool isQ = vector_length_in_bytes == 16;
2216   bool is_min = opc == Op_MinReductionV;
2217 
2218   BLOCK_COMMENT("neon_reduce_minmax_integral {");
2219     if (bt == T_LONG) {
2220       assert(vtmp == fnoreg, "should be");
2221       assert(isQ, "should be");
2222       umov(rscratch1, vsrc, D, 0);
2223       cmp(isrc, rscratch1);
2224       csel(dst, isrc, rscratch1, is_min ? LT : GT);
2225       umov(rscratch1, vsrc, D, 1);
2226       cmp(dst, rscratch1);
2227       csel(dst, dst, rscratch1, is_min ? LT : GT);
2228     } else {
2229       SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
2230       if (size == T2S) {
2231         is_min ? sminp(vtmp, size, vsrc, vsrc) : smaxp(vtmp, size, vsrc, vsrc);
2232       } else {
2233         is_min ? sminv(vtmp, size, vsrc) : smaxv(vtmp, size, vsrc);
2234       }
2235       if (bt == T_INT) {
2236         umov(dst, vtmp, S, 0);
2237       } else {
2238         smov(dst, vtmp, elemType_to_regVariant(bt), 0);
2239       }
2240       cmpw(dst, isrc);
2241       cselw(dst, dst, isrc, is_min ? LT : GT);
2242     }
2243   BLOCK_COMMENT("} neon_reduce_minmax_integral");
2244 }
2245 
2246 // Vector reduction for integral type with SVE instruction.
2247 // Supported operations are Add, And, Or, Xor, Max, Min.
2248 // rflags would be clobbered if opc is Op_MaxReductionV or Op_MinReductionV.
2249 void C2_MacroAssembler::sve_reduce_integral(int opc, Register dst, BasicType bt, Register src1,
2250                                             FloatRegister src2, PRegister pg, FloatRegister tmp) {
2251   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2252   assert(pg->is_governing(), "This register has to be a governing predicate register");
2253   assert_different_registers(src1, dst);
2254   // Register "dst" and "tmp" are to be clobbered, and "src1" and "src2" should be preserved.
2255   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2256   switch (opc) {
2257     case Op_AddReductionVI: {
2258       sve_uaddv(tmp, size, pg, src2);
2259       if (bt == T_BYTE) {
2260         smov(dst, tmp, size, 0);
2261         addw(dst, src1, dst, ext::sxtb);
2262       } else if (bt == T_SHORT) {
2263         smov(dst, tmp, size, 0);
2264         addw(dst, src1, dst, ext::sxth);
2265       } else {
2266         umov(dst, tmp, size, 0);
2267         addw(dst, dst, src1);
2268       }
2269       break;
2270     }
2271     case Op_AddReductionVL: {
2272       sve_uaddv(tmp, size, pg, src2);
2273       umov(dst, tmp, size, 0);
2274       add(dst, dst, src1);
2275       break;
2276     }
2277     case Op_AndReductionV: {
2278       sve_andv(tmp, size, pg, src2);
2279       if (bt == T_INT || bt == T_LONG) {
2280         umov(dst, tmp, size, 0);
2281       } else {
2282         smov(dst, tmp, size, 0);
2283       }
2284       if (bt == T_LONG) {
2285         andr(dst, dst, src1);
2286       } else {
2287         andw(dst, dst, src1);
2288       }
2289       break;
2290     }
2291     case Op_OrReductionV: {
2292       sve_orv(tmp, size, pg, src2);
2293       if (bt == T_INT || bt == T_LONG) {
2294         umov(dst, tmp, size, 0);
2295       } else {
2296         smov(dst, tmp, size, 0);
2297       }
2298       if (bt == T_LONG) {
2299         orr(dst, dst, src1);
2300       } else {
2301         orrw(dst, dst, src1);
2302       }
2303       break;
2304     }
2305     case Op_XorReductionV: {
2306       sve_eorv(tmp, size, pg, src2);
2307       if (bt == T_INT || bt == T_LONG) {
2308         umov(dst, tmp, size, 0);
2309       } else {
2310         smov(dst, tmp, size, 0);
2311       }
2312       if (bt == T_LONG) {
2313         eor(dst, dst, src1);
2314       } else {
2315         eorw(dst, dst, src1);
2316       }
2317       break;
2318     }
2319     case Op_MaxReductionV: {
2320       sve_smaxv(tmp, size, pg, src2);
2321       if (bt == T_INT || bt == T_LONG) {
2322         umov(dst, tmp, size, 0);
2323       } else {
2324         smov(dst, tmp, size, 0);
2325       }
2326       if (bt == T_LONG) {
2327         cmp(dst, src1);
2328         csel(dst, dst, src1, Assembler::GT);
2329       } else {
2330         cmpw(dst, src1);
2331         cselw(dst, dst, src1, Assembler::GT);
2332       }
2333       break;
2334     }
2335     case Op_MinReductionV: {
2336       sve_sminv(tmp, size, pg, src2);
2337       if (bt == T_INT || bt == T_LONG) {
2338         umov(dst, tmp, size, 0);
2339       } else {
2340         smov(dst, tmp, size, 0);
2341       }
2342       if (bt == T_LONG) {
2343         cmp(dst, src1);
2344         csel(dst, dst, src1, Assembler::LT);
2345       } else {
2346         cmpw(dst, src1);
2347         cselw(dst, dst, src1, Assembler::LT);
2348       }
2349       break;
2350     }
2351     default:
2352       assert(false, "unsupported");
2353       ShouldNotReachHere();
2354   }
2355 
2356   if (opc == Op_AndReductionV || opc == Op_OrReductionV || opc == Op_XorReductionV) {
2357     if (bt == T_BYTE) {
2358       sxtb(dst, dst);
2359     } else if (bt == T_SHORT) {
2360       sxth(dst, dst);
2361     }
2362   }
2363 }
2364 
2365 // Set elements of the dst predicate to true for lanes in the range of [0, lane_cnt), or
2366 // to false otherwise. The input "lane_cnt" should be smaller than or equal to the supported
2367 // max vector length of the basic type. Clobbers: rscratch1 and the rFlagsReg.
2368 void C2_MacroAssembler::sve_gen_mask_imm(PRegister dst, BasicType bt, uint32_t lane_cnt) {
2369   uint32_t max_vector_length = Matcher::max_vector_size(bt);
2370   assert(lane_cnt <= max_vector_length, "unsupported input lane_cnt");
2371 
2372   // Set all elements to false if the input "lane_cnt" is zero.
2373   if (lane_cnt == 0) {
2374     sve_pfalse(dst);
2375     return;
2376   }
2377 
2378   SIMD_RegVariant size = elemType_to_regVariant(bt);
2379   assert(size != Q, "invalid size");
2380 
2381   // Set all true if "lane_cnt" equals to the max lane count.
2382   if (lane_cnt == max_vector_length) {
2383     sve_ptrue(dst, size, /* ALL */ 0b11111);
2384     return;
2385   }
2386 
2387   // Fixed numbers for "ptrue".
2388   switch(lane_cnt) {
2389   case 1: /* VL1 */
2390   case 2: /* VL2 */
2391   case 3: /* VL3 */
2392   case 4: /* VL4 */
2393   case 5: /* VL5 */
2394   case 6: /* VL6 */
2395   case 7: /* VL7 */
2396   case 8: /* VL8 */
2397     sve_ptrue(dst, size, lane_cnt);
2398     return;
2399   case 16:
2400     sve_ptrue(dst, size, /* VL16 */ 0b01001);
2401     return;
2402   case 32:
2403     sve_ptrue(dst, size, /* VL32 */ 0b01010);
2404     return;
2405   case 64:
2406     sve_ptrue(dst, size, /* VL64 */ 0b01011);
2407     return;
2408   case 128:
2409     sve_ptrue(dst, size, /* VL128 */ 0b01100);
2410     return;
2411   case 256:
2412     sve_ptrue(dst, size, /* VL256 */ 0b01101);
2413     return;
2414   default:
2415     break;
2416   }
2417 
2418   // Special patterns for "ptrue".
2419   if (lane_cnt == round_down_power_of_2(max_vector_length)) {
2420     sve_ptrue(dst, size, /* POW2 */ 0b00000);
2421   } else if (lane_cnt == max_vector_length - (max_vector_length % 4)) {
2422     sve_ptrue(dst, size, /* MUL4 */ 0b11101);
2423   } else if (lane_cnt == max_vector_length - (max_vector_length % 3)) {
2424     sve_ptrue(dst, size, /* MUL3 */ 0b11110);
2425   } else {
2426     // Encode to "whileltw" for the remaining cases.
2427     mov(rscratch1, lane_cnt);
2428     sve_whileltw(dst, size, zr, rscratch1);
2429   }
2430 }
2431 
2432 // Pack active elements of src, under the control of mask, into the lowest-numbered elements of dst.
2433 // Any remaining elements of dst will be filled with zero.
2434 // Clobbers: rscratch1
2435 // Preserves: src, mask
2436 void C2_MacroAssembler::sve_compress_short(FloatRegister dst, FloatRegister src, PRegister mask,
2437                                            FloatRegister vtmp1, FloatRegister vtmp2,
2438                                            PRegister pgtmp) {
2439   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2440   assert_different_registers(dst, src, vtmp1, vtmp2);
2441   assert_different_registers(mask, pgtmp);
2442 
2443   // Example input:   src   = 8888 7777 6666 5555 4444 3333 2222 1111
2444   //                  mask  = 0001 0000 0000 0001 0001 0000 0001 0001
2445   // Expected result: dst   = 0000 0000 0000 8888 5555 4444 2222 1111
2446   sve_dup(vtmp2, H, 0);
2447 
2448   // Extend lowest half to type INT.
2449   // dst = 00004444 00003333 00002222 00001111
2450   sve_uunpklo(dst, S, src);
2451   // pgtmp = 00000001 00000000 00000001 00000001
2452   sve_punpklo(pgtmp, mask);
2453   // Pack the active elements in size of type INT to the right,
2454   // and fill the remainings with zero.
2455   // dst = 00000000 00004444 00002222 00001111
2456   sve_compact(dst, S, dst, pgtmp);
2457   // Narrow the result back to type SHORT.
2458   // dst = 0000 0000 0000 0000 0000 4444 2222 1111
2459   sve_uzp1(dst, H, dst, vtmp2);
2460   // Count the active elements of lowest half.
2461   // rscratch1 = 3
2462   sve_cntp(rscratch1, S, ptrue, pgtmp);
2463 
2464   // Repeat to the highest half.
2465   // pgtmp = 00000001 00000000 00000000 00000001
2466   sve_punpkhi(pgtmp, mask);
2467   // vtmp1 = 00008888 00007777 00006666 00005555
2468   sve_uunpkhi(vtmp1, S, src);
2469   // vtmp1 = 00000000 00000000 00008888 00005555
2470   sve_compact(vtmp1, S, vtmp1, pgtmp);
2471   // vtmp1 = 0000 0000 0000 0000 0000 0000 8888 5555
2472   sve_uzp1(vtmp1, H, vtmp1, vtmp2);
2473 
2474   // Compressed low:   dst   = 0000 0000 0000 0000 0000 4444 2222 1111
2475   // Compressed high:  vtmp1 = 0000 0000 0000 0000 0000 0000 8888  5555
2476   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2477   // TRUE_CNT is the number of active elements in the compressed low.
2478   neg(rscratch1, rscratch1);
2479   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2480   sve_index(vtmp2, H, rscratch1, 1);
2481   // vtmp1 = 0000 0000 0000 8888 5555 0000 0000 0000
2482   sve_tbl(vtmp1, H, vtmp1, vtmp2);
2483 
2484   // Combine the compressed high(after shifted) with the compressed low.
2485   // dst = 0000 0000 0000 8888 5555 4444 2222 1111
2486   sve_orr(dst, dst, vtmp1);
2487 }
2488 
2489 // Clobbers: rscratch1, rscratch2
2490 // Preserves: src, mask
2491 void C2_MacroAssembler::sve_compress_byte(FloatRegister dst, FloatRegister src, PRegister mask,
2492                                           FloatRegister vtmp1, FloatRegister vtmp2,
2493                                           FloatRegister vtmp3, FloatRegister vtmp4,
2494                                           PRegister ptmp, PRegister pgtmp) {
2495   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2496   assert_different_registers(dst, src, vtmp1, vtmp2, vtmp3, vtmp4);
2497   assert_different_registers(mask, ptmp, pgtmp);
2498   // Example input:   src   = 88 77 66 55 44 33 22 11
2499   //                  mask  = 01 00 00 01 01 00 01 01
2500   // Expected result: dst   = 00 00 00 88 55 44 22 11
2501 
2502   sve_dup(vtmp4, B, 0);
2503   // Extend lowest half to type SHORT.
2504   // vtmp1 = 0044 0033 0022 0011
2505   sve_uunpklo(vtmp1, H, src);
2506   // ptmp = 0001 0000 0001 0001
2507   sve_punpklo(ptmp, mask);
2508   // Count the active elements of lowest half.
2509   // rscratch2 = 3
2510   sve_cntp(rscratch2, H, ptrue, ptmp);
2511   // Pack the active elements in size of type SHORT to the right,
2512   // and fill the remainings with zero.
2513   // dst = 0000 0044 0022 0011
2514   sve_compress_short(dst, vtmp1, ptmp, vtmp2, vtmp3, pgtmp);
2515   // Narrow the result back to type BYTE.
2516   // dst = 00 00 00 00 00 44 22 11
2517   sve_uzp1(dst, B, dst, vtmp4);
2518 
2519   // Repeat to the highest half.
2520   // ptmp = 0001 0000 0000 0001
2521   sve_punpkhi(ptmp, mask);
2522   // vtmp1 = 0088 0077 0066 0055
2523   sve_uunpkhi(vtmp2, H, src);
2524   // vtmp1 = 0000 0000 0088 0055
2525   sve_compress_short(vtmp1, vtmp2, ptmp, vtmp3, vtmp4, pgtmp);
2526 
2527   sve_dup(vtmp4, B, 0);
2528   // vtmp1 = 00 00 00 00 00 00 88 55
2529   sve_uzp1(vtmp1, B, vtmp1, vtmp4);
2530 
2531   // Compressed low:   dst   = 00 00 00 00 00 44 22 11
2532   // Compressed high:  vtmp1 = 00 00 00 00 00 00 88 55
2533   // Left shift(cross lane) compressed high with TRUE_CNT lanes,
2534   // TRUE_CNT is the number of active elements in the compressed low.
2535   neg(rscratch2, rscratch2);
2536   // vtmp2 = {4 3 2 1 0 -1 -2 -3}
2537   sve_index(vtmp2, B, rscratch2, 1);
2538   // vtmp1 = 00 00 00 88 55 00 00 00
2539   sve_tbl(vtmp1, B, vtmp1, vtmp2);
2540   // Combine the compressed high(after shifted) with the compressed low.
2541   // dst = 00 00 00 88 55 44 22 11
2542   sve_orr(dst, dst, vtmp1);
2543 }
2544 
2545 void C2_MacroAssembler::neon_reverse_bits(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2546   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2547   SIMD_Arrangement size = isQ ? T16B : T8B;
2548   if (bt == T_BYTE) {
2549     rbit(dst, size, src);
2550   } else {
2551     neon_reverse_bytes(dst, src, bt, isQ);
2552     rbit(dst, size, dst);
2553   }
2554 }
2555 
2556 void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ) {
2557   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported basic type");
2558   SIMD_Arrangement size = isQ ? T16B : T8B;
2559   switch (bt) {
2560     case T_BYTE:
2561       if (dst != src) {
2562         orr(dst, size, src, src);
2563       }
2564       break;
2565     case T_SHORT:
2566       rev16(dst, size, src);
2567       break;
2568     case T_INT:
2569       rev32(dst, size, src);
2570       break;
2571     case T_LONG:
2572       rev64(dst, size, src);
2573       break;
2574     default:
2575       assert(false, "unsupported");
2576       ShouldNotReachHere();
2577   }
2578 }
2579 
2580 // Extract a scalar element from an sve vector at position 'idx'.
2581 // The input elements in src are expected to be of integral type.
2582 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
2583                                              int idx, FloatRegister vtmp) {
2584   assert(bt == T_BYTE || bt == T_SHORT || bt == T_INT || bt == T_LONG, "unsupported element type");
2585   Assembler::SIMD_RegVariant size = elemType_to_regVariant(bt);
2586   if (regVariant_to_elemBits(size) * idx < 128) { // generate lower cost NEON instruction
2587     if (bt == T_INT || bt == T_LONG) {
2588       umov(dst, src, size, idx);
2589     } else {
2590       smov(dst, src, size, idx);
2591     }
2592   } else {
2593     sve_orr(vtmp, src, src);
2594     sve_ext(vtmp, vtmp, idx << size);
2595     if (bt == T_INT || bt == T_LONG) {
2596       umov(dst, vtmp, size, 0);
2597     } else {
2598       smov(dst, vtmp, size, 0);
2599     }
2600   }
2601 }
2602 
2603 // java.lang.Math::round intrinsics
2604 
2605 // Clobbers: rscratch1, rflags
2606 void C2_MacroAssembler::vector_round_neon(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2607                                           FloatRegister tmp2, FloatRegister tmp3, SIMD_Arrangement T) {
2608   assert_different_registers(tmp1, tmp2, tmp3, src, dst);
2609   switch (T) {
2610     case T2S:
2611     case T4S:
2612       fmovs(tmp1, T, 0.5f);
2613       mov(rscratch1, jint_cast(0x1.0p23f));
2614       break;
2615     case T2D:
2616       fmovd(tmp1, T, 0.5);
2617       mov(rscratch1, julong_cast(0x1.0p52));
2618       break;
2619     default:
2620       assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2621   }
2622   fadd(tmp1, T, tmp1, src);
2623   fcvtms(tmp1, T, tmp1);
2624   // tmp1 = floor(src + 0.5, ties to even)
2625 
2626   fcvtas(dst, T, src);
2627   // dst = round(src), ties to away
2628 
2629   fneg(tmp3, T, src);
2630   dup(tmp2, T, rscratch1);
2631   cm(HS, tmp3, T, tmp3, tmp2);
2632   // tmp3 is now a set of flags
2633 
2634   bif(dst, T16B, tmp1, tmp3);
2635   // result in dst
2636 }
2637 
2638 // Clobbers: rscratch1, rflags
2639 void C2_MacroAssembler::vector_round_sve(FloatRegister dst, FloatRegister src, FloatRegister tmp1,
2640                                          FloatRegister tmp2, PRegister pgtmp, SIMD_RegVariant T) {
2641   assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2642   assert_different_registers(tmp1, tmp2, src, dst);
2643 
2644   switch (T) {
2645     case S:
2646       mov(rscratch1, jint_cast(0x1.0p23f));
2647       break;
2648     case D:
2649       mov(rscratch1, julong_cast(0x1.0p52));
2650       break;
2651     default:
2652       assert(T == S || T == D, "invalid register variant");
2653   }
2654 
2655   sve_frinta(dst, T, ptrue, src);
2656   // dst = round(src), ties to away
2657 
2658   Label none;
2659 
2660   sve_fneg(tmp1, T, ptrue, src);
2661   sve_dup(tmp2, T, rscratch1);
2662   sve_cmp(HS, pgtmp, T, ptrue, tmp2, tmp1);
2663   br(EQ, none);
2664   {
2665     sve_cpy(tmp1, T, pgtmp, 0.5);
2666     sve_fadd(tmp1, T, pgtmp, src);
2667     sve_frintm(dst, T, pgtmp, tmp1);
2668     // dst = floor(src + 0.5, ties to even)
2669   }
2670   bind(none);
2671 
2672   sve_fcvtzs(dst, T, ptrue, dst, T);
2673   // result in dst
2674 }
2675 
2676 void C2_MacroAssembler::vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
2677                                            FloatRegister one, SIMD_Arrangement T) {
2678   assert_different_registers(dst, src, zero, one);
2679   assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
2680 
2681   facgt(dst, T, src, zero);
2682   ushr(dst, T, dst, 1); // dst=0 for +-0.0 and NaN. 0x7FF..F otherwise
2683   bsl(dst, T == T2S ? T8B : T16B, one, src); // Result in dst
2684 }
2685 
2686 void C2_MacroAssembler::vector_signum_sve(FloatRegister dst, FloatRegister src, FloatRegister zero,
2687                                           FloatRegister one, FloatRegister vtmp, PRegister pgtmp, SIMD_RegVariant T) {
2688     assert_different_registers(dst, src, zero, one, vtmp);
2689     assert(pgtmp->is_governing(), "This register has to be a governing predicate register");
2690 
2691     sve_orr(vtmp, src, src);
2692     sve_fac(Assembler::GT, pgtmp, T, ptrue, src, zero); // pmtp=0 for +-0.0 and NaN. 0x1 otherwise
2693     switch (T) {
2694     case S:
2695       sve_and(vtmp, T, min_jint); // Extract the sign bit of float value in every lane of src
2696       sve_orr(vtmp, T, jint_cast(1.0)); // OR it with +1 to make the final result +1 or -1 depending
2697                                         // on the sign of the float value
2698       break;
2699     case D:
2700       sve_and(vtmp, T, min_jlong);
2701       sve_orr(vtmp, T, jlong_cast(1.0));
2702       break;
2703     default:
2704       assert(false, "unsupported");
2705       ShouldNotReachHere();
2706     }
2707     sve_sel(dst, T, pgtmp, vtmp, src); // Select either from src or vtmp based on the predicate register pgtmp
2708                                        // Result in dst
2709 }
2710 
2711 bool C2_MacroAssembler::in_scratch_emit_size() {
2712   if (ciEnv::current()->task() != nullptr) {
2713     PhaseOutput* phase_output = Compile::current()->output();
2714     if (phase_output != nullptr && phase_output->in_scratch_emit_size()) {
2715       return true;
2716     }
2717   }
2718   return MacroAssembler::in_scratch_emit_size();
2719 }