1 /* 2 * Copyright (c) 1997, 2025, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "asm/assembler.hpp" 26 #include "asm/assembler.inline.hpp" 27 #include "code/compiledIC.hpp" 28 #include "compiler/compiler_globals.hpp" 29 #include "compiler/disassembler.hpp" 30 #include "crc32c.h" 31 #include "gc/shared/barrierSet.hpp" 32 #include "gc/shared/barrierSetAssembler.hpp" 33 #include "gc/shared/collectedHeap.inline.hpp" 34 #include "gc/shared/tlab_globals.hpp" 35 #include "interpreter/bytecodeHistogram.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "interpreter/interpreterRuntime.hpp" 38 #include "jvm.h" 39 #include "memory/resourceArea.hpp" 40 #include "memory/universe.hpp" 41 #include "oops/accessDecorators.hpp" 42 #include "oops/compressedKlass.inline.hpp" 43 #include "oops/compressedOops.inline.hpp" 44 #include "oops/klass.inline.hpp" 45 #include "prims/methodHandles.hpp" 46 #include "runtime/continuation.hpp" 47 #include "runtime/interfaceSupport.inline.hpp" 48 #include "runtime/javaThread.hpp" 49 #include "runtime/jniHandles.hpp" 50 #include "runtime/objectMonitor.hpp" 51 #include "runtime/os.hpp" 52 #include "runtime/safepoint.hpp" 53 #include "runtime/safepointMechanism.hpp" 54 #include "runtime/sharedRuntime.hpp" 55 #include "runtime/stubRoutines.hpp" 56 #include "utilities/checkedCast.hpp" 57 #include "utilities/macros.hpp" 58 59 #ifdef PRODUCT 60 #define BLOCK_COMMENT(str) /* nothing */ 61 #define STOP(error) stop(error) 62 #else 63 #define BLOCK_COMMENT(str) block_comment(str) 64 #define STOP(error) block_comment(error); stop(error) 65 #endif 66 67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 68 69 #ifdef ASSERT 70 bool AbstractAssembler::pd_check_instruction_mark() { return true; } 71 #endif 72 73 static const Assembler::Condition reverse[] = { 74 Assembler::noOverflow /* overflow = 0x0 */ , 75 Assembler::overflow /* noOverflow = 0x1 */ , 76 Assembler::aboveEqual /* carrySet = 0x2, below = 0x2 */ , 77 Assembler::below /* aboveEqual = 0x3, carryClear = 0x3 */ , 78 Assembler::notZero /* zero = 0x4, equal = 0x4 */ , 79 Assembler::zero /* notZero = 0x5, notEqual = 0x5 */ , 80 Assembler::above /* belowEqual = 0x6 */ , 81 Assembler::belowEqual /* above = 0x7 */ , 82 Assembler::positive /* negative = 0x8 */ , 83 Assembler::negative /* positive = 0x9 */ , 84 Assembler::noParity /* parity = 0xa */ , 85 Assembler::parity /* noParity = 0xb */ , 86 Assembler::greaterEqual /* less = 0xc */ , 87 Assembler::less /* greaterEqual = 0xd */ , 88 Assembler::greater /* lessEqual = 0xe */ , 89 Assembler::lessEqual /* greater = 0xf, */ 90 91 }; 92 93 94 // Implementation of MacroAssembler 95 96 // First all the versions that have distinct versions depending on 32/64 bit 97 // Unless the difference is trivial (1 line or so). 98 99 #ifndef _LP64 100 101 // 32bit versions 102 103 Address MacroAssembler::as_Address(AddressLiteral adr) { 104 return Address(adr.target(), adr.rspec()); 105 } 106 107 Address MacroAssembler::as_Address(ArrayAddress adr, Register rscratch) { 108 assert(rscratch == noreg, ""); 109 return Address::make_array(adr); 110 } 111 112 void MacroAssembler::call_VM_leaf_base(address entry_point, 113 int number_of_arguments) { 114 call(RuntimeAddress(entry_point)); 115 increment(rsp, number_of_arguments * wordSize); 116 } 117 118 void MacroAssembler::cmpklass(Address src1, Metadata* obj) { 119 cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate()); 120 } 121 122 123 void MacroAssembler::cmpklass(Register src1, Metadata* obj) { 124 cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate()); 125 } 126 127 void MacroAssembler::cmpoop(Address src1, jobject obj) { 128 cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate()); 129 } 130 131 void MacroAssembler::cmpoop(Register src1, jobject obj, Register rscratch) { 132 assert(rscratch == noreg, "redundant"); 133 cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate()); 134 } 135 136 void MacroAssembler::extend_sign(Register hi, Register lo) { 137 // According to Intel Doc. AP-526, "Integer Divide", p.18. 138 if (VM_Version::is_P6() && hi == rdx && lo == rax) { 139 cdql(); 140 } else { 141 movl(hi, lo); 142 sarl(hi, 31); 143 } 144 } 145 146 void MacroAssembler::jC2(Register tmp, Label& L) { 147 // set parity bit if FPU flag C2 is set (via rax) 148 save_rax(tmp); 149 fwait(); fnstsw_ax(); 150 sahf(); 151 restore_rax(tmp); 152 // branch 153 jcc(Assembler::parity, L); 154 } 155 156 void MacroAssembler::jnC2(Register tmp, Label& L) { 157 // set parity bit if FPU flag C2 is set (via rax) 158 save_rax(tmp); 159 fwait(); fnstsw_ax(); 160 sahf(); 161 restore_rax(tmp); 162 // branch 163 jcc(Assembler::noParity, L); 164 } 165 166 // 32bit can do a case table jump in one instruction but we no longer allow the base 167 // to be installed in the Address class 168 void MacroAssembler::jump(ArrayAddress entry, Register rscratch) { 169 assert(rscratch == noreg, "not needed"); 170 jmp(as_Address(entry, noreg)); 171 } 172 173 // Note: y_lo will be destroyed 174 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) { 175 // Long compare for Java (semantics as described in JVM spec.) 176 Label high, low, done; 177 178 cmpl(x_hi, y_hi); 179 jcc(Assembler::less, low); 180 jcc(Assembler::greater, high); 181 // x_hi is the return register 182 xorl(x_hi, x_hi); 183 cmpl(x_lo, y_lo); 184 jcc(Assembler::below, low); 185 jcc(Assembler::equal, done); 186 187 bind(high); 188 xorl(x_hi, x_hi); 189 increment(x_hi); 190 jmp(done); 191 192 bind(low); 193 xorl(x_hi, x_hi); 194 decrementl(x_hi); 195 196 bind(done); 197 } 198 199 void MacroAssembler::lea(Register dst, AddressLiteral src) { 200 mov_literal32(dst, (int32_t)src.target(), src.rspec()); 201 } 202 203 void MacroAssembler::lea(Address dst, AddressLiteral adr, Register rscratch) { 204 assert(rscratch == noreg, "not needed"); 205 206 // leal(dst, as_Address(adr)); 207 // see note in movl as to why we must use a move 208 mov_literal32(dst, (int32_t)adr.target(), adr.rspec()); 209 } 210 211 void MacroAssembler::leave() { 212 mov(rsp, rbp); 213 pop(rbp); 214 } 215 216 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) { 217 // Multiplication of two Java long values stored on the stack 218 // as illustrated below. Result is in rdx:rax. 219 // 220 // rsp ---> [ ?? ] \ \ 221 // .... | y_rsp_offset | 222 // [ y_lo ] / (in bytes) | x_rsp_offset 223 // [ y_hi ] | (in bytes) 224 // .... | 225 // [ x_lo ] / 226 // [ x_hi ] 227 // .... 228 // 229 // Basic idea: lo(result) = lo(x_lo * y_lo) 230 // hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi) 231 Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset); 232 Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset); 233 Label quick; 234 // load x_hi, y_hi and check if quick 235 // multiplication is possible 236 movl(rbx, x_hi); 237 movl(rcx, y_hi); 238 movl(rax, rbx); 239 orl(rbx, rcx); // rbx, = 0 <=> x_hi = 0 and y_hi = 0 240 jcc(Assembler::zero, quick); // if rbx, = 0 do quick multiply 241 // do full multiplication 242 // 1st step 243 mull(y_lo); // x_hi * y_lo 244 movl(rbx, rax); // save lo(x_hi * y_lo) in rbx, 245 // 2nd step 246 movl(rax, x_lo); 247 mull(rcx); // x_lo * y_hi 248 addl(rbx, rax); // add lo(x_lo * y_hi) to rbx, 249 // 3rd step 250 bind(quick); // note: rbx, = 0 if quick multiply! 251 movl(rax, x_lo); 252 mull(y_lo); // x_lo * y_lo 253 addl(rdx, rbx); // correct hi(x_lo * y_lo) 254 } 255 256 void MacroAssembler::lneg(Register hi, Register lo) { 257 negl(lo); 258 adcl(hi, 0); 259 negl(hi); 260 } 261 262 void MacroAssembler::lshl(Register hi, Register lo) { 263 // Java shift left long support (semantics as described in JVM spec., p.305) 264 // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n)) 265 // shift value is in rcx ! 266 assert(hi != rcx, "must not use rcx"); 267 assert(lo != rcx, "must not use rcx"); 268 const Register s = rcx; // shift count 269 const int n = BitsPerWord; 270 Label L; 271 andl(s, 0x3f); // s := s & 0x3f (s < 0x40) 272 cmpl(s, n); // if (s < n) 273 jcc(Assembler::less, L); // else (s >= n) 274 movl(hi, lo); // x := x << n 275 xorl(lo, lo); 276 // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n! 277 bind(L); // s (mod n) < n 278 shldl(hi, lo); // x := x << s 279 shll(lo); 280 } 281 282 283 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) { 284 // Java shift right long support (semantics as described in JVM spec., p.306 & p.310) 285 // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n)) 286 assert(hi != rcx, "must not use rcx"); 287 assert(lo != rcx, "must not use rcx"); 288 const Register s = rcx; // shift count 289 const int n = BitsPerWord; 290 Label L; 291 andl(s, 0x3f); // s := s & 0x3f (s < 0x40) 292 cmpl(s, n); // if (s < n) 293 jcc(Assembler::less, L); // else (s >= n) 294 movl(lo, hi); // x := x >> n 295 if (sign_extension) sarl(hi, 31); 296 else xorl(hi, hi); 297 // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n! 298 bind(L); // s (mod n) < n 299 shrdl(lo, hi); // x := x >> s 300 if (sign_extension) sarl(hi); 301 else shrl(hi); 302 } 303 304 void MacroAssembler::movoop(Register dst, jobject obj) { 305 mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate()); 306 } 307 308 void MacroAssembler::movoop(Address dst, jobject obj, Register rscratch) { 309 assert(rscratch == noreg, "redundant"); 310 mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate()); 311 } 312 313 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) { 314 mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate()); 315 } 316 317 void MacroAssembler::mov_metadata(Address dst, Metadata* obj, Register rscratch) { 318 assert(rscratch == noreg, "redundant"); 319 mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate()); 320 } 321 322 void MacroAssembler::movptr(Register dst, AddressLiteral src) { 323 if (src.is_lval()) { 324 mov_literal32(dst, (intptr_t)src.target(), src.rspec()); 325 } else { 326 movl(dst, as_Address(src)); 327 } 328 } 329 330 void MacroAssembler::movptr(ArrayAddress dst, Register src, Register rscratch) { 331 assert(rscratch == noreg, "redundant"); 332 movl(as_Address(dst, noreg), src); 333 } 334 335 void MacroAssembler::movptr(Register dst, ArrayAddress src) { 336 movl(dst, as_Address(src, noreg)); 337 } 338 339 void MacroAssembler::movptr(Address dst, intptr_t src, Register rscratch) { 340 assert(rscratch == noreg, "redundant"); 341 movl(dst, src); 342 } 343 344 void MacroAssembler::pushoop(jobject obj, Register rscratch) { 345 assert(rscratch == noreg, "redundant"); 346 push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate()); 347 } 348 349 void MacroAssembler::pushklass(Metadata* obj, Register rscratch) { 350 assert(rscratch == noreg, "redundant"); 351 push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate()); 352 } 353 354 void MacroAssembler::pushptr(AddressLiteral src, Register rscratch) { 355 assert(rscratch == noreg, "redundant"); 356 if (src.is_lval()) { 357 push_literal32((int32_t)src.target(), src.rspec()); 358 } else { 359 pushl(as_Address(src)); 360 } 361 } 362 363 static void pass_arg0(MacroAssembler* masm, Register arg) { 364 masm->push(arg); 365 } 366 367 static void pass_arg1(MacroAssembler* masm, Register arg) { 368 masm->push(arg); 369 } 370 371 static void pass_arg2(MacroAssembler* masm, Register arg) { 372 masm->push(arg); 373 } 374 375 static void pass_arg3(MacroAssembler* masm, Register arg) { 376 masm->push(arg); 377 } 378 379 #ifndef PRODUCT 380 extern "C" void findpc(intptr_t x); 381 #endif 382 383 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) { 384 // In order to get locks to work, we need to fake a in_VM state 385 JavaThread* thread = JavaThread::current(); 386 JavaThreadState saved_state = thread->thread_state(); 387 thread->set_thread_state(_thread_in_vm); 388 if (ShowMessageBoxOnError) { 389 JavaThread* thread = JavaThread::current(); 390 JavaThreadState saved_state = thread->thread_state(); 391 thread->set_thread_state(_thread_in_vm); 392 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { 393 ttyLocker ttyl; 394 BytecodeCounter::print(); 395 } 396 // To see where a verify_oop failed, get $ebx+40/X for this frame. 397 // This is the value of eip which points to where verify_oop will return. 398 if (os::message_box(msg, "Execution stopped, print registers?")) { 399 print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip); 400 BREAKPOINT; 401 } 402 } 403 fatal("DEBUG MESSAGE: %s", msg); 404 } 405 406 void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) { 407 ttyLocker ttyl; 408 DebuggingContext debugging{}; 409 tty->print_cr("eip = 0x%08x", eip); 410 #ifndef PRODUCT 411 if ((WizardMode || Verbose) && PrintMiscellaneous) { 412 tty->cr(); 413 findpc(eip); 414 tty->cr(); 415 } 416 #endif 417 #define PRINT_REG(rax) \ 418 { tty->print("%s = ", #rax); os::print_location(tty, rax); } 419 PRINT_REG(rax); 420 PRINT_REG(rbx); 421 PRINT_REG(rcx); 422 PRINT_REG(rdx); 423 PRINT_REG(rdi); 424 PRINT_REG(rsi); 425 PRINT_REG(rbp); 426 PRINT_REG(rsp); 427 #undef PRINT_REG 428 // Print some words near top of staack. 429 int* dump_sp = (int*) rsp; 430 for (int col1 = 0; col1 < 8; col1++) { 431 tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp); 432 os::print_location(tty, *dump_sp++); 433 } 434 for (int row = 0; row < 16; row++) { 435 tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp); 436 for (int col = 0; col < 8; col++) { 437 tty->print(" 0x%08x", *dump_sp++); 438 } 439 tty->cr(); 440 } 441 // Print some instructions around pc: 442 Disassembler::decode((address)eip-64, (address)eip); 443 tty->print_cr("--------"); 444 Disassembler::decode((address)eip, (address)eip+32); 445 } 446 447 void MacroAssembler::stop(const char* msg) { 448 // push address of message 449 ExternalAddress message((address)msg); 450 pushptr(message.addr(), noreg); 451 { Label L; call(L, relocInfo::none); bind(L); } // push eip 452 pusha(); // push registers 453 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32))); 454 hlt(); 455 } 456 457 void MacroAssembler::warn(const char* msg) { 458 push_CPU_state(); 459 460 // push address of message 461 ExternalAddress message((address)msg); 462 pushptr(message.addr(), noreg); 463 464 call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning))); 465 addl(rsp, wordSize); // discard argument 466 pop_CPU_state(); 467 } 468 469 void MacroAssembler::print_state() { 470 { Label L; call(L, relocInfo::none); bind(L); } // push eip 471 pusha(); // push registers 472 473 push_CPU_state(); 474 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32))); 475 pop_CPU_state(); 476 477 popa(); 478 addl(rsp, wordSize); 479 } 480 481 #else // _LP64 482 483 // 64 bit versions 484 485 Address MacroAssembler::as_Address(AddressLiteral adr) { 486 // amd64 always does this as a pc-rel 487 // we can be absolute or disp based on the instruction type 488 // jmp/call are displacements others are absolute 489 assert(!adr.is_lval(), "must be rval"); 490 assert(reachable(adr), "must be"); 491 return Address(checked_cast<int32_t>(adr.target() - pc()), adr.target(), adr.reloc()); 492 493 } 494 495 Address MacroAssembler::as_Address(ArrayAddress adr, Register rscratch) { 496 AddressLiteral base = adr.base(); 497 lea(rscratch, base); 498 Address index = adr.index(); 499 assert(index._disp == 0, "must not have disp"); // maybe it can? 500 Address array(rscratch, index._index, index._scale, index._disp); 501 return array; 502 } 503 504 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) { 505 Label L, E; 506 507 #ifdef _WIN64 508 // Windows always allocates space for it's register args 509 assert(num_args <= 4, "only register arguments supported"); 510 subq(rsp, frame::arg_reg_save_area_bytes); 511 #endif 512 513 // Align stack if necessary 514 testl(rsp, 15); 515 jcc(Assembler::zero, L); 516 517 subq(rsp, 8); 518 call(RuntimeAddress(entry_point)); 519 addq(rsp, 8); 520 jmp(E); 521 522 bind(L); 523 call(RuntimeAddress(entry_point)); 524 525 bind(E); 526 527 #ifdef _WIN64 528 // restore stack pointer 529 addq(rsp, frame::arg_reg_save_area_bytes); 530 #endif 531 } 532 533 void MacroAssembler::cmp64(Register src1, AddressLiteral src2, Register rscratch) { 534 assert(!src2.is_lval(), "should use cmpptr"); 535 assert(rscratch != noreg || always_reachable(src2), "missing"); 536 537 if (reachable(src2)) { 538 cmpq(src1, as_Address(src2)); 539 } else { 540 lea(rscratch, src2); 541 Assembler::cmpq(src1, Address(rscratch, 0)); 542 } 543 } 544 545 int MacroAssembler::corrected_idivq(Register reg) { 546 // Full implementation of Java ldiv and lrem; checks for special 547 // case as described in JVM spec., p.243 & p.271. The function 548 // returns the (pc) offset of the idivl instruction - may be needed 549 // for implicit exceptions. 550 // 551 // normal case special case 552 // 553 // input : rax: dividend min_long 554 // reg: divisor (may not be eax/edx) -1 555 // 556 // output: rax: quotient (= rax idiv reg) min_long 557 // rdx: remainder (= rax irem reg) 0 558 assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register"); 559 static const int64_t min_long = 0x8000000000000000; 560 Label normal_case, special_case; 561 562 // check for special case 563 cmp64(rax, ExternalAddress((address) &min_long), rdx /*rscratch*/); 564 jcc(Assembler::notEqual, normal_case); 565 xorl(rdx, rdx); // prepare rdx for possible special case (where 566 // remainder = 0) 567 cmpq(reg, -1); 568 jcc(Assembler::equal, special_case); 569 570 // handle normal case 571 bind(normal_case); 572 cdqq(); 573 int idivq_offset = offset(); 574 idivq(reg); 575 576 // normal and special case exit 577 bind(special_case); 578 579 return idivq_offset; 580 } 581 582 void MacroAssembler::decrementq(Register reg, int value) { 583 if (value == min_jint) { subq(reg, value); return; } 584 if (value < 0) { incrementq(reg, -value); return; } 585 if (value == 0) { ; return; } 586 if (value == 1 && UseIncDec) { decq(reg) ; return; } 587 /* else */ { subq(reg, value) ; return; } 588 } 589 590 void MacroAssembler::decrementq(Address dst, int value) { 591 if (value == min_jint) { subq(dst, value); return; } 592 if (value < 0) { incrementq(dst, -value); return; } 593 if (value == 0) { ; return; } 594 if (value == 1 && UseIncDec) { decq(dst) ; return; } 595 /* else */ { subq(dst, value) ; return; } 596 } 597 598 void MacroAssembler::incrementq(AddressLiteral dst, Register rscratch) { 599 assert(rscratch != noreg || always_reachable(dst), "missing"); 600 601 if (reachable(dst)) { 602 incrementq(as_Address(dst)); 603 } else { 604 lea(rscratch, dst); 605 incrementq(Address(rscratch, 0)); 606 } 607 } 608 609 void MacroAssembler::incrementq(Register reg, int value) { 610 if (value == min_jint) { addq(reg, value); return; } 611 if (value < 0) { decrementq(reg, -value); return; } 612 if (value == 0) { ; return; } 613 if (value == 1 && UseIncDec) { incq(reg) ; return; } 614 /* else */ { addq(reg, value) ; return; } 615 } 616 617 void MacroAssembler::incrementq(Address dst, int value) { 618 if (value == min_jint) { addq(dst, value); return; } 619 if (value < 0) { decrementq(dst, -value); return; } 620 if (value == 0) { ; return; } 621 if (value == 1 && UseIncDec) { incq(dst) ; return; } 622 /* else */ { addq(dst, value) ; return; } 623 } 624 625 // 32bit can do a case table jump in one instruction but we no longer allow the base 626 // to be installed in the Address class 627 void MacroAssembler::jump(ArrayAddress entry, Register rscratch) { 628 lea(rscratch, entry.base()); 629 Address dispatch = entry.index(); 630 assert(dispatch._base == noreg, "must be"); 631 dispatch._base = rscratch; 632 jmp(dispatch); 633 } 634 635 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) { 636 ShouldNotReachHere(); // 64bit doesn't use two regs 637 cmpq(x_lo, y_lo); 638 } 639 640 void MacroAssembler::lea(Register dst, AddressLiteral src) { 641 mov_literal64(dst, (intptr_t)src.target(), src.rspec()); 642 } 643 644 void MacroAssembler::lea(Address dst, AddressLiteral adr, Register rscratch) { 645 lea(rscratch, adr); 646 movptr(dst, rscratch); 647 } 648 649 void MacroAssembler::leave() { 650 // %%% is this really better? Why not on 32bit too? 651 emit_int8((unsigned char)0xC9); // LEAVE 652 } 653 654 void MacroAssembler::lneg(Register hi, Register lo) { 655 ShouldNotReachHere(); // 64bit doesn't use two regs 656 negq(lo); 657 } 658 659 void MacroAssembler::movoop(Register dst, jobject obj) { 660 mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate()); 661 } 662 663 void MacroAssembler::movoop(Address dst, jobject obj, Register rscratch) { 664 mov_literal64(rscratch, (intptr_t)obj, oop_Relocation::spec_for_immediate()); 665 movq(dst, rscratch); 666 } 667 668 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) { 669 mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate()); 670 } 671 672 void MacroAssembler::mov_metadata(Address dst, Metadata* obj, Register rscratch) { 673 mov_literal64(rscratch, (intptr_t)obj, metadata_Relocation::spec_for_immediate()); 674 movq(dst, rscratch); 675 } 676 677 void MacroAssembler::movptr(Register dst, AddressLiteral src) { 678 if (src.is_lval()) { 679 mov_literal64(dst, (intptr_t)src.target(), src.rspec()); 680 } else { 681 if (reachable(src)) { 682 movq(dst, as_Address(src)); 683 } else { 684 lea(dst, src); 685 movq(dst, Address(dst, 0)); 686 } 687 } 688 } 689 690 void MacroAssembler::movptr(ArrayAddress dst, Register src, Register rscratch) { 691 movq(as_Address(dst, rscratch), src); 692 } 693 694 void MacroAssembler::movptr(Register dst, ArrayAddress src) { 695 movq(dst, as_Address(src, dst /*rscratch*/)); 696 } 697 698 // src should NEVER be a real pointer. Use AddressLiteral for true pointers 699 void MacroAssembler::movptr(Address dst, intptr_t src, Register rscratch) { 700 if (is_simm32(src)) { 701 movptr(dst, checked_cast<int32_t>(src)); 702 } else { 703 mov64(rscratch, src); 704 movq(dst, rscratch); 705 } 706 } 707 708 void MacroAssembler::pushoop(jobject obj, Register rscratch) { 709 movoop(rscratch, obj); 710 push(rscratch); 711 } 712 713 void MacroAssembler::pushklass(Metadata* obj, Register rscratch) { 714 mov_metadata(rscratch, obj); 715 push(rscratch); 716 } 717 718 void MacroAssembler::pushptr(AddressLiteral src, Register rscratch) { 719 lea(rscratch, src); 720 if (src.is_lval()) { 721 push(rscratch); 722 } else { 723 pushq(Address(rscratch, 0)); 724 } 725 } 726 727 static void pass_arg0(MacroAssembler* masm, Register arg) { 728 if (c_rarg0 != arg ) { 729 masm->mov(c_rarg0, arg); 730 } 731 } 732 733 static void pass_arg1(MacroAssembler* masm, Register arg) { 734 if (c_rarg1 != arg ) { 735 masm->mov(c_rarg1, arg); 736 } 737 } 738 739 static void pass_arg2(MacroAssembler* masm, Register arg) { 740 if (c_rarg2 != arg ) { 741 masm->mov(c_rarg2, arg); 742 } 743 } 744 745 static void pass_arg3(MacroAssembler* masm, Register arg) { 746 if (c_rarg3 != arg ) { 747 masm->mov(c_rarg3, arg); 748 } 749 } 750 751 void MacroAssembler::stop(const char* msg) { 752 if (ShowMessageBoxOnError) { 753 address rip = pc(); 754 pusha(); // get regs on stack 755 lea(c_rarg1, InternalAddress(rip)); 756 movq(c_rarg2, rsp); // pass pointer to regs array 757 } 758 lea(c_rarg0, ExternalAddress((address) msg)); 759 andq(rsp, -16); // align stack as required by ABI 760 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64))); 761 hlt(); 762 } 763 764 void MacroAssembler::warn(const char* msg) { 765 push(rbp); 766 movq(rbp, rsp); 767 andq(rsp, -16); // align stack as required by push_CPU_state and call 768 push_CPU_state(); // keeps alignment at 16 bytes 769 770 #ifdef _WIN64 771 // Windows always allocates space for its register args 772 subq(rsp, frame::arg_reg_save_area_bytes); 773 #endif 774 lea(c_rarg0, ExternalAddress((address) msg)); 775 call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning))); 776 777 #ifdef _WIN64 778 // restore stack pointer 779 addq(rsp, frame::arg_reg_save_area_bytes); 780 #endif 781 pop_CPU_state(); 782 mov(rsp, rbp); 783 pop(rbp); 784 } 785 786 void MacroAssembler::print_state() { 787 address rip = pc(); 788 pusha(); // get regs on stack 789 push(rbp); 790 movq(rbp, rsp); 791 andq(rsp, -16); // align stack as required by push_CPU_state and call 792 push_CPU_state(); // keeps alignment at 16 bytes 793 794 lea(c_rarg0, InternalAddress(rip)); 795 lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array 796 call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1); 797 798 pop_CPU_state(); 799 mov(rsp, rbp); 800 pop(rbp); 801 popa(); 802 } 803 804 #ifndef PRODUCT 805 extern "C" void findpc(intptr_t x); 806 #endif 807 808 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) { 809 // In order to get locks to work, we need to fake a in_VM state 810 if (ShowMessageBoxOnError) { 811 JavaThread* thread = JavaThread::current(); 812 JavaThreadState saved_state = thread->thread_state(); 813 thread->set_thread_state(_thread_in_vm); 814 #ifndef PRODUCT 815 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { 816 ttyLocker ttyl; 817 BytecodeCounter::print(); 818 } 819 #endif 820 // To see where a verify_oop failed, get $ebx+40/X for this frame. 821 // XXX correct this offset for amd64 822 // This is the value of eip which points to where verify_oop will return. 823 if (os::message_box(msg, "Execution stopped, print registers?")) { 824 print_state64(pc, regs); 825 BREAKPOINT; 826 } 827 } 828 fatal("DEBUG MESSAGE: %s", msg); 829 } 830 831 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) { 832 ttyLocker ttyl; 833 DebuggingContext debugging{}; 834 tty->print_cr("rip = 0x%016lx", (intptr_t)pc); 835 #ifndef PRODUCT 836 tty->cr(); 837 findpc(pc); 838 tty->cr(); 839 #endif 840 #define PRINT_REG(rax, value) \ 841 { tty->print("%s = ", #rax); os::print_location(tty, value); } 842 PRINT_REG(rax, regs[15]); 843 PRINT_REG(rbx, regs[12]); 844 PRINT_REG(rcx, regs[14]); 845 PRINT_REG(rdx, regs[13]); 846 PRINT_REG(rdi, regs[8]); 847 PRINT_REG(rsi, regs[9]); 848 PRINT_REG(rbp, regs[10]); 849 // rsp is actually not stored by pusha(), compute the old rsp from regs (rsp after pusha): regs + 16 = old rsp 850 PRINT_REG(rsp, (intptr_t)(®s[16])); 851 PRINT_REG(r8 , regs[7]); 852 PRINT_REG(r9 , regs[6]); 853 PRINT_REG(r10, regs[5]); 854 PRINT_REG(r11, regs[4]); 855 PRINT_REG(r12, regs[3]); 856 PRINT_REG(r13, regs[2]); 857 PRINT_REG(r14, regs[1]); 858 PRINT_REG(r15, regs[0]); 859 #undef PRINT_REG 860 // Print some words near the top of the stack. 861 int64_t* rsp = ®s[16]; 862 int64_t* dump_sp = rsp; 863 for (int col1 = 0; col1 < 8; col1++) { 864 tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp); 865 os::print_location(tty, *dump_sp++); 866 } 867 for (int row = 0; row < 25; row++) { 868 tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp); 869 for (int col = 0; col < 4; col++) { 870 tty->print(" 0x%016lx", (intptr_t)*dump_sp++); 871 } 872 tty->cr(); 873 } 874 // Print some instructions around pc: 875 Disassembler::decode((address)pc-64, (address)pc); 876 tty->print_cr("--------"); 877 Disassembler::decode((address)pc, (address)pc+32); 878 } 879 880 // The java_calling_convention describes stack locations as ideal slots on 881 // a frame with no abi restrictions. Since we must observe abi restrictions 882 // (like the placement of the register window) the slots must be biased by 883 // the following value. 884 static int reg2offset_in(VMReg r) { 885 // Account for saved rbp and return address 886 // This should really be in_preserve_stack_slots 887 return (r->reg2stack() + 4) * VMRegImpl::stack_slot_size; 888 } 889 890 static int reg2offset_out(VMReg r) { 891 return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size; 892 } 893 894 // A long move 895 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) { 896 897 // The calling conventions assures us that each VMregpair is either 898 // all really one physical register or adjacent stack slots. 899 900 if (src.is_single_phys_reg() ) { 901 if (dst.is_single_phys_reg()) { 902 if (dst.first() != src.first()) { 903 mov(dst.first()->as_Register(), src.first()->as_Register()); 904 } 905 } else { 906 assert(dst.is_single_reg(), "not a stack pair: (%s, %s), (%s, %s)", 907 src.first()->name(), src.second()->name(), dst.first()->name(), dst.second()->name()); 908 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_Register()); 909 } 910 } else if (dst.is_single_phys_reg()) { 911 assert(src.is_single_reg(), "not a stack pair"); 912 movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias)); 913 } else { 914 assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs"); 915 movq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias)); 916 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp); 917 } 918 } 919 920 // A double move 921 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) { 922 923 // The calling conventions assures us that each VMregpair is either 924 // all really one physical register or adjacent stack slots. 925 926 if (src.is_single_phys_reg() ) { 927 if (dst.is_single_phys_reg()) { 928 // In theory these overlap but the ordering is such that this is likely a nop 929 if ( src.first() != dst.first()) { 930 movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister()); 931 } 932 } else { 933 assert(dst.is_single_reg(), "not a stack pair"); 934 movdbl(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_XMMRegister()); 935 } 936 } else if (dst.is_single_phys_reg()) { 937 assert(src.is_single_reg(), "not a stack pair"); 938 movdbl(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias)); 939 } else { 940 assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs"); 941 movq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias)); 942 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp); 943 } 944 } 945 946 947 // A float arg may have to do float reg int reg conversion 948 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) { 949 assert(!src.second()->is_valid() && !dst.second()->is_valid(), "bad float_move"); 950 951 // The calling conventions assures us that each VMregpair is either 952 // all really one physical register or adjacent stack slots. 953 954 if (src.first()->is_stack()) { 955 if (dst.first()->is_stack()) { 956 movl(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias)); 957 movptr(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp); 958 } else { 959 // stack to reg 960 assert(dst.first()->is_XMMRegister(), "only expect xmm registers as parameters"); 961 movflt(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias)); 962 } 963 } else if (dst.first()->is_stack()) { 964 // reg to stack 965 assert(src.first()->is_XMMRegister(), "only expect xmm registers as parameters"); 966 movflt(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_XMMRegister()); 967 } else { 968 // reg to reg 969 // In theory these overlap but the ordering is such that this is likely a nop 970 if ( src.first() != dst.first()) { 971 movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister()); 972 } 973 } 974 } 975 976 // On 64 bit we will store integer like items to the stack as 977 // 64 bits items (x86_32/64 abi) even though java would only store 978 // 32bits for a parameter. On 32bit it will simply be 32 bits 979 // So this routine will do 32->32 on 32bit and 32->64 on 64bit 980 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) { 981 if (src.first()->is_stack()) { 982 if (dst.first()->is_stack()) { 983 // stack to stack 984 movslq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias)); 985 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp); 986 } else { 987 // stack to reg 988 movslq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias)); 989 } 990 } else if (dst.first()->is_stack()) { 991 // reg to stack 992 // Do we really have to sign extend??? 993 // __ movslq(src.first()->as_Register(), src.first()->as_Register()); 994 movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_Register()); 995 } else { 996 // Do we really have to sign extend??? 997 // __ movslq(dst.first()->as_Register(), src.first()->as_Register()); 998 if (dst.first() != src.first()) { 999 movq(dst.first()->as_Register(), src.first()->as_Register()); 1000 } 1001 } 1002 } 1003 1004 void MacroAssembler::move_ptr(VMRegPair src, VMRegPair dst) { 1005 if (src.first()->is_stack()) { 1006 if (dst.first()->is_stack()) { 1007 // stack to stack 1008 movq(rax, Address(rbp, reg2offset_in(src.first()))); 1009 movq(Address(rsp, reg2offset_out(dst.first())), rax); 1010 } else { 1011 // stack to reg 1012 movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()))); 1013 } 1014 } else if (dst.first()->is_stack()) { 1015 // reg to stack 1016 movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register()); 1017 } else { 1018 if (dst.first() != src.first()) { 1019 movq(dst.first()->as_Register(), src.first()->as_Register()); 1020 } 1021 } 1022 } 1023 1024 // An oop arg. Must pass a handle not the oop itself 1025 void MacroAssembler::object_move(OopMap* map, 1026 int oop_handle_offset, 1027 int framesize_in_slots, 1028 VMRegPair src, 1029 VMRegPair dst, 1030 bool is_receiver, 1031 int* receiver_offset) { 1032 1033 // must pass a handle. First figure out the location we use as a handle 1034 1035 Register rHandle = dst.first()->is_stack() ? rax : dst.first()->as_Register(); 1036 1037 // See if oop is null if it is we need no handle 1038 1039 if (src.first()->is_stack()) { 1040 1041 // Oop is already on the stack as an argument 1042 int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots(); 1043 map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots)); 1044 if (is_receiver) { 1045 *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size; 1046 } 1047 1048 cmpptr(Address(rbp, reg2offset_in(src.first())), NULL_WORD); 1049 lea(rHandle, Address(rbp, reg2offset_in(src.first()))); 1050 // conditionally move a null 1051 cmovptr(Assembler::equal, rHandle, Address(rbp, reg2offset_in(src.first()))); 1052 } else { 1053 1054 // Oop is in a register we must store it to the space we reserve 1055 // on the stack for oop_handles and pass a handle if oop is non-null 1056 1057 const Register rOop = src.first()->as_Register(); 1058 int oop_slot; 1059 if (rOop == j_rarg0) 1060 oop_slot = 0; 1061 else if (rOop == j_rarg1) 1062 oop_slot = 1; 1063 else if (rOop == j_rarg2) 1064 oop_slot = 2; 1065 else if (rOop == j_rarg3) 1066 oop_slot = 3; 1067 else if (rOop == j_rarg4) 1068 oop_slot = 4; 1069 else { 1070 assert(rOop == j_rarg5, "wrong register"); 1071 oop_slot = 5; 1072 } 1073 1074 oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset; 1075 int offset = oop_slot*VMRegImpl::stack_slot_size; 1076 1077 map->set_oop(VMRegImpl::stack2reg(oop_slot)); 1078 // Store oop in handle area, may be null 1079 movptr(Address(rsp, offset), rOop); 1080 if (is_receiver) { 1081 *receiver_offset = offset; 1082 } 1083 1084 cmpptr(rOop, NULL_WORD); 1085 lea(rHandle, Address(rsp, offset)); 1086 // conditionally move a null from the handle area where it was just stored 1087 cmovptr(Assembler::equal, rHandle, Address(rsp, offset)); 1088 } 1089 1090 // If arg is on the stack then place it otherwise it is already in correct reg. 1091 if (dst.first()->is_stack()) { 1092 movptr(Address(rsp, reg2offset_out(dst.first())), rHandle); 1093 } 1094 } 1095 1096 #endif // _LP64 1097 1098 // Now versions that are common to 32/64 bit 1099 1100 void MacroAssembler::addptr(Register dst, int32_t imm32) { 1101 LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32)); 1102 } 1103 1104 void MacroAssembler::addptr(Register dst, Register src) { 1105 LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src)); 1106 } 1107 1108 void MacroAssembler::addptr(Address dst, Register src) { 1109 LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src)); 1110 } 1111 1112 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src, Register rscratch) { 1113 assert(rscratch != noreg || always_reachable(src), "missing"); 1114 1115 if (reachable(src)) { 1116 Assembler::addsd(dst, as_Address(src)); 1117 } else { 1118 lea(rscratch, src); 1119 Assembler::addsd(dst, Address(rscratch, 0)); 1120 } 1121 } 1122 1123 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src, Register rscratch) { 1124 assert(rscratch != noreg || always_reachable(src), "missing"); 1125 1126 if (reachable(src)) { 1127 addss(dst, as_Address(src)); 1128 } else { 1129 lea(rscratch, src); 1130 addss(dst, Address(rscratch, 0)); 1131 } 1132 } 1133 1134 void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src, Register rscratch) { 1135 assert(rscratch != noreg || always_reachable(src), "missing"); 1136 1137 if (reachable(src)) { 1138 Assembler::addpd(dst, as_Address(src)); 1139 } else { 1140 lea(rscratch, src); 1141 Assembler::addpd(dst, Address(rscratch, 0)); 1142 } 1143 } 1144 1145 // See 8273459. Function for ensuring 64-byte alignment, intended for stubs only. 1146 // Stub code is generated once and never copied. 1147 // NMethods can't use this because they get copied and we can't force alignment > 32 bytes. 1148 void MacroAssembler::align64() { 1149 align(64, (uint)(uintptr_t)pc()); 1150 } 1151 1152 void MacroAssembler::align32() { 1153 align(32, (uint)(uintptr_t)pc()); 1154 } 1155 1156 void MacroAssembler::align(uint modulus) { 1157 // 8273459: Ensure alignment is possible with current segment alignment 1158 assert(modulus <= (uintx)CodeEntryAlignment, "Alignment must be <= CodeEntryAlignment"); 1159 align(modulus, offset()); 1160 } 1161 1162 void MacroAssembler::align(uint modulus, uint target) { 1163 if (target % modulus != 0) { 1164 nop(modulus - (target % modulus)); 1165 } 1166 } 1167 1168 void MacroAssembler::push_f(XMMRegister r) { 1169 subptr(rsp, wordSize); 1170 movflt(Address(rsp, 0), r); 1171 } 1172 1173 void MacroAssembler::pop_f(XMMRegister r) { 1174 movflt(r, Address(rsp, 0)); 1175 addptr(rsp, wordSize); 1176 } 1177 1178 void MacroAssembler::push_d(XMMRegister r) { 1179 subptr(rsp, 2 * wordSize); 1180 movdbl(Address(rsp, 0), r); 1181 } 1182 1183 void MacroAssembler::pop_d(XMMRegister r) { 1184 movdbl(r, Address(rsp, 0)); 1185 addptr(rsp, 2 * Interpreter::stackElementSize); 1186 } 1187 1188 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src, Register rscratch) { 1189 // Used in sign-masking with aligned address. 1190 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes"); 1191 assert(rscratch != noreg || always_reachable(src), "missing"); 1192 1193 if (UseAVX > 2 && 1194 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) && 1195 (dst->encoding() >= 16)) { 1196 vpand(dst, dst, src, AVX_512bit, rscratch); 1197 } else if (reachable(src)) { 1198 Assembler::andpd(dst, as_Address(src)); 1199 } else { 1200 lea(rscratch, src); 1201 Assembler::andpd(dst, Address(rscratch, 0)); 1202 } 1203 } 1204 1205 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src, Register rscratch) { 1206 // Used in sign-masking with aligned address. 1207 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes"); 1208 assert(rscratch != noreg || always_reachable(src), "missing"); 1209 1210 if (reachable(src)) { 1211 Assembler::andps(dst, as_Address(src)); 1212 } else { 1213 lea(rscratch, src); 1214 Assembler::andps(dst, Address(rscratch, 0)); 1215 } 1216 } 1217 1218 void MacroAssembler::andptr(Register dst, int32_t imm32) { 1219 LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32)); 1220 } 1221 1222 #ifdef _LP64 1223 void MacroAssembler::andq(Register dst, AddressLiteral src, Register rscratch) { 1224 assert(rscratch != noreg || always_reachable(src), "missing"); 1225 1226 if (reachable(src)) { 1227 andq(dst, as_Address(src)); 1228 } else { 1229 lea(rscratch, src); 1230 andq(dst, Address(rscratch, 0)); 1231 } 1232 } 1233 #endif 1234 1235 void MacroAssembler::atomic_incl(Address counter_addr) { 1236 lock(); 1237 incrementl(counter_addr); 1238 } 1239 1240 void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register rscratch) { 1241 assert(rscratch != noreg || always_reachable(counter_addr), "missing"); 1242 1243 if (reachable(counter_addr)) { 1244 atomic_incl(as_Address(counter_addr)); 1245 } else { 1246 lea(rscratch, counter_addr); 1247 atomic_incl(Address(rscratch, 0)); 1248 } 1249 } 1250 1251 #ifdef _LP64 1252 void MacroAssembler::atomic_incq(Address counter_addr) { 1253 lock(); 1254 incrementq(counter_addr); 1255 } 1256 1257 void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register rscratch) { 1258 assert(rscratch != noreg || always_reachable(counter_addr), "missing"); 1259 1260 if (reachable(counter_addr)) { 1261 atomic_incq(as_Address(counter_addr)); 1262 } else { 1263 lea(rscratch, counter_addr); 1264 atomic_incq(Address(rscratch, 0)); 1265 } 1266 } 1267 #endif 1268 1269 // Writes to stack successive pages until offset reached to check for 1270 // stack overflow + shadow pages. This clobbers tmp. 1271 void MacroAssembler::bang_stack_size(Register size, Register tmp) { 1272 movptr(tmp, rsp); 1273 // Bang stack for total size given plus shadow page size. 1274 // Bang one page at a time because large size can bang beyond yellow and 1275 // red zones. 1276 Label loop; 1277 bind(loop); 1278 movl(Address(tmp, (-(int)os::vm_page_size())), size ); 1279 subptr(tmp, (int)os::vm_page_size()); 1280 subl(size, (int)os::vm_page_size()); 1281 jcc(Assembler::greater, loop); 1282 1283 // Bang down shadow pages too. 1284 // At this point, (tmp-0) is the last address touched, so don't 1285 // touch it again. (It was touched as (tmp-pagesize) but then tmp 1286 // was post-decremented.) Skip this address by starting at i=1, and 1287 // touch a few more pages below. N.B. It is important to touch all 1288 // the way down including all pages in the shadow zone. 1289 for (int i = 1; i < ((int)StackOverflow::stack_shadow_zone_size() / (int)os::vm_page_size()); i++) { 1290 // this could be any sized move but this is can be a debugging crumb 1291 // so the bigger the better. 1292 movptr(Address(tmp, (-i*(int)os::vm_page_size())), size ); 1293 } 1294 } 1295 1296 void MacroAssembler::reserved_stack_check() { 1297 // testing if reserved zone needs to be enabled 1298 Label no_reserved_zone_enabling; 1299 Register thread = NOT_LP64(rsi) LP64_ONLY(r15_thread); 1300 NOT_LP64(get_thread(rsi);) 1301 1302 cmpptr(rsp, Address(thread, JavaThread::reserved_stack_activation_offset())); 1303 jcc(Assembler::below, no_reserved_zone_enabling); 1304 1305 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), thread); 1306 jump(RuntimeAddress(SharedRuntime::throw_delayed_StackOverflowError_entry())); 1307 should_not_reach_here(); 1308 1309 bind(no_reserved_zone_enabling); 1310 } 1311 1312 void MacroAssembler::c2bool(Register x) { 1313 // implements x == 0 ? 0 : 1 1314 // note: must only look at least-significant byte of x 1315 // since C-style booleans are stored in one byte 1316 // only! (was bug) 1317 andl(x, 0xFF); 1318 setb(Assembler::notZero, x); 1319 } 1320 1321 // Wouldn't need if AddressLiteral version had new name 1322 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) { 1323 Assembler::call(L, rtype); 1324 } 1325 1326 void MacroAssembler::call(Register entry) { 1327 Assembler::call(entry); 1328 } 1329 1330 void MacroAssembler::call(AddressLiteral entry, Register rscratch) { 1331 assert(rscratch != noreg || always_reachable(entry), "missing"); 1332 1333 if (reachable(entry)) { 1334 Assembler::call_literal(entry.target(), entry.rspec()); 1335 } else { 1336 lea(rscratch, entry); 1337 Assembler::call(rscratch); 1338 } 1339 } 1340 1341 void MacroAssembler::ic_call(address entry, jint method_index) { 1342 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index); 1343 #ifdef _LP64 1344 // Needs full 64-bit immediate for later patching. 1345 mov64(rax, (int64_t)Universe::non_oop_word()); 1346 #else 1347 movptr(rax, (intptr_t)Universe::non_oop_word()); 1348 #endif 1349 call(AddressLiteral(entry, rh)); 1350 } 1351 1352 int MacroAssembler::ic_check_size() { 1353 return 1354 LP64_ONLY(UseCompactObjectHeaders ? 17 : 14) NOT_LP64(12); 1355 } 1356 1357 int MacroAssembler::ic_check(int end_alignment) { 1358 Register receiver = LP64_ONLY(j_rarg0) NOT_LP64(rcx); 1359 Register data = rax; 1360 Register temp = LP64_ONLY(rscratch1) NOT_LP64(rbx); 1361 1362 // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed 1363 // before the inline cache check, so we don't have to execute any nop instructions when dispatching 1364 // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align 1365 // before the inline cache check here, and not after 1366 align(end_alignment, offset() + ic_check_size()); 1367 1368 int uep_offset = offset(); 1369 1370 #ifdef _LP64 1371 if (UseCompactObjectHeaders) { 1372 load_narrow_klass_compact(temp, receiver); 1373 cmpl(temp, Address(data, CompiledICData::speculated_klass_offset())); 1374 } else 1375 #endif 1376 if (UseCompressedClassPointers) { 1377 movl(temp, Address(receiver, oopDesc::klass_offset_in_bytes())); 1378 cmpl(temp, Address(data, CompiledICData::speculated_klass_offset())); 1379 } else { 1380 movptr(temp, Address(receiver, oopDesc::klass_offset_in_bytes())); 1381 cmpptr(temp, Address(data, CompiledICData::speculated_klass_offset())); 1382 } 1383 1384 // if inline cache check fails, then jump to runtime routine 1385 jump_cc(Assembler::notEqual, RuntimeAddress(SharedRuntime::get_ic_miss_stub())); 1386 assert((offset() % end_alignment) == 0, "Misaligned verified entry point (%d, %d, %d)", uep_offset, offset(), end_alignment); 1387 1388 return uep_offset; 1389 } 1390 1391 void MacroAssembler::emit_static_call_stub() { 1392 // Static stub relocation also tags the Method* in the code-stream. 1393 mov_metadata(rbx, (Metadata*) nullptr); // Method is zapped till fixup time. 1394 // This is recognized as unresolved by relocs/nativeinst/ic code. 1395 jump(RuntimeAddress(pc())); 1396 } 1397 1398 // Implementation of call_VM versions 1399 1400 void MacroAssembler::call_VM(Register oop_result, 1401 address entry_point, 1402 bool check_exceptions) { 1403 Label C, E; 1404 call(C, relocInfo::none); 1405 jmp(E); 1406 1407 bind(C); 1408 call_VM_helper(oop_result, entry_point, 0, check_exceptions); 1409 ret(0); 1410 1411 bind(E); 1412 } 1413 1414 void MacroAssembler::call_VM(Register oop_result, 1415 address entry_point, 1416 Register arg_1, 1417 bool check_exceptions) { 1418 Label C, E; 1419 call(C, relocInfo::none); 1420 jmp(E); 1421 1422 bind(C); 1423 pass_arg1(this, arg_1); 1424 call_VM_helper(oop_result, entry_point, 1, check_exceptions); 1425 ret(0); 1426 1427 bind(E); 1428 } 1429 1430 void MacroAssembler::call_VM(Register oop_result, 1431 address entry_point, 1432 Register arg_1, 1433 Register arg_2, 1434 bool check_exceptions) { 1435 Label C, E; 1436 call(C, relocInfo::none); 1437 jmp(E); 1438 1439 bind(C); 1440 1441 LP64_ONLY(assert_different_registers(arg_1, c_rarg2)); 1442 1443 pass_arg2(this, arg_2); 1444 pass_arg1(this, arg_1); 1445 call_VM_helper(oop_result, entry_point, 2, check_exceptions); 1446 ret(0); 1447 1448 bind(E); 1449 } 1450 1451 void MacroAssembler::call_VM(Register oop_result, 1452 address entry_point, 1453 Register arg_1, 1454 Register arg_2, 1455 Register arg_3, 1456 bool check_exceptions) { 1457 Label C, E; 1458 call(C, relocInfo::none); 1459 jmp(E); 1460 1461 bind(C); 1462 1463 LP64_ONLY(assert_different_registers(arg_1, c_rarg2, c_rarg3)); 1464 LP64_ONLY(assert_different_registers(arg_2, c_rarg3)); 1465 pass_arg3(this, arg_3); 1466 pass_arg2(this, arg_2); 1467 pass_arg1(this, arg_1); 1468 call_VM_helper(oop_result, entry_point, 3, check_exceptions); 1469 ret(0); 1470 1471 bind(E); 1472 } 1473 1474 void MacroAssembler::call_VM(Register oop_result, 1475 Register last_java_sp, 1476 address entry_point, 1477 int number_of_arguments, 1478 bool check_exceptions) { 1479 call_VM_base(oop_result, last_java_sp, entry_point, number_of_arguments, check_exceptions); 1480 } 1481 1482 void MacroAssembler::call_VM(Register oop_result, 1483 Register last_java_sp, 1484 address entry_point, 1485 Register arg_1, 1486 bool check_exceptions) { 1487 pass_arg1(this, arg_1); 1488 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); 1489 } 1490 1491 void MacroAssembler::call_VM(Register oop_result, 1492 Register last_java_sp, 1493 address entry_point, 1494 Register arg_1, 1495 Register arg_2, 1496 bool check_exceptions) { 1497 1498 LP64_ONLY(assert_different_registers(arg_1, c_rarg2)); 1499 pass_arg2(this, arg_2); 1500 pass_arg1(this, arg_1); 1501 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); 1502 } 1503 1504 void MacroAssembler::call_VM(Register oop_result, 1505 Register last_java_sp, 1506 address entry_point, 1507 Register arg_1, 1508 Register arg_2, 1509 Register arg_3, 1510 bool check_exceptions) { 1511 LP64_ONLY(assert_different_registers(arg_1, c_rarg2, c_rarg3)); 1512 LP64_ONLY(assert_different_registers(arg_2, c_rarg3)); 1513 pass_arg3(this, arg_3); 1514 pass_arg2(this, arg_2); 1515 pass_arg1(this, arg_1); 1516 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); 1517 } 1518 1519 void MacroAssembler::super_call_VM(Register oop_result, 1520 Register last_java_sp, 1521 address entry_point, 1522 int number_of_arguments, 1523 bool check_exceptions) { 1524 MacroAssembler::call_VM_base(oop_result, last_java_sp, entry_point, number_of_arguments, check_exceptions); 1525 } 1526 1527 void MacroAssembler::super_call_VM(Register oop_result, 1528 Register last_java_sp, 1529 address entry_point, 1530 Register arg_1, 1531 bool check_exceptions) { 1532 pass_arg1(this, arg_1); 1533 super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); 1534 } 1535 1536 void MacroAssembler::super_call_VM(Register oop_result, 1537 Register last_java_sp, 1538 address entry_point, 1539 Register arg_1, 1540 Register arg_2, 1541 bool check_exceptions) { 1542 1543 LP64_ONLY(assert_different_registers(arg_1, c_rarg2)); 1544 pass_arg2(this, arg_2); 1545 pass_arg1(this, arg_1); 1546 super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); 1547 } 1548 1549 void MacroAssembler::super_call_VM(Register oop_result, 1550 Register last_java_sp, 1551 address entry_point, 1552 Register arg_1, 1553 Register arg_2, 1554 Register arg_3, 1555 bool check_exceptions) { 1556 LP64_ONLY(assert_different_registers(arg_1, c_rarg2, c_rarg3)); 1557 LP64_ONLY(assert_different_registers(arg_2, c_rarg3)); 1558 pass_arg3(this, arg_3); 1559 pass_arg2(this, arg_2); 1560 pass_arg1(this, arg_1); 1561 super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); 1562 } 1563 1564 void MacroAssembler::call_VM_base(Register oop_result, 1565 Register last_java_sp, 1566 address entry_point, 1567 int number_of_arguments, 1568 bool check_exceptions) { 1569 Register java_thread = r15_thread; 1570 1571 // determine last_java_sp register 1572 if (!last_java_sp->is_valid()) { 1573 last_java_sp = rsp; 1574 } 1575 // debugging support 1576 assert(number_of_arguments >= 0 , "cannot have negative number of arguments"); 1577 #ifdef ASSERT 1578 // TraceBytecodes does not use r12 but saves it over the call, so don't verify 1579 // r12 is the heapbase. 1580 if (UseCompressedOops && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?"); 1581 #endif // ASSERT 1582 1583 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result"); 1584 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp"); 1585 1586 // push java thread (becomes first argument of C function) 1587 1588 mov(c_rarg0, r15_thread); 1589 1590 // set last Java frame before call 1591 assert(last_java_sp != rbp, "can't use ebp/rbp"); 1592 1593 // Only interpreter should have to set fp 1594 set_last_Java_frame(last_java_sp, rbp, nullptr, rscratch1); 1595 1596 // do the call, remove parameters 1597 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments); 1598 1599 #ifdef ASSERT 1600 // Check that thread register is not clobbered. 1601 guarantee(java_thread != rax, "change this code"); 1602 push(rax); 1603 { Label L; 1604 get_thread_slow(rax); 1605 cmpptr(java_thread, rax); 1606 jcc(Assembler::equal, L); 1607 STOP("MacroAssembler::call_VM_base: java_thread not callee saved?"); 1608 bind(L); 1609 } 1610 pop(rax); 1611 #endif 1612 1613 // reset last Java frame 1614 // Only interpreter should have to clear fp 1615 reset_last_Java_frame(true); 1616 1617 // C++ interp handles this in the interpreter 1618 check_and_handle_popframe(); 1619 check_and_handle_earlyret(); 1620 1621 if (check_exceptions) { 1622 // check for pending exceptions (java_thread is set upon return) 1623 cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD); 1624 // This used to conditionally jump to forward_exception however it is 1625 // possible if we relocate that the branch will not reach. So we must jump 1626 // around so we can always reach 1627 1628 Label ok; 1629 jcc(Assembler::equal, ok); 1630 jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 1631 bind(ok); 1632 } 1633 1634 // get oop result if there is one and reset the value in the thread 1635 if (oop_result->is_valid()) { 1636 get_vm_result(oop_result); 1637 } 1638 } 1639 1640 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) { 1641 // Calculate the value for last_Java_sp somewhat subtle. 1642 // call_VM does an intermediate call which places a return address on 1643 // the stack just under the stack pointer as the user finished with it. 1644 // This allows use to retrieve last_Java_pc from last_Java_sp[-1]. 1645 1646 // We've pushed one address, correct last_Java_sp 1647 lea(rax, Address(rsp, wordSize)); 1648 1649 call_VM_base(oop_result, rax, entry_point, number_of_arguments, check_exceptions); 1650 } 1651 1652 // Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter. 1653 void MacroAssembler::call_VM_leaf0(address entry_point) { 1654 MacroAssembler::call_VM_leaf_base(entry_point, 0); 1655 } 1656 1657 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) { 1658 call_VM_leaf_base(entry_point, number_of_arguments); 1659 } 1660 1661 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) { 1662 pass_arg0(this, arg_0); 1663 call_VM_leaf(entry_point, 1); 1664 } 1665 1666 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1667 1668 LP64_ONLY(assert_different_registers(arg_0, c_rarg1)); 1669 pass_arg1(this, arg_1); 1670 pass_arg0(this, arg_0); 1671 call_VM_leaf(entry_point, 2); 1672 } 1673 1674 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { 1675 LP64_ONLY(assert_different_registers(arg_0, c_rarg1, c_rarg2)); 1676 LP64_ONLY(assert_different_registers(arg_1, c_rarg2)); 1677 pass_arg2(this, arg_2); 1678 pass_arg1(this, arg_1); 1679 pass_arg0(this, arg_0); 1680 call_VM_leaf(entry_point, 3); 1681 } 1682 1683 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) { 1684 LP64_ONLY(assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3)); 1685 LP64_ONLY(assert_different_registers(arg_1, c_rarg2, c_rarg3)); 1686 LP64_ONLY(assert_different_registers(arg_2, c_rarg3)); 1687 pass_arg3(this, arg_3); 1688 pass_arg2(this, arg_2); 1689 pass_arg1(this, arg_1); 1690 pass_arg0(this, arg_0); 1691 call_VM_leaf(entry_point, 3); 1692 } 1693 1694 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) { 1695 pass_arg0(this, arg_0); 1696 MacroAssembler::call_VM_leaf_base(entry_point, 1); 1697 } 1698 1699 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1700 LP64_ONLY(assert_different_registers(arg_0, c_rarg1)); 1701 pass_arg1(this, arg_1); 1702 pass_arg0(this, arg_0); 1703 MacroAssembler::call_VM_leaf_base(entry_point, 2); 1704 } 1705 1706 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { 1707 LP64_ONLY(assert_different_registers(arg_0, c_rarg1, c_rarg2)); 1708 LP64_ONLY(assert_different_registers(arg_1, c_rarg2)); 1709 pass_arg2(this, arg_2); 1710 pass_arg1(this, arg_1); 1711 pass_arg0(this, arg_0); 1712 MacroAssembler::call_VM_leaf_base(entry_point, 3); 1713 } 1714 1715 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) { 1716 LP64_ONLY(assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3)); 1717 LP64_ONLY(assert_different_registers(arg_1, c_rarg2, c_rarg3)); 1718 LP64_ONLY(assert_different_registers(arg_2, c_rarg3)); 1719 pass_arg3(this, arg_3); 1720 pass_arg2(this, arg_2); 1721 pass_arg1(this, arg_1); 1722 pass_arg0(this, arg_0); 1723 MacroAssembler::call_VM_leaf_base(entry_point, 4); 1724 } 1725 1726 void MacroAssembler::get_vm_result(Register oop_result) { 1727 movptr(oop_result, Address(r15_thread, JavaThread::vm_result_offset())); 1728 movptr(Address(r15_thread, JavaThread::vm_result_offset()), NULL_WORD); 1729 verify_oop_msg(oop_result, "broken oop in call_VM_base"); 1730 } 1731 1732 void MacroAssembler::get_vm_result_2(Register metadata_result) { 1733 movptr(metadata_result, Address(r15_thread, JavaThread::vm_result_2_offset())); 1734 movptr(Address(r15_thread, JavaThread::vm_result_2_offset()), NULL_WORD); 1735 } 1736 1737 void MacroAssembler::check_and_handle_earlyret() { 1738 } 1739 1740 void MacroAssembler::check_and_handle_popframe() { 1741 } 1742 1743 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm, Register rscratch) { 1744 assert(rscratch != noreg || always_reachable(src1), "missing"); 1745 1746 if (reachable(src1)) { 1747 cmpl(as_Address(src1), imm); 1748 } else { 1749 lea(rscratch, src1); 1750 cmpl(Address(rscratch, 0), imm); 1751 } 1752 } 1753 1754 void MacroAssembler::cmp32(Register src1, AddressLiteral src2, Register rscratch) { 1755 assert(!src2.is_lval(), "use cmpptr"); 1756 assert(rscratch != noreg || always_reachable(src2), "missing"); 1757 1758 if (reachable(src2)) { 1759 cmpl(src1, as_Address(src2)); 1760 } else { 1761 lea(rscratch, src2); 1762 cmpl(src1, Address(rscratch, 0)); 1763 } 1764 } 1765 1766 void MacroAssembler::cmp32(Register src1, int32_t imm) { 1767 Assembler::cmpl(src1, imm); 1768 } 1769 1770 void MacroAssembler::cmp32(Register src1, Address src2) { 1771 Assembler::cmpl(src1, src2); 1772 } 1773 1774 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) { 1775 ucomisd(opr1, opr2); 1776 1777 Label L; 1778 if (unordered_is_less) { 1779 movl(dst, -1); 1780 jcc(Assembler::parity, L); 1781 jcc(Assembler::below , L); 1782 movl(dst, 0); 1783 jcc(Assembler::equal , L); 1784 increment(dst); 1785 } else { // unordered is greater 1786 movl(dst, 1); 1787 jcc(Assembler::parity, L); 1788 jcc(Assembler::above , L); 1789 movl(dst, 0); 1790 jcc(Assembler::equal , L); 1791 decrementl(dst); 1792 } 1793 bind(L); 1794 } 1795 1796 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) { 1797 ucomiss(opr1, opr2); 1798 1799 Label L; 1800 if (unordered_is_less) { 1801 movl(dst, -1); 1802 jcc(Assembler::parity, L); 1803 jcc(Assembler::below , L); 1804 movl(dst, 0); 1805 jcc(Assembler::equal , L); 1806 increment(dst); 1807 } else { // unordered is greater 1808 movl(dst, 1); 1809 jcc(Assembler::parity, L); 1810 jcc(Assembler::above , L); 1811 movl(dst, 0); 1812 jcc(Assembler::equal , L); 1813 decrementl(dst); 1814 } 1815 bind(L); 1816 } 1817 1818 1819 void MacroAssembler::cmp8(AddressLiteral src1, int imm, Register rscratch) { 1820 assert(rscratch != noreg || always_reachable(src1), "missing"); 1821 1822 if (reachable(src1)) { 1823 cmpb(as_Address(src1), imm); 1824 } else { 1825 lea(rscratch, src1); 1826 cmpb(Address(rscratch, 0), imm); 1827 } 1828 } 1829 1830 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2, Register rscratch) { 1831 #ifdef _LP64 1832 assert(rscratch != noreg || always_reachable(src2), "missing"); 1833 1834 if (src2.is_lval()) { 1835 movptr(rscratch, src2); 1836 Assembler::cmpq(src1, rscratch); 1837 } else if (reachable(src2)) { 1838 cmpq(src1, as_Address(src2)); 1839 } else { 1840 lea(rscratch, src2); 1841 Assembler::cmpq(src1, Address(rscratch, 0)); 1842 } 1843 #else 1844 assert(rscratch == noreg, "not needed"); 1845 if (src2.is_lval()) { 1846 cmp_literal32(src1, (int32_t)src2.target(), src2.rspec()); 1847 } else { 1848 cmpl(src1, as_Address(src2)); 1849 } 1850 #endif // _LP64 1851 } 1852 1853 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2, Register rscratch) { 1854 assert(src2.is_lval(), "not a mem-mem compare"); 1855 #ifdef _LP64 1856 // moves src2's literal address 1857 movptr(rscratch, src2); 1858 Assembler::cmpq(src1, rscratch); 1859 #else 1860 assert(rscratch == noreg, "not needed"); 1861 cmp_literal32(src1, (int32_t)src2.target(), src2.rspec()); 1862 #endif // _LP64 1863 } 1864 1865 void MacroAssembler::cmpoop(Register src1, Register src2) { 1866 cmpptr(src1, src2); 1867 } 1868 1869 void MacroAssembler::cmpoop(Register src1, Address src2) { 1870 cmpptr(src1, src2); 1871 } 1872 1873 #ifdef _LP64 1874 void MacroAssembler::cmpoop(Register src1, jobject src2, Register rscratch) { 1875 movoop(rscratch, src2); 1876 cmpptr(src1, rscratch); 1877 } 1878 #endif 1879 1880 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr, Register rscratch) { 1881 assert(rscratch != noreg || always_reachable(adr), "missing"); 1882 1883 if (reachable(adr)) { 1884 lock(); 1885 cmpxchgptr(reg, as_Address(adr)); 1886 } else { 1887 lea(rscratch, adr); 1888 lock(); 1889 cmpxchgptr(reg, Address(rscratch, 0)); 1890 } 1891 } 1892 1893 void MacroAssembler::cmpxchgptr(Register reg, Address adr) { 1894 LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr)); 1895 } 1896 1897 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src, Register rscratch) { 1898 assert(rscratch != noreg || always_reachable(src), "missing"); 1899 1900 if (reachable(src)) { 1901 Assembler::comisd(dst, as_Address(src)); 1902 } else { 1903 lea(rscratch, src); 1904 Assembler::comisd(dst, Address(rscratch, 0)); 1905 } 1906 } 1907 1908 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src, Register rscratch) { 1909 assert(rscratch != noreg || always_reachable(src), "missing"); 1910 1911 if (reachable(src)) { 1912 Assembler::comiss(dst, as_Address(src)); 1913 } else { 1914 lea(rscratch, src); 1915 Assembler::comiss(dst, Address(rscratch, 0)); 1916 } 1917 } 1918 1919 1920 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr, Register rscratch) { 1921 assert(rscratch != noreg || always_reachable(counter_addr), "missing"); 1922 1923 Condition negated_cond = negate_condition(cond); 1924 Label L; 1925 jcc(negated_cond, L); 1926 pushf(); // Preserve flags 1927 atomic_incl(counter_addr, rscratch); 1928 popf(); 1929 bind(L); 1930 } 1931 1932 int MacroAssembler::corrected_idivl(Register reg) { 1933 // Full implementation of Java idiv and irem; checks for 1934 // special case as described in JVM spec., p.243 & p.271. 1935 // The function returns the (pc) offset of the idivl 1936 // instruction - may be needed for implicit exceptions. 1937 // 1938 // normal case special case 1939 // 1940 // input : rax,: dividend min_int 1941 // reg: divisor (may not be rax,/rdx) -1 1942 // 1943 // output: rax,: quotient (= rax, idiv reg) min_int 1944 // rdx: remainder (= rax, irem reg) 0 1945 assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register"); 1946 const int min_int = 0x80000000; 1947 Label normal_case, special_case; 1948 1949 // check for special case 1950 cmpl(rax, min_int); 1951 jcc(Assembler::notEqual, normal_case); 1952 xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0) 1953 cmpl(reg, -1); 1954 jcc(Assembler::equal, special_case); 1955 1956 // handle normal case 1957 bind(normal_case); 1958 cdql(); 1959 int idivl_offset = offset(); 1960 idivl(reg); 1961 1962 // normal and special case exit 1963 bind(special_case); 1964 1965 return idivl_offset; 1966 } 1967 1968 1969 1970 void MacroAssembler::decrementl(Register reg, int value) { 1971 if (value == min_jint) {subl(reg, value) ; return; } 1972 if (value < 0) { incrementl(reg, -value); return; } 1973 if (value == 0) { ; return; } 1974 if (value == 1 && UseIncDec) { decl(reg) ; return; } 1975 /* else */ { subl(reg, value) ; return; } 1976 } 1977 1978 void MacroAssembler::decrementl(Address dst, int value) { 1979 if (value == min_jint) {subl(dst, value) ; return; } 1980 if (value < 0) { incrementl(dst, -value); return; } 1981 if (value == 0) { ; return; } 1982 if (value == 1 && UseIncDec) { decl(dst) ; return; } 1983 /* else */ { subl(dst, value) ; return; } 1984 } 1985 1986 void MacroAssembler::division_with_shift (Register reg, int shift_value) { 1987 assert(shift_value > 0, "illegal shift value"); 1988 Label _is_positive; 1989 testl (reg, reg); 1990 jcc (Assembler::positive, _is_positive); 1991 int offset = (1 << shift_value) - 1 ; 1992 1993 if (offset == 1) { 1994 incrementl(reg); 1995 } else { 1996 addl(reg, offset); 1997 } 1998 1999 bind (_is_positive); 2000 sarl(reg, shift_value); 2001 } 2002 2003 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src, Register rscratch) { 2004 assert(rscratch != noreg || always_reachable(src), "missing"); 2005 2006 if (reachable(src)) { 2007 Assembler::divsd(dst, as_Address(src)); 2008 } else { 2009 lea(rscratch, src); 2010 Assembler::divsd(dst, Address(rscratch, 0)); 2011 } 2012 } 2013 2014 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src, Register rscratch) { 2015 assert(rscratch != noreg || always_reachable(src), "missing"); 2016 2017 if (reachable(src)) { 2018 Assembler::divss(dst, as_Address(src)); 2019 } else { 2020 lea(rscratch, src); 2021 Assembler::divss(dst, Address(rscratch, 0)); 2022 } 2023 } 2024 2025 void MacroAssembler::enter() { 2026 push(rbp); 2027 mov(rbp, rsp); 2028 } 2029 2030 void MacroAssembler::post_call_nop() { 2031 if (!Continuations::enabled()) { 2032 return; 2033 } 2034 InstructionMark im(this); 2035 relocate(post_call_nop_Relocation::spec()); 2036 InlineSkippedInstructionsCounter skipCounter(this); 2037 emit_int8((uint8_t)0x0f); 2038 emit_int8((uint8_t)0x1f); 2039 emit_int8((uint8_t)0x84); 2040 emit_int8((uint8_t)0x00); 2041 emit_int32(0x00); 2042 } 2043 2044 // A 5 byte nop that is safe for patching (see patch_verified_entry) 2045 void MacroAssembler::fat_nop() { 2046 if (UseAddressNop) { 2047 addr_nop_5(); 2048 } else { 2049 emit_int8((uint8_t)0x26); // es: 2050 emit_int8((uint8_t)0x2e); // cs: 2051 emit_int8((uint8_t)0x64); // fs: 2052 emit_int8((uint8_t)0x65); // gs: 2053 emit_int8((uint8_t)0x90); 2054 } 2055 } 2056 2057 #ifndef _LP64 2058 void MacroAssembler::fcmp(Register tmp) { 2059 fcmp(tmp, 1, true, true); 2060 } 2061 2062 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) { 2063 assert(!pop_right || pop_left, "usage error"); 2064 if (VM_Version::supports_cmov()) { 2065 assert(tmp == noreg, "unneeded temp"); 2066 if (pop_left) { 2067 fucomip(index); 2068 } else { 2069 fucomi(index); 2070 } 2071 if (pop_right) { 2072 fpop(); 2073 } 2074 } else { 2075 assert(tmp != noreg, "need temp"); 2076 if (pop_left) { 2077 if (pop_right) { 2078 fcompp(); 2079 } else { 2080 fcomp(index); 2081 } 2082 } else { 2083 fcom(index); 2084 } 2085 // convert FPU condition into eflags condition via rax, 2086 save_rax(tmp); 2087 fwait(); fnstsw_ax(); 2088 sahf(); 2089 restore_rax(tmp); 2090 } 2091 // condition codes set as follows: 2092 // 2093 // CF (corresponds to C0) if x < y 2094 // PF (corresponds to C2) if unordered 2095 // ZF (corresponds to C3) if x = y 2096 } 2097 2098 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) { 2099 fcmp2int(dst, unordered_is_less, 1, true, true); 2100 } 2101 2102 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) { 2103 fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right); 2104 Label L; 2105 if (unordered_is_less) { 2106 movl(dst, -1); 2107 jcc(Assembler::parity, L); 2108 jcc(Assembler::below , L); 2109 movl(dst, 0); 2110 jcc(Assembler::equal , L); 2111 increment(dst); 2112 } else { // unordered is greater 2113 movl(dst, 1); 2114 jcc(Assembler::parity, L); 2115 jcc(Assembler::above , L); 2116 movl(dst, 0); 2117 jcc(Assembler::equal , L); 2118 decrementl(dst); 2119 } 2120 bind(L); 2121 } 2122 2123 void MacroAssembler::fld_d(AddressLiteral src) { 2124 fld_d(as_Address(src)); 2125 } 2126 2127 void MacroAssembler::fld_s(AddressLiteral src) { 2128 fld_s(as_Address(src)); 2129 } 2130 2131 void MacroAssembler::fldcw(AddressLiteral src) { 2132 fldcw(as_Address(src)); 2133 } 2134 2135 void MacroAssembler::fpop() { 2136 ffree(); 2137 fincstp(); 2138 } 2139 2140 void MacroAssembler::fremr(Register tmp) { 2141 save_rax(tmp); 2142 { Label L; 2143 bind(L); 2144 fprem(); 2145 fwait(); fnstsw_ax(); 2146 sahf(); 2147 jcc(Assembler::parity, L); 2148 } 2149 restore_rax(tmp); 2150 // Result is in ST0. 2151 // Note: fxch & fpop to get rid of ST1 2152 // (otherwise FPU stack could overflow eventually) 2153 fxch(1); 2154 fpop(); 2155 } 2156 2157 void MacroAssembler::empty_FPU_stack() { 2158 if (VM_Version::supports_mmx()) { 2159 emms(); 2160 } else { 2161 for (int i = 8; i-- > 0; ) ffree(i); 2162 } 2163 } 2164 #endif // !LP64 2165 2166 void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src, Register rscratch) { 2167 assert(rscratch != noreg || always_reachable(src), "missing"); 2168 if (reachable(src)) { 2169 Assembler::mulpd(dst, as_Address(src)); 2170 } else { 2171 lea(rscratch, src); 2172 Assembler::mulpd(dst, Address(rscratch, 0)); 2173 } 2174 } 2175 2176 void MacroAssembler::load_float(Address src) { 2177 #ifdef _LP64 2178 movflt(xmm0, src); 2179 #else 2180 if (UseSSE >= 1) { 2181 movflt(xmm0, src); 2182 } else { 2183 fld_s(src); 2184 } 2185 #endif // LP64 2186 } 2187 2188 void MacroAssembler::store_float(Address dst) { 2189 #ifdef _LP64 2190 movflt(dst, xmm0); 2191 #else 2192 if (UseSSE >= 1) { 2193 movflt(dst, xmm0); 2194 } else { 2195 fstp_s(dst); 2196 } 2197 #endif // LP64 2198 } 2199 2200 void MacroAssembler::load_double(Address src) { 2201 #ifdef _LP64 2202 movdbl(xmm0, src); 2203 #else 2204 if (UseSSE >= 2) { 2205 movdbl(xmm0, src); 2206 } else { 2207 fld_d(src); 2208 } 2209 #endif // LP64 2210 } 2211 2212 void MacroAssembler::store_double(Address dst) { 2213 #ifdef _LP64 2214 movdbl(dst, xmm0); 2215 #else 2216 if (UseSSE >= 2) { 2217 movdbl(dst, xmm0); 2218 } else { 2219 fstp_d(dst); 2220 } 2221 #endif // LP64 2222 } 2223 2224 // dst = c = a * b + c 2225 void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) { 2226 Assembler::vfmadd231sd(c, a, b); 2227 if (dst != c) { 2228 movdbl(dst, c); 2229 } 2230 } 2231 2232 // dst = c = a * b + c 2233 void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) { 2234 Assembler::vfmadd231ss(c, a, b); 2235 if (dst != c) { 2236 movflt(dst, c); 2237 } 2238 } 2239 2240 // dst = c = a * b + c 2241 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) { 2242 Assembler::vfmadd231pd(c, a, b, vector_len); 2243 if (dst != c) { 2244 vmovdqu(dst, c); 2245 } 2246 } 2247 2248 // dst = c = a * b + c 2249 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) { 2250 Assembler::vfmadd231ps(c, a, b, vector_len); 2251 if (dst != c) { 2252 vmovdqu(dst, c); 2253 } 2254 } 2255 2256 // dst = c = a * b + c 2257 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) { 2258 Assembler::vfmadd231pd(c, a, b, vector_len); 2259 if (dst != c) { 2260 vmovdqu(dst, c); 2261 } 2262 } 2263 2264 // dst = c = a * b + c 2265 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) { 2266 Assembler::vfmadd231ps(c, a, b, vector_len); 2267 if (dst != c) { 2268 vmovdqu(dst, c); 2269 } 2270 } 2271 2272 void MacroAssembler::incrementl(AddressLiteral dst, Register rscratch) { 2273 assert(rscratch != noreg || always_reachable(dst), "missing"); 2274 2275 if (reachable(dst)) { 2276 incrementl(as_Address(dst)); 2277 } else { 2278 lea(rscratch, dst); 2279 incrementl(Address(rscratch, 0)); 2280 } 2281 } 2282 2283 void MacroAssembler::incrementl(ArrayAddress dst, Register rscratch) { 2284 incrementl(as_Address(dst, rscratch)); 2285 } 2286 2287 void MacroAssembler::incrementl(Register reg, int value) { 2288 if (value == min_jint) {addl(reg, value) ; return; } 2289 if (value < 0) { decrementl(reg, -value); return; } 2290 if (value == 0) { ; return; } 2291 if (value == 1 && UseIncDec) { incl(reg) ; return; } 2292 /* else */ { addl(reg, value) ; return; } 2293 } 2294 2295 void MacroAssembler::incrementl(Address dst, int value) { 2296 if (value == min_jint) {addl(dst, value) ; return; } 2297 if (value < 0) { decrementl(dst, -value); return; } 2298 if (value == 0) { ; return; } 2299 if (value == 1 && UseIncDec) { incl(dst) ; return; } 2300 /* else */ { addl(dst, value) ; return; } 2301 } 2302 2303 void MacroAssembler::jump(AddressLiteral dst, Register rscratch) { 2304 assert(rscratch != noreg || always_reachable(dst), "missing"); 2305 assert(!dst.rspec().reloc()->is_data(), "should not use ExternalAddress for jump"); 2306 if (reachable(dst)) { 2307 jmp_literal(dst.target(), dst.rspec()); 2308 } else { 2309 lea(rscratch, dst); 2310 jmp(rscratch); 2311 } 2312 } 2313 2314 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst, Register rscratch) { 2315 assert(rscratch != noreg || always_reachable(dst), "missing"); 2316 assert(!dst.rspec().reloc()->is_data(), "should not use ExternalAddress for jump_cc"); 2317 if (reachable(dst)) { 2318 InstructionMark im(this); 2319 relocate(dst.reloc()); 2320 const int short_size = 2; 2321 const int long_size = 6; 2322 int offs = (intptr_t)dst.target() - ((intptr_t)pc()); 2323 if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) { 2324 // 0111 tttn #8-bit disp 2325 emit_int8(0x70 | cc); 2326 emit_int8((offs - short_size) & 0xFF); 2327 } else { 2328 // 0000 1111 1000 tttn #32-bit disp 2329 emit_int8(0x0F); 2330 emit_int8((unsigned char)(0x80 | cc)); 2331 emit_int32(offs - long_size); 2332 } 2333 } else { 2334 #ifdef ASSERT 2335 warning("reversing conditional branch"); 2336 #endif /* ASSERT */ 2337 Label skip; 2338 jccb(reverse[cc], skip); 2339 lea(rscratch, dst); 2340 Assembler::jmp(rscratch); 2341 bind(skip); 2342 } 2343 } 2344 2345 void MacroAssembler::cmp32_mxcsr_std(Address mxcsr_save, Register tmp, Register rscratch) { 2346 ExternalAddress mxcsr_std(StubRoutines::x86::addr_mxcsr_std()); 2347 assert(rscratch != noreg || always_reachable(mxcsr_std), "missing"); 2348 2349 stmxcsr(mxcsr_save); 2350 movl(tmp, mxcsr_save); 2351 if (EnableX86ECoreOpts) { 2352 // The mxcsr_std has status bits set for performance on ECore 2353 orl(tmp, 0x003f); 2354 } else { 2355 // Mask out status bits (only check control and mask bits) 2356 andl(tmp, 0xFFC0); 2357 } 2358 cmp32(tmp, mxcsr_std, rscratch); 2359 } 2360 2361 void MacroAssembler::ldmxcsr(AddressLiteral src, Register rscratch) { 2362 assert(rscratch != noreg || always_reachable(src), "missing"); 2363 2364 if (reachable(src)) { 2365 Assembler::ldmxcsr(as_Address(src)); 2366 } else { 2367 lea(rscratch, src); 2368 Assembler::ldmxcsr(Address(rscratch, 0)); 2369 } 2370 } 2371 2372 int MacroAssembler::load_signed_byte(Register dst, Address src) { 2373 int off; 2374 if (LP64_ONLY(true ||) VM_Version::is_P6()) { 2375 off = offset(); 2376 movsbl(dst, src); // movsxb 2377 } else { 2378 off = load_unsigned_byte(dst, src); 2379 shll(dst, 24); 2380 sarl(dst, 24); 2381 } 2382 return off; 2383 } 2384 2385 // Note: load_signed_short used to be called load_signed_word. 2386 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler 2387 // manual, which means 16 bits, that usage is found nowhere in HotSpot code. 2388 // The term "word" in HotSpot means a 32- or 64-bit machine word. 2389 int MacroAssembler::load_signed_short(Register dst, Address src) { 2390 int off; 2391 if (LP64_ONLY(true ||) VM_Version::is_P6()) { 2392 // This is dubious to me since it seems safe to do a signed 16 => 64 bit 2393 // version but this is what 64bit has always done. This seems to imply 2394 // that users are only using 32bits worth. 2395 off = offset(); 2396 movswl(dst, src); // movsxw 2397 } else { 2398 off = load_unsigned_short(dst, src); 2399 shll(dst, 16); 2400 sarl(dst, 16); 2401 } 2402 return off; 2403 } 2404 2405 int MacroAssembler::load_unsigned_byte(Register dst, Address src) { 2406 // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16, 2407 // and "3.9 Partial Register Penalties", p. 22). 2408 int off; 2409 if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) { 2410 off = offset(); 2411 movzbl(dst, src); // movzxb 2412 } else { 2413 xorl(dst, dst); 2414 off = offset(); 2415 movb(dst, src); 2416 } 2417 return off; 2418 } 2419 2420 // Note: load_unsigned_short used to be called load_unsigned_word. 2421 int MacroAssembler::load_unsigned_short(Register dst, Address src) { 2422 // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16, 2423 // and "3.9 Partial Register Penalties", p. 22). 2424 int off; 2425 if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) { 2426 off = offset(); 2427 movzwl(dst, src); // movzxw 2428 } else { 2429 xorl(dst, dst); 2430 off = offset(); 2431 movw(dst, src); 2432 } 2433 return off; 2434 } 2435 2436 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) { 2437 switch (size_in_bytes) { 2438 #ifndef _LP64 2439 case 8: 2440 assert(dst2 != noreg, "second dest register required"); 2441 movl(dst, src); 2442 movl(dst2, src.plus_disp(BytesPerInt)); 2443 break; 2444 #else 2445 case 8: movq(dst, src); break; 2446 #endif 2447 case 4: movl(dst, src); break; 2448 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break; 2449 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break; 2450 default: ShouldNotReachHere(); 2451 } 2452 } 2453 2454 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) { 2455 switch (size_in_bytes) { 2456 #ifndef _LP64 2457 case 8: 2458 assert(src2 != noreg, "second source register required"); 2459 movl(dst, src); 2460 movl(dst.plus_disp(BytesPerInt), src2); 2461 break; 2462 #else 2463 case 8: movq(dst, src); break; 2464 #endif 2465 case 4: movl(dst, src); break; 2466 case 2: movw(dst, src); break; 2467 case 1: movb(dst, src); break; 2468 default: ShouldNotReachHere(); 2469 } 2470 } 2471 2472 void MacroAssembler::mov32(AddressLiteral dst, Register src, Register rscratch) { 2473 assert(rscratch != noreg || always_reachable(dst), "missing"); 2474 2475 if (reachable(dst)) { 2476 movl(as_Address(dst), src); 2477 } else { 2478 lea(rscratch, dst); 2479 movl(Address(rscratch, 0), src); 2480 } 2481 } 2482 2483 void MacroAssembler::mov32(Register dst, AddressLiteral src) { 2484 if (reachable(src)) { 2485 movl(dst, as_Address(src)); 2486 } else { 2487 lea(dst, src); 2488 movl(dst, Address(dst, 0)); 2489 } 2490 } 2491 2492 // C++ bool manipulation 2493 2494 void MacroAssembler::movbool(Register dst, Address src) { 2495 if(sizeof(bool) == 1) 2496 movb(dst, src); 2497 else if(sizeof(bool) == 2) 2498 movw(dst, src); 2499 else if(sizeof(bool) == 4) 2500 movl(dst, src); 2501 else 2502 // unsupported 2503 ShouldNotReachHere(); 2504 } 2505 2506 void MacroAssembler::movbool(Address dst, bool boolconst) { 2507 if(sizeof(bool) == 1) 2508 movb(dst, (int) boolconst); 2509 else if(sizeof(bool) == 2) 2510 movw(dst, (int) boolconst); 2511 else if(sizeof(bool) == 4) 2512 movl(dst, (int) boolconst); 2513 else 2514 // unsupported 2515 ShouldNotReachHere(); 2516 } 2517 2518 void MacroAssembler::movbool(Address dst, Register src) { 2519 if(sizeof(bool) == 1) 2520 movb(dst, src); 2521 else if(sizeof(bool) == 2) 2522 movw(dst, src); 2523 else if(sizeof(bool) == 4) 2524 movl(dst, src); 2525 else 2526 // unsupported 2527 ShouldNotReachHere(); 2528 } 2529 2530 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src, Register rscratch) { 2531 assert(rscratch != noreg || always_reachable(src), "missing"); 2532 2533 if (reachable(src)) { 2534 movdl(dst, as_Address(src)); 2535 } else { 2536 lea(rscratch, src); 2537 movdl(dst, Address(rscratch, 0)); 2538 } 2539 } 2540 2541 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src, Register rscratch) { 2542 assert(rscratch != noreg || always_reachable(src), "missing"); 2543 2544 if (reachable(src)) { 2545 movq(dst, as_Address(src)); 2546 } else { 2547 lea(rscratch, src); 2548 movq(dst, Address(rscratch, 0)); 2549 } 2550 } 2551 2552 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src, Register rscratch) { 2553 assert(rscratch != noreg || always_reachable(src), "missing"); 2554 2555 if (reachable(src)) { 2556 if (UseXmmLoadAndClearUpper) { 2557 movsd (dst, as_Address(src)); 2558 } else { 2559 movlpd(dst, as_Address(src)); 2560 } 2561 } else { 2562 lea(rscratch, src); 2563 if (UseXmmLoadAndClearUpper) { 2564 movsd (dst, Address(rscratch, 0)); 2565 } else { 2566 movlpd(dst, Address(rscratch, 0)); 2567 } 2568 } 2569 } 2570 2571 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src, Register rscratch) { 2572 assert(rscratch != noreg || always_reachable(src), "missing"); 2573 2574 if (reachable(src)) { 2575 movss(dst, as_Address(src)); 2576 } else { 2577 lea(rscratch, src); 2578 movss(dst, Address(rscratch, 0)); 2579 } 2580 } 2581 2582 void MacroAssembler::movptr(Register dst, Register src) { 2583 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src)); 2584 } 2585 2586 void MacroAssembler::movptr(Register dst, Address src) { 2587 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src)); 2588 } 2589 2590 // src should NEVER be a real pointer. Use AddressLiteral for true pointers 2591 void MacroAssembler::movptr(Register dst, intptr_t src) { 2592 #ifdef _LP64 2593 if (is_uimm32(src)) { 2594 movl(dst, checked_cast<uint32_t>(src)); 2595 } else if (is_simm32(src)) { 2596 movq(dst, checked_cast<int32_t>(src)); 2597 } else { 2598 mov64(dst, src); 2599 } 2600 #else 2601 movl(dst, src); 2602 #endif 2603 } 2604 2605 void MacroAssembler::movptr(Address dst, Register src) { 2606 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src)); 2607 } 2608 2609 void MacroAssembler::movptr(Address dst, int32_t src) { 2610 LP64_ONLY(movslq(dst, src)) NOT_LP64(movl(dst, src)); 2611 } 2612 2613 void MacroAssembler::movdqu(Address dst, XMMRegister src) { 2614 assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); 2615 Assembler::movdqu(dst, src); 2616 } 2617 2618 void MacroAssembler::movdqu(XMMRegister dst, Address src) { 2619 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); 2620 Assembler::movdqu(dst, src); 2621 } 2622 2623 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) { 2624 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); 2625 Assembler::movdqu(dst, src); 2626 } 2627 2628 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register rscratch) { 2629 assert(rscratch != noreg || always_reachable(src), "missing"); 2630 2631 if (reachable(src)) { 2632 movdqu(dst, as_Address(src)); 2633 } else { 2634 lea(rscratch, src); 2635 movdqu(dst, Address(rscratch, 0)); 2636 } 2637 } 2638 2639 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) { 2640 assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); 2641 Assembler::vmovdqu(dst, src); 2642 } 2643 2644 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) { 2645 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); 2646 Assembler::vmovdqu(dst, src); 2647 } 2648 2649 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) { 2650 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); 2651 Assembler::vmovdqu(dst, src); 2652 } 2653 2654 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register rscratch) { 2655 assert(rscratch != noreg || always_reachable(src), "missing"); 2656 2657 if (reachable(src)) { 2658 vmovdqu(dst, as_Address(src)); 2659 } 2660 else { 2661 lea(rscratch, src); 2662 vmovdqu(dst, Address(rscratch, 0)); 2663 } 2664 } 2665 2666 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { 2667 assert(rscratch != noreg || always_reachable(src), "missing"); 2668 2669 if (vector_len == AVX_512bit) { 2670 evmovdquq(dst, src, AVX_512bit, rscratch); 2671 } else if (vector_len == AVX_256bit) { 2672 vmovdqu(dst, src, rscratch); 2673 } else { 2674 movdqu(dst, src, rscratch); 2675 } 2676 } 2677 2678 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src, int vector_len) { 2679 if (vector_len == AVX_512bit) { 2680 evmovdquq(dst, src, AVX_512bit); 2681 } else if (vector_len == AVX_256bit) { 2682 vmovdqu(dst, src); 2683 } else { 2684 movdqu(dst, src); 2685 } 2686 } 2687 2688 void MacroAssembler::vmovdqu(Address dst, XMMRegister src, int vector_len) { 2689 if (vector_len == AVX_512bit) { 2690 evmovdquq(dst, src, AVX_512bit); 2691 } else if (vector_len == AVX_256bit) { 2692 vmovdqu(dst, src); 2693 } else { 2694 movdqu(dst, src); 2695 } 2696 } 2697 2698 void MacroAssembler::vmovdqu(XMMRegister dst, Address src, int vector_len) { 2699 if (vector_len == AVX_512bit) { 2700 evmovdquq(dst, src, AVX_512bit); 2701 } else if (vector_len == AVX_256bit) { 2702 vmovdqu(dst, src); 2703 } else { 2704 movdqu(dst, src); 2705 } 2706 } 2707 2708 void MacroAssembler::vmovdqa(XMMRegister dst, AddressLiteral src, Register rscratch) { 2709 assert(rscratch != noreg || always_reachable(src), "missing"); 2710 2711 if (reachable(src)) { 2712 vmovdqa(dst, as_Address(src)); 2713 } 2714 else { 2715 lea(rscratch, src); 2716 vmovdqa(dst, Address(rscratch, 0)); 2717 } 2718 } 2719 2720 void MacroAssembler::vmovdqa(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { 2721 assert(rscratch != noreg || always_reachable(src), "missing"); 2722 2723 if (vector_len == AVX_512bit) { 2724 evmovdqaq(dst, src, AVX_512bit, rscratch); 2725 } else if (vector_len == AVX_256bit) { 2726 vmovdqa(dst, src, rscratch); 2727 } else { 2728 movdqa(dst, src, rscratch); 2729 } 2730 } 2731 2732 void MacroAssembler::kmov(KRegister dst, Address src) { 2733 if (VM_Version::supports_avx512bw()) { 2734 kmovql(dst, src); 2735 } else { 2736 assert(VM_Version::supports_evex(), ""); 2737 kmovwl(dst, src); 2738 } 2739 } 2740 2741 void MacroAssembler::kmov(Address dst, KRegister src) { 2742 if (VM_Version::supports_avx512bw()) { 2743 kmovql(dst, src); 2744 } else { 2745 assert(VM_Version::supports_evex(), ""); 2746 kmovwl(dst, src); 2747 } 2748 } 2749 2750 void MacroAssembler::kmov(KRegister dst, KRegister src) { 2751 if (VM_Version::supports_avx512bw()) { 2752 kmovql(dst, src); 2753 } else { 2754 assert(VM_Version::supports_evex(), ""); 2755 kmovwl(dst, src); 2756 } 2757 } 2758 2759 void MacroAssembler::kmov(Register dst, KRegister src) { 2760 if (VM_Version::supports_avx512bw()) { 2761 kmovql(dst, src); 2762 } else { 2763 assert(VM_Version::supports_evex(), ""); 2764 kmovwl(dst, src); 2765 } 2766 } 2767 2768 void MacroAssembler::kmov(KRegister dst, Register src) { 2769 if (VM_Version::supports_avx512bw()) { 2770 kmovql(dst, src); 2771 } else { 2772 assert(VM_Version::supports_evex(), ""); 2773 kmovwl(dst, src); 2774 } 2775 } 2776 2777 void MacroAssembler::kmovql(KRegister dst, AddressLiteral src, Register rscratch) { 2778 assert(rscratch != noreg || always_reachable(src), "missing"); 2779 2780 if (reachable(src)) { 2781 kmovql(dst, as_Address(src)); 2782 } else { 2783 lea(rscratch, src); 2784 kmovql(dst, Address(rscratch, 0)); 2785 } 2786 } 2787 2788 void MacroAssembler::kmovwl(KRegister dst, AddressLiteral src, Register rscratch) { 2789 assert(rscratch != noreg || always_reachable(src), "missing"); 2790 2791 if (reachable(src)) { 2792 kmovwl(dst, as_Address(src)); 2793 } else { 2794 lea(rscratch, src); 2795 kmovwl(dst, Address(rscratch, 0)); 2796 } 2797 } 2798 2799 void MacroAssembler::evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, 2800 int vector_len, Register rscratch) { 2801 assert(rscratch != noreg || always_reachable(src), "missing"); 2802 2803 if (reachable(src)) { 2804 Assembler::evmovdqub(dst, mask, as_Address(src), merge, vector_len); 2805 } else { 2806 lea(rscratch, src); 2807 Assembler::evmovdqub(dst, mask, Address(rscratch, 0), merge, vector_len); 2808 } 2809 } 2810 2811 void MacroAssembler::evmovdquw(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, 2812 int vector_len, Register rscratch) { 2813 assert(rscratch != noreg || always_reachable(src), "missing"); 2814 2815 if (reachable(src)) { 2816 Assembler::evmovdquw(dst, mask, as_Address(src), merge, vector_len); 2817 } else { 2818 lea(rscratch, src); 2819 Assembler::evmovdquw(dst, mask, Address(rscratch, 0), merge, vector_len); 2820 } 2821 } 2822 2823 void MacroAssembler::evmovdqul(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch) { 2824 assert(rscratch != noreg || always_reachable(src), "missing"); 2825 2826 if (reachable(src)) { 2827 Assembler::evmovdqul(dst, mask, as_Address(src), merge, vector_len); 2828 } else { 2829 lea(rscratch, src); 2830 Assembler::evmovdqul(dst, mask, Address(rscratch, 0), merge, vector_len); 2831 } 2832 } 2833 2834 void MacroAssembler::evmovdquq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch) { 2835 assert(rscratch != noreg || always_reachable(src), "missing"); 2836 2837 if (reachable(src)) { 2838 Assembler::evmovdquq(dst, mask, as_Address(src), merge, vector_len); 2839 } else { 2840 lea(rscratch, src); 2841 Assembler::evmovdquq(dst, mask, Address(rscratch, 0), merge, vector_len); 2842 } 2843 } 2844 2845 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { 2846 assert(rscratch != noreg || always_reachable(src), "missing"); 2847 2848 if (reachable(src)) { 2849 Assembler::evmovdquq(dst, as_Address(src), vector_len); 2850 } else { 2851 lea(rscratch, src); 2852 Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len); 2853 } 2854 } 2855 2856 void MacroAssembler::evmovdqaq(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register rscratch) { 2857 assert(rscratch != noreg || always_reachable(src), "missing"); 2858 2859 if (reachable(src)) { 2860 Assembler::evmovdqaq(dst, mask, as_Address(src), merge, vector_len); 2861 } else { 2862 lea(rscratch, src); 2863 Assembler::evmovdqaq(dst, mask, Address(rscratch, 0), merge, vector_len); 2864 } 2865 } 2866 2867 void MacroAssembler::evmovdqaq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { 2868 assert(rscratch != noreg || always_reachable(src), "missing"); 2869 2870 if (reachable(src)) { 2871 Assembler::evmovdqaq(dst, as_Address(src), vector_len); 2872 } else { 2873 lea(rscratch, src); 2874 Assembler::evmovdqaq(dst, Address(rscratch, 0), vector_len); 2875 } 2876 } 2877 2878 2879 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src, Register rscratch) { 2880 assert(rscratch != noreg || always_reachable(src), "missing"); 2881 2882 if (reachable(src)) { 2883 Assembler::movdqa(dst, as_Address(src)); 2884 } else { 2885 lea(rscratch, src); 2886 Assembler::movdqa(dst, Address(rscratch, 0)); 2887 } 2888 } 2889 2890 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src, Register rscratch) { 2891 assert(rscratch != noreg || always_reachable(src), "missing"); 2892 2893 if (reachable(src)) { 2894 Assembler::movsd(dst, as_Address(src)); 2895 } else { 2896 lea(rscratch, src); 2897 Assembler::movsd(dst, Address(rscratch, 0)); 2898 } 2899 } 2900 2901 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src, Register rscratch) { 2902 assert(rscratch != noreg || always_reachable(src), "missing"); 2903 2904 if (reachable(src)) { 2905 Assembler::movss(dst, as_Address(src)); 2906 } else { 2907 lea(rscratch, src); 2908 Assembler::movss(dst, Address(rscratch, 0)); 2909 } 2910 } 2911 2912 void MacroAssembler::movddup(XMMRegister dst, AddressLiteral src, Register rscratch) { 2913 assert(rscratch != noreg || always_reachable(src), "missing"); 2914 2915 if (reachable(src)) { 2916 Assembler::movddup(dst, as_Address(src)); 2917 } else { 2918 lea(rscratch, src); 2919 Assembler::movddup(dst, Address(rscratch, 0)); 2920 } 2921 } 2922 2923 void MacroAssembler::vmovddup(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { 2924 assert(rscratch != noreg || always_reachable(src), "missing"); 2925 2926 if (reachable(src)) { 2927 Assembler::vmovddup(dst, as_Address(src), vector_len); 2928 } else { 2929 lea(rscratch, src); 2930 Assembler::vmovddup(dst, Address(rscratch, 0), vector_len); 2931 } 2932 } 2933 2934 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src, Register rscratch) { 2935 assert(rscratch != noreg || always_reachable(src), "missing"); 2936 2937 if (reachable(src)) { 2938 Assembler::mulsd(dst, as_Address(src)); 2939 } else { 2940 lea(rscratch, src); 2941 Assembler::mulsd(dst, Address(rscratch, 0)); 2942 } 2943 } 2944 2945 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src, Register rscratch) { 2946 assert(rscratch != noreg || always_reachable(src), "missing"); 2947 2948 if (reachable(src)) { 2949 Assembler::mulss(dst, as_Address(src)); 2950 } else { 2951 lea(rscratch, src); 2952 Assembler::mulss(dst, Address(rscratch, 0)); 2953 } 2954 } 2955 2956 void MacroAssembler::null_check(Register reg, int offset) { 2957 if (needs_explicit_null_check(offset)) { 2958 // provoke OS null exception if reg is null by 2959 // accessing M[reg] w/o changing any (non-CC) registers 2960 // NOTE: cmpl is plenty here to provoke a segv 2961 cmpptr(rax, Address(reg, 0)); 2962 // Note: should probably use testl(rax, Address(reg, 0)); 2963 // may be shorter code (however, this version of 2964 // testl needs to be implemented first) 2965 } else { 2966 // nothing to do, (later) access of M[reg + offset] 2967 // will provoke OS null exception if reg is null 2968 } 2969 } 2970 2971 void MacroAssembler::os_breakpoint() { 2972 // instead of directly emitting a breakpoint, call os:breakpoint for better debugability 2973 // (e.g., MSVC can't call ps() otherwise) 2974 call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint))); 2975 } 2976 2977 void MacroAssembler::unimplemented(const char* what) { 2978 const char* buf = nullptr; 2979 { 2980 ResourceMark rm; 2981 stringStream ss; 2982 ss.print("unimplemented: %s", what); 2983 buf = code_string(ss.as_string()); 2984 } 2985 stop(buf); 2986 } 2987 2988 #ifdef _LP64 2989 #define XSTATE_BV 0x200 2990 #endif 2991 2992 void MacroAssembler::pop_CPU_state() { 2993 pop_FPU_state(); 2994 pop_IU_state(); 2995 } 2996 2997 void MacroAssembler::pop_FPU_state() { 2998 #ifndef _LP64 2999 frstor(Address(rsp, 0)); 3000 #else 3001 fxrstor(Address(rsp, 0)); 3002 #endif 3003 addptr(rsp, FPUStateSizeInWords * wordSize); 3004 } 3005 3006 void MacroAssembler::pop_IU_state() { 3007 popa(); 3008 LP64_ONLY(addq(rsp, 8)); 3009 popf(); 3010 } 3011 3012 // Save Integer and Float state 3013 // Warning: Stack must be 16 byte aligned (64bit) 3014 void MacroAssembler::push_CPU_state() { 3015 push_IU_state(); 3016 push_FPU_state(); 3017 } 3018 3019 void MacroAssembler::push_FPU_state() { 3020 subptr(rsp, FPUStateSizeInWords * wordSize); 3021 #ifndef _LP64 3022 fnsave(Address(rsp, 0)); 3023 fwait(); 3024 #else 3025 fxsave(Address(rsp, 0)); 3026 #endif // LP64 3027 } 3028 3029 void MacroAssembler::push_IU_state() { 3030 // Push flags first because pusha kills them 3031 pushf(); 3032 // Make sure rsp stays 16-byte aligned 3033 LP64_ONLY(subq(rsp, 8)); 3034 pusha(); 3035 } 3036 3037 void MacroAssembler::push_cont_fastpath() { 3038 if (!Continuations::enabled()) return; 3039 3040 #ifndef _LP64 3041 Register rthread = rax; 3042 Register rrealsp = rbx; 3043 push(rthread); 3044 push(rrealsp); 3045 3046 get_thread(rthread); 3047 3048 // The code below wants the original RSP. 3049 // Move it back after the pushes above. 3050 movptr(rrealsp, rsp); 3051 addptr(rrealsp, 2*wordSize); 3052 #else 3053 Register rthread = r15_thread; 3054 Register rrealsp = rsp; 3055 #endif 3056 3057 Label done; 3058 cmpptr(rrealsp, Address(rthread, JavaThread::cont_fastpath_offset())); 3059 jccb(Assembler::belowEqual, done); 3060 movptr(Address(rthread, JavaThread::cont_fastpath_offset()), rrealsp); 3061 bind(done); 3062 3063 #ifndef _LP64 3064 pop(rrealsp); 3065 pop(rthread); 3066 #endif 3067 } 3068 3069 void MacroAssembler::pop_cont_fastpath() { 3070 if (!Continuations::enabled()) return; 3071 3072 #ifndef _LP64 3073 Register rthread = rax; 3074 Register rrealsp = rbx; 3075 push(rthread); 3076 push(rrealsp); 3077 3078 get_thread(rthread); 3079 3080 // The code below wants the original RSP. 3081 // Move it back after the pushes above. 3082 movptr(rrealsp, rsp); 3083 addptr(rrealsp, 2*wordSize); 3084 #else 3085 Register rthread = r15_thread; 3086 Register rrealsp = rsp; 3087 #endif 3088 3089 Label done; 3090 cmpptr(rrealsp, Address(rthread, JavaThread::cont_fastpath_offset())); 3091 jccb(Assembler::below, done); 3092 movptr(Address(rthread, JavaThread::cont_fastpath_offset()), 0); 3093 bind(done); 3094 3095 #ifndef _LP64 3096 pop(rrealsp); 3097 pop(rthread); 3098 #endif 3099 } 3100 3101 void MacroAssembler::inc_held_monitor_count() { 3102 #ifdef _LP64 3103 incrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 3104 #endif 3105 } 3106 3107 void MacroAssembler::dec_held_monitor_count() { 3108 #ifdef _LP64 3109 decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); 3110 #endif 3111 } 3112 3113 #ifdef ASSERT 3114 void MacroAssembler::stop_if_in_cont(Register cont, const char* name) { 3115 #ifdef _LP64 3116 Label no_cont; 3117 movptr(cont, Address(r15_thread, JavaThread::cont_entry_offset())); 3118 testl(cont, cont); 3119 jcc(Assembler::zero, no_cont); 3120 stop(name); 3121 bind(no_cont); 3122 #else 3123 Unimplemented(); 3124 #endif 3125 } 3126 #endif 3127 3128 void MacroAssembler::reset_last_Java_frame(bool clear_fp) { // determine java_thread register 3129 // we must set sp to zero to clear frame 3130 movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), NULL_WORD); 3131 // must clear fp, so that compiled frames are not confused; it is 3132 // possible that we need it only for debugging 3133 if (clear_fp) { 3134 movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD); 3135 } 3136 // Always clear the pc because it could have been set by make_walkable() 3137 movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD); 3138 vzeroupper(); 3139 } 3140 3141 void MacroAssembler::restore_rax(Register tmp) { 3142 if (tmp == noreg) pop(rax); 3143 else if (tmp != rax) mov(rax, tmp); 3144 } 3145 3146 void MacroAssembler::round_to(Register reg, int modulus) { 3147 addptr(reg, modulus - 1); 3148 andptr(reg, -modulus); 3149 } 3150 3151 void MacroAssembler::save_rax(Register tmp) { 3152 if (tmp == noreg) push(rax); 3153 else if (tmp != rax) mov(tmp, rax); 3154 } 3155 3156 void MacroAssembler::safepoint_poll(Label& slow_path, bool at_return, bool in_nmethod) { 3157 if (at_return) { 3158 // Note that when in_nmethod is set, the stack pointer is incremented before the poll. Therefore, 3159 // we may safely use rsp instead to perform the stack watermark check. 3160 cmpptr(in_nmethod ? rsp : rbp, Address(r15_thread, JavaThread::polling_word_offset())); 3161 jcc(Assembler::above, slow_path); 3162 return; 3163 } 3164 testb(Address(r15_thread, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit()); 3165 jcc(Assembler::notZero, slow_path); // handshake bit set implies poll 3166 } 3167 3168 // Calls to C land 3169 // 3170 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded 3171 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp 3172 // has to be reset to 0. This is required to allow proper stack traversal. 3173 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 3174 Register last_java_fp, 3175 address last_java_pc, 3176 Register rscratch) { 3177 vzeroupper(); 3178 // determine last_java_sp register 3179 if (!last_java_sp->is_valid()) { 3180 last_java_sp = rsp; 3181 } 3182 // last_java_fp is optional 3183 if (last_java_fp->is_valid()) { 3184 movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), last_java_fp); 3185 } 3186 // last_java_pc is optional 3187 if (last_java_pc != nullptr) { 3188 Address java_pc(r15_thread, 3189 JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()); 3190 lea(java_pc, InternalAddress(last_java_pc), rscratch); 3191 } 3192 movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp); 3193 } 3194 3195 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 3196 Register last_java_fp, 3197 Label &L, 3198 Register scratch) { 3199 lea(scratch, L); 3200 movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), scratch); 3201 set_last_Java_frame(last_java_sp, last_java_fp, nullptr, scratch); 3202 } 3203 3204 void MacroAssembler::shlptr(Register dst, int imm8) { 3205 LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8)); 3206 } 3207 3208 void MacroAssembler::shrptr(Register dst, int imm8) { 3209 LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8)); 3210 } 3211 3212 void MacroAssembler::sign_extend_byte(Register reg) { 3213 if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) { 3214 movsbl(reg, reg); // movsxb 3215 } else { 3216 shll(reg, 24); 3217 sarl(reg, 24); 3218 } 3219 } 3220 3221 void MacroAssembler::sign_extend_short(Register reg) { 3222 if (LP64_ONLY(true ||) VM_Version::is_P6()) { 3223 movswl(reg, reg); // movsxw 3224 } else { 3225 shll(reg, 16); 3226 sarl(reg, 16); 3227 } 3228 } 3229 3230 void MacroAssembler::testl(Address dst, int32_t imm32) { 3231 if (imm32 >= 0 && is8bit(imm32)) { 3232 testb(dst, imm32); 3233 } else { 3234 Assembler::testl(dst, imm32); 3235 } 3236 } 3237 3238 void MacroAssembler::testl(Register dst, int32_t imm32) { 3239 if (imm32 >= 0 && is8bit(imm32) && dst->has_byte_register()) { 3240 testb(dst, imm32); 3241 } else { 3242 Assembler::testl(dst, imm32); 3243 } 3244 } 3245 3246 void MacroAssembler::testl(Register dst, AddressLiteral src) { 3247 assert(always_reachable(src), "Address should be reachable"); 3248 testl(dst, as_Address(src)); 3249 } 3250 3251 #ifdef _LP64 3252 3253 void MacroAssembler::testq(Address dst, int32_t imm32) { 3254 if (imm32 >= 0) { 3255 testl(dst, imm32); 3256 } else { 3257 Assembler::testq(dst, imm32); 3258 } 3259 } 3260 3261 void MacroAssembler::testq(Register dst, int32_t imm32) { 3262 if (imm32 >= 0) { 3263 testl(dst, imm32); 3264 } else { 3265 Assembler::testq(dst, imm32); 3266 } 3267 } 3268 3269 #endif 3270 3271 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) { 3272 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3273 Assembler::pcmpeqb(dst, src); 3274 } 3275 3276 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) { 3277 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3278 Assembler::pcmpeqw(dst, src); 3279 } 3280 3281 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) { 3282 assert((dst->encoding() < 16),"XMM register should be 0-15"); 3283 Assembler::pcmpestri(dst, src, imm8); 3284 } 3285 3286 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) { 3287 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15"); 3288 Assembler::pcmpestri(dst, src, imm8); 3289 } 3290 3291 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) { 3292 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3293 Assembler::pmovzxbw(dst, src); 3294 } 3295 3296 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) { 3297 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3298 Assembler::pmovzxbw(dst, src); 3299 } 3300 3301 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) { 3302 assert((src->encoding() < 16),"XMM register should be 0-15"); 3303 Assembler::pmovmskb(dst, src); 3304 } 3305 3306 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) { 3307 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15"); 3308 Assembler::ptest(dst, src); 3309 } 3310 3311 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src, Register rscratch) { 3312 assert(rscratch != noreg || always_reachable(src), "missing"); 3313 3314 if (reachable(src)) { 3315 Assembler::sqrtss(dst, as_Address(src)); 3316 } else { 3317 lea(rscratch, src); 3318 Assembler::sqrtss(dst, Address(rscratch, 0)); 3319 } 3320 } 3321 3322 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src, Register rscratch) { 3323 assert(rscratch != noreg || always_reachable(src), "missing"); 3324 3325 if (reachable(src)) { 3326 Assembler::subsd(dst, as_Address(src)); 3327 } else { 3328 lea(rscratch, src); 3329 Assembler::subsd(dst, Address(rscratch, 0)); 3330 } 3331 } 3332 3333 void MacroAssembler::roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register rscratch) { 3334 assert(rscratch != noreg || always_reachable(src), "missing"); 3335 3336 if (reachable(src)) { 3337 Assembler::roundsd(dst, as_Address(src), rmode); 3338 } else { 3339 lea(rscratch, src); 3340 Assembler::roundsd(dst, Address(rscratch, 0), rmode); 3341 } 3342 } 3343 3344 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src, Register rscratch) { 3345 assert(rscratch != noreg || always_reachable(src), "missing"); 3346 3347 if (reachable(src)) { 3348 Assembler::subss(dst, as_Address(src)); 3349 } else { 3350 lea(rscratch, src); 3351 Assembler::subss(dst, Address(rscratch, 0)); 3352 } 3353 } 3354 3355 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src, Register rscratch) { 3356 assert(rscratch != noreg || always_reachable(src), "missing"); 3357 3358 if (reachable(src)) { 3359 Assembler::ucomisd(dst, as_Address(src)); 3360 } else { 3361 lea(rscratch, src); 3362 Assembler::ucomisd(dst, Address(rscratch, 0)); 3363 } 3364 } 3365 3366 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src, Register rscratch) { 3367 assert(rscratch != noreg || always_reachable(src), "missing"); 3368 3369 if (reachable(src)) { 3370 Assembler::ucomiss(dst, as_Address(src)); 3371 } else { 3372 lea(rscratch, src); 3373 Assembler::ucomiss(dst, Address(rscratch, 0)); 3374 } 3375 } 3376 3377 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src, Register rscratch) { 3378 assert(rscratch != noreg || always_reachable(src), "missing"); 3379 3380 // Used in sign-bit flipping with aligned address. 3381 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes"); 3382 3383 if (UseAVX > 2 && 3384 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) && 3385 (dst->encoding() >= 16)) { 3386 vpxor(dst, dst, src, Assembler::AVX_512bit, rscratch); 3387 } else if (reachable(src)) { 3388 Assembler::xorpd(dst, as_Address(src)); 3389 } else { 3390 lea(rscratch, src); 3391 Assembler::xorpd(dst, Address(rscratch, 0)); 3392 } 3393 } 3394 3395 void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) { 3396 if (UseAVX > 2 && 3397 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) && 3398 ((dst->encoding() >= 16) || (src->encoding() >= 16))) { 3399 Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit); 3400 } else { 3401 Assembler::xorpd(dst, src); 3402 } 3403 } 3404 3405 void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) { 3406 if (UseAVX > 2 && 3407 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) && 3408 ((dst->encoding() >= 16) || (src->encoding() >= 16))) { 3409 Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit); 3410 } else { 3411 Assembler::xorps(dst, src); 3412 } 3413 } 3414 3415 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src, Register rscratch) { 3416 assert(rscratch != noreg || always_reachable(src), "missing"); 3417 3418 // Used in sign-bit flipping with aligned address. 3419 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes"); 3420 3421 if (UseAVX > 2 && 3422 (!VM_Version::supports_avx512dq() || !VM_Version::supports_avx512vl()) && 3423 (dst->encoding() >= 16)) { 3424 vpxor(dst, dst, src, Assembler::AVX_512bit, rscratch); 3425 } else if (reachable(src)) { 3426 Assembler::xorps(dst, as_Address(src)); 3427 } else { 3428 lea(rscratch, src); 3429 Assembler::xorps(dst, Address(rscratch, 0)); 3430 } 3431 } 3432 3433 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src, Register rscratch) { 3434 assert(rscratch != noreg || always_reachable(src), "missing"); 3435 3436 // Used in sign-bit flipping with aligned address. 3437 bool aligned_adr = (((intptr_t)src.target() & 15) == 0); 3438 assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes"); 3439 if (reachable(src)) { 3440 Assembler::pshufb(dst, as_Address(src)); 3441 } else { 3442 lea(rscratch, src); 3443 Assembler::pshufb(dst, Address(rscratch, 0)); 3444 } 3445 } 3446 3447 // AVX 3-operands instructions 3448 3449 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) { 3450 assert(rscratch != noreg || always_reachable(src), "missing"); 3451 3452 if (reachable(src)) { 3453 vaddsd(dst, nds, as_Address(src)); 3454 } else { 3455 lea(rscratch, src); 3456 vaddsd(dst, nds, Address(rscratch, 0)); 3457 } 3458 } 3459 3460 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) { 3461 assert(rscratch != noreg || always_reachable(src), "missing"); 3462 3463 if (reachable(src)) { 3464 vaddss(dst, nds, as_Address(src)); 3465 } else { 3466 lea(rscratch, src); 3467 vaddss(dst, nds, Address(rscratch, 0)); 3468 } 3469 } 3470 3471 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 3472 assert(UseAVX > 0, "requires some form of AVX"); 3473 assert(rscratch != noreg || always_reachable(src), "missing"); 3474 3475 if (reachable(src)) { 3476 Assembler::vpaddb(dst, nds, as_Address(src), vector_len); 3477 } else { 3478 lea(rscratch, src); 3479 Assembler::vpaddb(dst, nds, Address(rscratch, 0), vector_len); 3480 } 3481 } 3482 3483 void MacroAssembler::vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 3484 assert(UseAVX > 0, "requires some form of AVX"); 3485 assert(rscratch != noreg || always_reachable(src), "missing"); 3486 3487 if (reachable(src)) { 3488 Assembler::vpaddd(dst, nds, as_Address(src), vector_len); 3489 } else { 3490 lea(rscratch, src); 3491 Assembler::vpaddd(dst, nds, Address(rscratch, 0), vector_len); 3492 } 3493 } 3494 3495 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch) { 3496 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15"); 3497 assert(rscratch != noreg || always_reachable(negate_field), "missing"); 3498 3499 vandps(dst, nds, negate_field, vector_len, rscratch); 3500 } 3501 3502 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len, Register rscratch) { 3503 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15"); 3504 assert(rscratch != noreg || always_reachable(negate_field), "missing"); 3505 3506 vandpd(dst, nds, negate_field, vector_len, rscratch); 3507 } 3508 3509 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 3510 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3511 Assembler::vpaddb(dst, nds, src, vector_len); 3512 } 3513 3514 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { 3515 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3516 Assembler::vpaddb(dst, nds, src, vector_len); 3517 } 3518 3519 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 3520 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3521 Assembler::vpaddw(dst, nds, src, vector_len); 3522 } 3523 3524 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { 3525 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3526 Assembler::vpaddw(dst, nds, src, vector_len); 3527 } 3528 3529 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 3530 assert(rscratch != noreg || always_reachable(src), "missing"); 3531 3532 if (reachable(src)) { 3533 Assembler::vpand(dst, nds, as_Address(src), vector_len); 3534 } else { 3535 lea(rscratch, src); 3536 Assembler::vpand(dst, nds, Address(rscratch, 0), vector_len); 3537 } 3538 } 3539 3540 void MacroAssembler::vpbroadcastd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { 3541 assert(rscratch != noreg || always_reachable(src), "missing"); 3542 3543 if (reachable(src)) { 3544 Assembler::vpbroadcastd(dst, as_Address(src), vector_len); 3545 } else { 3546 lea(rscratch, src); 3547 Assembler::vpbroadcastd(dst, Address(rscratch, 0), vector_len); 3548 } 3549 } 3550 3551 void MacroAssembler::vbroadcasti128(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { 3552 assert(rscratch != noreg || always_reachable(src), "missing"); 3553 3554 if (reachable(src)) { 3555 Assembler::vbroadcasti128(dst, as_Address(src), vector_len); 3556 } else { 3557 lea(rscratch, src); 3558 Assembler::vbroadcasti128(dst, Address(rscratch, 0), vector_len); 3559 } 3560 } 3561 3562 void MacroAssembler::vpbroadcastq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { 3563 assert(rscratch != noreg || always_reachable(src), "missing"); 3564 3565 if (reachable(src)) { 3566 Assembler::vpbroadcastq(dst, as_Address(src), vector_len); 3567 } else { 3568 lea(rscratch, src); 3569 Assembler::vpbroadcastq(dst, Address(rscratch, 0), vector_len); 3570 } 3571 } 3572 3573 void MacroAssembler::vbroadcastsd(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { 3574 assert(rscratch != noreg || always_reachable(src), "missing"); 3575 3576 if (reachable(src)) { 3577 Assembler::vbroadcastsd(dst, as_Address(src), vector_len); 3578 } else { 3579 lea(rscratch, src); 3580 Assembler::vbroadcastsd(dst, Address(rscratch, 0), vector_len); 3581 } 3582 } 3583 3584 void MacroAssembler::vbroadcastss(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { 3585 assert(rscratch != noreg || always_reachable(src), "missing"); 3586 3587 if (reachable(src)) { 3588 Assembler::vbroadcastss(dst, as_Address(src), vector_len); 3589 } else { 3590 lea(rscratch, src); 3591 Assembler::vbroadcastss(dst, Address(rscratch, 0), vector_len); 3592 } 3593 } 3594 3595 // Vector float blend 3596 // vblendvps(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len, bool compute_mask = true, XMMRegister scratch = xnoreg) 3597 void MacroAssembler::vblendvps(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister mask, int vector_len, bool compute_mask, XMMRegister scratch) { 3598 // WARN: Allow dst == (src1|src2), mask == scratch 3599 bool blend_emulation = EnableX86ECoreOpts && UseAVX > 1; 3600 bool scratch_available = scratch != xnoreg && scratch != src1 && scratch != src2 && scratch != dst; 3601 bool dst_available = dst != mask && (dst != src1 || dst != src2); 3602 if (blend_emulation && scratch_available && dst_available) { 3603 if (compute_mask) { 3604 vpsrad(scratch, mask, 32, vector_len); 3605 mask = scratch; 3606 } 3607 if (dst == src1) { 3608 vpandn(dst, mask, src1, vector_len); // if mask == 0, src1 3609 vpand (scratch, mask, src2, vector_len); // if mask == 1, src2 3610 } else { 3611 vpand (dst, mask, src2, vector_len); // if mask == 1, src2 3612 vpandn(scratch, mask, src1, vector_len); // if mask == 0, src1 3613 } 3614 vpor(dst, dst, scratch, vector_len); 3615 } else { 3616 Assembler::vblendvps(dst, src1, src2, mask, vector_len); 3617 } 3618 } 3619 3620 // vblendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister mask, int vector_len, bool compute_mask = true, XMMRegister scratch = xnoreg) 3621 void MacroAssembler::vblendvpd(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister mask, int vector_len, bool compute_mask, XMMRegister scratch) { 3622 // WARN: Allow dst == (src1|src2), mask == scratch 3623 bool blend_emulation = EnableX86ECoreOpts && UseAVX > 1; 3624 bool scratch_available = scratch != xnoreg && scratch != src1 && scratch != src2 && scratch != dst && (!compute_mask || scratch != mask); 3625 bool dst_available = dst != mask && (dst != src1 || dst != src2); 3626 if (blend_emulation && scratch_available && dst_available) { 3627 if (compute_mask) { 3628 vpxor(scratch, scratch, scratch, vector_len); 3629 vpcmpgtq(scratch, scratch, mask, vector_len); 3630 mask = scratch; 3631 } 3632 if (dst == src1) { 3633 vpandn(dst, mask, src1, vector_len); // if mask == 0, src 3634 vpand (scratch, mask, src2, vector_len); // if mask == 1, src2 3635 } else { 3636 vpand (dst, mask, src2, vector_len); // if mask == 1, src2 3637 vpandn(scratch, mask, src1, vector_len); // if mask == 0, src 3638 } 3639 vpor(dst, dst, scratch, vector_len); 3640 } else { 3641 Assembler::vblendvpd(dst, src1, src2, mask, vector_len); 3642 } 3643 } 3644 3645 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 3646 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3647 Assembler::vpcmpeqb(dst, nds, src, vector_len); 3648 } 3649 3650 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister src1, Address src2, int vector_len) { 3651 assert(((dst->encoding() < 16 && src1->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3652 Assembler::vpcmpeqb(dst, src1, src2, vector_len); 3653 } 3654 3655 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 3656 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3657 Assembler::vpcmpeqw(dst, nds, src, vector_len); 3658 } 3659 3660 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { 3661 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3662 Assembler::vpcmpeqw(dst, nds, src, vector_len); 3663 } 3664 3665 void MacroAssembler::evpcmpeqd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 3666 assert(rscratch != noreg || always_reachable(src), "missing"); 3667 3668 if (reachable(src)) { 3669 Assembler::evpcmpeqd(kdst, mask, nds, as_Address(src), vector_len); 3670 } else { 3671 lea(rscratch, src); 3672 Assembler::evpcmpeqd(kdst, mask, nds, Address(rscratch, 0), vector_len); 3673 } 3674 } 3675 3676 void MacroAssembler::evpcmpd(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, 3677 int comparison, bool is_signed, int vector_len, Register rscratch) { 3678 assert(rscratch != noreg || always_reachable(src), "missing"); 3679 3680 if (reachable(src)) { 3681 Assembler::evpcmpd(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len); 3682 } else { 3683 lea(rscratch, src); 3684 Assembler::evpcmpd(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len); 3685 } 3686 } 3687 3688 void MacroAssembler::evpcmpq(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, 3689 int comparison, bool is_signed, int vector_len, Register rscratch) { 3690 assert(rscratch != noreg || always_reachable(src), "missing"); 3691 3692 if (reachable(src)) { 3693 Assembler::evpcmpq(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len); 3694 } else { 3695 lea(rscratch, src); 3696 Assembler::evpcmpq(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len); 3697 } 3698 } 3699 3700 void MacroAssembler::evpcmpb(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, 3701 int comparison, bool is_signed, int vector_len, Register rscratch) { 3702 assert(rscratch != noreg || always_reachable(src), "missing"); 3703 3704 if (reachable(src)) { 3705 Assembler::evpcmpb(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len); 3706 } else { 3707 lea(rscratch, src); 3708 Assembler::evpcmpb(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len); 3709 } 3710 } 3711 3712 void MacroAssembler::evpcmpw(KRegister kdst, KRegister mask, XMMRegister nds, AddressLiteral src, 3713 int comparison, bool is_signed, int vector_len, Register rscratch) { 3714 assert(rscratch != noreg || always_reachable(src), "missing"); 3715 3716 if (reachable(src)) { 3717 Assembler::evpcmpw(kdst, mask, nds, as_Address(src), comparison, is_signed, vector_len); 3718 } else { 3719 lea(rscratch, src); 3720 Assembler::evpcmpw(kdst, mask, nds, Address(rscratch, 0), comparison, is_signed, vector_len); 3721 } 3722 } 3723 3724 void MacroAssembler::vpcmpCC(XMMRegister dst, XMMRegister nds, XMMRegister src, int cond_encoding, Width width, int vector_len) { 3725 if (width == Assembler::Q) { 3726 Assembler::vpcmpCCq(dst, nds, src, cond_encoding, vector_len); 3727 } else { 3728 Assembler::vpcmpCCbwd(dst, nds, src, cond_encoding, vector_len); 3729 } 3730 } 3731 3732 void MacroAssembler::vpcmpCCW(XMMRegister dst, XMMRegister nds, XMMRegister src, XMMRegister xtmp, ComparisonPredicate cond, Width width, int vector_len) { 3733 int eq_cond_enc = 0x29; 3734 int gt_cond_enc = 0x37; 3735 if (width != Assembler::Q) { 3736 eq_cond_enc = 0x74 + width; 3737 gt_cond_enc = 0x64 + width; 3738 } 3739 switch (cond) { 3740 case eq: 3741 vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len); 3742 break; 3743 case neq: 3744 vpcmpCC(dst, nds, src, eq_cond_enc, width, vector_len); 3745 vallones(xtmp, vector_len); 3746 vpxor(dst, xtmp, dst, vector_len); 3747 break; 3748 case le: 3749 vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len); 3750 vallones(xtmp, vector_len); 3751 vpxor(dst, xtmp, dst, vector_len); 3752 break; 3753 case nlt: 3754 vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len); 3755 vallones(xtmp, vector_len); 3756 vpxor(dst, xtmp, dst, vector_len); 3757 break; 3758 case lt: 3759 vpcmpCC(dst, src, nds, gt_cond_enc, width, vector_len); 3760 break; 3761 case nle: 3762 vpcmpCC(dst, nds, src, gt_cond_enc, width, vector_len); 3763 break; 3764 default: 3765 assert(false, "Should not reach here"); 3766 } 3767 } 3768 3769 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) { 3770 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3771 Assembler::vpmovzxbw(dst, src, vector_len); 3772 } 3773 3774 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src, int vector_len) { 3775 assert((src->encoding() < 16),"XMM register should be 0-15"); 3776 Assembler::vpmovmskb(dst, src, vector_len); 3777 } 3778 3779 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 3780 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3781 Assembler::vpmullw(dst, nds, src, vector_len); 3782 } 3783 3784 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { 3785 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3786 Assembler::vpmullw(dst, nds, src, vector_len); 3787 } 3788 3789 void MacroAssembler::vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 3790 assert((UseAVX > 0), "AVX support is needed"); 3791 assert(rscratch != noreg || always_reachable(src), "missing"); 3792 3793 if (reachable(src)) { 3794 Assembler::vpmulld(dst, nds, as_Address(src), vector_len); 3795 } else { 3796 lea(rscratch, src); 3797 Assembler::vpmulld(dst, nds, Address(rscratch, 0), vector_len); 3798 } 3799 } 3800 3801 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 3802 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3803 Assembler::vpsubb(dst, nds, src, vector_len); 3804 } 3805 3806 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { 3807 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3808 Assembler::vpsubb(dst, nds, src, vector_len); 3809 } 3810 3811 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 3812 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3813 Assembler::vpsubw(dst, nds, src, vector_len); 3814 } 3815 3816 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { 3817 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3818 Assembler::vpsubw(dst, nds, src, vector_len); 3819 } 3820 3821 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) { 3822 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3823 Assembler::vpsraw(dst, nds, shift, vector_len); 3824 } 3825 3826 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 3827 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3828 Assembler::vpsraw(dst, nds, shift, vector_len); 3829 } 3830 3831 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) { 3832 assert(UseAVX > 2,""); 3833 if (!VM_Version::supports_avx512vl() && vector_len < 2) { 3834 vector_len = 2; 3835 } 3836 Assembler::evpsraq(dst, nds, shift, vector_len); 3837 } 3838 3839 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 3840 assert(UseAVX > 2,""); 3841 if (!VM_Version::supports_avx512vl() && vector_len < 2) { 3842 vector_len = 2; 3843 } 3844 Assembler::evpsraq(dst, nds, shift, vector_len); 3845 } 3846 3847 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) { 3848 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3849 Assembler::vpsrlw(dst, nds, shift, vector_len); 3850 } 3851 3852 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 3853 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3854 Assembler::vpsrlw(dst, nds, shift, vector_len); 3855 } 3856 3857 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) { 3858 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3859 Assembler::vpsllw(dst, nds, shift, vector_len); 3860 } 3861 3862 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 3863 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3864 Assembler::vpsllw(dst, nds, shift, vector_len); 3865 } 3866 3867 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) { 3868 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15"); 3869 Assembler::vptest(dst, src); 3870 } 3871 3872 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) { 3873 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3874 Assembler::punpcklbw(dst, src); 3875 } 3876 3877 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) { 3878 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); 3879 Assembler::pshufd(dst, src, mode); 3880 } 3881 3882 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) { 3883 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3884 Assembler::pshuflw(dst, src, mode); 3885 } 3886 3887 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 3888 assert(rscratch != noreg || always_reachable(src), "missing"); 3889 3890 if (reachable(src)) { 3891 vandpd(dst, nds, as_Address(src), vector_len); 3892 } else { 3893 lea(rscratch, src); 3894 vandpd(dst, nds, Address(rscratch, 0), vector_len); 3895 } 3896 } 3897 3898 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 3899 assert(rscratch != noreg || always_reachable(src), "missing"); 3900 3901 if (reachable(src)) { 3902 vandps(dst, nds, as_Address(src), vector_len); 3903 } else { 3904 lea(rscratch, src); 3905 vandps(dst, nds, Address(rscratch, 0), vector_len); 3906 } 3907 } 3908 3909 void MacroAssembler::evpord(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src, 3910 bool merge, int vector_len, Register rscratch) { 3911 assert(rscratch != noreg || always_reachable(src), "missing"); 3912 3913 if (reachable(src)) { 3914 Assembler::evpord(dst, mask, nds, as_Address(src), merge, vector_len); 3915 } else { 3916 lea(rscratch, src); 3917 Assembler::evpord(dst, mask, nds, Address(rscratch, 0), merge, vector_len); 3918 } 3919 } 3920 3921 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) { 3922 assert(rscratch != noreg || always_reachable(src), "missing"); 3923 3924 if (reachable(src)) { 3925 vdivsd(dst, nds, as_Address(src)); 3926 } else { 3927 lea(rscratch, src); 3928 vdivsd(dst, nds, Address(rscratch, 0)); 3929 } 3930 } 3931 3932 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) { 3933 assert(rscratch != noreg || always_reachable(src), "missing"); 3934 3935 if (reachable(src)) { 3936 vdivss(dst, nds, as_Address(src)); 3937 } else { 3938 lea(rscratch, src); 3939 vdivss(dst, nds, Address(rscratch, 0)); 3940 } 3941 } 3942 3943 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) { 3944 assert(rscratch != noreg || always_reachable(src), "missing"); 3945 3946 if (reachable(src)) { 3947 vmulsd(dst, nds, as_Address(src)); 3948 } else { 3949 lea(rscratch, src); 3950 vmulsd(dst, nds, Address(rscratch, 0)); 3951 } 3952 } 3953 3954 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) { 3955 assert(rscratch != noreg || always_reachable(src), "missing"); 3956 3957 if (reachable(src)) { 3958 vmulss(dst, nds, as_Address(src)); 3959 } else { 3960 lea(rscratch, src); 3961 vmulss(dst, nds, Address(rscratch, 0)); 3962 } 3963 } 3964 3965 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) { 3966 assert(rscratch != noreg || always_reachable(src), "missing"); 3967 3968 if (reachable(src)) { 3969 vsubsd(dst, nds, as_Address(src)); 3970 } else { 3971 lea(rscratch, src); 3972 vsubsd(dst, nds, Address(rscratch, 0)); 3973 } 3974 } 3975 3976 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) { 3977 assert(rscratch != noreg || always_reachable(src), "missing"); 3978 3979 if (reachable(src)) { 3980 vsubss(dst, nds, as_Address(src)); 3981 } else { 3982 lea(rscratch, src); 3983 vsubss(dst, nds, Address(rscratch, 0)); 3984 } 3985 } 3986 3987 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) { 3988 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15"); 3989 assert(rscratch != noreg || always_reachable(src), "missing"); 3990 3991 vxorps(dst, nds, src, Assembler::AVX_128bit, rscratch); 3992 } 3993 3994 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src, Register rscratch) { 3995 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15"); 3996 assert(rscratch != noreg || always_reachable(src), "missing"); 3997 3998 vxorpd(dst, nds, src, Assembler::AVX_128bit, rscratch); 3999 } 4000 4001 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 4002 assert(rscratch != noreg || always_reachable(src), "missing"); 4003 4004 if (reachable(src)) { 4005 vxorpd(dst, nds, as_Address(src), vector_len); 4006 } else { 4007 lea(rscratch, src); 4008 vxorpd(dst, nds, Address(rscratch, 0), vector_len); 4009 } 4010 } 4011 4012 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 4013 assert(rscratch != noreg || always_reachable(src), "missing"); 4014 4015 if (reachable(src)) { 4016 vxorps(dst, nds, as_Address(src), vector_len); 4017 } else { 4018 lea(rscratch, src); 4019 vxorps(dst, nds, Address(rscratch, 0), vector_len); 4020 } 4021 } 4022 4023 void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 4024 assert(rscratch != noreg || always_reachable(src), "missing"); 4025 4026 if (UseAVX > 1 || (vector_len < 1)) { 4027 if (reachable(src)) { 4028 Assembler::vpxor(dst, nds, as_Address(src), vector_len); 4029 } else { 4030 lea(rscratch, src); 4031 Assembler::vpxor(dst, nds, Address(rscratch, 0), vector_len); 4032 } 4033 } else { 4034 MacroAssembler::vxorpd(dst, nds, src, vector_len, rscratch); 4035 } 4036 } 4037 4038 void MacroAssembler::vpermd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 4039 assert(rscratch != noreg || always_reachable(src), "missing"); 4040 4041 if (reachable(src)) { 4042 Assembler::vpermd(dst, nds, as_Address(src), vector_len); 4043 } else { 4044 lea(rscratch, src); 4045 Assembler::vpermd(dst, nds, Address(rscratch, 0), vector_len); 4046 } 4047 } 4048 4049 void MacroAssembler::clear_jobject_tag(Register possibly_non_local) { 4050 const int32_t inverted_mask = ~static_cast<int32_t>(JNIHandles::tag_mask); 4051 STATIC_ASSERT(inverted_mask == -4); // otherwise check this code 4052 // The inverted mask is sign-extended 4053 andptr(possibly_non_local, inverted_mask); 4054 } 4055 4056 void MacroAssembler::resolve_jobject(Register value, 4057 Register tmp) { 4058 Register thread = r15_thread; 4059 assert_different_registers(value, thread, tmp); 4060 Label done, tagged, weak_tagged; 4061 testptr(value, value); 4062 jcc(Assembler::zero, done); // Use null as-is. 4063 testptr(value, JNIHandles::tag_mask); // Test for tag. 4064 jcc(Assembler::notZero, tagged); 4065 4066 // Resolve local handle 4067 access_load_at(T_OBJECT, IN_NATIVE | AS_RAW, value, Address(value, 0), tmp, thread); 4068 verify_oop(value); 4069 jmp(done); 4070 4071 bind(tagged); 4072 testptr(value, JNIHandles::TypeTag::weak_global); // Test for weak tag. 4073 jcc(Assembler::notZero, weak_tagged); 4074 4075 // Resolve global handle 4076 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, -JNIHandles::TypeTag::global), tmp, thread); 4077 verify_oop(value); 4078 jmp(done); 4079 4080 bind(weak_tagged); 4081 // Resolve jweak. 4082 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, 4083 value, Address(value, -JNIHandles::TypeTag::weak_global), tmp, thread); 4084 verify_oop(value); 4085 4086 bind(done); 4087 } 4088 4089 void MacroAssembler::resolve_global_jobject(Register value, 4090 Register tmp) { 4091 Register thread = r15_thread; 4092 assert_different_registers(value, thread, tmp); 4093 Label done; 4094 4095 testptr(value, value); 4096 jcc(Assembler::zero, done); // Use null as-is. 4097 4098 #ifdef ASSERT 4099 { 4100 Label valid_global_tag; 4101 testptr(value, JNIHandles::TypeTag::global); // Test for global tag. 4102 jcc(Assembler::notZero, valid_global_tag); 4103 stop("non global jobject using resolve_global_jobject"); 4104 bind(valid_global_tag); 4105 } 4106 #endif 4107 4108 // Resolve global handle 4109 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, -JNIHandles::TypeTag::global), tmp, thread); 4110 verify_oop(value); 4111 4112 bind(done); 4113 } 4114 4115 void MacroAssembler::subptr(Register dst, int32_t imm32) { 4116 LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32)); 4117 } 4118 4119 // Force generation of a 4 byte immediate value even if it fits into 8bit 4120 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) { 4121 LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32)); 4122 } 4123 4124 void MacroAssembler::subptr(Register dst, Register src) { 4125 LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src)); 4126 } 4127 4128 // C++ bool manipulation 4129 void MacroAssembler::testbool(Register dst) { 4130 if(sizeof(bool) == 1) 4131 testb(dst, 0xff); 4132 else if(sizeof(bool) == 2) { 4133 // testw implementation needed for two byte bools 4134 ShouldNotReachHere(); 4135 } else if(sizeof(bool) == 4) 4136 testl(dst, dst); 4137 else 4138 // unsupported 4139 ShouldNotReachHere(); 4140 } 4141 4142 void MacroAssembler::testptr(Register dst, Register src) { 4143 LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src)); 4144 } 4145 4146 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes. 4147 void MacroAssembler::tlab_allocate(Register thread, Register obj, 4148 Register var_size_in_bytes, 4149 int con_size_in_bytes, 4150 Register t1, 4151 Register t2, 4152 Label& slow_case) { 4153 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4154 bs->tlab_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case); 4155 } 4156 4157 RegSet MacroAssembler::call_clobbered_gp_registers() { 4158 RegSet regs; 4159 #ifdef _LP64 4160 regs += RegSet::of(rax, rcx, rdx); 4161 #ifndef _WINDOWS 4162 regs += RegSet::of(rsi, rdi); 4163 #endif 4164 regs += RegSet::range(r8, r11); 4165 #else 4166 regs += RegSet::of(rax, rcx, rdx); 4167 #endif 4168 #ifdef _LP64 4169 if (UseAPX) { 4170 regs += RegSet::range(r16, as_Register(Register::number_of_registers - 1)); 4171 } 4172 #endif 4173 return regs; 4174 } 4175 4176 XMMRegSet MacroAssembler::call_clobbered_xmm_registers() { 4177 int num_xmm_registers = XMMRegister::available_xmm_registers(); 4178 #if defined(_WINDOWS) 4179 XMMRegSet result = XMMRegSet::range(xmm0, xmm5); 4180 if (num_xmm_registers > 16) { 4181 result += XMMRegSet::range(xmm16, as_XMMRegister(num_xmm_registers - 1)); 4182 } 4183 return result; 4184 #else 4185 return XMMRegSet::range(xmm0, as_XMMRegister(num_xmm_registers - 1)); 4186 #endif 4187 } 4188 4189 static int FPUSaveAreaSize = align_up(108, StackAlignmentInBytes); // 108 bytes needed for FPU state by fsave/frstor 4190 4191 #ifndef _LP64 4192 static bool use_x87_registers() { return UseSSE < 2; } 4193 #endif 4194 static bool use_xmm_registers() { return UseSSE >= 1; } 4195 4196 // C1 only ever uses the first double/float of the XMM register. 4197 static int xmm_save_size() { return UseSSE >= 2 ? sizeof(double) : sizeof(float); } 4198 4199 static void save_xmm_register(MacroAssembler* masm, int offset, XMMRegister reg) { 4200 if (UseSSE == 1) { 4201 masm->movflt(Address(rsp, offset), reg); 4202 } else { 4203 masm->movdbl(Address(rsp, offset), reg); 4204 } 4205 } 4206 4207 static void restore_xmm_register(MacroAssembler* masm, int offset, XMMRegister reg) { 4208 if (UseSSE == 1) { 4209 masm->movflt(reg, Address(rsp, offset)); 4210 } else { 4211 masm->movdbl(reg, Address(rsp, offset)); 4212 } 4213 } 4214 4215 static int register_section_sizes(RegSet gp_registers, XMMRegSet xmm_registers, 4216 bool save_fpu, int& gp_area_size, 4217 int& fp_area_size, int& xmm_area_size) { 4218 4219 gp_area_size = align_up(gp_registers.size() * Register::max_slots_per_register * VMRegImpl::stack_slot_size, 4220 StackAlignmentInBytes); 4221 #ifdef _LP64 4222 fp_area_size = 0; 4223 #else 4224 fp_area_size = (save_fpu && use_x87_registers()) ? FPUSaveAreaSize : 0; 4225 #endif 4226 xmm_area_size = (save_fpu && use_xmm_registers()) ? xmm_registers.size() * xmm_save_size() : 0; 4227 4228 return gp_area_size + fp_area_size + xmm_area_size; 4229 } 4230 4231 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude, bool save_fpu) { 4232 block_comment("push_call_clobbered_registers start"); 4233 // Regular registers 4234 RegSet gp_registers_to_push = call_clobbered_gp_registers() - exclude; 4235 4236 int gp_area_size; 4237 int fp_area_size; 4238 int xmm_area_size; 4239 int total_save_size = register_section_sizes(gp_registers_to_push, call_clobbered_xmm_registers(), save_fpu, 4240 gp_area_size, fp_area_size, xmm_area_size); 4241 subptr(rsp, total_save_size); 4242 4243 push_set(gp_registers_to_push, 0); 4244 4245 #ifndef _LP64 4246 if (save_fpu && use_x87_registers()) { 4247 fnsave(Address(rsp, gp_area_size)); 4248 fwait(); 4249 } 4250 #endif 4251 if (save_fpu && use_xmm_registers()) { 4252 push_set(call_clobbered_xmm_registers(), gp_area_size + fp_area_size); 4253 } 4254 4255 block_comment("push_call_clobbered_registers end"); 4256 } 4257 4258 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude, bool restore_fpu) { 4259 block_comment("pop_call_clobbered_registers start"); 4260 4261 RegSet gp_registers_to_pop = call_clobbered_gp_registers() - exclude; 4262 4263 int gp_area_size; 4264 int fp_area_size; 4265 int xmm_area_size; 4266 int total_save_size = register_section_sizes(gp_registers_to_pop, call_clobbered_xmm_registers(), restore_fpu, 4267 gp_area_size, fp_area_size, xmm_area_size); 4268 4269 if (restore_fpu && use_xmm_registers()) { 4270 pop_set(call_clobbered_xmm_registers(), gp_area_size + fp_area_size); 4271 } 4272 #ifndef _LP64 4273 if (restore_fpu && use_x87_registers()) { 4274 frstor(Address(rsp, gp_area_size)); 4275 } 4276 #endif 4277 4278 pop_set(gp_registers_to_pop, 0); 4279 4280 addptr(rsp, total_save_size); 4281 4282 vzeroupper(); 4283 4284 block_comment("pop_call_clobbered_registers end"); 4285 } 4286 4287 void MacroAssembler::push_set(XMMRegSet set, int offset) { 4288 assert(is_aligned(set.size() * xmm_save_size(), StackAlignmentInBytes), "must be"); 4289 int spill_offset = offset; 4290 4291 for (RegSetIterator<XMMRegister> it = set.begin(); *it != xnoreg; ++it) { 4292 save_xmm_register(this, spill_offset, *it); 4293 spill_offset += xmm_save_size(); 4294 } 4295 } 4296 4297 void MacroAssembler::pop_set(XMMRegSet set, int offset) { 4298 int restore_size = set.size() * xmm_save_size(); 4299 assert(is_aligned(restore_size, StackAlignmentInBytes), "must be"); 4300 4301 int restore_offset = offset + restore_size - xmm_save_size(); 4302 4303 for (ReverseRegSetIterator<XMMRegister> it = set.rbegin(); *it != xnoreg; ++it) { 4304 restore_xmm_register(this, restore_offset, *it); 4305 restore_offset -= xmm_save_size(); 4306 } 4307 } 4308 4309 void MacroAssembler::push_set(RegSet set, int offset) { 4310 int spill_offset; 4311 if (offset == -1) { 4312 int register_push_size = set.size() * Register::max_slots_per_register * VMRegImpl::stack_slot_size; 4313 int aligned_size = align_up(register_push_size, StackAlignmentInBytes); 4314 subptr(rsp, aligned_size); 4315 spill_offset = 0; 4316 } else { 4317 spill_offset = offset; 4318 } 4319 4320 for (RegSetIterator<Register> it = set.begin(); *it != noreg; ++it) { 4321 movptr(Address(rsp, spill_offset), *it); 4322 spill_offset += Register::max_slots_per_register * VMRegImpl::stack_slot_size; 4323 } 4324 } 4325 4326 void MacroAssembler::pop_set(RegSet set, int offset) { 4327 4328 int gp_reg_size = Register::max_slots_per_register * VMRegImpl::stack_slot_size; 4329 int restore_size = set.size() * gp_reg_size; 4330 int aligned_size = align_up(restore_size, StackAlignmentInBytes); 4331 4332 int restore_offset; 4333 if (offset == -1) { 4334 restore_offset = restore_size - gp_reg_size; 4335 } else { 4336 restore_offset = offset + restore_size - gp_reg_size; 4337 } 4338 for (ReverseRegSetIterator<Register> it = set.rbegin(); *it != noreg; ++it) { 4339 movptr(*it, Address(rsp, restore_offset)); 4340 restore_offset -= gp_reg_size; 4341 } 4342 4343 if (offset == -1) { 4344 addptr(rsp, aligned_size); 4345 } 4346 } 4347 4348 // Preserves the contents of address, destroys the contents length_in_bytes and temp. 4349 void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) { 4350 assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different"); 4351 assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord"); 4352 Label done; 4353 4354 testptr(length_in_bytes, length_in_bytes); 4355 jcc(Assembler::zero, done); 4356 4357 // initialize topmost word, divide index by 2, check if odd and test if zero 4358 // note: for the remaining code to work, index must be a multiple of BytesPerWord 4359 #ifdef ASSERT 4360 { 4361 Label L; 4362 testptr(length_in_bytes, BytesPerWord - 1); 4363 jcc(Assembler::zero, L); 4364 stop("length must be a multiple of BytesPerWord"); 4365 bind(L); 4366 } 4367 #endif 4368 Register index = length_in_bytes; 4369 xorptr(temp, temp); // use _zero reg to clear memory (shorter code) 4370 if (UseIncDec) { 4371 shrptr(index, 3); // divide by 8/16 and set carry flag if bit 2 was set 4372 } else { 4373 shrptr(index, 2); // use 2 instructions to avoid partial flag stall 4374 shrptr(index, 1); 4375 } 4376 #ifndef _LP64 4377 // index could have not been a multiple of 8 (i.e., bit 2 was set) 4378 { 4379 Label even; 4380 // note: if index was a multiple of 8, then it cannot 4381 // be 0 now otherwise it must have been 0 before 4382 // => if it is even, we don't need to check for 0 again 4383 jcc(Assembler::carryClear, even); 4384 // clear topmost word (no jump would be needed if conditional assignment worked here) 4385 movptr(Address(address, index, Address::times_8, offset_in_bytes - 0*BytesPerWord), temp); 4386 // index could be 0 now, must check again 4387 jcc(Assembler::zero, done); 4388 bind(even); 4389 } 4390 #endif // !_LP64 4391 // initialize remaining object fields: index is a multiple of 2 now 4392 { 4393 Label loop; 4394 bind(loop); 4395 movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp); 4396 NOT_LP64(movptr(Address(address, index, Address::times_8, offset_in_bytes - 2*BytesPerWord), temp);) 4397 decrement(index); 4398 jcc(Assembler::notZero, loop); 4399 } 4400 4401 bind(done); 4402 } 4403 4404 // Look up the method for a megamorphic invokeinterface call. 4405 // The target method is determined by <intf_klass, itable_index>. 4406 // The receiver klass is in recv_klass. 4407 // On success, the result will be in method_result, and execution falls through. 4408 // On failure, execution transfers to the given label. 4409 void MacroAssembler::lookup_interface_method(Register recv_klass, 4410 Register intf_klass, 4411 RegisterOrConstant itable_index, 4412 Register method_result, 4413 Register scan_temp, 4414 Label& L_no_such_interface, 4415 bool return_method) { 4416 assert_different_registers(recv_klass, intf_klass, scan_temp); 4417 assert_different_registers(method_result, intf_klass, scan_temp); 4418 assert(recv_klass != method_result || !return_method, 4419 "recv_klass can be destroyed when method isn't needed"); 4420 4421 assert(itable_index.is_constant() || itable_index.as_register() == method_result, 4422 "caller must use same register for non-constant itable index as for method"); 4423 4424 // Compute start of first itableOffsetEntry (which is at the end of the vtable) 4425 int vtable_base = in_bytes(Klass::vtable_start_offset()); 4426 int itentry_off = in_bytes(itableMethodEntry::method_offset()); 4427 int scan_step = itableOffsetEntry::size() * wordSize; 4428 int vte_size = vtableEntry::size_in_bytes(); 4429 Address::ScaleFactor times_vte_scale = Address::times_ptr; 4430 assert(vte_size == wordSize, "else adjust times_vte_scale"); 4431 4432 movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset())); 4433 4434 // Could store the aligned, prescaled offset in the klass. 4435 lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base)); 4436 4437 if (return_method) { 4438 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 4439 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 4440 lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off)); 4441 } 4442 4443 // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) { 4444 // if (scan->interface() == intf) { 4445 // result = (klass + scan->offset() + itable_index); 4446 // } 4447 // } 4448 Label search, found_method; 4449 4450 for (int peel = 1; peel >= 0; peel--) { 4451 movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset())); 4452 cmpptr(intf_klass, method_result); 4453 4454 if (peel) { 4455 jccb(Assembler::equal, found_method); 4456 } else { 4457 jccb(Assembler::notEqual, search); 4458 // (invert the test to fall through to found_method...) 4459 } 4460 4461 if (!peel) break; 4462 4463 bind(search); 4464 4465 // Check that the previous entry is non-null. A null entry means that 4466 // the receiver class doesn't implement the interface, and wasn't the 4467 // same as when the caller was compiled. 4468 testptr(method_result, method_result); 4469 jcc(Assembler::zero, L_no_such_interface); 4470 addptr(scan_temp, scan_step); 4471 } 4472 4473 bind(found_method); 4474 4475 if (return_method) { 4476 // Got a hit. 4477 movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset())); 4478 movptr(method_result, Address(recv_klass, scan_temp, Address::times_1)); 4479 } 4480 } 4481 4482 // Look up the method for a megamorphic invokeinterface call in a single pass over itable: 4483 // - check recv_klass (actual object class) is a subtype of resolved_klass from CompiledICData 4484 // - find a holder_klass (class that implements the method) vtable offset and get the method from vtable by index 4485 // The target method is determined by <holder_klass, itable_index>. 4486 // The receiver klass is in recv_klass. 4487 // On success, the result will be in method_result, and execution falls through. 4488 // On failure, execution transfers to the given label. 4489 void MacroAssembler::lookup_interface_method_stub(Register recv_klass, 4490 Register holder_klass, 4491 Register resolved_klass, 4492 Register method_result, 4493 Register scan_temp, 4494 Register temp_reg2, 4495 Register receiver, 4496 int itable_index, 4497 Label& L_no_such_interface) { 4498 assert_different_registers(recv_klass, method_result, holder_klass, resolved_klass, scan_temp, temp_reg2, receiver); 4499 Register temp_itbl_klass = method_result; 4500 Register temp_reg = (temp_reg2 == noreg ? recv_klass : temp_reg2); // reuse recv_klass register on 32-bit x86 impl 4501 4502 int vtable_base = in_bytes(Klass::vtable_start_offset()); 4503 int itentry_off = in_bytes(itableMethodEntry::method_offset()); 4504 int scan_step = itableOffsetEntry::size() * wordSize; 4505 int vte_size = vtableEntry::size_in_bytes(); 4506 int ioffset = in_bytes(itableOffsetEntry::interface_offset()); 4507 int ooffset = in_bytes(itableOffsetEntry::offset_offset()); 4508 Address::ScaleFactor times_vte_scale = Address::times_ptr; 4509 assert(vte_size == wordSize, "adjust times_vte_scale"); 4510 4511 Label L_loop_scan_resolved_entry, L_resolved_found, L_holder_found; 4512 4513 // temp_itbl_klass = recv_klass.itable[0] 4514 // scan_temp = &recv_klass.itable[0] + step 4515 movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset())); 4516 movptr(temp_itbl_klass, Address(recv_klass, scan_temp, times_vte_scale, vtable_base + ioffset)); 4517 lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base + ioffset + scan_step)); 4518 xorptr(temp_reg, temp_reg); 4519 4520 // Initial checks: 4521 // - if (holder_klass != resolved_klass), go to "scan for resolved" 4522 // - if (itable[0] == 0), no such interface 4523 // - if (itable[0] == holder_klass), shortcut to "holder found" 4524 cmpptr(holder_klass, resolved_klass); 4525 jccb(Assembler::notEqual, L_loop_scan_resolved_entry); 4526 testptr(temp_itbl_klass, temp_itbl_klass); 4527 jccb(Assembler::zero, L_no_such_interface); 4528 cmpptr(holder_klass, temp_itbl_klass); 4529 jccb(Assembler::equal, L_holder_found); 4530 4531 // Loop: Look for holder_klass record in itable 4532 // do { 4533 // tmp = itable[index]; 4534 // index += step; 4535 // if (tmp == holder_klass) { 4536 // goto L_holder_found; // Found! 4537 // } 4538 // } while (tmp != 0); 4539 // goto L_no_such_interface // Not found. 4540 Label L_scan_holder; 4541 bind(L_scan_holder); 4542 movptr(temp_itbl_klass, Address(scan_temp, 0)); 4543 addptr(scan_temp, scan_step); 4544 cmpptr(holder_klass, temp_itbl_klass); 4545 jccb(Assembler::equal, L_holder_found); 4546 testptr(temp_itbl_klass, temp_itbl_klass); 4547 jccb(Assembler::notZero, L_scan_holder); 4548 4549 jmpb(L_no_such_interface); 4550 4551 // Loop: Look for resolved_class record in itable 4552 // do { 4553 // tmp = itable[index]; 4554 // index += step; 4555 // if (tmp == holder_klass) { 4556 // // Also check if we have met a holder klass 4557 // holder_tmp = itable[index-step-ioffset]; 4558 // } 4559 // if (tmp == resolved_klass) { 4560 // goto L_resolved_found; // Found! 4561 // } 4562 // } while (tmp != 0); 4563 // goto L_no_such_interface // Not found. 4564 // 4565 Label L_loop_scan_resolved; 4566 bind(L_loop_scan_resolved); 4567 movptr(temp_itbl_klass, Address(scan_temp, 0)); 4568 addptr(scan_temp, scan_step); 4569 bind(L_loop_scan_resolved_entry); 4570 cmpptr(holder_klass, temp_itbl_klass); 4571 cmovl(Assembler::equal, temp_reg, Address(scan_temp, ooffset - ioffset - scan_step)); 4572 cmpptr(resolved_klass, temp_itbl_klass); 4573 jccb(Assembler::equal, L_resolved_found); 4574 testptr(temp_itbl_klass, temp_itbl_klass); 4575 jccb(Assembler::notZero, L_loop_scan_resolved); 4576 4577 jmpb(L_no_such_interface); 4578 4579 Label L_ready; 4580 4581 // See if we already have a holder klass. If not, go and scan for it. 4582 bind(L_resolved_found); 4583 testptr(temp_reg, temp_reg); 4584 jccb(Assembler::zero, L_scan_holder); 4585 jmpb(L_ready); 4586 4587 bind(L_holder_found); 4588 movl(temp_reg, Address(scan_temp, ooffset - ioffset - scan_step)); 4589 4590 // Finally, temp_reg contains holder_klass vtable offset 4591 bind(L_ready); 4592 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 4593 if (temp_reg2 == noreg) { // recv_klass register is clobbered for 32-bit x86 impl 4594 load_klass(scan_temp, receiver, noreg); 4595 movptr(method_result, Address(scan_temp, temp_reg, Address::times_1, itable_index * wordSize + itentry_off)); 4596 } else { 4597 movptr(method_result, Address(recv_klass, temp_reg, Address::times_1, itable_index * wordSize + itentry_off)); 4598 } 4599 } 4600 4601 4602 // virtual method calling 4603 void MacroAssembler::lookup_virtual_method(Register recv_klass, 4604 RegisterOrConstant vtable_index, 4605 Register method_result) { 4606 const ByteSize base = Klass::vtable_start_offset(); 4607 assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below"); 4608 Address vtable_entry_addr(recv_klass, 4609 vtable_index, Address::times_ptr, 4610 base + vtableEntry::method_offset()); 4611 movptr(method_result, vtable_entry_addr); 4612 } 4613 4614 4615 void MacroAssembler::check_klass_subtype(Register sub_klass, 4616 Register super_klass, 4617 Register temp_reg, 4618 Label& L_success) { 4619 Label L_failure; 4620 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, nullptr); 4621 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, nullptr); 4622 bind(L_failure); 4623 } 4624 4625 4626 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 4627 Register super_klass, 4628 Register temp_reg, 4629 Label* L_success, 4630 Label* L_failure, 4631 Label* L_slow_path, 4632 RegisterOrConstant super_check_offset) { 4633 assert_different_registers(sub_klass, super_klass, temp_reg); 4634 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 4635 if (super_check_offset.is_register()) { 4636 assert_different_registers(sub_klass, super_klass, 4637 super_check_offset.as_register()); 4638 } else if (must_load_sco) { 4639 assert(temp_reg != noreg, "supply either a temp or a register offset"); 4640 } 4641 4642 Label L_fallthrough; 4643 int label_nulls = 0; 4644 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; } 4645 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; } 4646 if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; } 4647 assert(label_nulls <= 1, "at most one null in the batch"); 4648 4649 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 4650 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 4651 Address super_check_offset_addr(super_klass, sco_offset); 4652 4653 // Hacked jcc, which "knows" that L_fallthrough, at least, is in 4654 // range of a jccb. If this routine grows larger, reconsider at 4655 // least some of these. 4656 #define local_jcc(assembler_cond, label) \ 4657 if (&(label) == &L_fallthrough) jccb(assembler_cond, label); \ 4658 else jcc( assembler_cond, label) /*omit semi*/ 4659 4660 // Hacked jmp, which may only be used just before L_fallthrough. 4661 #define final_jmp(label) \ 4662 if (&(label) == &L_fallthrough) { /*do nothing*/ } \ 4663 else jmp(label) /*omit semi*/ 4664 4665 // If the pointers are equal, we are done (e.g., String[] elements). 4666 // This self-check enables sharing of secondary supertype arrays among 4667 // non-primary types such as array-of-interface. Otherwise, each such 4668 // type would need its own customized SSA. 4669 // We move this check to the front of the fast path because many 4670 // type checks are in fact trivially successful in this manner, 4671 // so we get a nicely predicted branch right at the start of the check. 4672 cmpptr(sub_klass, super_klass); 4673 local_jcc(Assembler::equal, *L_success); 4674 4675 // Check the supertype display: 4676 if (must_load_sco) { 4677 // Positive movl does right thing on LP64. 4678 movl(temp_reg, super_check_offset_addr); 4679 super_check_offset = RegisterOrConstant(temp_reg); 4680 } 4681 Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0); 4682 cmpptr(super_klass, super_check_addr); // load displayed supertype 4683 4684 // This check has worked decisively for primary supers. 4685 // Secondary supers are sought in the super_cache ('super_cache_addr'). 4686 // (Secondary supers are interfaces and very deeply nested subtypes.) 4687 // This works in the same check above because of a tricky aliasing 4688 // between the super_cache and the primary super display elements. 4689 // (The 'super_check_addr' can address either, as the case requires.) 4690 // Note that the cache is updated below if it does not help us find 4691 // what we need immediately. 4692 // So if it was a primary super, we can just fail immediately. 4693 // Otherwise, it's the slow path for us (no success at this point). 4694 4695 if (super_check_offset.is_register()) { 4696 local_jcc(Assembler::equal, *L_success); 4697 cmpl(super_check_offset.as_register(), sc_offset); 4698 if (L_failure == &L_fallthrough) { 4699 local_jcc(Assembler::equal, *L_slow_path); 4700 } else { 4701 local_jcc(Assembler::notEqual, *L_failure); 4702 final_jmp(*L_slow_path); 4703 } 4704 } else if (super_check_offset.as_constant() == sc_offset) { 4705 // Need a slow path; fast failure is impossible. 4706 if (L_slow_path == &L_fallthrough) { 4707 local_jcc(Assembler::equal, *L_success); 4708 } else { 4709 local_jcc(Assembler::notEqual, *L_slow_path); 4710 final_jmp(*L_success); 4711 } 4712 } else { 4713 // No slow path; it's a fast decision. 4714 if (L_failure == &L_fallthrough) { 4715 local_jcc(Assembler::equal, *L_success); 4716 } else { 4717 local_jcc(Assembler::notEqual, *L_failure); 4718 final_jmp(*L_success); 4719 } 4720 } 4721 4722 bind(L_fallthrough); 4723 4724 #undef local_jcc 4725 #undef final_jmp 4726 } 4727 4728 4729 void MacroAssembler::check_klass_subtype_slow_path_linear(Register sub_klass, 4730 Register super_klass, 4731 Register temp_reg, 4732 Register temp2_reg, 4733 Label* L_success, 4734 Label* L_failure, 4735 bool set_cond_codes) { 4736 assert_different_registers(sub_klass, super_klass, temp_reg); 4737 if (temp2_reg != noreg) 4738 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg); 4739 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg) 4740 4741 Label L_fallthrough; 4742 int label_nulls = 0; 4743 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; } 4744 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; } 4745 assert(label_nulls <= 1, "at most one null in the batch"); 4746 4747 // a couple of useful fields in sub_klass: 4748 int ss_offset = in_bytes(Klass::secondary_supers_offset()); 4749 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 4750 Address secondary_supers_addr(sub_klass, ss_offset); 4751 Address super_cache_addr( sub_klass, sc_offset); 4752 4753 // Do a linear scan of the secondary super-klass chain. 4754 // This code is rarely used, so simplicity is a virtue here. 4755 // The repne_scan instruction uses fixed registers, which we must spill. 4756 // Don't worry too much about pre-existing connections with the input regs. 4757 4758 assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super) 4759 assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter) 4760 4761 // Get super_klass value into rax (even if it was in rdi or rcx). 4762 bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false; 4763 if (super_klass != rax) { 4764 if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; } 4765 mov(rax, super_klass); 4766 } 4767 if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; } 4768 if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; } 4769 4770 #ifndef PRODUCT 4771 uint* pst_counter = &SharedRuntime::_partial_subtype_ctr; 4772 ExternalAddress pst_counter_addr((address) pst_counter); 4773 NOT_LP64( incrementl(pst_counter_addr) ); 4774 LP64_ONLY( lea(rcx, pst_counter_addr) ); 4775 LP64_ONLY( incrementl(Address(rcx, 0)) ); 4776 #endif //PRODUCT 4777 4778 // We will consult the secondary-super array. 4779 movptr(rdi, secondary_supers_addr); 4780 // Load the array length. (Positive movl does right thing on LP64.) 4781 movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes())); 4782 // Skip to start of data. 4783 addptr(rdi, Array<Klass*>::base_offset_in_bytes()); 4784 4785 // Scan RCX words at [RDI] for an occurrence of RAX. 4786 // Set NZ/Z based on last compare. 4787 // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does 4788 // not change flags (only scas instruction which is repeated sets flags). 4789 // Set Z = 0 (not equal) before 'repne' to indicate that class was not found. 4790 4791 testptr(rax,rax); // Set Z = 0 4792 repne_scan(); 4793 4794 // Unspill the temp. registers: 4795 if (pushed_rdi) pop(rdi); 4796 if (pushed_rcx) pop(rcx); 4797 if (pushed_rax) pop(rax); 4798 4799 if (set_cond_codes) { 4800 // Special hack for the AD files: rdi is guaranteed non-zero. 4801 assert(!pushed_rdi, "rdi must be left non-null"); 4802 // Also, the condition codes are properly set Z/NZ on succeed/failure. 4803 } 4804 4805 if (L_failure == &L_fallthrough) 4806 jccb(Assembler::notEqual, *L_failure); 4807 else jcc(Assembler::notEqual, *L_failure); 4808 4809 // Success. Cache the super we found and proceed in triumph. 4810 movptr(super_cache_addr, super_klass); 4811 4812 if (L_success != &L_fallthrough) { 4813 jmp(*L_success); 4814 } 4815 4816 #undef IS_A_TEMP 4817 4818 bind(L_fallthrough); 4819 } 4820 4821 #ifndef _LP64 4822 4823 // 32-bit x86 only: always use the linear search. 4824 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 4825 Register super_klass, 4826 Register temp_reg, 4827 Register temp2_reg, 4828 Label* L_success, 4829 Label* L_failure, 4830 bool set_cond_codes) { 4831 check_klass_subtype_slow_path_linear 4832 (sub_klass, super_klass, temp_reg, temp2_reg, L_success, L_failure, set_cond_codes); 4833 } 4834 4835 #else // _LP64 4836 4837 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 4838 Register super_klass, 4839 Register temp_reg, 4840 Register temp2_reg, 4841 Label* L_success, 4842 Label* L_failure, 4843 bool set_cond_codes) { 4844 assert(set_cond_codes == false, "must be false on 64-bit x86"); 4845 check_klass_subtype_slow_path 4846 (sub_klass, super_klass, temp_reg, temp2_reg, noreg, noreg, 4847 L_success, L_failure); 4848 } 4849 4850 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 4851 Register super_klass, 4852 Register temp_reg, 4853 Register temp2_reg, 4854 Register temp3_reg, 4855 Register temp4_reg, 4856 Label* L_success, 4857 Label* L_failure) { 4858 if (UseSecondarySupersTable) { 4859 check_klass_subtype_slow_path_table 4860 (sub_klass, super_klass, temp_reg, temp2_reg, temp3_reg, temp4_reg, 4861 L_success, L_failure); 4862 } else { 4863 check_klass_subtype_slow_path_linear 4864 (sub_klass, super_klass, temp_reg, temp2_reg, L_success, L_failure, /*set_cond_codes*/false); 4865 } 4866 } 4867 4868 Register MacroAssembler::allocate_if_noreg(Register r, 4869 RegSetIterator<Register> &available_regs, 4870 RegSet ®s_to_push) { 4871 if (!r->is_valid()) { 4872 r = *available_regs++; 4873 regs_to_push += r; 4874 } 4875 return r; 4876 } 4877 4878 void MacroAssembler::check_klass_subtype_slow_path_table(Register sub_klass, 4879 Register super_klass, 4880 Register temp_reg, 4881 Register temp2_reg, 4882 Register temp3_reg, 4883 Register result_reg, 4884 Label* L_success, 4885 Label* L_failure) { 4886 // NB! Callers may assume that, when temp2_reg is a valid register, 4887 // this code sets it to a nonzero value. 4888 bool temp2_reg_was_valid = temp2_reg->is_valid(); 4889 4890 RegSet temps = RegSet::of(temp_reg, temp2_reg, temp3_reg); 4891 4892 Label L_fallthrough; 4893 int label_nulls = 0; 4894 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; } 4895 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; } 4896 assert(label_nulls <= 1, "at most one null in the batch"); 4897 4898 BLOCK_COMMENT("check_klass_subtype_slow_path_table"); 4899 4900 RegSetIterator<Register> available_regs 4901 = (RegSet::of(rax, rcx, rdx, r8) + r9 + r10 + r11 + r12 - temps - sub_klass - super_klass).begin(); 4902 4903 RegSet pushed_regs; 4904 4905 temp_reg = allocate_if_noreg(temp_reg, available_regs, pushed_regs); 4906 temp2_reg = allocate_if_noreg(temp2_reg, available_regs, pushed_regs); 4907 temp3_reg = allocate_if_noreg(temp3_reg, available_regs, pushed_regs); 4908 result_reg = allocate_if_noreg(result_reg, available_regs, pushed_regs); 4909 Register temp4_reg = allocate_if_noreg(noreg, available_regs, pushed_regs); 4910 4911 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, temp3_reg, result_reg); 4912 4913 { 4914 4915 int register_push_size = pushed_regs.size() * Register::max_slots_per_register * VMRegImpl::stack_slot_size; 4916 int aligned_size = align_up(register_push_size, StackAlignmentInBytes); 4917 subptr(rsp, aligned_size); 4918 push_set(pushed_regs, 0); 4919 4920 lookup_secondary_supers_table_var(sub_klass, 4921 super_klass, 4922 temp_reg, temp2_reg, temp3_reg, temp4_reg, result_reg); 4923 cmpq(result_reg, 0); 4924 4925 // Unspill the temp. registers: 4926 pop_set(pushed_regs, 0); 4927 // Increment SP but do not clobber flags. 4928 lea(rsp, Address(rsp, aligned_size)); 4929 } 4930 4931 if (temp2_reg_was_valid) { 4932 movq(temp2_reg, 1); 4933 } 4934 4935 jcc(Assembler::notEqual, *L_failure); 4936 4937 if (L_success != &L_fallthrough) { 4938 jmp(*L_success); 4939 } 4940 4941 bind(L_fallthrough); 4942 } 4943 4944 // population_count variant for running without the POPCNT 4945 // instruction, which was introduced with SSE4.2 in 2008. 4946 void MacroAssembler::population_count(Register dst, Register src, 4947 Register scratch1, Register scratch2) { 4948 assert_different_registers(src, scratch1, scratch2); 4949 if (UsePopCountInstruction) { 4950 Assembler::popcntq(dst, src); 4951 } else { 4952 assert_different_registers(src, scratch1, scratch2); 4953 assert_different_registers(dst, scratch1, scratch2); 4954 Label loop, done; 4955 4956 mov(scratch1, src); 4957 // dst = 0; 4958 // while(scratch1 != 0) { 4959 // dst++; 4960 // scratch1 &= (scratch1 - 1); 4961 // } 4962 xorl(dst, dst); 4963 testq(scratch1, scratch1); 4964 jccb(Assembler::equal, done); 4965 { 4966 bind(loop); 4967 incq(dst); 4968 movq(scratch2, scratch1); 4969 decq(scratch2); 4970 andq(scratch1, scratch2); 4971 jccb(Assembler::notEqual, loop); 4972 } 4973 bind(done); 4974 } 4975 #ifdef ASSERT 4976 mov64(scratch1, 0xCafeBabeDeadBeef); 4977 movq(scratch2, scratch1); 4978 #endif 4979 } 4980 4981 // Ensure that the inline code and the stub are using the same registers. 4982 #define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS \ 4983 do { \ 4984 assert(r_super_klass == rax, "mismatch"); \ 4985 assert(r_array_base == rbx, "mismatch"); \ 4986 assert(r_array_length == rcx, "mismatch"); \ 4987 assert(r_array_index == rdx, "mismatch"); \ 4988 assert(r_sub_klass == rsi || r_sub_klass == noreg, "mismatch"); \ 4989 assert(r_bitmap == r11 || r_bitmap == noreg, "mismatch"); \ 4990 assert(result == rdi || result == noreg, "mismatch"); \ 4991 } while(0) 4992 4993 // Versions of salq and rorq that don't need count to be in rcx 4994 4995 void MacroAssembler::salq(Register dest, Register count) { 4996 if (count == rcx) { 4997 Assembler::salq(dest); 4998 } else { 4999 assert_different_registers(rcx, dest); 5000 xchgq(rcx, count); 5001 Assembler::salq(dest); 5002 xchgq(rcx, count); 5003 } 5004 } 5005 5006 void MacroAssembler::rorq(Register dest, Register count) { 5007 if (count == rcx) { 5008 Assembler::rorq(dest); 5009 } else { 5010 assert_different_registers(rcx, dest); 5011 xchgq(rcx, count); 5012 Assembler::rorq(dest); 5013 xchgq(rcx, count); 5014 } 5015 } 5016 5017 // Return true: we succeeded in generating this code 5018 // 5019 // At runtime, return 0 in result if r_super_klass is a superclass of 5020 // r_sub_klass, otherwise return nonzero. Use this if you know the 5021 // super_klass_slot of the class you're looking for. This is always 5022 // the case for instanceof and checkcast. 5023 void MacroAssembler::lookup_secondary_supers_table_const(Register r_sub_klass, 5024 Register r_super_klass, 5025 Register temp1, 5026 Register temp2, 5027 Register temp3, 5028 Register temp4, 5029 Register result, 5030 u1 super_klass_slot) { 5031 assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result); 5032 5033 Label L_fallthrough, L_success, L_failure; 5034 5035 BLOCK_COMMENT("lookup_secondary_supers_table {"); 5036 5037 const Register 5038 r_array_index = temp1, 5039 r_array_length = temp2, 5040 r_array_base = temp3, 5041 r_bitmap = temp4; 5042 5043 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS; 5044 5045 xorq(result, result); // = 0 5046 5047 movq(r_bitmap, Address(r_sub_klass, Klass::secondary_supers_bitmap_offset())); 5048 movq(r_array_index, r_bitmap); 5049 5050 // First check the bitmap to see if super_klass might be present. If 5051 // the bit is zero, we are certain that super_klass is not one of 5052 // the secondary supers. 5053 u1 bit = super_klass_slot; 5054 { 5055 // NB: If the count in a x86 shift instruction is 0, the flags are 5056 // not affected, so we do a testq instead. 5057 int shift_count = Klass::SECONDARY_SUPERS_TABLE_MASK - bit; 5058 if (shift_count != 0) { 5059 salq(r_array_index, shift_count); 5060 } else { 5061 testq(r_array_index, r_array_index); 5062 } 5063 } 5064 // We test the MSB of r_array_index, i.e. its sign bit 5065 jcc(Assembler::positive, L_failure); 5066 5067 // Get the first array index that can contain super_klass into r_array_index. 5068 if (bit != 0) { 5069 population_count(r_array_index, r_array_index, temp2, temp3); 5070 } else { 5071 movl(r_array_index, 1); 5072 } 5073 // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word. 5074 5075 // We will consult the secondary-super array. 5076 movptr(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset()))); 5077 5078 // We're asserting that the first word in an Array<Klass*> is the 5079 // length, and the second word is the first word of the data. If 5080 // that ever changes, r_array_base will have to be adjusted here. 5081 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code"); 5082 assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code"); 5083 5084 cmpq(r_super_klass, Address(r_array_base, r_array_index, Address::times_8)); 5085 jccb(Assembler::equal, L_success); 5086 5087 // Is there another entry to check? Consult the bitmap. 5088 btq(r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK); 5089 jccb(Assembler::carryClear, L_failure); 5090 5091 // Linear probe. Rotate the bitmap so that the next bit to test is 5092 // in Bit 1. 5093 if (bit != 0) { 5094 rorq(r_bitmap, bit); 5095 } 5096 5097 // Calls into the stub generated by lookup_secondary_supers_table_slow_path. 5098 // Arguments: r_super_klass, r_array_base, r_array_index, r_bitmap. 5099 // Kills: r_array_length. 5100 // Returns: result. 5101 call(RuntimeAddress(StubRoutines::lookup_secondary_supers_table_slow_path_stub())); 5102 // Result (0/1) is in rdi 5103 jmpb(L_fallthrough); 5104 5105 bind(L_failure); 5106 incq(result); // 0 => 1 5107 5108 bind(L_success); 5109 // result = 0; 5110 5111 bind(L_fallthrough); 5112 BLOCK_COMMENT("} lookup_secondary_supers_table"); 5113 5114 if (VerifySecondarySupers) { 5115 verify_secondary_supers_table(r_sub_klass, r_super_klass, result, 5116 temp1, temp2, temp3); 5117 } 5118 } 5119 5120 // At runtime, return 0 in result if r_super_klass is a superclass of 5121 // r_sub_klass, otherwise return nonzero. Use this version of 5122 // lookup_secondary_supers_table() if you don't know ahead of time 5123 // which superclass will be searched for. Used by interpreter and 5124 // runtime stubs. It is larger and has somewhat greater latency than 5125 // the version above, which takes a constant super_klass_slot. 5126 void MacroAssembler::lookup_secondary_supers_table_var(Register r_sub_klass, 5127 Register r_super_klass, 5128 Register temp1, 5129 Register temp2, 5130 Register temp3, 5131 Register temp4, 5132 Register result) { 5133 assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result); 5134 assert_different_registers(r_sub_klass, r_super_klass, rcx); 5135 RegSet temps = RegSet::of(temp1, temp2, temp3, temp4); 5136 5137 Label L_fallthrough, L_success, L_failure; 5138 5139 BLOCK_COMMENT("lookup_secondary_supers_table {"); 5140 5141 RegSetIterator<Register> available_regs = (temps - rcx).begin(); 5142 5143 // FIXME. Once we are sure that all paths reaching this point really 5144 // do pass rcx as one of our temps we can get rid of the following 5145 // workaround. 5146 assert(temps.contains(rcx), "fix this code"); 5147 5148 // We prefer to have our shift count in rcx. If rcx is one of our 5149 // temps, use it for slot. If not, pick any of our temps. 5150 Register slot; 5151 if (!temps.contains(rcx)) { 5152 slot = *available_regs++; 5153 } else { 5154 slot = rcx; 5155 } 5156 5157 const Register r_array_index = *available_regs++; 5158 const Register r_bitmap = *available_regs++; 5159 5160 // The logic above guarantees this property, but we state it here. 5161 assert_different_registers(r_array_index, r_bitmap, rcx); 5162 5163 movq(r_bitmap, Address(r_sub_klass, Klass::secondary_supers_bitmap_offset())); 5164 movq(r_array_index, r_bitmap); 5165 5166 // First check the bitmap to see if super_klass might be present. If 5167 // the bit is zero, we are certain that super_klass is not one of 5168 // the secondary supers. 5169 movb(slot, Address(r_super_klass, Klass::hash_slot_offset())); 5170 xorl(slot, (u1)(Klass::SECONDARY_SUPERS_TABLE_SIZE - 1)); // slot ^ 63 === 63 - slot (mod 64) 5171 salq(r_array_index, slot); 5172 5173 testq(r_array_index, r_array_index); 5174 // We test the MSB of r_array_index, i.e. its sign bit 5175 jcc(Assembler::positive, L_failure); 5176 5177 const Register r_array_base = *available_regs++; 5178 5179 // Get the first array index that can contain super_klass into r_array_index. 5180 // Note: Clobbers r_array_base and slot. 5181 population_count(r_array_index, r_array_index, /*temp2*/r_array_base, /*temp3*/slot); 5182 5183 // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word. 5184 5185 // We will consult the secondary-super array. 5186 movptr(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset()))); 5187 5188 // We're asserting that the first word in an Array<Klass*> is the 5189 // length, and the second word is the first word of the data. If 5190 // that ever changes, r_array_base will have to be adjusted here. 5191 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code"); 5192 assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code"); 5193 5194 cmpq(r_super_klass, Address(r_array_base, r_array_index, Address::times_8)); 5195 jccb(Assembler::equal, L_success); 5196 5197 // Restore slot to its true value 5198 movb(slot, Address(r_super_klass, Klass::hash_slot_offset())); 5199 5200 // Linear probe. Rotate the bitmap so that the next bit to test is 5201 // in Bit 1. 5202 rorq(r_bitmap, slot); 5203 5204 // Is there another entry to check? Consult the bitmap. 5205 btq(r_bitmap, 1); 5206 jccb(Assembler::carryClear, L_failure); 5207 5208 // Calls into the stub generated by lookup_secondary_supers_table_slow_path. 5209 // Arguments: r_super_klass, r_array_base, r_array_index, r_bitmap. 5210 // Kills: r_array_length. 5211 // Returns: result. 5212 lookup_secondary_supers_table_slow_path(r_super_klass, 5213 r_array_base, 5214 r_array_index, 5215 r_bitmap, 5216 /*temp1*/result, 5217 /*temp2*/slot, 5218 &L_success, 5219 nullptr); 5220 5221 bind(L_failure); 5222 movq(result, 1); 5223 jmpb(L_fallthrough); 5224 5225 bind(L_success); 5226 xorq(result, result); // = 0 5227 5228 bind(L_fallthrough); 5229 BLOCK_COMMENT("} lookup_secondary_supers_table"); 5230 5231 if (VerifySecondarySupers) { 5232 verify_secondary_supers_table(r_sub_klass, r_super_klass, result, 5233 temp1, temp2, temp3); 5234 } 5235 } 5236 5237 void MacroAssembler::repne_scanq(Register addr, Register value, Register count, Register limit, 5238 Label* L_success, Label* L_failure) { 5239 Label L_loop, L_fallthrough; 5240 { 5241 int label_nulls = 0; 5242 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; } 5243 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; } 5244 assert(label_nulls <= 1, "at most one null in the batch"); 5245 } 5246 bind(L_loop); 5247 cmpq(value, Address(addr, count, Address::times_8)); 5248 jcc(Assembler::equal, *L_success); 5249 addl(count, 1); 5250 cmpl(count, limit); 5251 jcc(Assembler::less, L_loop); 5252 5253 if (&L_fallthrough != L_failure) { 5254 jmp(*L_failure); 5255 } 5256 bind(L_fallthrough); 5257 } 5258 5259 // Called by code generated by check_klass_subtype_slow_path 5260 // above. This is called when there is a collision in the hashed 5261 // lookup in the secondary supers array. 5262 void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass, 5263 Register r_array_base, 5264 Register r_array_index, 5265 Register r_bitmap, 5266 Register temp1, 5267 Register temp2, 5268 Label* L_success, 5269 Label* L_failure) { 5270 assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, temp2); 5271 5272 const Register 5273 r_array_length = temp1, 5274 r_sub_klass = noreg, 5275 result = noreg; 5276 5277 Label L_fallthrough; 5278 int label_nulls = 0; 5279 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; } 5280 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; } 5281 assert(label_nulls <= 1, "at most one null in the batch"); 5282 5283 // Load the array length. 5284 movl(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes())); 5285 // And adjust the array base to point to the data. 5286 // NB! Effectively increments current slot index by 1. 5287 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, ""); 5288 addptr(r_array_base, Array<Klass*>::base_offset_in_bytes()); 5289 5290 // Linear probe 5291 Label L_huge; 5292 5293 // The bitmap is full to bursting. 5294 // Implicit invariant: BITMAP_FULL implies (length > 0) 5295 cmpl(r_array_length, (int32_t)Klass::SECONDARY_SUPERS_TABLE_SIZE - 2); 5296 jcc(Assembler::greater, L_huge); 5297 5298 // NB! Our caller has checked bits 0 and 1 in the bitmap. The 5299 // current slot (at secondary_supers[r_array_index]) has not yet 5300 // been inspected, and r_array_index may be out of bounds if we 5301 // wrapped around the end of the array. 5302 5303 { // This is conventional linear probing, but instead of terminating 5304 // when a null entry is found in the table, we maintain a bitmap 5305 // in which a 0 indicates missing entries. 5306 // The check above guarantees there are 0s in the bitmap, so the loop 5307 // eventually terminates. 5308 5309 xorl(temp2, temp2); // = 0; 5310 5311 Label L_again; 5312 bind(L_again); 5313 5314 // Check for array wraparound. 5315 cmpl(r_array_index, r_array_length); 5316 cmovl(Assembler::greaterEqual, r_array_index, temp2); 5317 5318 cmpq(r_super_klass, Address(r_array_base, r_array_index, Address::times_8)); 5319 jcc(Assembler::equal, *L_success); 5320 5321 // If the next bit in bitmap is zero, we're done. 5322 btq(r_bitmap, 2); // look-ahead check (Bit 2); Bits 0 and 1 are tested by now 5323 jcc(Assembler::carryClear, *L_failure); 5324 5325 rorq(r_bitmap, 1); // Bits 1/2 => 0/1 5326 addl(r_array_index, 1); 5327 5328 jmp(L_again); 5329 } 5330 5331 { // Degenerate case: more than 64 secondary supers. 5332 // FIXME: We could do something smarter here, maybe a vectorized 5333 // comparison or a binary search, but is that worth any added 5334 // complexity? 5335 bind(L_huge); 5336 xorl(r_array_index, r_array_index); // = 0 5337 repne_scanq(r_array_base, r_super_klass, r_array_index, r_array_length, 5338 L_success, 5339 (&L_fallthrough != L_failure ? L_failure : nullptr)); 5340 5341 bind(L_fallthrough); 5342 } 5343 } 5344 5345 struct VerifyHelperArguments { 5346 Klass* _super; 5347 Klass* _sub; 5348 intptr_t _linear_result; 5349 intptr_t _table_result; 5350 }; 5351 5352 static void verify_secondary_supers_table_helper(const char* msg, VerifyHelperArguments* args) { 5353 Klass::on_secondary_supers_verification_failure(args->_super, 5354 args->_sub, 5355 args->_linear_result, 5356 args->_table_result, 5357 msg); 5358 } 5359 5360 // Make sure that the hashed lookup and a linear scan agree. 5361 void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass, 5362 Register r_super_klass, 5363 Register result, 5364 Register temp1, 5365 Register temp2, 5366 Register temp3) { 5367 const Register 5368 r_array_index = temp1, 5369 r_array_length = temp2, 5370 r_array_base = temp3, 5371 r_bitmap = noreg; 5372 5373 BLOCK_COMMENT("verify_secondary_supers_table {"); 5374 5375 Label L_success, L_failure, L_check, L_done; 5376 5377 movptr(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset()))); 5378 movl(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes())); 5379 // And adjust the array base to point to the data. 5380 addptr(r_array_base, Array<Klass*>::base_offset_in_bytes()); 5381 5382 testl(r_array_length, r_array_length); // array_length == 0? 5383 jcc(Assembler::zero, L_failure); 5384 5385 movl(r_array_index, 0); 5386 repne_scanq(r_array_base, r_super_klass, r_array_index, r_array_length, &L_success); 5387 // fall through to L_failure 5388 5389 const Register linear_result = r_array_index; // reuse temp1 5390 5391 bind(L_failure); // not present 5392 movl(linear_result, 1); 5393 jmp(L_check); 5394 5395 bind(L_success); // present 5396 movl(linear_result, 0); 5397 5398 bind(L_check); 5399 cmpl(linear_result, result); 5400 jcc(Assembler::equal, L_done); 5401 5402 { // To avoid calling convention issues, build a record on the stack 5403 // and pass the pointer to that instead. 5404 push(result); 5405 push(linear_result); 5406 push(r_sub_klass); 5407 push(r_super_klass); 5408 movptr(c_rarg1, rsp); 5409 movptr(c_rarg0, (uintptr_t) "mismatch"); 5410 call(RuntimeAddress(CAST_FROM_FN_PTR(address, verify_secondary_supers_table_helper))); 5411 should_not_reach_here(); 5412 } 5413 bind(L_done); 5414 5415 BLOCK_COMMENT("} verify_secondary_supers_table"); 5416 } 5417 5418 #undef LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS 5419 5420 #endif // LP64 5421 5422 void MacroAssembler::clinit_barrier(Register klass, Label* L_fast_path, Label* L_slow_path) { 5423 assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required"); 5424 5425 Label L_fallthrough; 5426 if (L_fast_path == nullptr) { 5427 L_fast_path = &L_fallthrough; 5428 } else if (L_slow_path == nullptr) { 5429 L_slow_path = &L_fallthrough; 5430 } 5431 5432 // Fast path check: class is fully initialized. 5433 // init_state needs acquire, but x86 is TSO, and so we are already good. 5434 cmpb(Address(klass, InstanceKlass::init_state_offset()), InstanceKlass::fully_initialized); 5435 jcc(Assembler::equal, *L_fast_path); 5436 5437 // Fast path check: current thread is initializer thread 5438 cmpptr(r15_thread, Address(klass, InstanceKlass::init_thread_offset())); 5439 if (L_slow_path == &L_fallthrough) { 5440 jcc(Assembler::equal, *L_fast_path); 5441 bind(*L_slow_path); 5442 } else if (L_fast_path == &L_fallthrough) { 5443 jcc(Assembler::notEqual, *L_slow_path); 5444 bind(*L_fast_path); 5445 } else { 5446 Unimplemented(); 5447 } 5448 } 5449 5450 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) { 5451 if (VM_Version::supports_cmov()) { 5452 cmovl(cc, dst, src); 5453 } else { 5454 Label L; 5455 jccb(negate_condition(cc), L); 5456 movl(dst, src); 5457 bind(L); 5458 } 5459 } 5460 5461 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) { 5462 if (VM_Version::supports_cmov()) { 5463 cmovl(cc, dst, src); 5464 } else { 5465 Label L; 5466 jccb(negate_condition(cc), L); 5467 movl(dst, src); 5468 bind(L); 5469 } 5470 } 5471 5472 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) { 5473 if (!VerifyOops) return; 5474 5475 BLOCK_COMMENT("verify_oop {"); 5476 #ifdef _LP64 5477 push(rscratch1); 5478 #endif 5479 push(rax); // save rax 5480 push(reg); // pass register argument 5481 5482 // Pass register number to verify_oop_subroutine 5483 const char* b = nullptr; 5484 { 5485 ResourceMark rm; 5486 stringStream ss; 5487 ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line); 5488 b = code_string(ss.as_string()); 5489 } 5490 AddressLiteral buffer((address) b, external_word_Relocation::spec_for_immediate()); 5491 pushptr(buffer.addr(), rscratch1); 5492 5493 // call indirectly to solve generation ordering problem 5494 movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 5495 call(rax); 5496 // Caller pops the arguments (oop, message) and restores rax, r10 5497 BLOCK_COMMENT("} verify_oop"); 5498 } 5499 5500 void MacroAssembler::vallones(XMMRegister dst, int vector_len) { 5501 if (UseAVX > 2 && (vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 5502 // Only pcmpeq has dependency breaking treatment (i.e the execution can begin without 5503 // waiting for the previous result on dst), not vpcmpeqd, so just use vpternlog 5504 vpternlogd(dst, 0xFF, dst, dst, vector_len); 5505 } else if (VM_Version::supports_avx()) { 5506 vpcmpeqd(dst, dst, dst, vector_len); 5507 } else { 5508 pcmpeqd(dst, dst); 5509 } 5510 } 5511 5512 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot, 5513 int extra_slot_offset) { 5514 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 5515 int stackElementSize = Interpreter::stackElementSize; 5516 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0); 5517 #ifdef ASSERT 5518 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1); 5519 assert(offset1 - offset == stackElementSize, "correct arithmetic"); 5520 #endif 5521 Register scale_reg = noreg; 5522 Address::ScaleFactor scale_factor = Address::no_scale; 5523 if (arg_slot.is_constant()) { 5524 offset += arg_slot.as_constant() * stackElementSize; 5525 } else { 5526 scale_reg = arg_slot.as_register(); 5527 scale_factor = Address::times(stackElementSize); 5528 } 5529 offset += wordSize; // return PC is on stack 5530 return Address(rsp, scale_reg, scale_factor, offset); 5531 } 5532 5533 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) { 5534 if (!VerifyOops) return; 5535 5536 #ifdef _LP64 5537 push(rscratch1); 5538 #endif 5539 push(rax); // save rax, 5540 // addr may contain rsp so we will have to adjust it based on the push 5541 // we just did (and on 64 bit we do two pushes) 5542 // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which 5543 // stores rax into addr which is backwards of what was intended. 5544 if (addr.uses(rsp)) { 5545 lea(rax, addr); 5546 pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord)); 5547 } else { 5548 pushptr(addr); 5549 } 5550 5551 // Pass register number to verify_oop_subroutine 5552 const char* b = nullptr; 5553 { 5554 ResourceMark rm; 5555 stringStream ss; 5556 ss.print("verify_oop_addr: %s (%s:%d)", s, file, line); 5557 b = code_string(ss.as_string()); 5558 } 5559 AddressLiteral buffer((address) b, external_word_Relocation::spec_for_immediate()); 5560 pushptr(buffer.addr(), rscratch1); 5561 5562 // call indirectly to solve generation ordering problem 5563 movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 5564 call(rax); 5565 // Caller pops the arguments (addr, message) and restores rax, r10. 5566 } 5567 5568 void MacroAssembler::verify_tlab() { 5569 #ifdef ASSERT 5570 if (UseTLAB && VerifyOops) { 5571 Label next, ok; 5572 Register t1 = rsi; 5573 Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread); 5574 5575 push(t1); 5576 NOT_LP64(push(thread_reg)); 5577 NOT_LP64(get_thread(thread_reg)); 5578 5579 movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset()))); 5580 cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset()))); 5581 jcc(Assembler::aboveEqual, next); 5582 STOP("assert(top >= start)"); 5583 should_not_reach_here(); 5584 5585 bind(next); 5586 movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset()))); 5587 cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset()))); 5588 jcc(Assembler::aboveEqual, ok); 5589 STOP("assert(top <= end)"); 5590 should_not_reach_here(); 5591 5592 bind(ok); 5593 NOT_LP64(pop(thread_reg)); 5594 pop(t1); 5595 } 5596 #endif 5597 } 5598 5599 class ControlWord { 5600 public: 5601 int32_t _value; 5602 5603 int rounding_control() const { return (_value >> 10) & 3 ; } 5604 int precision_control() const { return (_value >> 8) & 3 ; } 5605 bool precision() const { return ((_value >> 5) & 1) != 0; } 5606 bool underflow() const { return ((_value >> 4) & 1) != 0; } 5607 bool overflow() const { return ((_value >> 3) & 1) != 0; } 5608 bool zero_divide() const { return ((_value >> 2) & 1) != 0; } 5609 bool denormalized() const { return ((_value >> 1) & 1) != 0; } 5610 bool invalid() const { return ((_value >> 0) & 1) != 0; } 5611 5612 void print() const { 5613 // rounding control 5614 const char* rc; 5615 switch (rounding_control()) { 5616 case 0: rc = "round near"; break; 5617 case 1: rc = "round down"; break; 5618 case 2: rc = "round up "; break; 5619 case 3: rc = "chop "; break; 5620 default: 5621 rc = nullptr; // silence compiler warnings 5622 fatal("Unknown rounding control: %d", rounding_control()); 5623 }; 5624 // precision control 5625 const char* pc; 5626 switch (precision_control()) { 5627 case 0: pc = "24 bits "; break; 5628 case 1: pc = "reserved"; break; 5629 case 2: pc = "53 bits "; break; 5630 case 3: pc = "64 bits "; break; 5631 default: 5632 pc = nullptr; // silence compiler warnings 5633 fatal("Unknown precision control: %d", precision_control()); 5634 }; 5635 // flags 5636 char f[9]; 5637 f[0] = ' '; 5638 f[1] = ' '; 5639 f[2] = (precision ()) ? 'P' : 'p'; 5640 f[3] = (underflow ()) ? 'U' : 'u'; 5641 f[4] = (overflow ()) ? 'O' : 'o'; 5642 f[5] = (zero_divide ()) ? 'Z' : 'z'; 5643 f[6] = (denormalized()) ? 'D' : 'd'; 5644 f[7] = (invalid ()) ? 'I' : 'i'; 5645 f[8] = '\x0'; 5646 // output 5647 printf("%04x masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc); 5648 } 5649 5650 }; 5651 5652 class StatusWord { 5653 public: 5654 int32_t _value; 5655 5656 bool busy() const { return ((_value >> 15) & 1) != 0; } 5657 bool C3() const { return ((_value >> 14) & 1) != 0; } 5658 bool C2() const { return ((_value >> 10) & 1) != 0; } 5659 bool C1() const { return ((_value >> 9) & 1) != 0; } 5660 bool C0() const { return ((_value >> 8) & 1) != 0; } 5661 int top() const { return (_value >> 11) & 7 ; } 5662 bool error_status() const { return ((_value >> 7) & 1) != 0; } 5663 bool stack_fault() const { return ((_value >> 6) & 1) != 0; } 5664 bool precision() const { return ((_value >> 5) & 1) != 0; } 5665 bool underflow() const { return ((_value >> 4) & 1) != 0; } 5666 bool overflow() const { return ((_value >> 3) & 1) != 0; } 5667 bool zero_divide() const { return ((_value >> 2) & 1) != 0; } 5668 bool denormalized() const { return ((_value >> 1) & 1) != 0; } 5669 bool invalid() const { return ((_value >> 0) & 1) != 0; } 5670 5671 void print() const { 5672 // condition codes 5673 char c[5]; 5674 c[0] = (C3()) ? '3' : '-'; 5675 c[1] = (C2()) ? '2' : '-'; 5676 c[2] = (C1()) ? '1' : '-'; 5677 c[3] = (C0()) ? '0' : '-'; 5678 c[4] = '\x0'; 5679 // flags 5680 char f[9]; 5681 f[0] = (error_status()) ? 'E' : '-'; 5682 f[1] = (stack_fault ()) ? 'S' : '-'; 5683 f[2] = (precision ()) ? 'P' : '-'; 5684 f[3] = (underflow ()) ? 'U' : '-'; 5685 f[4] = (overflow ()) ? 'O' : '-'; 5686 f[5] = (zero_divide ()) ? 'Z' : '-'; 5687 f[6] = (denormalized()) ? 'D' : '-'; 5688 f[7] = (invalid ()) ? 'I' : '-'; 5689 f[8] = '\x0'; 5690 // output 5691 printf("%04x flags = %s, cc = %s, top = %d", _value & 0xFFFF, f, c, top()); 5692 } 5693 5694 }; 5695 5696 class TagWord { 5697 public: 5698 int32_t _value; 5699 5700 int tag_at(int i) const { return (_value >> (i*2)) & 3; } 5701 5702 void print() const { 5703 printf("%04x", _value & 0xFFFF); 5704 } 5705 5706 }; 5707 5708 class FPU_Register { 5709 public: 5710 int32_t _m0; 5711 int32_t _m1; 5712 int16_t _ex; 5713 5714 bool is_indefinite() const { 5715 return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0; 5716 } 5717 5718 void print() const { 5719 char sign = (_ex < 0) ? '-' : '+'; 5720 const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : " "; 5721 printf("%c%04hx.%08x%08x %s", sign, _ex, _m1, _m0, kind); 5722 }; 5723 5724 }; 5725 5726 class FPU_State { 5727 public: 5728 enum { 5729 register_size = 10, 5730 number_of_registers = 8, 5731 register_mask = 7 5732 }; 5733 5734 ControlWord _control_word; 5735 StatusWord _status_word; 5736 TagWord _tag_word; 5737 int32_t _error_offset; 5738 int32_t _error_selector; 5739 int32_t _data_offset; 5740 int32_t _data_selector; 5741 int8_t _register[register_size * number_of_registers]; 5742 5743 int tag_for_st(int i) const { return _tag_word.tag_at((_status_word.top() + i) & register_mask); } 5744 FPU_Register* st(int i) const { return (FPU_Register*)&_register[register_size * i]; } 5745 5746 const char* tag_as_string(int tag) const { 5747 switch (tag) { 5748 case 0: return "valid"; 5749 case 1: return "zero"; 5750 case 2: return "special"; 5751 case 3: return "empty"; 5752 } 5753 ShouldNotReachHere(); 5754 return nullptr; 5755 } 5756 5757 void print() const { 5758 // print computation registers 5759 { int t = _status_word.top(); 5760 for (int i = 0; i < number_of_registers; i++) { 5761 int j = (i - t) & register_mask; 5762 printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j); 5763 st(j)->print(); 5764 printf(" %s\n", tag_as_string(_tag_word.tag_at(i))); 5765 } 5766 } 5767 printf("\n"); 5768 // print control registers 5769 printf("ctrl = "); _control_word.print(); printf("\n"); 5770 printf("stat = "); _status_word .print(); printf("\n"); 5771 printf("tags = "); _tag_word .print(); printf("\n"); 5772 } 5773 5774 }; 5775 5776 class Flag_Register { 5777 public: 5778 int32_t _value; 5779 5780 bool overflow() const { return ((_value >> 11) & 1) != 0; } 5781 bool direction() const { return ((_value >> 10) & 1) != 0; } 5782 bool sign() const { return ((_value >> 7) & 1) != 0; } 5783 bool zero() const { return ((_value >> 6) & 1) != 0; } 5784 bool auxiliary_carry() const { return ((_value >> 4) & 1) != 0; } 5785 bool parity() const { return ((_value >> 2) & 1) != 0; } 5786 bool carry() const { return ((_value >> 0) & 1) != 0; } 5787 5788 void print() const { 5789 // flags 5790 char f[8]; 5791 f[0] = (overflow ()) ? 'O' : '-'; 5792 f[1] = (direction ()) ? 'D' : '-'; 5793 f[2] = (sign ()) ? 'S' : '-'; 5794 f[3] = (zero ()) ? 'Z' : '-'; 5795 f[4] = (auxiliary_carry()) ? 'A' : '-'; 5796 f[5] = (parity ()) ? 'P' : '-'; 5797 f[6] = (carry ()) ? 'C' : '-'; 5798 f[7] = '\x0'; 5799 // output 5800 printf("%08x flags = %s", _value, f); 5801 } 5802 5803 }; 5804 5805 class IU_Register { 5806 public: 5807 int32_t _value; 5808 5809 void print() const { 5810 printf("%08x %11d", _value, _value); 5811 } 5812 5813 }; 5814 5815 class IU_State { 5816 public: 5817 Flag_Register _eflags; 5818 IU_Register _rdi; 5819 IU_Register _rsi; 5820 IU_Register _rbp; 5821 IU_Register _rsp; 5822 IU_Register _rbx; 5823 IU_Register _rdx; 5824 IU_Register _rcx; 5825 IU_Register _rax; 5826 5827 void print() const { 5828 // computation registers 5829 printf("rax, = "); _rax.print(); printf("\n"); 5830 printf("rbx, = "); _rbx.print(); printf("\n"); 5831 printf("rcx = "); _rcx.print(); printf("\n"); 5832 printf("rdx = "); _rdx.print(); printf("\n"); 5833 printf("rdi = "); _rdi.print(); printf("\n"); 5834 printf("rsi = "); _rsi.print(); printf("\n"); 5835 printf("rbp, = "); _rbp.print(); printf("\n"); 5836 printf("rsp = "); _rsp.print(); printf("\n"); 5837 printf("\n"); 5838 // control registers 5839 printf("flgs = "); _eflags.print(); printf("\n"); 5840 } 5841 }; 5842 5843 5844 class CPU_State { 5845 public: 5846 FPU_State _fpu_state; 5847 IU_State _iu_state; 5848 5849 void print() const { 5850 printf("--------------------------------------------------\n"); 5851 _iu_state .print(); 5852 printf("\n"); 5853 _fpu_state.print(); 5854 printf("--------------------------------------------------\n"); 5855 } 5856 5857 }; 5858 5859 5860 static void _print_CPU_state(CPU_State* state) { 5861 state->print(); 5862 }; 5863 5864 5865 void MacroAssembler::print_CPU_state() { 5866 push_CPU_state(); 5867 push(rsp); // pass CPU state 5868 call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state))); 5869 addptr(rsp, wordSize); // discard argument 5870 pop_CPU_state(); 5871 } 5872 5873 5874 #ifndef _LP64 5875 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) { 5876 static int counter = 0; 5877 FPU_State* fs = &state->_fpu_state; 5878 counter++; 5879 // For leaf calls, only verify that the top few elements remain empty. 5880 // We only need 1 empty at the top for C2 code. 5881 if( stack_depth < 0 ) { 5882 if( fs->tag_for_st(7) != 3 ) { 5883 printf("FPR7 not empty\n"); 5884 state->print(); 5885 assert(false, "error"); 5886 return false; 5887 } 5888 return true; // All other stack states do not matter 5889 } 5890 5891 assert((fs->_control_word._value & 0xffff) == StubRoutines::x86::fpu_cntrl_wrd_std(), 5892 "bad FPU control word"); 5893 5894 // compute stack depth 5895 int i = 0; 5896 while (i < FPU_State::number_of_registers && fs->tag_for_st(i) < 3) i++; 5897 int d = i; 5898 while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++; 5899 // verify findings 5900 if (i != FPU_State::number_of_registers) { 5901 // stack not contiguous 5902 printf("%s: stack not contiguous at ST%d\n", s, i); 5903 state->print(); 5904 assert(false, "error"); 5905 return false; 5906 } 5907 // check if computed stack depth corresponds to expected stack depth 5908 if (stack_depth < 0) { 5909 // expected stack depth is -stack_depth or less 5910 if (d > -stack_depth) { 5911 // too many elements on the stack 5912 printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d); 5913 state->print(); 5914 assert(false, "error"); 5915 return false; 5916 } 5917 } else { 5918 // expected stack depth is stack_depth 5919 if (d != stack_depth) { 5920 // wrong stack depth 5921 printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d); 5922 state->print(); 5923 assert(false, "error"); 5924 return false; 5925 } 5926 } 5927 // everything is cool 5928 return true; 5929 } 5930 5931 void MacroAssembler::verify_FPU(int stack_depth, const char* s) { 5932 if (!VerifyFPU) return; 5933 push_CPU_state(); 5934 push(rsp); // pass CPU state 5935 ExternalAddress msg((address) s); 5936 // pass message string s 5937 pushptr(msg.addr(), noreg); 5938 push(stack_depth); // pass stack depth 5939 call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU))); 5940 addptr(rsp, 3 * wordSize); // discard arguments 5941 // check for error 5942 { Label L; 5943 testl(rax, rax); 5944 jcc(Assembler::notZero, L); 5945 int3(); // break if error condition 5946 bind(L); 5947 } 5948 pop_CPU_state(); 5949 } 5950 #endif // _LP64 5951 5952 void MacroAssembler::restore_cpu_control_state_after_jni(Register rscratch) { 5953 // Either restore the MXCSR register after returning from the JNI Call 5954 // or verify that it wasn't changed (with -Xcheck:jni flag). 5955 if (VM_Version::supports_sse()) { 5956 if (RestoreMXCSROnJNICalls) { 5957 ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), rscratch); 5958 } else if (CheckJNICalls) { 5959 call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry())); 5960 } 5961 } 5962 // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty. 5963 vzeroupper(); 5964 5965 #ifndef _LP64 5966 // Either restore the x87 floating pointer control word after returning 5967 // from the JNI call or verify that it wasn't changed. 5968 if (CheckJNICalls) { 5969 call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry())); 5970 } 5971 #endif // _LP64 5972 } 5973 5974 // ((OopHandle)result).resolve(); 5975 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) { 5976 assert_different_registers(result, tmp); 5977 5978 // Only 64 bit platforms support GCs that require a tmp register 5979 // Only IN_HEAP loads require a thread_tmp register 5980 // OopHandle::resolve is an indirection like jobject. 5981 access_load_at(T_OBJECT, IN_NATIVE, 5982 result, Address(result, 0), tmp, /*tmp_thread*/noreg); 5983 } 5984 5985 // ((WeakHandle)result).resolve(); 5986 void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) { 5987 assert_different_registers(rresult, rtmp); 5988 Label resolved; 5989 5990 // A null weak handle resolves to null. 5991 cmpptr(rresult, 0); 5992 jcc(Assembler::equal, resolved); 5993 5994 // Only 64 bit platforms support GCs that require a tmp register 5995 // Only IN_HEAP loads require a thread_tmp register 5996 // WeakHandle::resolve is an indirection like jweak. 5997 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, 5998 rresult, Address(rresult, 0), rtmp, /*tmp_thread*/noreg); 5999 bind(resolved); 6000 } 6001 6002 void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) { 6003 // get mirror 6004 const int mirror_offset = in_bytes(Klass::java_mirror_offset()); 6005 load_method_holder(mirror, method); 6006 movptr(mirror, Address(mirror, mirror_offset)); 6007 resolve_oop_handle(mirror, tmp); 6008 } 6009 6010 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) { 6011 load_method_holder(rresult, rmethod); 6012 movptr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset())); 6013 } 6014 6015 void MacroAssembler::load_method_holder(Register holder, Register method) { 6016 movptr(holder, Address(method, Method::const_offset())); // ConstMethod* 6017 movptr(holder, Address(holder, ConstMethod::constants_offset())); // ConstantPool* 6018 movptr(holder, Address(holder, ConstantPool::pool_holder_offset())); // InstanceKlass* 6019 } 6020 6021 #ifdef _LP64 6022 void MacroAssembler::load_narrow_klass_compact(Register dst, Register src) { 6023 assert(UseCompactObjectHeaders, "expect compact object headers"); 6024 movq(dst, Address(src, oopDesc::mark_offset_in_bytes())); 6025 shrq(dst, markWord::klass_shift); 6026 } 6027 #endif 6028 6029 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) { 6030 assert_different_registers(src, tmp); 6031 assert_different_registers(dst, tmp); 6032 #ifdef _LP64 6033 if (UseCompactObjectHeaders) { 6034 load_narrow_klass_compact(dst, src); 6035 decode_klass_not_null(dst, tmp); 6036 } else if (UseCompressedClassPointers) { 6037 movl(dst, Address(src, oopDesc::klass_offset_in_bytes())); 6038 decode_klass_not_null(dst, tmp); 6039 } else 6040 #endif 6041 { 6042 movptr(dst, Address(src, oopDesc::klass_offset_in_bytes())); 6043 } 6044 } 6045 6046 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) { 6047 assert(!UseCompactObjectHeaders, "not with compact headers"); 6048 assert_different_registers(src, tmp); 6049 assert_different_registers(dst, tmp); 6050 #ifdef _LP64 6051 if (UseCompressedClassPointers) { 6052 encode_klass_not_null(src, tmp); 6053 movl(Address(dst, oopDesc::klass_offset_in_bytes()), src); 6054 } else 6055 #endif 6056 movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src); 6057 } 6058 6059 void MacroAssembler::cmp_klass(Register klass, Register obj, Register tmp) { 6060 #ifdef _LP64 6061 if (UseCompactObjectHeaders) { 6062 assert(tmp != noreg, "need tmp"); 6063 assert_different_registers(klass, obj, tmp); 6064 load_narrow_klass_compact(tmp, obj); 6065 cmpl(klass, tmp); 6066 } else if (UseCompressedClassPointers) { 6067 cmpl(klass, Address(obj, oopDesc::klass_offset_in_bytes())); 6068 } else 6069 #endif 6070 { 6071 cmpptr(klass, Address(obj, oopDesc::klass_offset_in_bytes())); 6072 } 6073 } 6074 6075 void MacroAssembler::cmp_klasses_from_objects(Register obj1, Register obj2, Register tmp1, Register tmp2) { 6076 #ifdef _LP64 6077 if (UseCompactObjectHeaders) { 6078 assert(tmp2 != noreg, "need tmp2"); 6079 assert_different_registers(obj1, obj2, tmp1, tmp2); 6080 load_narrow_klass_compact(tmp1, obj1); 6081 load_narrow_klass_compact(tmp2, obj2); 6082 cmpl(tmp1, tmp2); 6083 } else if (UseCompressedClassPointers) { 6084 movl(tmp1, Address(obj1, oopDesc::klass_offset_in_bytes())); 6085 cmpl(tmp1, Address(obj2, oopDesc::klass_offset_in_bytes())); 6086 } else 6087 #endif 6088 { 6089 movptr(tmp1, Address(obj1, oopDesc::klass_offset_in_bytes())); 6090 cmpptr(tmp1, Address(obj2, oopDesc::klass_offset_in_bytes())); 6091 } 6092 } 6093 6094 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src, 6095 Register tmp1, Register thread_tmp) { 6096 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 6097 decorators = AccessInternal::decorator_fixup(decorators, type); 6098 bool as_raw = (decorators & AS_RAW) != 0; 6099 if (as_raw) { 6100 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 6101 } else { 6102 bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 6103 } 6104 } 6105 6106 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register val, 6107 Register tmp1, Register tmp2, Register tmp3) { 6108 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 6109 decorators = AccessInternal::decorator_fixup(decorators, type); 6110 bool as_raw = (decorators & AS_RAW) != 0; 6111 if (as_raw) { 6112 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3); 6113 } else { 6114 bs->store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3); 6115 } 6116 } 6117 6118 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1, 6119 Register thread_tmp, DecoratorSet decorators) { 6120 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 6121 } 6122 6123 // Doesn't do verification, generates fixed size code 6124 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1, 6125 Register thread_tmp, DecoratorSet decorators) { 6126 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp); 6127 } 6128 6129 void MacroAssembler::store_heap_oop(Address dst, Register val, Register tmp1, 6130 Register tmp2, Register tmp3, DecoratorSet decorators) { 6131 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, val, tmp1, tmp2, tmp3); 6132 } 6133 6134 // Used for storing nulls. 6135 void MacroAssembler::store_heap_oop_null(Address dst) { 6136 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg); 6137 } 6138 6139 #ifdef _LP64 6140 void MacroAssembler::store_klass_gap(Register dst, Register src) { 6141 assert(!UseCompactObjectHeaders, "Don't use with compact headers"); 6142 if (UseCompressedClassPointers) { 6143 // Store to klass gap in destination 6144 movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src); 6145 } 6146 } 6147 6148 #ifdef ASSERT 6149 void MacroAssembler::verify_heapbase(const char* msg) { 6150 assert (UseCompressedOops, "should be compressed"); 6151 assert (Universe::heap() != nullptr, "java heap should be initialized"); 6152 if (CheckCompressedOops) { 6153 Label ok; 6154 ExternalAddress src2(CompressedOops::base_addr()); 6155 const bool is_src2_reachable = reachable(src2); 6156 if (!is_src2_reachable) { 6157 push(rscratch1); // cmpptr trashes rscratch1 6158 } 6159 cmpptr(r12_heapbase, src2, rscratch1); 6160 jcc(Assembler::equal, ok); 6161 STOP(msg); 6162 bind(ok); 6163 if (!is_src2_reachable) { 6164 pop(rscratch1); 6165 } 6166 } 6167 } 6168 #endif 6169 6170 // Algorithm must match oop.inline.hpp encode_heap_oop. 6171 void MacroAssembler::encode_heap_oop(Register r) { 6172 #ifdef ASSERT 6173 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?"); 6174 #endif 6175 verify_oop_msg(r, "broken oop in encode_heap_oop"); 6176 if (CompressedOops::base() == nullptr) { 6177 if (CompressedOops::shift() != 0) { 6178 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 6179 shrq(r, LogMinObjAlignmentInBytes); 6180 } 6181 return; 6182 } 6183 testq(r, r); 6184 cmovq(Assembler::equal, r, r12_heapbase); 6185 subq(r, r12_heapbase); 6186 shrq(r, LogMinObjAlignmentInBytes); 6187 } 6188 6189 void MacroAssembler::encode_heap_oop_not_null(Register r) { 6190 #ifdef ASSERT 6191 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?"); 6192 if (CheckCompressedOops) { 6193 Label ok; 6194 testq(r, r); 6195 jcc(Assembler::notEqual, ok); 6196 STOP("null oop passed to encode_heap_oop_not_null"); 6197 bind(ok); 6198 } 6199 #endif 6200 verify_oop_msg(r, "broken oop in encode_heap_oop_not_null"); 6201 if (CompressedOops::base() != nullptr) { 6202 subq(r, r12_heapbase); 6203 } 6204 if (CompressedOops::shift() != 0) { 6205 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 6206 shrq(r, LogMinObjAlignmentInBytes); 6207 } 6208 } 6209 6210 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) { 6211 #ifdef ASSERT 6212 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?"); 6213 if (CheckCompressedOops) { 6214 Label ok; 6215 testq(src, src); 6216 jcc(Assembler::notEqual, ok); 6217 STOP("null oop passed to encode_heap_oop_not_null2"); 6218 bind(ok); 6219 } 6220 #endif 6221 verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2"); 6222 if (dst != src) { 6223 movq(dst, src); 6224 } 6225 if (CompressedOops::base() != nullptr) { 6226 subq(dst, r12_heapbase); 6227 } 6228 if (CompressedOops::shift() != 0) { 6229 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 6230 shrq(dst, LogMinObjAlignmentInBytes); 6231 } 6232 } 6233 6234 void MacroAssembler::decode_heap_oop(Register r) { 6235 #ifdef ASSERT 6236 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?"); 6237 #endif 6238 if (CompressedOops::base() == nullptr) { 6239 if (CompressedOops::shift() != 0) { 6240 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 6241 shlq(r, LogMinObjAlignmentInBytes); 6242 } 6243 } else { 6244 Label done; 6245 shlq(r, LogMinObjAlignmentInBytes); 6246 jccb(Assembler::equal, done); 6247 addq(r, r12_heapbase); 6248 bind(done); 6249 } 6250 verify_oop_msg(r, "broken oop in decode_heap_oop"); 6251 } 6252 6253 void MacroAssembler::decode_heap_oop_not_null(Register r) { 6254 // Note: it will change flags 6255 assert (UseCompressedOops, "should only be used for compressed headers"); 6256 assert (Universe::heap() != nullptr, "java heap should be initialized"); 6257 // Cannot assert, unverified entry point counts instructions (see .ad file) 6258 // vtableStubs also counts instructions in pd_code_size_limit. 6259 // Also do not verify_oop as this is called by verify_oop. 6260 if (CompressedOops::shift() != 0) { 6261 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 6262 shlq(r, LogMinObjAlignmentInBytes); 6263 if (CompressedOops::base() != nullptr) { 6264 addq(r, r12_heapbase); 6265 } 6266 } else { 6267 assert (CompressedOops::base() == nullptr, "sanity"); 6268 } 6269 } 6270 6271 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) { 6272 // Note: it will change flags 6273 assert (UseCompressedOops, "should only be used for compressed headers"); 6274 assert (Universe::heap() != nullptr, "java heap should be initialized"); 6275 // Cannot assert, unverified entry point counts instructions (see .ad file) 6276 // vtableStubs also counts instructions in pd_code_size_limit. 6277 // Also do not verify_oop as this is called by verify_oop. 6278 if (CompressedOops::shift() != 0) { 6279 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 6280 if (LogMinObjAlignmentInBytes == Address::times_8) { 6281 leaq(dst, Address(r12_heapbase, src, Address::times_8, 0)); 6282 } else { 6283 if (dst != src) { 6284 movq(dst, src); 6285 } 6286 shlq(dst, LogMinObjAlignmentInBytes); 6287 if (CompressedOops::base() != nullptr) { 6288 addq(dst, r12_heapbase); 6289 } 6290 } 6291 } else { 6292 assert (CompressedOops::base() == nullptr, "sanity"); 6293 if (dst != src) { 6294 movq(dst, src); 6295 } 6296 } 6297 } 6298 6299 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) { 6300 assert_different_registers(r, tmp); 6301 if (CompressedKlassPointers::base() != nullptr) { 6302 mov64(tmp, (int64_t)CompressedKlassPointers::base()); 6303 subq(r, tmp); 6304 } 6305 if (CompressedKlassPointers::shift() != 0) { 6306 shrq(r, CompressedKlassPointers::shift()); 6307 } 6308 } 6309 6310 void MacroAssembler::encode_and_move_klass_not_null(Register dst, Register src) { 6311 assert_different_registers(src, dst); 6312 if (CompressedKlassPointers::base() != nullptr) { 6313 mov64(dst, -(int64_t)CompressedKlassPointers::base()); 6314 addq(dst, src); 6315 } else { 6316 movptr(dst, src); 6317 } 6318 if (CompressedKlassPointers::shift() != 0) { 6319 shrq(dst, CompressedKlassPointers::shift()); 6320 } 6321 } 6322 6323 void MacroAssembler::decode_klass_not_null(Register r, Register tmp) { 6324 assert_different_registers(r, tmp); 6325 // Note: it will change flags 6326 assert(UseCompressedClassPointers, "should only be used for compressed headers"); 6327 // Cannot assert, unverified entry point counts instructions (see .ad file) 6328 // vtableStubs also counts instructions in pd_code_size_limit. 6329 // Also do not verify_oop as this is called by verify_oop. 6330 if (CompressedKlassPointers::shift() != 0) { 6331 shlq(r, CompressedKlassPointers::shift()); 6332 } 6333 if (CompressedKlassPointers::base() != nullptr) { 6334 mov64(tmp, (int64_t)CompressedKlassPointers::base()); 6335 addq(r, tmp); 6336 } 6337 } 6338 6339 void MacroAssembler::decode_and_move_klass_not_null(Register dst, Register src) { 6340 assert_different_registers(src, dst); 6341 // Note: it will change flags 6342 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 6343 // Cannot assert, unverified entry point counts instructions (see .ad file) 6344 // vtableStubs also counts instructions in pd_code_size_limit. 6345 // Also do not verify_oop as this is called by verify_oop. 6346 6347 if (CompressedKlassPointers::base() == nullptr && 6348 CompressedKlassPointers::shift() == 0) { 6349 // The best case scenario is that there is no base or shift. Then it is already 6350 // a pointer that needs nothing but a register rename. 6351 movl(dst, src); 6352 } else { 6353 if (CompressedKlassPointers::shift() <= Address::times_8) { 6354 if (CompressedKlassPointers::base() != nullptr) { 6355 mov64(dst, (int64_t)CompressedKlassPointers::base()); 6356 } else { 6357 xorq(dst, dst); 6358 } 6359 if (CompressedKlassPointers::shift() != 0) { 6360 assert(CompressedKlassPointers::shift() == Address::times_8, "klass not aligned on 64bits?"); 6361 leaq(dst, Address(dst, src, Address::times_8, 0)); 6362 } else { 6363 addq(dst, src); 6364 } 6365 } else { 6366 if (CompressedKlassPointers::base() != nullptr) { 6367 const uint64_t base_right_shifted = 6368 (uint64_t)CompressedKlassPointers::base() >> CompressedKlassPointers::shift(); 6369 mov64(dst, base_right_shifted); 6370 } else { 6371 xorq(dst, dst); 6372 } 6373 addq(dst, src); 6374 shlq(dst, CompressedKlassPointers::shift()); 6375 } 6376 } 6377 } 6378 6379 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) { 6380 assert (UseCompressedOops, "should only be used for compressed headers"); 6381 assert (Universe::heap() != nullptr, "java heap should be initialized"); 6382 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder"); 6383 int oop_index = oop_recorder()->find_index(obj); 6384 RelocationHolder rspec = oop_Relocation::spec(oop_index); 6385 mov_narrow_oop(dst, oop_index, rspec); 6386 } 6387 6388 void MacroAssembler::set_narrow_oop(Address dst, jobject obj) { 6389 assert (UseCompressedOops, "should only be used for compressed headers"); 6390 assert (Universe::heap() != nullptr, "java heap should be initialized"); 6391 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder"); 6392 int oop_index = oop_recorder()->find_index(obj); 6393 RelocationHolder rspec = oop_Relocation::spec(oop_index); 6394 mov_narrow_oop(dst, oop_index, rspec); 6395 } 6396 6397 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) { 6398 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 6399 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder"); 6400 int klass_index = oop_recorder()->find_index(k); 6401 RelocationHolder rspec = metadata_Relocation::spec(klass_index); 6402 mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec); 6403 } 6404 6405 void MacroAssembler::set_narrow_klass(Address dst, Klass* k) { 6406 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 6407 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder"); 6408 int klass_index = oop_recorder()->find_index(k); 6409 RelocationHolder rspec = metadata_Relocation::spec(klass_index); 6410 mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec); 6411 } 6412 6413 void MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) { 6414 assert (UseCompressedOops, "should only be used for compressed headers"); 6415 assert (Universe::heap() != nullptr, "java heap should be initialized"); 6416 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder"); 6417 int oop_index = oop_recorder()->find_index(obj); 6418 RelocationHolder rspec = oop_Relocation::spec(oop_index); 6419 Assembler::cmp_narrow_oop(dst, oop_index, rspec); 6420 } 6421 6422 void MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) { 6423 assert (UseCompressedOops, "should only be used for compressed headers"); 6424 assert (Universe::heap() != nullptr, "java heap should be initialized"); 6425 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder"); 6426 int oop_index = oop_recorder()->find_index(obj); 6427 RelocationHolder rspec = oop_Relocation::spec(oop_index); 6428 Assembler::cmp_narrow_oop(dst, oop_index, rspec); 6429 } 6430 6431 void MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) { 6432 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 6433 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder"); 6434 int klass_index = oop_recorder()->find_index(k); 6435 RelocationHolder rspec = metadata_Relocation::spec(klass_index); 6436 Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec); 6437 } 6438 6439 void MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) { 6440 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 6441 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder"); 6442 int klass_index = oop_recorder()->find_index(k); 6443 RelocationHolder rspec = metadata_Relocation::spec(klass_index); 6444 Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec); 6445 } 6446 6447 void MacroAssembler::reinit_heapbase() { 6448 if (UseCompressedOops) { 6449 if (Universe::heap() != nullptr) { 6450 if (CompressedOops::base() == nullptr) { 6451 MacroAssembler::xorptr(r12_heapbase, r12_heapbase); 6452 } else { 6453 mov64(r12_heapbase, (int64_t)CompressedOops::base()); 6454 } 6455 } else { 6456 movptr(r12_heapbase, ExternalAddress(CompressedOops::base_addr())); 6457 } 6458 } 6459 } 6460 6461 #endif // _LP64 6462 6463 #if COMPILER2_OR_JVMCI 6464 6465 // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM/ZMM registers 6466 void MacroAssembler::xmm_clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, KRegister mask) { 6467 // cnt - number of qwords (8-byte words). 6468 // base - start address, qword aligned. 6469 Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end; 6470 bool use64byteVector = (MaxVectorSize == 64) && (VM_Version::avx3_threshold() == 0); 6471 if (use64byteVector) { 6472 vpxor(xtmp, xtmp, xtmp, AVX_512bit); 6473 } else if (MaxVectorSize >= 32) { 6474 vpxor(xtmp, xtmp, xtmp, AVX_256bit); 6475 } else { 6476 pxor(xtmp, xtmp); 6477 } 6478 jmp(L_zero_64_bytes); 6479 6480 BIND(L_loop); 6481 if (MaxVectorSize >= 32) { 6482 fill64(base, 0, xtmp, use64byteVector); 6483 } else { 6484 movdqu(Address(base, 0), xtmp); 6485 movdqu(Address(base, 16), xtmp); 6486 movdqu(Address(base, 32), xtmp); 6487 movdqu(Address(base, 48), xtmp); 6488 } 6489 addptr(base, 64); 6490 6491 BIND(L_zero_64_bytes); 6492 subptr(cnt, 8); 6493 jccb(Assembler::greaterEqual, L_loop); 6494 6495 // Copy trailing 64 bytes 6496 if (use64byteVector) { 6497 addptr(cnt, 8); 6498 jccb(Assembler::equal, L_end); 6499 fill64_masked(3, base, 0, xtmp, mask, cnt, rtmp, true); 6500 jmp(L_end); 6501 } else { 6502 addptr(cnt, 4); 6503 jccb(Assembler::less, L_tail); 6504 if (MaxVectorSize >= 32) { 6505 vmovdqu(Address(base, 0), xtmp); 6506 } else { 6507 movdqu(Address(base, 0), xtmp); 6508 movdqu(Address(base, 16), xtmp); 6509 } 6510 } 6511 addptr(base, 32); 6512 subptr(cnt, 4); 6513 6514 BIND(L_tail); 6515 addptr(cnt, 4); 6516 jccb(Assembler::lessEqual, L_end); 6517 if (UseAVX > 2 && MaxVectorSize >= 32 && VM_Version::supports_avx512vl()) { 6518 fill32_masked(3, base, 0, xtmp, mask, cnt, rtmp); 6519 } else { 6520 decrement(cnt); 6521 6522 BIND(L_sloop); 6523 movq(Address(base, 0), xtmp); 6524 addptr(base, 8); 6525 decrement(cnt); 6526 jccb(Assembler::greaterEqual, L_sloop); 6527 } 6528 BIND(L_end); 6529 } 6530 6531 // Clearing constant sized memory using YMM/ZMM registers. 6532 void MacroAssembler::clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask) { 6533 assert(UseAVX > 2 && VM_Version::supports_avx512vl(), ""); 6534 bool use64byteVector = (MaxVectorSize > 32) && (VM_Version::avx3_threshold() == 0); 6535 6536 int vector64_count = (cnt & (~0x7)) >> 3; 6537 cnt = cnt & 0x7; 6538 const int fill64_per_loop = 4; 6539 const int max_unrolled_fill64 = 8; 6540 6541 // 64 byte initialization loop. 6542 vpxor(xtmp, xtmp, xtmp, use64byteVector ? AVX_512bit : AVX_256bit); 6543 int start64 = 0; 6544 if (vector64_count > max_unrolled_fill64) { 6545 Label LOOP; 6546 Register index = rtmp; 6547 6548 start64 = vector64_count - (vector64_count % fill64_per_loop); 6549 6550 movl(index, 0); 6551 BIND(LOOP); 6552 for (int i = 0; i < fill64_per_loop; i++) { 6553 fill64(Address(base, index, Address::times_1, i * 64), xtmp, use64byteVector); 6554 } 6555 addl(index, fill64_per_loop * 64); 6556 cmpl(index, start64 * 64); 6557 jccb(Assembler::less, LOOP); 6558 } 6559 for (int i = start64; i < vector64_count; i++) { 6560 fill64(base, i * 64, xtmp, use64byteVector); 6561 } 6562 6563 // Clear remaining 64 byte tail. 6564 int disp = vector64_count * 64; 6565 if (cnt) { 6566 switch (cnt) { 6567 case 1: 6568 movq(Address(base, disp), xtmp); 6569 break; 6570 case 2: 6571 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_128bit); 6572 break; 6573 case 3: 6574 movl(rtmp, 0x7); 6575 kmovwl(mask, rtmp); 6576 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_256bit); 6577 break; 6578 case 4: 6579 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit); 6580 break; 6581 case 5: 6582 if (use64byteVector) { 6583 movl(rtmp, 0x1F); 6584 kmovwl(mask, rtmp); 6585 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit); 6586 } else { 6587 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit); 6588 movq(Address(base, disp + 32), xtmp); 6589 } 6590 break; 6591 case 6: 6592 if (use64byteVector) { 6593 movl(rtmp, 0x3F); 6594 kmovwl(mask, rtmp); 6595 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit); 6596 } else { 6597 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit); 6598 evmovdqu(T_LONG, k0, Address(base, disp + 32), xtmp, false, Assembler::AVX_128bit); 6599 } 6600 break; 6601 case 7: 6602 if (use64byteVector) { 6603 movl(rtmp, 0x7F); 6604 kmovwl(mask, rtmp); 6605 evmovdqu(T_LONG, mask, Address(base, disp), xtmp, true, Assembler::AVX_512bit); 6606 } else { 6607 evmovdqu(T_LONG, k0, Address(base, disp), xtmp, false, Assembler::AVX_256bit); 6608 movl(rtmp, 0x7); 6609 kmovwl(mask, rtmp); 6610 evmovdqu(T_LONG, mask, Address(base, disp + 32), xtmp, true, Assembler::AVX_256bit); 6611 } 6612 break; 6613 default: 6614 fatal("Unexpected length : %d\n",cnt); 6615 break; 6616 } 6617 } 6618 } 6619 6620 void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp, 6621 bool is_large, KRegister mask) { 6622 // cnt - number of qwords (8-byte words). 6623 // base - start address, qword aligned. 6624 // is_large - if optimizers know cnt is larger than InitArrayShortSize 6625 assert(base==rdi, "base register must be edi for rep stos"); 6626 assert(tmp==rax, "tmp register must be eax for rep stos"); 6627 assert(cnt==rcx, "cnt register must be ecx for rep stos"); 6628 assert(InitArrayShortSize % BytesPerLong == 0, 6629 "InitArrayShortSize should be the multiple of BytesPerLong"); 6630 6631 Label DONE; 6632 if (!is_large || !UseXMMForObjInit) { 6633 xorptr(tmp, tmp); 6634 } 6635 6636 if (!is_large) { 6637 Label LOOP, LONG; 6638 cmpptr(cnt, InitArrayShortSize/BytesPerLong); 6639 jccb(Assembler::greater, LONG); 6640 6641 NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM 6642 6643 decrement(cnt); 6644 jccb(Assembler::negative, DONE); // Zero length 6645 6646 // Use individual pointer-sized stores for small counts: 6647 BIND(LOOP); 6648 movptr(Address(base, cnt, Address::times_ptr), tmp); 6649 decrement(cnt); 6650 jccb(Assembler::greaterEqual, LOOP); 6651 jmpb(DONE); 6652 6653 BIND(LONG); 6654 } 6655 6656 // Use longer rep-prefixed ops for non-small counts: 6657 if (UseFastStosb) { 6658 shlptr(cnt, 3); // convert to number of bytes 6659 rep_stosb(); 6660 } else if (UseXMMForObjInit) { 6661 xmm_clear_mem(base, cnt, tmp, xtmp, mask); 6662 } else { 6663 NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM 6664 rep_stos(); 6665 } 6666 6667 BIND(DONE); 6668 } 6669 6670 #endif //COMPILER2_OR_JVMCI 6671 6672 6673 void MacroAssembler::generate_fill(BasicType t, bool aligned, 6674 Register to, Register value, Register count, 6675 Register rtmp, XMMRegister xtmp) { 6676 ShortBranchVerifier sbv(this); 6677 assert_different_registers(to, value, count, rtmp); 6678 Label L_exit; 6679 Label L_fill_2_bytes, L_fill_4_bytes; 6680 6681 #if defined(COMPILER2) && defined(_LP64) 6682 if(MaxVectorSize >=32 && 6683 VM_Version::supports_avx512vlbw() && 6684 VM_Version::supports_bmi2()) { 6685 generate_fill_avx3(t, to, value, count, rtmp, xtmp); 6686 return; 6687 } 6688 #endif 6689 6690 int shift = -1; 6691 switch (t) { 6692 case T_BYTE: 6693 shift = 2; 6694 break; 6695 case T_SHORT: 6696 shift = 1; 6697 break; 6698 case T_INT: 6699 shift = 0; 6700 break; 6701 default: ShouldNotReachHere(); 6702 } 6703 6704 if (t == T_BYTE) { 6705 andl(value, 0xff); 6706 movl(rtmp, value); 6707 shll(rtmp, 8); 6708 orl(value, rtmp); 6709 } 6710 if (t == T_SHORT) { 6711 andl(value, 0xffff); 6712 } 6713 if (t == T_BYTE || t == T_SHORT) { 6714 movl(rtmp, value); 6715 shll(rtmp, 16); 6716 orl(value, rtmp); 6717 } 6718 6719 cmpptr(count, 2<<shift); // Short arrays (< 8 bytes) fill by element 6720 jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp 6721 if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) { 6722 Label L_skip_align2; 6723 // align source address at 4 bytes address boundary 6724 if (t == T_BYTE) { 6725 Label L_skip_align1; 6726 // One byte misalignment happens only for byte arrays 6727 testptr(to, 1); 6728 jccb(Assembler::zero, L_skip_align1); 6729 movb(Address(to, 0), value); 6730 increment(to); 6731 decrement(count); 6732 BIND(L_skip_align1); 6733 } 6734 // Two bytes misalignment happens only for byte and short (char) arrays 6735 testptr(to, 2); 6736 jccb(Assembler::zero, L_skip_align2); 6737 movw(Address(to, 0), value); 6738 addptr(to, 2); 6739 subptr(count, 1<<(shift-1)); 6740 BIND(L_skip_align2); 6741 } 6742 if (UseSSE < 2) { 6743 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes; 6744 // Fill 32-byte chunks 6745 subptr(count, 8 << shift); 6746 jcc(Assembler::less, L_check_fill_8_bytes); 6747 align(16); 6748 6749 BIND(L_fill_32_bytes_loop); 6750 6751 for (int i = 0; i < 32; i += 4) { 6752 movl(Address(to, i), value); 6753 } 6754 6755 addptr(to, 32); 6756 subptr(count, 8 << shift); 6757 jcc(Assembler::greaterEqual, L_fill_32_bytes_loop); 6758 BIND(L_check_fill_8_bytes); 6759 addptr(count, 8 << shift); 6760 jccb(Assembler::zero, L_exit); 6761 jmpb(L_fill_8_bytes); 6762 6763 // 6764 // length is too short, just fill qwords 6765 // 6766 BIND(L_fill_8_bytes_loop); 6767 movl(Address(to, 0), value); 6768 movl(Address(to, 4), value); 6769 addptr(to, 8); 6770 BIND(L_fill_8_bytes); 6771 subptr(count, 1 << (shift + 1)); 6772 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop); 6773 // fall through to fill 4 bytes 6774 } else { 6775 Label L_fill_32_bytes; 6776 if (!UseUnalignedLoadStores) { 6777 // align to 8 bytes, we know we are 4 byte aligned to start 6778 testptr(to, 4); 6779 jccb(Assembler::zero, L_fill_32_bytes); 6780 movl(Address(to, 0), value); 6781 addptr(to, 4); 6782 subptr(count, 1<<shift); 6783 } 6784 BIND(L_fill_32_bytes); 6785 { 6786 assert( UseSSE >= 2, "supported cpu only" ); 6787 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes; 6788 movdl(xtmp, value); 6789 if (UseAVX >= 2 && UseUnalignedLoadStores) { 6790 Label L_check_fill_32_bytes; 6791 if (UseAVX > 2) { 6792 // Fill 64-byte chunks 6793 Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2; 6794 6795 // If number of bytes to fill < VM_Version::avx3_threshold(), perform fill using AVX2 6796 cmpptr(count, VM_Version::avx3_threshold()); 6797 jccb(Assembler::below, L_check_fill_64_bytes_avx2); 6798 6799 vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit); 6800 6801 subptr(count, 16 << shift); 6802 jccb(Assembler::less, L_check_fill_32_bytes); 6803 align(16); 6804 6805 BIND(L_fill_64_bytes_loop_avx3); 6806 evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit); 6807 addptr(to, 64); 6808 subptr(count, 16 << shift); 6809 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop_avx3); 6810 jmpb(L_check_fill_32_bytes); 6811 6812 BIND(L_check_fill_64_bytes_avx2); 6813 } 6814 // Fill 64-byte chunks 6815 Label L_fill_64_bytes_loop; 6816 vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit); 6817 6818 subptr(count, 16 << shift); 6819 jcc(Assembler::less, L_check_fill_32_bytes); 6820 align(16); 6821 6822 BIND(L_fill_64_bytes_loop); 6823 vmovdqu(Address(to, 0), xtmp); 6824 vmovdqu(Address(to, 32), xtmp); 6825 addptr(to, 64); 6826 subptr(count, 16 << shift); 6827 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop); 6828 6829 BIND(L_check_fill_32_bytes); 6830 addptr(count, 8 << shift); 6831 jccb(Assembler::less, L_check_fill_8_bytes); 6832 vmovdqu(Address(to, 0), xtmp); 6833 addptr(to, 32); 6834 subptr(count, 8 << shift); 6835 6836 BIND(L_check_fill_8_bytes); 6837 // clean upper bits of YMM registers 6838 movdl(xtmp, value); 6839 pshufd(xtmp, xtmp, 0); 6840 } else { 6841 // Fill 32-byte chunks 6842 pshufd(xtmp, xtmp, 0); 6843 6844 subptr(count, 8 << shift); 6845 jcc(Assembler::less, L_check_fill_8_bytes); 6846 align(16); 6847 6848 BIND(L_fill_32_bytes_loop); 6849 6850 if (UseUnalignedLoadStores) { 6851 movdqu(Address(to, 0), xtmp); 6852 movdqu(Address(to, 16), xtmp); 6853 } else { 6854 movq(Address(to, 0), xtmp); 6855 movq(Address(to, 8), xtmp); 6856 movq(Address(to, 16), xtmp); 6857 movq(Address(to, 24), xtmp); 6858 } 6859 6860 addptr(to, 32); 6861 subptr(count, 8 << shift); 6862 jcc(Assembler::greaterEqual, L_fill_32_bytes_loop); 6863 6864 BIND(L_check_fill_8_bytes); 6865 } 6866 addptr(count, 8 << shift); 6867 jccb(Assembler::zero, L_exit); 6868 jmpb(L_fill_8_bytes); 6869 6870 // 6871 // length is too short, just fill qwords 6872 // 6873 BIND(L_fill_8_bytes_loop); 6874 movq(Address(to, 0), xtmp); 6875 addptr(to, 8); 6876 BIND(L_fill_8_bytes); 6877 subptr(count, 1 << (shift + 1)); 6878 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop); 6879 } 6880 } 6881 // fill trailing 4 bytes 6882 BIND(L_fill_4_bytes); 6883 testl(count, 1<<shift); 6884 jccb(Assembler::zero, L_fill_2_bytes); 6885 movl(Address(to, 0), value); 6886 if (t == T_BYTE || t == T_SHORT) { 6887 Label L_fill_byte; 6888 addptr(to, 4); 6889 BIND(L_fill_2_bytes); 6890 // fill trailing 2 bytes 6891 testl(count, 1<<(shift-1)); 6892 jccb(Assembler::zero, L_fill_byte); 6893 movw(Address(to, 0), value); 6894 if (t == T_BYTE) { 6895 addptr(to, 2); 6896 BIND(L_fill_byte); 6897 // fill trailing byte 6898 testl(count, 1); 6899 jccb(Assembler::zero, L_exit); 6900 movb(Address(to, 0), value); 6901 } else { 6902 BIND(L_fill_byte); 6903 } 6904 } else { 6905 BIND(L_fill_2_bytes); 6906 } 6907 BIND(L_exit); 6908 } 6909 6910 void MacroAssembler::evpbroadcast(BasicType type, XMMRegister dst, Register src, int vector_len) { 6911 switch(type) { 6912 case T_BYTE: 6913 case T_BOOLEAN: 6914 evpbroadcastb(dst, src, vector_len); 6915 break; 6916 case T_SHORT: 6917 case T_CHAR: 6918 evpbroadcastw(dst, src, vector_len); 6919 break; 6920 case T_INT: 6921 case T_FLOAT: 6922 evpbroadcastd(dst, src, vector_len); 6923 break; 6924 case T_LONG: 6925 case T_DOUBLE: 6926 evpbroadcastq(dst, src, vector_len); 6927 break; 6928 default: 6929 fatal("Unhandled type : %s", type2name(type)); 6930 break; 6931 } 6932 } 6933 6934 // encode char[] to byte[] in ISO_8859_1 or ASCII 6935 //@IntrinsicCandidate 6936 //private static int implEncodeISOArray(byte[] sa, int sp, 6937 //byte[] da, int dp, int len) { 6938 // int i = 0; 6939 // for (; i < len; i++) { 6940 // char c = StringUTF16.getChar(sa, sp++); 6941 // if (c > '\u00FF') 6942 // break; 6943 // da[dp++] = (byte)c; 6944 // } 6945 // return i; 6946 //} 6947 // 6948 //@IntrinsicCandidate 6949 //private static int implEncodeAsciiArray(char[] sa, int sp, 6950 // byte[] da, int dp, int len) { 6951 // int i = 0; 6952 // for (; i < len; i++) { 6953 // char c = sa[sp++]; 6954 // if (c >= '\u0080') 6955 // break; 6956 // da[dp++] = (byte)c; 6957 // } 6958 // return i; 6959 //} 6960 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len, 6961 XMMRegister tmp1Reg, XMMRegister tmp2Reg, 6962 XMMRegister tmp3Reg, XMMRegister tmp4Reg, 6963 Register tmp5, Register result, bool ascii) { 6964 6965 // rsi: src 6966 // rdi: dst 6967 // rdx: len 6968 // rcx: tmp5 6969 // rax: result 6970 ShortBranchVerifier sbv(this); 6971 assert_different_registers(src, dst, len, tmp5, result); 6972 Label L_done, L_copy_1_char, L_copy_1_char_exit; 6973 6974 int mask = ascii ? 0xff80ff80 : 0xff00ff00; 6975 int short_mask = ascii ? 0xff80 : 0xff00; 6976 6977 // set result 6978 xorl(result, result); 6979 // check for zero length 6980 testl(len, len); 6981 jcc(Assembler::zero, L_done); 6982 6983 movl(result, len); 6984 6985 // Setup pointers 6986 lea(src, Address(src, len, Address::times_2)); // char[] 6987 lea(dst, Address(dst, len, Address::times_1)); // byte[] 6988 negptr(len); 6989 6990 if (UseSSE42Intrinsics || UseAVX >= 2) { 6991 Label L_copy_8_chars, L_copy_8_chars_exit; 6992 Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit; 6993 6994 if (UseAVX >= 2) { 6995 Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit; 6996 movl(tmp5, mask); // create mask to test for Unicode or non-ASCII chars in vector 6997 movdl(tmp1Reg, tmp5); 6998 vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit); 6999 jmp(L_chars_32_check); 7000 7001 bind(L_copy_32_chars); 7002 vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64)); 7003 vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32)); 7004 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1); 7005 vptest(tmp2Reg, tmp1Reg); // check for Unicode or non-ASCII chars in vector 7006 jccb(Assembler::notZero, L_copy_32_chars_exit); 7007 vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1); 7008 vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1); 7009 vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg); 7010 7011 bind(L_chars_32_check); 7012 addptr(len, 32); 7013 jcc(Assembler::lessEqual, L_copy_32_chars); 7014 7015 bind(L_copy_32_chars_exit); 7016 subptr(len, 16); 7017 jccb(Assembler::greater, L_copy_16_chars_exit); 7018 7019 } else if (UseSSE42Intrinsics) { 7020 movl(tmp5, mask); // create mask to test for Unicode or non-ASCII chars in vector 7021 movdl(tmp1Reg, tmp5); 7022 pshufd(tmp1Reg, tmp1Reg, 0); 7023 jmpb(L_chars_16_check); 7024 } 7025 7026 bind(L_copy_16_chars); 7027 if (UseAVX >= 2) { 7028 vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32)); 7029 vptest(tmp2Reg, tmp1Reg); 7030 jcc(Assembler::notZero, L_copy_16_chars_exit); 7031 vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1); 7032 vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1); 7033 } else { 7034 if (UseAVX > 0) { 7035 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32)); 7036 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16)); 7037 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0); 7038 } else { 7039 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32)); 7040 por(tmp2Reg, tmp3Reg); 7041 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16)); 7042 por(tmp2Reg, tmp4Reg); 7043 } 7044 ptest(tmp2Reg, tmp1Reg); // check for Unicode or non-ASCII chars in vector 7045 jccb(Assembler::notZero, L_copy_16_chars_exit); 7046 packuswb(tmp3Reg, tmp4Reg); 7047 } 7048 movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg); 7049 7050 bind(L_chars_16_check); 7051 addptr(len, 16); 7052 jcc(Assembler::lessEqual, L_copy_16_chars); 7053 7054 bind(L_copy_16_chars_exit); 7055 if (UseAVX >= 2) { 7056 // clean upper bits of YMM registers 7057 vpxor(tmp2Reg, tmp2Reg); 7058 vpxor(tmp3Reg, tmp3Reg); 7059 vpxor(tmp4Reg, tmp4Reg); 7060 movdl(tmp1Reg, tmp5); 7061 pshufd(tmp1Reg, tmp1Reg, 0); 7062 } 7063 subptr(len, 8); 7064 jccb(Assembler::greater, L_copy_8_chars_exit); 7065 7066 bind(L_copy_8_chars); 7067 movdqu(tmp3Reg, Address(src, len, Address::times_2, -16)); 7068 ptest(tmp3Reg, tmp1Reg); 7069 jccb(Assembler::notZero, L_copy_8_chars_exit); 7070 packuswb(tmp3Reg, tmp1Reg); 7071 movq(Address(dst, len, Address::times_1, -8), tmp3Reg); 7072 addptr(len, 8); 7073 jccb(Assembler::lessEqual, L_copy_8_chars); 7074 7075 bind(L_copy_8_chars_exit); 7076 subptr(len, 8); 7077 jccb(Assembler::zero, L_done); 7078 } 7079 7080 bind(L_copy_1_char); 7081 load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0)); 7082 testl(tmp5, short_mask); // check if Unicode or non-ASCII char 7083 jccb(Assembler::notZero, L_copy_1_char_exit); 7084 movb(Address(dst, len, Address::times_1, 0), tmp5); 7085 addptr(len, 1); 7086 jccb(Assembler::less, L_copy_1_char); 7087 7088 bind(L_copy_1_char_exit); 7089 addptr(result, len); // len is negative count of not processed elements 7090 7091 bind(L_done); 7092 } 7093 7094 #ifdef _LP64 7095 /** 7096 * Helper for multiply_to_len(). 7097 */ 7098 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) { 7099 addq(dest_lo, src1); 7100 adcq(dest_hi, 0); 7101 addq(dest_lo, src2); 7102 adcq(dest_hi, 0); 7103 } 7104 7105 /** 7106 * Multiply 64 bit by 64 bit first loop. 7107 */ 7108 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart, 7109 Register y, Register y_idx, Register z, 7110 Register carry, Register product, 7111 Register idx, Register kdx) { 7112 // 7113 // jlong carry, x[], y[], z[]; 7114 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 7115 // huge_128 product = y[idx] * x[xstart] + carry; 7116 // z[kdx] = (jlong)product; 7117 // carry = (jlong)(product >>> 64); 7118 // } 7119 // z[xstart] = carry; 7120 // 7121 7122 Label L_first_loop, L_first_loop_exit; 7123 Label L_one_x, L_one_y, L_multiply; 7124 7125 decrementl(xstart); 7126 jcc(Assembler::negative, L_one_x); 7127 7128 movq(x_xstart, Address(x, xstart, Address::times_4, 0)); 7129 rorq(x_xstart, 32); // convert big-endian to little-endian 7130 7131 bind(L_first_loop); 7132 decrementl(idx); 7133 jcc(Assembler::negative, L_first_loop_exit); 7134 decrementl(idx); 7135 jcc(Assembler::negative, L_one_y); 7136 movq(y_idx, Address(y, idx, Address::times_4, 0)); 7137 rorq(y_idx, 32); // convert big-endian to little-endian 7138 bind(L_multiply); 7139 movq(product, x_xstart); 7140 mulq(y_idx); // product(rax) * y_idx -> rdx:rax 7141 addq(product, carry); 7142 adcq(rdx, 0); 7143 subl(kdx, 2); 7144 movl(Address(z, kdx, Address::times_4, 4), product); 7145 shrq(product, 32); 7146 movl(Address(z, kdx, Address::times_4, 0), product); 7147 movq(carry, rdx); 7148 jmp(L_first_loop); 7149 7150 bind(L_one_y); 7151 movl(y_idx, Address(y, 0)); 7152 jmp(L_multiply); 7153 7154 bind(L_one_x); 7155 movl(x_xstart, Address(x, 0)); 7156 jmp(L_first_loop); 7157 7158 bind(L_first_loop_exit); 7159 } 7160 7161 /** 7162 * Multiply 64 bit by 64 bit and add 128 bit. 7163 */ 7164 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z, 7165 Register yz_idx, Register idx, 7166 Register carry, Register product, int offset) { 7167 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 7168 // z[kdx] = (jlong)product; 7169 7170 movq(yz_idx, Address(y, idx, Address::times_4, offset)); 7171 rorq(yz_idx, 32); // convert big-endian to little-endian 7172 movq(product, x_xstart); 7173 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax) 7174 movq(yz_idx, Address(z, idx, Address::times_4, offset)); 7175 rorq(yz_idx, 32); // convert big-endian to little-endian 7176 7177 add2_with_carry(rdx, product, carry, yz_idx); 7178 7179 movl(Address(z, idx, Address::times_4, offset+4), product); 7180 shrq(product, 32); 7181 movl(Address(z, idx, Address::times_4, offset), product); 7182 7183 } 7184 7185 /** 7186 * Multiply 128 bit by 128 bit. Unrolled inner loop. 7187 */ 7188 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z, 7189 Register yz_idx, Register idx, Register jdx, 7190 Register carry, Register product, 7191 Register carry2) { 7192 // jlong carry, x[], y[], z[]; 7193 // int kdx = ystart+1; 7194 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 7195 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 7196 // z[kdx+idx+1] = (jlong)product; 7197 // jlong carry2 = (jlong)(product >>> 64); 7198 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 7199 // z[kdx+idx] = (jlong)product; 7200 // carry = (jlong)(product >>> 64); 7201 // } 7202 // idx += 2; 7203 // if (idx > 0) { 7204 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 7205 // z[kdx+idx] = (jlong)product; 7206 // carry = (jlong)(product >>> 64); 7207 // } 7208 // 7209 7210 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 7211 7212 movl(jdx, idx); 7213 andl(jdx, 0xFFFFFFFC); 7214 shrl(jdx, 2); 7215 7216 bind(L_third_loop); 7217 subl(jdx, 1); 7218 jcc(Assembler::negative, L_third_loop_exit); 7219 subl(idx, 4); 7220 7221 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8); 7222 movq(carry2, rdx); 7223 7224 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0); 7225 movq(carry, rdx); 7226 jmp(L_third_loop); 7227 7228 bind (L_third_loop_exit); 7229 7230 andl (idx, 0x3); 7231 jcc(Assembler::zero, L_post_third_loop_done); 7232 7233 Label L_check_1; 7234 subl(idx, 2); 7235 jcc(Assembler::negative, L_check_1); 7236 7237 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0); 7238 movq(carry, rdx); 7239 7240 bind (L_check_1); 7241 addl (idx, 0x2); 7242 andl (idx, 0x1); 7243 subl(idx, 1); 7244 jcc(Assembler::negative, L_post_third_loop_done); 7245 7246 movl(yz_idx, Address(y, idx, Address::times_4, 0)); 7247 movq(product, x_xstart); 7248 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax) 7249 movl(yz_idx, Address(z, idx, Address::times_4, 0)); 7250 7251 add2_with_carry(rdx, product, yz_idx, carry); 7252 7253 movl(Address(z, idx, Address::times_4, 0), product); 7254 shrq(product, 32); 7255 7256 shlq(rdx, 32); 7257 orq(product, rdx); 7258 movq(carry, product); 7259 7260 bind(L_post_third_loop_done); 7261 } 7262 7263 /** 7264 * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop. 7265 * 7266 */ 7267 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z, 7268 Register carry, Register carry2, 7269 Register idx, Register jdx, 7270 Register yz_idx1, Register yz_idx2, 7271 Register tmp, Register tmp3, Register tmp4) { 7272 assert(UseBMI2Instructions, "should be used only when BMI2 is available"); 7273 7274 // jlong carry, x[], y[], z[]; 7275 // int kdx = ystart+1; 7276 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 7277 // huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry; 7278 // jlong carry2 = (jlong)(tmp3 >>> 64); 7279 // huge_128 tmp4 = (y[idx] * rdx) + z[kdx+idx] + carry2; 7280 // carry = (jlong)(tmp4 >>> 64); 7281 // z[kdx+idx+1] = (jlong)tmp3; 7282 // z[kdx+idx] = (jlong)tmp4; 7283 // } 7284 // idx += 2; 7285 // if (idx > 0) { 7286 // yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry; 7287 // z[kdx+idx] = (jlong)yz_idx1; 7288 // carry = (jlong)(yz_idx1 >>> 64); 7289 // } 7290 // 7291 7292 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 7293 7294 movl(jdx, idx); 7295 andl(jdx, 0xFFFFFFFC); 7296 shrl(jdx, 2); 7297 7298 bind(L_third_loop); 7299 subl(jdx, 1); 7300 jcc(Assembler::negative, L_third_loop_exit); 7301 subl(idx, 4); 7302 7303 movq(yz_idx1, Address(y, idx, Address::times_4, 8)); 7304 rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian 7305 movq(yz_idx2, Address(y, idx, Address::times_4, 0)); 7306 rorxq(yz_idx2, yz_idx2, 32); 7307 7308 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3 7309 mulxq(carry2, tmp, yz_idx2); // yz_idx2 * rdx -> carry2:tmp 7310 7311 movq(yz_idx1, Address(z, idx, Address::times_4, 8)); 7312 rorxq(yz_idx1, yz_idx1, 32); 7313 movq(yz_idx2, Address(z, idx, Address::times_4, 0)); 7314 rorxq(yz_idx2, yz_idx2, 32); 7315 7316 if (VM_Version::supports_adx()) { 7317 adcxq(tmp3, carry); 7318 adoxq(tmp3, yz_idx1); 7319 7320 adcxq(tmp4, tmp); 7321 adoxq(tmp4, yz_idx2); 7322 7323 movl(carry, 0); // does not affect flags 7324 adcxq(carry2, carry); 7325 adoxq(carry2, carry); 7326 } else { 7327 add2_with_carry(tmp4, tmp3, carry, yz_idx1); 7328 add2_with_carry(carry2, tmp4, tmp, yz_idx2); 7329 } 7330 movq(carry, carry2); 7331 7332 movl(Address(z, idx, Address::times_4, 12), tmp3); 7333 shrq(tmp3, 32); 7334 movl(Address(z, idx, Address::times_4, 8), tmp3); 7335 7336 movl(Address(z, idx, Address::times_4, 4), tmp4); 7337 shrq(tmp4, 32); 7338 movl(Address(z, idx, Address::times_4, 0), tmp4); 7339 7340 jmp(L_third_loop); 7341 7342 bind (L_third_loop_exit); 7343 7344 andl (idx, 0x3); 7345 jcc(Assembler::zero, L_post_third_loop_done); 7346 7347 Label L_check_1; 7348 subl(idx, 2); 7349 jcc(Assembler::negative, L_check_1); 7350 7351 movq(yz_idx1, Address(y, idx, Address::times_4, 0)); 7352 rorxq(yz_idx1, yz_idx1, 32); 7353 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3 7354 movq(yz_idx2, Address(z, idx, Address::times_4, 0)); 7355 rorxq(yz_idx2, yz_idx2, 32); 7356 7357 add2_with_carry(tmp4, tmp3, carry, yz_idx2); 7358 7359 movl(Address(z, idx, Address::times_4, 4), tmp3); 7360 shrq(tmp3, 32); 7361 movl(Address(z, idx, Address::times_4, 0), tmp3); 7362 movq(carry, tmp4); 7363 7364 bind (L_check_1); 7365 addl (idx, 0x2); 7366 andl (idx, 0x1); 7367 subl(idx, 1); 7368 jcc(Assembler::negative, L_post_third_loop_done); 7369 movl(tmp4, Address(y, idx, Address::times_4, 0)); 7370 mulxq(carry2, tmp3, tmp4); // tmp4 * rdx -> carry2:tmp3 7371 movl(tmp4, Address(z, idx, Address::times_4, 0)); 7372 7373 add2_with_carry(carry2, tmp3, tmp4, carry); 7374 7375 movl(Address(z, idx, Address::times_4, 0), tmp3); 7376 shrq(tmp3, 32); 7377 7378 shlq(carry2, 32); 7379 orq(tmp3, carry2); 7380 movq(carry, tmp3); 7381 7382 bind(L_post_third_loop_done); 7383 } 7384 7385 /** 7386 * Code for BigInteger::multiplyToLen() intrinsic. 7387 * 7388 * rdi: x 7389 * rax: xlen 7390 * rsi: y 7391 * rcx: ylen 7392 * r8: z 7393 * r11: tmp0 7394 * r12: tmp1 7395 * r13: tmp2 7396 * r14: tmp3 7397 * r15: tmp4 7398 * rbx: tmp5 7399 * 7400 */ 7401 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register tmp0, 7402 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) { 7403 ShortBranchVerifier sbv(this); 7404 assert_different_registers(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, rdx); 7405 7406 push(tmp0); 7407 push(tmp1); 7408 push(tmp2); 7409 push(tmp3); 7410 push(tmp4); 7411 push(tmp5); 7412 7413 push(xlen); 7414 7415 const Register idx = tmp1; 7416 const Register kdx = tmp2; 7417 const Register xstart = tmp3; 7418 7419 const Register y_idx = tmp4; 7420 const Register carry = tmp5; 7421 const Register product = xlen; 7422 const Register x_xstart = tmp0; 7423 7424 // First Loop. 7425 // 7426 // final static long LONG_MASK = 0xffffffffL; 7427 // int xstart = xlen - 1; 7428 // int ystart = ylen - 1; 7429 // long carry = 0; 7430 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 7431 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 7432 // z[kdx] = (int)product; 7433 // carry = product >>> 32; 7434 // } 7435 // z[xstart] = (int)carry; 7436 // 7437 7438 movl(idx, ylen); // idx = ylen; 7439 lea(kdx, Address(xlen, ylen)); // kdx = xlen+ylen; 7440 xorq(carry, carry); // carry = 0; 7441 7442 Label L_done; 7443 7444 movl(xstart, xlen); 7445 decrementl(xstart); 7446 jcc(Assembler::negative, L_done); 7447 7448 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx); 7449 7450 Label L_second_loop; 7451 testl(kdx, kdx); 7452 jcc(Assembler::zero, L_second_loop); 7453 7454 Label L_carry; 7455 subl(kdx, 1); 7456 jcc(Assembler::zero, L_carry); 7457 7458 movl(Address(z, kdx, Address::times_4, 0), carry); 7459 shrq(carry, 32); 7460 subl(kdx, 1); 7461 7462 bind(L_carry); 7463 movl(Address(z, kdx, Address::times_4, 0), carry); 7464 7465 // Second and third (nested) loops. 7466 // 7467 // for (int i = xstart-1; i >= 0; i--) { // Second loop 7468 // carry = 0; 7469 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 7470 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 7471 // (z[k] & LONG_MASK) + carry; 7472 // z[k] = (int)product; 7473 // carry = product >>> 32; 7474 // } 7475 // z[i] = (int)carry; 7476 // } 7477 // 7478 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 7479 7480 const Register jdx = tmp1; 7481 7482 bind(L_second_loop); 7483 xorl(carry, carry); // carry = 0; 7484 movl(jdx, ylen); // j = ystart+1 7485 7486 subl(xstart, 1); // i = xstart-1; 7487 jcc(Assembler::negative, L_done); 7488 7489 push (z); 7490 7491 Label L_last_x; 7492 lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j 7493 subl(xstart, 1); // i = xstart-1; 7494 jcc(Assembler::negative, L_last_x); 7495 7496 if (UseBMI2Instructions) { 7497 movq(rdx, Address(x, xstart, Address::times_4, 0)); 7498 rorxq(rdx, rdx, 32); // convert big-endian to little-endian 7499 } else { 7500 movq(x_xstart, Address(x, xstart, Address::times_4, 0)); 7501 rorq(x_xstart, 32); // convert big-endian to little-endian 7502 } 7503 7504 Label L_third_loop_prologue; 7505 bind(L_third_loop_prologue); 7506 7507 push (x); 7508 push (xstart); 7509 push (ylen); 7510 7511 7512 if (UseBMI2Instructions) { 7513 multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4); 7514 } else { // !UseBMI2Instructions 7515 multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x); 7516 } 7517 7518 pop(ylen); 7519 pop(xlen); 7520 pop(x); 7521 pop(z); 7522 7523 movl(tmp3, xlen); 7524 addl(tmp3, 1); 7525 movl(Address(z, tmp3, Address::times_4, 0), carry); 7526 subl(tmp3, 1); 7527 jccb(Assembler::negative, L_done); 7528 7529 shrq(carry, 32); 7530 movl(Address(z, tmp3, Address::times_4, 0), carry); 7531 jmp(L_second_loop); 7532 7533 // Next infrequent code is moved outside loops. 7534 bind(L_last_x); 7535 if (UseBMI2Instructions) { 7536 movl(rdx, Address(x, 0)); 7537 } else { 7538 movl(x_xstart, Address(x, 0)); 7539 } 7540 jmp(L_third_loop_prologue); 7541 7542 bind(L_done); 7543 7544 pop(xlen); 7545 7546 pop(tmp5); 7547 pop(tmp4); 7548 pop(tmp3); 7549 pop(tmp2); 7550 pop(tmp1); 7551 pop(tmp0); 7552 } 7553 7554 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale, 7555 Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){ 7556 assert(UseSSE42Intrinsics, "SSE4.2 must be enabled."); 7557 Label VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP; 7558 Label VECTOR8_TAIL, VECTOR4_TAIL; 7559 Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL; 7560 Label SAME_TILL_END, DONE; 7561 Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL; 7562 7563 //scale is in rcx in both Win64 and Unix 7564 ShortBranchVerifier sbv(this); 7565 7566 shlq(length); 7567 xorq(result, result); 7568 7569 if ((AVX3Threshold == 0) && (UseAVX > 2) && 7570 VM_Version::supports_avx512vlbw()) { 7571 Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL; 7572 7573 cmpq(length, 64); 7574 jcc(Assembler::less, VECTOR32_TAIL); 7575 7576 movq(tmp1, length); 7577 andq(tmp1, 0x3F); // tail count 7578 andq(length, ~(0x3F)); //vector count 7579 7580 bind(VECTOR64_LOOP); 7581 // AVX512 code to compare 64 byte vectors. 7582 evmovdqub(rymm0, Address(obja, result), Assembler::AVX_512bit); 7583 evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit); 7584 kortestql(k7, k7); 7585 jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL); // mismatch 7586 addq(result, 64); 7587 subq(length, 64); 7588 jccb(Assembler::notZero, VECTOR64_LOOP); 7589 7590 //bind(VECTOR64_TAIL); 7591 testq(tmp1, tmp1); 7592 jcc(Assembler::zero, SAME_TILL_END); 7593 7594 //bind(VECTOR64_TAIL); 7595 // AVX512 code to compare up to 63 byte vectors. 7596 mov64(tmp2, 0xFFFFFFFFFFFFFFFF); 7597 shlxq(tmp2, tmp2, tmp1); 7598 notq(tmp2); 7599 kmovql(k3, tmp2); 7600 7601 evmovdqub(rymm0, k3, Address(obja, result), false, Assembler::AVX_512bit); 7602 evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit); 7603 7604 ktestql(k7, k3); 7605 jcc(Assembler::below, SAME_TILL_END); // not mismatch 7606 7607 bind(VECTOR64_NOT_EQUAL); 7608 kmovql(tmp1, k7); 7609 notq(tmp1); 7610 tzcntq(tmp1, tmp1); 7611 addq(result, tmp1); 7612 shrq(result); 7613 jmp(DONE); 7614 bind(VECTOR32_TAIL); 7615 } 7616 7617 cmpq(length, 8); 7618 jcc(Assembler::equal, VECTOR8_LOOP); 7619 jcc(Assembler::less, VECTOR4_TAIL); 7620 7621 if (UseAVX >= 2) { 7622 Label VECTOR16_TAIL, VECTOR32_LOOP; 7623 7624 cmpq(length, 16); 7625 jcc(Assembler::equal, VECTOR16_LOOP); 7626 jcc(Assembler::less, VECTOR8_LOOP); 7627 7628 cmpq(length, 32); 7629 jccb(Assembler::less, VECTOR16_TAIL); 7630 7631 subq(length, 32); 7632 bind(VECTOR32_LOOP); 7633 vmovdqu(rymm0, Address(obja, result)); 7634 vmovdqu(rymm1, Address(objb, result)); 7635 vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit); 7636 vptest(rymm2, rymm2); 7637 jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found 7638 addq(result, 32); 7639 subq(length, 32); 7640 jcc(Assembler::greaterEqual, VECTOR32_LOOP); 7641 addq(length, 32); 7642 jcc(Assembler::equal, SAME_TILL_END); 7643 //falling through if less than 32 bytes left //close the branch here. 7644 7645 bind(VECTOR16_TAIL); 7646 cmpq(length, 16); 7647 jccb(Assembler::less, VECTOR8_TAIL); 7648 bind(VECTOR16_LOOP); 7649 movdqu(rymm0, Address(obja, result)); 7650 movdqu(rymm1, Address(objb, result)); 7651 vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit); 7652 ptest(rymm2, rymm2); 7653 jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found 7654 addq(result, 16); 7655 subq(length, 16); 7656 jcc(Assembler::equal, SAME_TILL_END); 7657 //falling through if less than 16 bytes left 7658 } else {//regular intrinsics 7659 7660 cmpq(length, 16); 7661 jccb(Assembler::less, VECTOR8_TAIL); 7662 7663 subq(length, 16); 7664 bind(VECTOR16_LOOP); 7665 movdqu(rymm0, Address(obja, result)); 7666 movdqu(rymm1, Address(objb, result)); 7667 pxor(rymm0, rymm1); 7668 ptest(rymm0, rymm0); 7669 jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found 7670 addq(result, 16); 7671 subq(length, 16); 7672 jccb(Assembler::greaterEqual, VECTOR16_LOOP); 7673 addq(length, 16); 7674 jcc(Assembler::equal, SAME_TILL_END); 7675 //falling through if less than 16 bytes left 7676 } 7677 7678 bind(VECTOR8_TAIL); 7679 cmpq(length, 8); 7680 jccb(Assembler::less, VECTOR4_TAIL); 7681 bind(VECTOR8_LOOP); 7682 movq(tmp1, Address(obja, result)); 7683 movq(tmp2, Address(objb, result)); 7684 xorq(tmp1, tmp2); 7685 testq(tmp1, tmp1); 7686 jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found 7687 addq(result, 8); 7688 subq(length, 8); 7689 jcc(Assembler::equal, SAME_TILL_END); 7690 //falling through if less than 8 bytes left 7691 7692 bind(VECTOR4_TAIL); 7693 cmpq(length, 4); 7694 jccb(Assembler::less, BYTES_TAIL); 7695 bind(VECTOR4_LOOP); 7696 movl(tmp1, Address(obja, result)); 7697 xorl(tmp1, Address(objb, result)); 7698 testl(tmp1, tmp1); 7699 jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found 7700 addq(result, 4); 7701 subq(length, 4); 7702 jcc(Assembler::equal, SAME_TILL_END); 7703 //falling through if less than 4 bytes left 7704 7705 bind(BYTES_TAIL); 7706 bind(BYTES_LOOP); 7707 load_unsigned_byte(tmp1, Address(obja, result)); 7708 load_unsigned_byte(tmp2, Address(objb, result)); 7709 xorl(tmp1, tmp2); 7710 testl(tmp1, tmp1); 7711 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found 7712 decq(length); 7713 jcc(Assembler::zero, SAME_TILL_END); 7714 incq(result); 7715 load_unsigned_byte(tmp1, Address(obja, result)); 7716 load_unsigned_byte(tmp2, Address(objb, result)); 7717 xorl(tmp1, tmp2); 7718 testl(tmp1, tmp1); 7719 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found 7720 decq(length); 7721 jcc(Assembler::zero, SAME_TILL_END); 7722 incq(result); 7723 load_unsigned_byte(tmp1, Address(obja, result)); 7724 load_unsigned_byte(tmp2, Address(objb, result)); 7725 xorl(tmp1, tmp2); 7726 testl(tmp1, tmp1); 7727 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found 7728 jmp(SAME_TILL_END); 7729 7730 if (UseAVX >= 2) { 7731 bind(VECTOR32_NOT_EQUAL); 7732 vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit); 7733 vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit); 7734 vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit); 7735 vpmovmskb(tmp1, rymm0); 7736 bsfq(tmp1, tmp1); 7737 addq(result, tmp1); 7738 shrq(result); 7739 jmp(DONE); 7740 } 7741 7742 bind(VECTOR16_NOT_EQUAL); 7743 if (UseAVX >= 2) { 7744 vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit); 7745 vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit); 7746 pxor(rymm0, rymm2); 7747 } else { 7748 pcmpeqb(rymm2, rymm2); 7749 pxor(rymm0, rymm1); 7750 pcmpeqb(rymm0, rymm1); 7751 pxor(rymm0, rymm2); 7752 } 7753 pmovmskb(tmp1, rymm0); 7754 bsfq(tmp1, tmp1); 7755 addq(result, tmp1); 7756 shrq(result); 7757 jmpb(DONE); 7758 7759 bind(VECTOR8_NOT_EQUAL); 7760 bind(VECTOR4_NOT_EQUAL); 7761 bsfq(tmp1, tmp1); 7762 shrq(tmp1, 3); 7763 addq(result, tmp1); 7764 bind(BYTES_NOT_EQUAL); 7765 shrq(result); 7766 jmpb(DONE); 7767 7768 bind(SAME_TILL_END); 7769 mov64(result, -1); 7770 7771 bind(DONE); 7772 } 7773 7774 //Helper functions for square_to_len() 7775 7776 /** 7777 * Store the squares of x[], right shifted one bit (divided by 2) into z[] 7778 * Preserves x and z and modifies rest of the registers. 7779 */ 7780 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { 7781 // Perform square and right shift by 1 7782 // Handle odd xlen case first, then for even xlen do the following 7783 // jlong carry = 0; 7784 // for (int j=0, i=0; j < xlen; j+=2, i+=4) { 7785 // huge_128 product = x[j:j+1] * x[j:j+1]; 7786 // z[i:i+1] = (carry << 63) | (jlong)(product >>> 65); 7787 // z[i+2:i+3] = (jlong)(product >>> 1); 7788 // carry = (jlong)product; 7789 // } 7790 7791 xorq(tmp5, tmp5); // carry 7792 xorq(rdxReg, rdxReg); 7793 xorl(tmp1, tmp1); // index for x 7794 xorl(tmp4, tmp4); // index for z 7795 7796 Label L_first_loop, L_first_loop_exit; 7797 7798 testl(xlen, 1); 7799 jccb(Assembler::zero, L_first_loop); //jump if xlen is even 7800 7801 // Square and right shift by 1 the odd element using 32 bit multiply 7802 movl(raxReg, Address(x, tmp1, Address::times_4, 0)); 7803 imulq(raxReg, raxReg); 7804 shrq(raxReg, 1); 7805 adcq(tmp5, 0); 7806 movq(Address(z, tmp4, Address::times_4, 0), raxReg); 7807 incrementl(tmp1); 7808 addl(tmp4, 2); 7809 7810 // Square and right shift by 1 the rest using 64 bit multiply 7811 bind(L_first_loop); 7812 cmpptr(tmp1, xlen); 7813 jccb(Assembler::equal, L_first_loop_exit); 7814 7815 // Square 7816 movq(raxReg, Address(x, tmp1, Address::times_4, 0)); 7817 rorq(raxReg, 32); // convert big-endian to little-endian 7818 mulq(raxReg); // 64-bit multiply rax * rax -> rdx:rax 7819 7820 // Right shift by 1 and save carry 7821 shrq(tmp5, 1); // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1 7822 rcrq(rdxReg, 1); 7823 rcrq(raxReg, 1); 7824 adcq(tmp5, 0); 7825 7826 // Store result in z 7827 movq(Address(z, tmp4, Address::times_4, 0), rdxReg); 7828 movq(Address(z, tmp4, Address::times_4, 8), raxReg); 7829 7830 // Update indices for x and z 7831 addl(tmp1, 2); 7832 addl(tmp4, 4); 7833 jmp(L_first_loop); 7834 7835 bind(L_first_loop_exit); 7836 } 7837 7838 7839 /** 7840 * Perform the following multiply add operation using BMI2 instructions 7841 * carry:sum = sum + op1*op2 + carry 7842 * op2 should be in rdx 7843 * op2 is preserved, all other registers are modified 7844 */ 7845 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) { 7846 // assert op2 is rdx 7847 mulxq(tmp2, op1, op1); // op1 * op2 -> tmp2:op1 7848 addq(sum, carry); 7849 adcq(tmp2, 0); 7850 addq(sum, op1); 7851 adcq(tmp2, 0); 7852 movq(carry, tmp2); 7853 } 7854 7855 /** 7856 * Perform the following multiply add operation: 7857 * carry:sum = sum + op1*op2 + carry 7858 * Preserves op1, op2 and modifies rest of registers 7859 */ 7860 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) { 7861 // rdx:rax = op1 * op2 7862 movq(raxReg, op2); 7863 mulq(op1); 7864 7865 // rdx:rax = sum + carry + rdx:rax 7866 addq(sum, carry); 7867 adcq(rdxReg, 0); 7868 addq(sum, raxReg); 7869 adcq(rdxReg, 0); 7870 7871 // carry:sum = rdx:sum 7872 movq(carry, rdxReg); 7873 } 7874 7875 /** 7876 * Add 64 bit long carry into z[] with carry propagation. 7877 * Preserves z and carry register values and modifies rest of registers. 7878 * 7879 */ 7880 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) { 7881 Label L_fourth_loop, L_fourth_loop_exit; 7882 7883 movl(tmp1, 1); 7884 subl(zlen, 2); 7885 addq(Address(z, zlen, Address::times_4, 0), carry); 7886 7887 bind(L_fourth_loop); 7888 jccb(Assembler::carryClear, L_fourth_loop_exit); 7889 subl(zlen, 2); 7890 jccb(Assembler::negative, L_fourth_loop_exit); 7891 addq(Address(z, zlen, Address::times_4, 0), tmp1); 7892 jmp(L_fourth_loop); 7893 bind(L_fourth_loop_exit); 7894 } 7895 7896 /** 7897 * Shift z[] left by 1 bit. 7898 * Preserves x, len, z and zlen registers and modifies rest of the registers. 7899 * 7900 */ 7901 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) { 7902 7903 Label L_fifth_loop, L_fifth_loop_exit; 7904 7905 // Fifth loop 7906 // Perform primitiveLeftShift(z, zlen, 1) 7907 7908 const Register prev_carry = tmp1; 7909 const Register new_carry = tmp4; 7910 const Register value = tmp2; 7911 const Register zidx = tmp3; 7912 7913 // int zidx, carry; 7914 // long value; 7915 // carry = 0; 7916 // for (zidx = zlen-2; zidx >=0; zidx -= 2) { 7917 // (carry:value) = (z[i] << 1) | carry ; 7918 // z[i] = value; 7919 // } 7920 7921 movl(zidx, zlen); 7922 xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register 7923 7924 bind(L_fifth_loop); 7925 decl(zidx); // Use decl to preserve carry flag 7926 decl(zidx); 7927 jccb(Assembler::negative, L_fifth_loop_exit); 7928 7929 if (UseBMI2Instructions) { 7930 movq(value, Address(z, zidx, Address::times_4, 0)); 7931 rclq(value, 1); 7932 rorxq(value, value, 32); 7933 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form 7934 } 7935 else { 7936 // clear new_carry 7937 xorl(new_carry, new_carry); 7938 7939 // Shift z[i] by 1, or in previous carry and save new carry 7940 movq(value, Address(z, zidx, Address::times_4, 0)); 7941 shlq(value, 1); 7942 adcl(new_carry, 0); 7943 7944 orq(value, prev_carry); 7945 rorq(value, 0x20); 7946 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form 7947 7948 // Set previous carry = new carry 7949 movl(prev_carry, new_carry); 7950 } 7951 jmp(L_fifth_loop); 7952 7953 bind(L_fifth_loop_exit); 7954 } 7955 7956 7957 /** 7958 * Code for BigInteger::squareToLen() intrinsic 7959 * 7960 * rdi: x 7961 * rsi: len 7962 * r8: z 7963 * rcx: zlen 7964 * r12: tmp1 7965 * r13: tmp2 7966 * r14: tmp3 7967 * r15: tmp4 7968 * rbx: tmp5 7969 * 7970 */ 7971 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { 7972 7973 Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, L_last_x, L_multiply; 7974 push(tmp1); 7975 push(tmp2); 7976 push(tmp3); 7977 push(tmp4); 7978 push(tmp5); 7979 7980 // First loop 7981 // Store the squares, right shifted one bit (i.e., divided by 2). 7982 square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg); 7983 7984 // Add in off-diagonal sums. 7985 // 7986 // Second, third (nested) and fourth loops. 7987 // zlen +=2; 7988 // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) { 7989 // carry = 0; 7990 // long op2 = x[xidx:xidx+1]; 7991 // for (int j=xidx-2,k=zidx; j >= 0; j-=2) { 7992 // k -= 2; 7993 // long op1 = x[j:j+1]; 7994 // long sum = z[k:k+1]; 7995 // carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs); 7996 // z[k:k+1] = sum; 7997 // } 7998 // add_one_64(z, k, carry, tmp_regs); 7999 // } 8000 8001 const Register carry = tmp5; 8002 const Register sum = tmp3; 8003 const Register op1 = tmp4; 8004 Register op2 = tmp2; 8005 8006 push(zlen); 8007 push(len); 8008 addl(zlen,2); 8009 bind(L_second_loop); 8010 xorq(carry, carry); 8011 subl(zlen, 4); 8012 subl(len, 2); 8013 push(zlen); 8014 push(len); 8015 cmpl(len, 0); 8016 jccb(Assembler::lessEqual, L_second_loop_exit); 8017 8018 // Multiply an array by one 64 bit long. 8019 if (UseBMI2Instructions) { 8020 op2 = rdxReg; 8021 movq(op2, Address(x, len, Address::times_4, 0)); 8022 rorxq(op2, op2, 32); 8023 } 8024 else { 8025 movq(op2, Address(x, len, Address::times_4, 0)); 8026 rorq(op2, 32); 8027 } 8028 8029 bind(L_third_loop); 8030 decrementl(len); 8031 jccb(Assembler::negative, L_third_loop_exit); 8032 decrementl(len); 8033 jccb(Assembler::negative, L_last_x); 8034 8035 movq(op1, Address(x, len, Address::times_4, 0)); 8036 rorq(op1, 32); 8037 8038 bind(L_multiply); 8039 subl(zlen, 2); 8040 movq(sum, Address(z, zlen, Address::times_4, 0)); 8041 8042 // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry. 8043 if (UseBMI2Instructions) { 8044 multiply_add_64_bmi2(sum, op1, op2, carry, tmp2); 8045 } 8046 else { 8047 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); 8048 } 8049 8050 movq(Address(z, zlen, Address::times_4, 0), sum); 8051 8052 jmp(L_third_loop); 8053 bind(L_third_loop_exit); 8054 8055 // Fourth loop 8056 // Add 64 bit long carry into z with carry propagation. 8057 // Uses offsetted zlen. 8058 add_one_64(z, zlen, carry, tmp1); 8059 8060 pop(len); 8061 pop(zlen); 8062 jmp(L_second_loop); 8063 8064 // Next infrequent code is moved outside loops. 8065 bind(L_last_x); 8066 movl(op1, Address(x, 0)); 8067 jmp(L_multiply); 8068 8069 bind(L_second_loop_exit); 8070 pop(len); 8071 pop(zlen); 8072 pop(len); 8073 pop(zlen); 8074 8075 // Fifth loop 8076 // Shift z left 1 bit. 8077 lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4); 8078 8079 // z[zlen-1] |= x[len-1] & 1; 8080 movl(tmp3, Address(x, len, Address::times_4, -4)); 8081 andl(tmp3, 1); 8082 orl(Address(z, zlen, Address::times_4, -4), tmp3); 8083 8084 pop(tmp5); 8085 pop(tmp4); 8086 pop(tmp3); 8087 pop(tmp2); 8088 pop(tmp1); 8089 } 8090 8091 /** 8092 * Helper function for mul_add() 8093 * Multiply the in[] by int k and add to out[] starting at offset offs using 8094 * 128 bit by 32 bit multiply and return the carry in tmp5. 8095 * Only quad int aligned length of in[] is operated on in this function. 8096 * k is in rdxReg for BMI2Instructions, for others it is in tmp2. 8097 * This function preserves out, in and k registers. 8098 * len and offset point to the appropriate index in "in" & "out" correspondingly 8099 * tmp5 has the carry. 8100 * other registers are temporary and are modified. 8101 * 8102 */ 8103 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in, 8104 Register offset, Register len, Register tmp1, Register tmp2, Register tmp3, 8105 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { 8106 8107 Label L_first_loop, L_first_loop_exit; 8108 8109 movl(tmp1, len); 8110 shrl(tmp1, 2); 8111 8112 bind(L_first_loop); 8113 subl(tmp1, 1); 8114 jccb(Assembler::negative, L_first_loop_exit); 8115 8116 subl(len, 4); 8117 subl(offset, 4); 8118 8119 Register op2 = tmp2; 8120 const Register sum = tmp3; 8121 const Register op1 = tmp4; 8122 const Register carry = tmp5; 8123 8124 if (UseBMI2Instructions) { 8125 op2 = rdxReg; 8126 } 8127 8128 movq(op1, Address(in, len, Address::times_4, 8)); 8129 rorq(op1, 32); 8130 movq(sum, Address(out, offset, Address::times_4, 8)); 8131 rorq(sum, 32); 8132 if (UseBMI2Instructions) { 8133 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); 8134 } 8135 else { 8136 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); 8137 } 8138 // Store back in big endian from little endian 8139 rorq(sum, 0x20); 8140 movq(Address(out, offset, Address::times_4, 8), sum); 8141 8142 movq(op1, Address(in, len, Address::times_4, 0)); 8143 rorq(op1, 32); 8144 movq(sum, Address(out, offset, Address::times_4, 0)); 8145 rorq(sum, 32); 8146 if (UseBMI2Instructions) { 8147 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); 8148 } 8149 else { 8150 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); 8151 } 8152 // Store back in big endian from little endian 8153 rorq(sum, 0x20); 8154 movq(Address(out, offset, Address::times_4, 0), sum); 8155 8156 jmp(L_first_loop); 8157 bind(L_first_loop_exit); 8158 } 8159 8160 /** 8161 * Code for BigInteger::mulAdd() intrinsic 8162 * 8163 * rdi: out 8164 * rsi: in 8165 * r11: offs (out.length - offset) 8166 * rcx: len 8167 * r8: k 8168 * r12: tmp1 8169 * r13: tmp2 8170 * r14: tmp3 8171 * r15: tmp4 8172 * rbx: tmp5 8173 * Multiply the in[] by word k and add to out[], return the carry in rax 8174 */ 8175 void MacroAssembler::mul_add(Register out, Register in, Register offs, 8176 Register len, Register k, Register tmp1, Register tmp2, Register tmp3, 8177 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { 8178 8179 Label L_carry, L_last_in, L_done; 8180 8181 // carry = 0; 8182 // for (int j=len-1; j >= 0; j--) { 8183 // long product = (in[j] & LONG_MASK) * kLong + 8184 // (out[offs] & LONG_MASK) + carry; 8185 // out[offs--] = (int)product; 8186 // carry = product >>> 32; 8187 // } 8188 // 8189 push(tmp1); 8190 push(tmp2); 8191 push(tmp3); 8192 push(tmp4); 8193 push(tmp5); 8194 8195 Register op2 = tmp2; 8196 const Register sum = tmp3; 8197 const Register op1 = tmp4; 8198 const Register carry = tmp5; 8199 8200 if (UseBMI2Instructions) { 8201 op2 = rdxReg; 8202 movl(op2, k); 8203 } 8204 else { 8205 movl(op2, k); 8206 } 8207 8208 xorq(carry, carry); 8209 8210 //First loop 8211 8212 //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply 8213 //The carry is in tmp5 8214 mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg); 8215 8216 //Multiply the trailing in[] entry using 64 bit by 32 bit, if any 8217 decrementl(len); 8218 jccb(Assembler::negative, L_carry); 8219 decrementl(len); 8220 jccb(Assembler::negative, L_last_in); 8221 8222 movq(op1, Address(in, len, Address::times_4, 0)); 8223 rorq(op1, 32); 8224 8225 subl(offs, 2); 8226 movq(sum, Address(out, offs, Address::times_4, 0)); 8227 rorq(sum, 32); 8228 8229 if (UseBMI2Instructions) { 8230 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); 8231 } 8232 else { 8233 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); 8234 } 8235 8236 // Store back in big endian from little endian 8237 rorq(sum, 0x20); 8238 movq(Address(out, offs, Address::times_4, 0), sum); 8239 8240 testl(len, len); 8241 jccb(Assembler::zero, L_carry); 8242 8243 //Multiply the last in[] entry, if any 8244 bind(L_last_in); 8245 movl(op1, Address(in, 0)); 8246 movl(sum, Address(out, offs, Address::times_4, -4)); 8247 8248 movl(raxReg, k); 8249 mull(op1); //tmp4 * eax -> edx:eax 8250 addl(sum, carry); 8251 adcl(rdxReg, 0); 8252 addl(sum, raxReg); 8253 adcl(rdxReg, 0); 8254 movl(carry, rdxReg); 8255 8256 movl(Address(out, offs, Address::times_4, -4), sum); 8257 8258 bind(L_carry); 8259 //return tmp5/carry as carry in rax 8260 movl(rax, carry); 8261 8262 bind(L_done); 8263 pop(tmp5); 8264 pop(tmp4); 8265 pop(tmp3); 8266 pop(tmp2); 8267 pop(tmp1); 8268 } 8269 #endif 8270 8271 /** 8272 * Emits code to update CRC-32 with a byte value according to constants in table 8273 * 8274 * @param [in,out]crc Register containing the crc. 8275 * @param [in]val Register containing the byte to fold into the CRC. 8276 * @param [in]table Register containing the table of crc constants. 8277 * 8278 * uint32_t crc; 8279 * val = crc_table[(val ^ crc) & 0xFF]; 8280 * crc = val ^ (crc >> 8); 8281 * 8282 */ 8283 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 8284 xorl(val, crc); 8285 andl(val, 0xFF); 8286 shrl(crc, 8); // unsigned shift 8287 xorl(crc, Address(table, val, Address::times_4, 0)); 8288 } 8289 8290 /** 8291 * Fold 128-bit data chunk 8292 */ 8293 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) { 8294 if (UseAVX > 0) { 8295 vpclmulhdq(xtmp, xK, xcrc); // [123:64] 8296 vpclmulldq(xcrc, xK, xcrc); // [63:0] 8297 vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */); 8298 pxor(xcrc, xtmp); 8299 } else { 8300 movdqa(xtmp, xcrc); 8301 pclmulhdq(xtmp, xK); // [123:64] 8302 pclmulldq(xcrc, xK); // [63:0] 8303 pxor(xcrc, xtmp); 8304 movdqu(xtmp, Address(buf, offset)); 8305 pxor(xcrc, xtmp); 8306 } 8307 } 8308 8309 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) { 8310 if (UseAVX > 0) { 8311 vpclmulhdq(xtmp, xK, xcrc); 8312 vpclmulldq(xcrc, xK, xcrc); 8313 pxor(xcrc, xbuf); 8314 pxor(xcrc, xtmp); 8315 } else { 8316 movdqa(xtmp, xcrc); 8317 pclmulhdq(xtmp, xK); 8318 pclmulldq(xcrc, xK); 8319 pxor(xcrc, xbuf); 8320 pxor(xcrc, xtmp); 8321 } 8322 } 8323 8324 /** 8325 * 8-bit folds to compute 32-bit CRC 8326 * 8327 * uint64_t xcrc; 8328 * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8); 8329 */ 8330 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) { 8331 movdl(tmp, xcrc); 8332 andl(tmp, 0xFF); 8333 movdl(xtmp, Address(table, tmp, Address::times_4, 0)); 8334 psrldq(xcrc, 1); // unsigned shift one byte 8335 pxor(xcrc, xtmp); 8336 } 8337 8338 /** 8339 * uint32_t crc; 8340 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 8341 */ 8342 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) { 8343 movl(tmp, crc); 8344 andl(tmp, 0xFF); 8345 shrl(crc, 8); 8346 xorl(crc, Address(table, tmp, Address::times_4, 0)); 8347 } 8348 8349 /** 8350 * @param crc register containing existing CRC (32-bit) 8351 * @param buf register pointing to input byte buffer (byte*) 8352 * @param len register containing number of bytes 8353 * @param table register that will contain address of CRC table 8354 * @param tmp scratch register 8355 */ 8356 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) { 8357 assert_different_registers(crc, buf, len, table, tmp, rax); 8358 8359 Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned; 8360 Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop; 8361 8362 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge 8363 // context for the registers used, where all instructions below are using 128-bit mode 8364 // On EVEX without VL and BW, these instructions will all be AVX. 8365 lea(table, ExternalAddress(StubRoutines::crc_table_addr())); 8366 notl(crc); // ~crc 8367 cmpl(len, 16); 8368 jcc(Assembler::less, L_tail); 8369 8370 // Align buffer to 16 bytes 8371 movl(tmp, buf); 8372 andl(tmp, 0xF); 8373 jccb(Assembler::zero, L_aligned); 8374 subl(tmp, 16); 8375 addl(len, tmp); 8376 8377 align(4); 8378 BIND(L_align_loop); 8379 movsbl(rax, Address(buf, 0)); // load byte with sign extension 8380 update_byte_crc32(crc, rax, table); 8381 increment(buf); 8382 incrementl(tmp); 8383 jccb(Assembler::less, L_align_loop); 8384 8385 BIND(L_aligned); 8386 movl(tmp, len); // save 8387 shrl(len, 4); 8388 jcc(Assembler::zero, L_tail_restore); 8389 8390 // Fold crc into first bytes of vector 8391 movdqa(xmm1, Address(buf, 0)); 8392 movdl(rax, xmm1); 8393 xorl(crc, rax); 8394 if (VM_Version::supports_sse4_1()) { 8395 pinsrd(xmm1, crc, 0); 8396 } else { 8397 pinsrw(xmm1, crc, 0); 8398 shrl(crc, 16); 8399 pinsrw(xmm1, crc, 1); 8400 } 8401 addptr(buf, 16); 8402 subl(len, 4); // len > 0 8403 jcc(Assembler::less, L_fold_tail); 8404 8405 movdqa(xmm2, Address(buf, 0)); 8406 movdqa(xmm3, Address(buf, 16)); 8407 movdqa(xmm4, Address(buf, 32)); 8408 addptr(buf, 48); 8409 subl(len, 3); 8410 jcc(Assembler::lessEqual, L_fold_512b); 8411 8412 // Fold total 512 bits of polynomial on each iteration, 8413 // 128 bits per each of 4 parallel streams. 8414 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32), rscratch1); 8415 8416 align32(); 8417 BIND(L_fold_512b_loop); 8418 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0); 8419 fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16); 8420 fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32); 8421 fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48); 8422 addptr(buf, 64); 8423 subl(len, 4); 8424 jcc(Assembler::greater, L_fold_512b_loop); 8425 8426 // Fold 512 bits to 128 bits. 8427 BIND(L_fold_512b); 8428 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16), rscratch1); 8429 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2); 8430 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3); 8431 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4); 8432 8433 // Fold the rest of 128 bits data chunks 8434 BIND(L_fold_tail); 8435 addl(len, 3); 8436 jccb(Assembler::lessEqual, L_fold_128b); 8437 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16), rscratch1); 8438 8439 BIND(L_fold_tail_loop); 8440 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0); 8441 addptr(buf, 16); 8442 decrementl(len); 8443 jccb(Assembler::greater, L_fold_tail_loop); 8444 8445 // Fold 128 bits in xmm1 down into 32 bits in crc register. 8446 BIND(L_fold_128b); 8447 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()), rscratch1); 8448 if (UseAVX > 0) { 8449 vpclmulqdq(xmm2, xmm0, xmm1, 0x1); 8450 vpand(xmm3, xmm0, xmm2, 0 /* vector_len */); 8451 vpclmulqdq(xmm0, xmm0, xmm3, 0x1); 8452 } else { 8453 movdqa(xmm2, xmm0); 8454 pclmulqdq(xmm2, xmm1, 0x1); 8455 movdqa(xmm3, xmm0); 8456 pand(xmm3, xmm2); 8457 pclmulqdq(xmm0, xmm3, 0x1); 8458 } 8459 psrldq(xmm1, 8); 8460 psrldq(xmm2, 4); 8461 pxor(xmm0, xmm1); 8462 pxor(xmm0, xmm2); 8463 8464 // 8 8-bit folds to compute 32-bit CRC. 8465 for (int j = 0; j < 4; j++) { 8466 fold_8bit_crc32(xmm0, table, xmm1, rax); 8467 } 8468 movdl(crc, xmm0); // mov 32 bits to general register 8469 for (int j = 0; j < 4; j++) { 8470 fold_8bit_crc32(crc, table, rax); 8471 } 8472 8473 BIND(L_tail_restore); 8474 movl(len, tmp); // restore 8475 BIND(L_tail); 8476 andl(len, 0xf); 8477 jccb(Assembler::zero, L_exit); 8478 8479 // Fold the rest of bytes 8480 align(4); 8481 BIND(L_tail_loop); 8482 movsbl(rax, Address(buf, 0)); // load byte with sign extension 8483 update_byte_crc32(crc, rax, table); 8484 increment(buf); 8485 decrementl(len); 8486 jccb(Assembler::greater, L_tail_loop); 8487 8488 BIND(L_exit); 8489 notl(crc); // ~c 8490 } 8491 8492 #ifdef _LP64 8493 // Helper function for AVX 512 CRC32 8494 // Fold 512-bit data chunks 8495 void MacroAssembler::fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, 8496 Register pos, int offset) { 8497 evmovdquq(xmm3, Address(buf, pos, Address::times_1, offset), Assembler::AVX_512bit); 8498 evpclmulqdq(xtmp, xcrc, xK, 0x10, Assembler::AVX_512bit); // [123:64] 8499 evpclmulqdq(xmm2, xcrc, xK, 0x01, Assembler::AVX_512bit); // [63:0] 8500 evpxorq(xcrc, xtmp, xmm2, Assembler::AVX_512bit /* vector_len */); 8501 evpxorq(xcrc, xcrc, xmm3, Assembler::AVX_512bit /* vector_len */); 8502 } 8503 8504 // Helper function for AVX 512 CRC32 8505 // Compute CRC32 for < 256B buffers 8506 void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register table, Register pos, 8507 Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop, 8508 Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup) { 8509 8510 Label L_less_than_32, L_exact_16_left, L_less_than_16_left; 8511 Label L_less_than_8_left, L_less_than_4_left, L_less_than_2_left, L_zero_left; 8512 Label L_only_less_than_4, L_only_less_than_3, L_only_less_than_2; 8513 8514 // check if there is enough buffer to be able to fold 16B at a time 8515 cmpl(len, 32); 8516 jcc(Assembler::less, L_less_than_32); 8517 8518 // if there is, load the constants 8519 movdqu(xmm10, Address(table, 1 * 16)); //rk1 and rk2 in xmm10 8520 movdl(xmm0, crc); // get the initial crc value 8521 movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext 8522 pxor(xmm7, xmm0); 8523 8524 // update the buffer pointer 8525 addl(pos, 16); 8526 //update the counter.subtract 32 instead of 16 to save one instruction from the loop 8527 subl(len, 32); 8528 jmp(L_16B_reduction_loop); 8529 8530 bind(L_less_than_32); 8531 //mov initial crc to the return value. this is necessary for zero - length buffers. 8532 movl(rax, crc); 8533 testl(len, len); 8534 jcc(Assembler::equal, L_cleanup); 8535 8536 movdl(xmm0, crc); //get the initial crc value 8537 8538 cmpl(len, 16); 8539 jcc(Assembler::equal, L_exact_16_left); 8540 jcc(Assembler::less, L_less_than_16_left); 8541 8542 movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext 8543 pxor(xmm7, xmm0); //xor the initial crc value 8544 addl(pos, 16); 8545 subl(len, 16); 8546 movdqu(xmm10, Address(table, 1 * 16)); // rk1 and rk2 in xmm10 8547 jmp(L_get_last_two_xmms); 8548 8549 bind(L_less_than_16_left); 8550 //use stack space to load data less than 16 bytes, zero - out the 16B in memory first. 8551 pxor(xmm1, xmm1); 8552 movptr(tmp1, rsp); 8553 movdqu(Address(tmp1, 0 * 16), xmm1); 8554 8555 cmpl(len, 4); 8556 jcc(Assembler::less, L_only_less_than_4); 8557 8558 //backup the counter value 8559 movl(tmp2, len); 8560 cmpl(len, 8); 8561 jcc(Assembler::less, L_less_than_8_left); 8562 8563 //load 8 Bytes 8564 movq(rax, Address(buf, pos, Address::times_1, 0 * 16)); 8565 movq(Address(tmp1, 0 * 16), rax); 8566 addptr(tmp1, 8); 8567 subl(len, 8); 8568 addl(pos, 8); 8569 8570 bind(L_less_than_8_left); 8571 cmpl(len, 4); 8572 jcc(Assembler::less, L_less_than_4_left); 8573 8574 //load 4 Bytes 8575 movl(rax, Address(buf, pos, Address::times_1, 0)); 8576 movl(Address(tmp1, 0 * 16), rax); 8577 addptr(tmp1, 4); 8578 subl(len, 4); 8579 addl(pos, 4); 8580 8581 bind(L_less_than_4_left); 8582 cmpl(len, 2); 8583 jcc(Assembler::less, L_less_than_2_left); 8584 8585 // load 2 Bytes 8586 movw(rax, Address(buf, pos, Address::times_1, 0)); 8587 movl(Address(tmp1, 0 * 16), rax); 8588 addptr(tmp1, 2); 8589 subl(len, 2); 8590 addl(pos, 2); 8591 8592 bind(L_less_than_2_left); 8593 cmpl(len, 1); 8594 jcc(Assembler::less, L_zero_left); 8595 8596 // load 1 Byte 8597 movb(rax, Address(buf, pos, Address::times_1, 0)); 8598 movb(Address(tmp1, 0 * 16), rax); 8599 8600 bind(L_zero_left); 8601 movdqu(xmm7, Address(rsp, 0)); 8602 pxor(xmm7, xmm0); //xor the initial crc value 8603 8604 lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr())); 8605 movdqu(xmm0, Address(rax, tmp2)); 8606 pshufb(xmm7, xmm0); 8607 jmp(L_128_done); 8608 8609 bind(L_exact_16_left); 8610 movdqu(xmm7, Address(buf, pos, Address::times_1, 0)); 8611 pxor(xmm7, xmm0); //xor the initial crc value 8612 jmp(L_128_done); 8613 8614 bind(L_only_less_than_4); 8615 cmpl(len, 3); 8616 jcc(Assembler::less, L_only_less_than_3); 8617 8618 // load 3 Bytes 8619 movb(rax, Address(buf, pos, Address::times_1, 0)); 8620 movb(Address(tmp1, 0), rax); 8621 8622 movb(rax, Address(buf, pos, Address::times_1, 1)); 8623 movb(Address(tmp1, 1), rax); 8624 8625 movb(rax, Address(buf, pos, Address::times_1, 2)); 8626 movb(Address(tmp1, 2), rax); 8627 8628 movdqu(xmm7, Address(rsp, 0)); 8629 pxor(xmm7, xmm0); //xor the initial crc value 8630 8631 pslldq(xmm7, 0x5); 8632 jmp(L_barrett); 8633 bind(L_only_less_than_3); 8634 cmpl(len, 2); 8635 jcc(Assembler::less, L_only_less_than_2); 8636 8637 // load 2 Bytes 8638 movb(rax, Address(buf, pos, Address::times_1, 0)); 8639 movb(Address(tmp1, 0), rax); 8640 8641 movb(rax, Address(buf, pos, Address::times_1, 1)); 8642 movb(Address(tmp1, 1), rax); 8643 8644 movdqu(xmm7, Address(rsp, 0)); 8645 pxor(xmm7, xmm0); //xor the initial crc value 8646 8647 pslldq(xmm7, 0x6); 8648 jmp(L_barrett); 8649 8650 bind(L_only_less_than_2); 8651 //load 1 Byte 8652 movb(rax, Address(buf, pos, Address::times_1, 0)); 8653 movb(Address(tmp1, 0), rax); 8654 8655 movdqu(xmm7, Address(rsp, 0)); 8656 pxor(xmm7, xmm0); //xor the initial crc value 8657 8658 pslldq(xmm7, 0x7); 8659 } 8660 8661 /** 8662 * Compute CRC32 using AVX512 instructions 8663 * param crc register containing existing CRC (32-bit) 8664 * param buf register pointing to input byte buffer (byte*) 8665 * param len register containing number of bytes 8666 * param table address of crc or crc32c table 8667 * param tmp1 scratch register 8668 * param tmp2 scratch register 8669 * return rax result register 8670 * 8671 * This routine is identical for crc32c with the exception of the precomputed constant 8672 * table which will be passed as the table argument. The calculation steps are 8673 * the same for both variants. 8674 */ 8675 void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2) { 8676 assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax, r12); 8677 8678 Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned; 8679 Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop; 8680 Label L_less_than_256, L_fold_128_B_loop, L_fold_256_B_loop; 8681 Label L_fold_128_B_register, L_final_reduction_for_128, L_16B_reduction_loop; 8682 Label L_128_done, L_get_last_two_xmms, L_barrett, L_cleanup; 8683 8684 const Register pos = r12; 8685 push(r12); 8686 subptr(rsp, 16 * 2 + 8); 8687 8688 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge 8689 // context for the registers used, where all instructions below are using 128-bit mode 8690 // On EVEX without VL and BW, these instructions will all be AVX. 8691 movl(pos, 0); 8692 8693 // check if smaller than 256B 8694 cmpl(len, 256); 8695 jcc(Assembler::less, L_less_than_256); 8696 8697 // load the initial crc value 8698 movdl(xmm10, crc); 8699 8700 // receive the initial 64B data, xor the initial crc value 8701 evmovdquq(xmm0, Address(buf, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit); 8702 evmovdquq(xmm4, Address(buf, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit); 8703 evpxorq(xmm0, xmm0, xmm10, Assembler::AVX_512bit); 8704 evbroadcasti32x4(xmm10, Address(table, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4 8705 8706 subl(len, 256); 8707 cmpl(len, 256); 8708 jcc(Assembler::less, L_fold_128_B_loop); 8709 8710 evmovdquq(xmm7, Address(buf, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit); 8711 evmovdquq(xmm8, Address(buf, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit); 8712 evbroadcasti32x4(xmm16, Address(table, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2 8713 subl(len, 256); 8714 8715 bind(L_fold_256_B_loop); 8716 addl(pos, 256); 8717 fold512bit_crc32_avx512(xmm0, xmm16, xmm1, buf, pos, 0 * 64); 8718 fold512bit_crc32_avx512(xmm4, xmm16, xmm1, buf, pos, 1 * 64); 8719 fold512bit_crc32_avx512(xmm7, xmm16, xmm1, buf, pos, 2 * 64); 8720 fold512bit_crc32_avx512(xmm8, xmm16, xmm1, buf, pos, 3 * 64); 8721 8722 subl(len, 256); 8723 jcc(Assembler::greaterEqual, L_fold_256_B_loop); 8724 8725 // Fold 256 into 128 8726 addl(pos, 256); 8727 evpclmulqdq(xmm1, xmm0, xmm10, 0x01, Assembler::AVX_512bit); 8728 evpclmulqdq(xmm2, xmm0, xmm10, 0x10, Assembler::AVX_512bit); 8729 vpternlogq(xmm7, 0x96, xmm1, xmm2, Assembler::AVX_512bit); // xor ABC 8730 8731 evpclmulqdq(xmm5, xmm4, xmm10, 0x01, Assembler::AVX_512bit); 8732 evpclmulqdq(xmm6, xmm4, xmm10, 0x10, Assembler::AVX_512bit); 8733 vpternlogq(xmm8, 0x96, xmm5, xmm6, Assembler::AVX_512bit); // xor ABC 8734 8735 evmovdquq(xmm0, xmm7, Assembler::AVX_512bit); 8736 evmovdquq(xmm4, xmm8, Assembler::AVX_512bit); 8737 8738 addl(len, 128); 8739 jmp(L_fold_128_B_register); 8740 8741 // at this section of the code, there is 128 * x + y(0 <= y<128) bytes of buffer.The fold_128_B_loop 8742 // loop will fold 128B at a time until we have 128 + y Bytes of buffer 8743 8744 // fold 128B at a time.This section of the code folds 8 xmm registers in parallel 8745 bind(L_fold_128_B_loop); 8746 addl(pos, 128); 8747 fold512bit_crc32_avx512(xmm0, xmm10, xmm1, buf, pos, 0 * 64); 8748 fold512bit_crc32_avx512(xmm4, xmm10, xmm1, buf, pos, 1 * 64); 8749 8750 subl(len, 128); 8751 jcc(Assembler::greaterEqual, L_fold_128_B_loop); 8752 8753 addl(pos, 128); 8754 8755 // at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128 8756 // the 128B of folded data is in 8 of the xmm registers : xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 8757 bind(L_fold_128_B_register); 8758 evmovdquq(xmm16, Address(table, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16 8759 evmovdquq(xmm11, Address(table, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0 8760 evpclmulqdq(xmm1, xmm0, xmm16, 0x01, Assembler::AVX_512bit); 8761 evpclmulqdq(xmm2, xmm0, xmm16, 0x10, Assembler::AVX_512bit); 8762 // save last that has no multiplicand 8763 vextracti64x2(xmm7, xmm4, 3); 8764 8765 evpclmulqdq(xmm5, xmm4, xmm11, 0x01, Assembler::AVX_512bit); 8766 evpclmulqdq(xmm6, xmm4, xmm11, 0x10, Assembler::AVX_512bit); 8767 // Needed later in reduction loop 8768 movdqu(xmm10, Address(table, 1 * 16)); 8769 vpternlogq(xmm1, 0x96, xmm2, xmm5, Assembler::AVX_512bit); // xor ABC 8770 vpternlogq(xmm1, 0x96, xmm6, xmm7, Assembler::AVX_512bit); // xor ABC 8771 8772 // Swap 1,0,3,2 - 01 00 11 10 8773 evshufi64x2(xmm8, xmm1, xmm1, 0x4e, Assembler::AVX_512bit); 8774 evpxorq(xmm8, xmm8, xmm1, Assembler::AVX_256bit); 8775 vextracti128(xmm5, xmm8, 1); 8776 evpxorq(xmm7, xmm5, xmm8, Assembler::AVX_128bit); 8777 8778 // instead of 128, we add 128 - 16 to the loop counter to save 1 instruction from the loop 8779 // instead of a cmp instruction, we use the negative flag with the jl instruction 8780 addl(len, 128 - 16); 8781 jcc(Assembler::less, L_final_reduction_for_128); 8782 8783 bind(L_16B_reduction_loop); 8784 vpclmulqdq(xmm8, xmm7, xmm10, 0x01); 8785 vpclmulqdq(xmm7, xmm7, xmm10, 0x10); 8786 vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit); 8787 movdqu(xmm0, Address(buf, pos, Address::times_1, 0 * 16)); 8788 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit); 8789 addl(pos, 16); 8790 subl(len, 16); 8791 jcc(Assembler::greaterEqual, L_16B_reduction_loop); 8792 8793 bind(L_final_reduction_for_128); 8794 addl(len, 16); 8795 jcc(Assembler::equal, L_128_done); 8796 8797 bind(L_get_last_two_xmms); 8798 movdqu(xmm2, xmm7); 8799 addl(pos, len); 8800 movdqu(xmm1, Address(buf, pos, Address::times_1, -16)); 8801 subl(pos, len); 8802 8803 // get rid of the extra data that was loaded before 8804 // load the shift constant 8805 lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr())); 8806 movdqu(xmm0, Address(rax, len)); 8807 addl(rax, len); 8808 8809 vpshufb(xmm7, xmm7, xmm0, Assembler::AVX_128bit); 8810 //Change mask to 512 8811 vpxor(xmm0, xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 2 * 16), Assembler::AVX_128bit, tmp2); 8812 vpshufb(xmm2, xmm2, xmm0, Assembler::AVX_128bit); 8813 8814 blendvpb(xmm2, xmm2, xmm1, xmm0, Assembler::AVX_128bit); 8815 vpclmulqdq(xmm8, xmm7, xmm10, 0x01); 8816 vpclmulqdq(xmm7, xmm7, xmm10, 0x10); 8817 vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit); 8818 vpxor(xmm7, xmm7, xmm2, Assembler::AVX_128bit); 8819 8820 bind(L_128_done); 8821 // compute crc of a 128-bit value 8822 movdqu(xmm10, Address(table, 3 * 16)); 8823 movdqu(xmm0, xmm7); 8824 8825 // 64b fold 8826 vpclmulqdq(xmm7, xmm7, xmm10, 0x0); 8827 vpsrldq(xmm0, xmm0, 0x8, Assembler::AVX_128bit); 8828 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit); 8829 8830 // 32b fold 8831 movdqu(xmm0, xmm7); 8832 vpslldq(xmm7, xmm7, 0x4, Assembler::AVX_128bit); 8833 vpclmulqdq(xmm7, xmm7, xmm10, 0x10); 8834 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit); 8835 jmp(L_barrett); 8836 8837 bind(L_less_than_256); 8838 kernel_crc32_avx512_256B(crc, buf, len, table, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup); 8839 8840 //barrett reduction 8841 bind(L_barrett); 8842 vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 1 * 16), Assembler::AVX_128bit, tmp2); 8843 movdqu(xmm1, xmm7); 8844 movdqu(xmm2, xmm7); 8845 movdqu(xmm10, Address(table, 4 * 16)); 8846 8847 pclmulqdq(xmm7, xmm10, 0x0); 8848 pxor(xmm7, xmm2); 8849 vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr()), Assembler::AVX_128bit, tmp2); 8850 movdqu(xmm2, xmm7); 8851 pclmulqdq(xmm7, xmm10, 0x10); 8852 pxor(xmm7, xmm2); 8853 pxor(xmm7, xmm1); 8854 pextrd(crc, xmm7, 2); 8855 8856 bind(L_cleanup); 8857 addptr(rsp, 16 * 2 + 8); 8858 pop(r12); 8859 } 8860 8861 // S. Gueron / Information Processing Letters 112 (2012) 184 8862 // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table. 8863 // Input: A 32 bit value B = [byte3, byte2, byte1, byte0]. 8864 // Output: the 64-bit carry-less product of B * CONST 8865 void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n, 8866 Register tmp1, Register tmp2, Register tmp3) { 8867 lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr())); 8868 if (n > 0) { 8869 addq(tmp3, n * 256 * 8); 8870 } 8871 // Q1 = TABLEExt[n][B & 0xFF]; 8872 movl(tmp1, in); 8873 andl(tmp1, 0x000000FF); 8874 shll(tmp1, 3); 8875 addq(tmp1, tmp3); 8876 movq(tmp1, Address(tmp1, 0)); 8877 8878 // Q2 = TABLEExt[n][B >> 8 & 0xFF]; 8879 movl(tmp2, in); 8880 shrl(tmp2, 8); 8881 andl(tmp2, 0x000000FF); 8882 shll(tmp2, 3); 8883 addq(tmp2, tmp3); 8884 movq(tmp2, Address(tmp2, 0)); 8885 8886 shlq(tmp2, 8); 8887 xorq(tmp1, tmp2); 8888 8889 // Q3 = TABLEExt[n][B >> 16 & 0xFF]; 8890 movl(tmp2, in); 8891 shrl(tmp2, 16); 8892 andl(tmp2, 0x000000FF); 8893 shll(tmp2, 3); 8894 addq(tmp2, tmp3); 8895 movq(tmp2, Address(tmp2, 0)); 8896 8897 shlq(tmp2, 16); 8898 xorq(tmp1, tmp2); 8899 8900 // Q4 = TABLEExt[n][B >> 24 & 0xFF]; 8901 shrl(in, 24); 8902 andl(in, 0x000000FF); 8903 shll(in, 3); 8904 addq(in, tmp3); 8905 movq(in, Address(in, 0)); 8906 8907 shlq(in, 24); 8908 xorq(in, tmp1); 8909 // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24; 8910 } 8911 8912 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1, 8913 Register in_out, 8914 uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported, 8915 XMMRegister w_xtmp2, 8916 Register tmp1, 8917 Register n_tmp2, Register n_tmp3) { 8918 if (is_pclmulqdq_supported) { 8919 movdl(w_xtmp1, in_out); // modified blindly 8920 8921 movl(tmp1, const_or_pre_comp_const_index); 8922 movdl(w_xtmp2, tmp1); 8923 pclmulqdq(w_xtmp1, w_xtmp2, 0); 8924 8925 movdq(in_out, w_xtmp1); 8926 } else { 8927 crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3); 8928 } 8929 } 8930 8931 // Recombination Alternative 2: No bit-reflections 8932 // T1 = (CRC_A * U1) << 1 8933 // T2 = (CRC_B * U2) << 1 8934 // C1 = T1 >> 32 8935 // C2 = T2 >> 32 8936 // T1 = T1 & 0xFFFFFFFF 8937 // T2 = T2 & 0xFFFFFFFF 8938 // T1 = CRC32(0, T1) 8939 // T2 = CRC32(0, T2) 8940 // C1 = C1 ^ T1 8941 // C2 = C2 ^ T2 8942 // CRC = C1 ^ C2 ^ CRC_C 8943 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2, 8944 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, 8945 Register tmp1, Register tmp2, 8946 Register n_tmp3) { 8947 crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3); 8948 crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3); 8949 shlq(in_out, 1); 8950 movl(tmp1, in_out); 8951 shrq(in_out, 32); 8952 xorl(tmp2, tmp2); 8953 crc32(tmp2, tmp1, 4); 8954 xorl(in_out, tmp2); // we don't care about upper 32 bit contents here 8955 shlq(in1, 1); 8956 movl(tmp1, in1); 8957 shrq(in1, 32); 8958 xorl(tmp2, tmp2); 8959 crc32(tmp2, tmp1, 4); 8960 xorl(in1, tmp2); 8961 xorl(in_out, in1); 8962 xorl(in_out, in2); 8963 } 8964 8965 // Set N to predefined value 8966 // Subtract from a length of a buffer 8967 // execute in a loop: 8968 // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0 8969 // for i = 1 to N do 8970 // CRC_A = CRC32(CRC_A, A[i]) 8971 // CRC_B = CRC32(CRC_B, B[i]) 8972 // CRC_C = CRC32(CRC_C, C[i]) 8973 // end for 8974 // Recombine 8975 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, 8976 Register in_out1, Register in_out2, Register in_out3, 8977 Register tmp1, Register tmp2, Register tmp3, 8978 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, 8979 Register tmp4, Register tmp5, 8980 Register n_tmp6) { 8981 Label L_processPartitions; 8982 Label L_processPartition; 8983 Label L_exit; 8984 8985 bind(L_processPartitions); 8986 cmpl(in_out1, 3 * size); 8987 jcc(Assembler::less, L_exit); 8988 xorl(tmp1, tmp1); 8989 xorl(tmp2, tmp2); 8990 movq(tmp3, in_out2); 8991 addq(tmp3, size); 8992 8993 bind(L_processPartition); 8994 crc32(in_out3, Address(in_out2, 0), 8); 8995 crc32(tmp1, Address(in_out2, size), 8); 8996 crc32(tmp2, Address(in_out2, size * 2), 8); 8997 addq(in_out2, 8); 8998 cmpq(in_out2, tmp3); 8999 jcc(Assembler::less, L_processPartition); 9000 crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2, 9001 w_xtmp1, w_xtmp2, w_xtmp3, 9002 tmp4, tmp5, 9003 n_tmp6); 9004 addq(in_out2, 2 * size); 9005 subl(in_out1, 3 * size); 9006 jmp(L_processPartitions); 9007 9008 bind(L_exit); 9009 } 9010 #else 9011 void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n, 9012 Register tmp1, Register tmp2, Register tmp3, 9013 XMMRegister xtmp1, XMMRegister xtmp2) { 9014 lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr())); 9015 if (n > 0) { 9016 addl(tmp3, n * 256 * 8); 9017 } 9018 // Q1 = TABLEExt[n][B & 0xFF]; 9019 movl(tmp1, in_out); 9020 andl(tmp1, 0x000000FF); 9021 shll(tmp1, 3); 9022 addl(tmp1, tmp3); 9023 movq(xtmp1, Address(tmp1, 0)); 9024 9025 // Q2 = TABLEExt[n][B >> 8 & 0xFF]; 9026 movl(tmp2, in_out); 9027 shrl(tmp2, 8); 9028 andl(tmp2, 0x000000FF); 9029 shll(tmp2, 3); 9030 addl(tmp2, tmp3); 9031 movq(xtmp2, Address(tmp2, 0)); 9032 9033 psllq(xtmp2, 8); 9034 pxor(xtmp1, xtmp2); 9035 9036 // Q3 = TABLEExt[n][B >> 16 & 0xFF]; 9037 movl(tmp2, in_out); 9038 shrl(tmp2, 16); 9039 andl(tmp2, 0x000000FF); 9040 shll(tmp2, 3); 9041 addl(tmp2, tmp3); 9042 movq(xtmp2, Address(tmp2, 0)); 9043 9044 psllq(xtmp2, 16); 9045 pxor(xtmp1, xtmp2); 9046 9047 // Q4 = TABLEExt[n][B >> 24 & 0xFF]; 9048 shrl(in_out, 24); 9049 andl(in_out, 0x000000FF); 9050 shll(in_out, 3); 9051 addl(in_out, tmp3); 9052 movq(xtmp2, Address(in_out, 0)); 9053 9054 psllq(xtmp2, 24); 9055 pxor(xtmp1, xtmp2); // Result in CXMM 9056 // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24; 9057 } 9058 9059 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1, 9060 Register in_out, 9061 uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported, 9062 XMMRegister w_xtmp2, 9063 Register tmp1, 9064 Register n_tmp2, Register n_tmp3) { 9065 if (is_pclmulqdq_supported) { 9066 movdl(w_xtmp1, in_out); 9067 9068 movl(tmp1, const_or_pre_comp_const_index); 9069 movdl(w_xtmp2, tmp1); 9070 pclmulqdq(w_xtmp1, w_xtmp2, 0); 9071 // Keep result in XMM since GPR is 32 bit in length 9072 } else { 9073 crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2); 9074 } 9075 } 9076 9077 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2, 9078 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, 9079 Register tmp1, Register tmp2, 9080 Register n_tmp3) { 9081 crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3); 9082 crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3); 9083 9084 psllq(w_xtmp1, 1); 9085 movdl(tmp1, w_xtmp1); 9086 psrlq(w_xtmp1, 32); 9087 movdl(in_out, w_xtmp1); 9088 9089 xorl(tmp2, tmp2); 9090 crc32(tmp2, tmp1, 4); 9091 xorl(in_out, tmp2); 9092 9093 psllq(w_xtmp2, 1); 9094 movdl(tmp1, w_xtmp2); 9095 psrlq(w_xtmp2, 32); 9096 movdl(in1, w_xtmp2); 9097 9098 xorl(tmp2, tmp2); 9099 crc32(tmp2, tmp1, 4); 9100 xorl(in1, tmp2); 9101 xorl(in_out, in1); 9102 xorl(in_out, in2); 9103 } 9104 9105 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, 9106 Register in_out1, Register in_out2, Register in_out3, 9107 Register tmp1, Register tmp2, Register tmp3, 9108 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, 9109 Register tmp4, Register tmp5, 9110 Register n_tmp6) { 9111 Label L_processPartitions; 9112 Label L_processPartition; 9113 Label L_exit; 9114 9115 bind(L_processPartitions); 9116 cmpl(in_out1, 3 * size); 9117 jcc(Assembler::less, L_exit); 9118 xorl(tmp1, tmp1); 9119 xorl(tmp2, tmp2); 9120 movl(tmp3, in_out2); 9121 addl(tmp3, size); 9122 9123 bind(L_processPartition); 9124 crc32(in_out3, Address(in_out2, 0), 4); 9125 crc32(tmp1, Address(in_out2, size), 4); 9126 crc32(tmp2, Address(in_out2, size*2), 4); 9127 crc32(in_out3, Address(in_out2, 0+4), 4); 9128 crc32(tmp1, Address(in_out2, size+4), 4); 9129 crc32(tmp2, Address(in_out2, size*2+4), 4); 9130 addl(in_out2, 8); 9131 cmpl(in_out2, tmp3); 9132 jcc(Assembler::less, L_processPartition); 9133 9134 push(tmp3); 9135 push(in_out1); 9136 push(in_out2); 9137 tmp4 = tmp3; 9138 tmp5 = in_out1; 9139 n_tmp6 = in_out2; 9140 9141 crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2, 9142 w_xtmp1, w_xtmp2, w_xtmp3, 9143 tmp4, tmp5, 9144 n_tmp6); 9145 9146 pop(in_out2); 9147 pop(in_out1); 9148 pop(tmp3); 9149 9150 addl(in_out2, 2 * size); 9151 subl(in_out1, 3 * size); 9152 jmp(L_processPartitions); 9153 9154 bind(L_exit); 9155 } 9156 #endif //LP64 9157 9158 #ifdef _LP64 9159 // Algorithm 2: Pipelined usage of the CRC32 instruction. 9160 // Input: A buffer I of L bytes. 9161 // Output: the CRC32C value of the buffer. 9162 // Notations: 9163 // Write L = 24N + r, with N = floor (L/24). 9164 // r = L mod 24 (0 <= r < 24). 9165 // Consider I as the concatenation of A|B|C|R, where A, B, C, each, 9166 // N quadwords, and R consists of r bytes. 9167 // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1 9168 // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1 9169 // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1 9170 // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1 9171 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2, 9172 Register tmp1, Register tmp2, Register tmp3, 9173 Register tmp4, Register tmp5, Register tmp6, 9174 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, 9175 bool is_pclmulqdq_supported) { 9176 uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS]; 9177 Label L_wordByWord; 9178 Label L_byteByByteProlog; 9179 Label L_byteByByte; 9180 Label L_exit; 9181 9182 if (is_pclmulqdq_supported ) { 9183 const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::crc32c_table_addr(); 9184 const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 1); 9185 9186 const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 2); 9187 const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 3); 9188 9189 const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 4); 9190 const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 5); 9191 assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\""); 9192 } else { 9193 const_or_pre_comp_const_index[0] = 1; 9194 const_or_pre_comp_const_index[1] = 0; 9195 9196 const_or_pre_comp_const_index[2] = 3; 9197 const_or_pre_comp_const_index[3] = 2; 9198 9199 const_or_pre_comp_const_index[4] = 5; 9200 const_or_pre_comp_const_index[5] = 4; 9201 } 9202 crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported, 9203 in2, in1, in_out, 9204 tmp1, tmp2, tmp3, 9205 w_xtmp1, w_xtmp2, w_xtmp3, 9206 tmp4, tmp5, 9207 tmp6); 9208 crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported, 9209 in2, in1, in_out, 9210 tmp1, tmp2, tmp3, 9211 w_xtmp1, w_xtmp2, w_xtmp3, 9212 tmp4, tmp5, 9213 tmp6); 9214 crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported, 9215 in2, in1, in_out, 9216 tmp1, tmp2, tmp3, 9217 w_xtmp1, w_xtmp2, w_xtmp3, 9218 tmp4, tmp5, 9219 tmp6); 9220 movl(tmp1, in2); 9221 andl(tmp1, 0x00000007); 9222 negl(tmp1); 9223 addl(tmp1, in2); 9224 addq(tmp1, in1); 9225 9226 cmpq(in1, tmp1); 9227 jccb(Assembler::greaterEqual, L_byteByByteProlog); 9228 align(16); 9229 BIND(L_wordByWord); 9230 crc32(in_out, Address(in1, 0), 8); 9231 addq(in1, 8); 9232 cmpq(in1, tmp1); 9233 jcc(Assembler::less, L_wordByWord); 9234 9235 BIND(L_byteByByteProlog); 9236 andl(in2, 0x00000007); 9237 movl(tmp2, 1); 9238 9239 cmpl(tmp2, in2); 9240 jccb(Assembler::greater, L_exit); 9241 BIND(L_byteByByte); 9242 crc32(in_out, Address(in1, 0), 1); 9243 incq(in1); 9244 incl(tmp2); 9245 cmpl(tmp2, in2); 9246 jcc(Assembler::lessEqual, L_byteByByte); 9247 9248 BIND(L_exit); 9249 } 9250 #else 9251 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2, 9252 Register tmp1, Register tmp2, Register tmp3, 9253 Register tmp4, Register tmp5, Register tmp6, 9254 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, 9255 bool is_pclmulqdq_supported) { 9256 uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS]; 9257 Label L_wordByWord; 9258 Label L_byteByByteProlog; 9259 Label L_byteByByte; 9260 Label L_exit; 9261 9262 if (is_pclmulqdq_supported) { 9263 const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::crc32c_table_addr(); 9264 const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 1); 9265 9266 const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 2); 9267 const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 3); 9268 9269 const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 4); 9270 const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::crc32c_table_addr() + 5); 9271 } else { 9272 const_or_pre_comp_const_index[0] = 1; 9273 const_or_pre_comp_const_index[1] = 0; 9274 9275 const_or_pre_comp_const_index[2] = 3; 9276 const_or_pre_comp_const_index[3] = 2; 9277 9278 const_or_pre_comp_const_index[4] = 5; 9279 const_or_pre_comp_const_index[5] = 4; 9280 } 9281 crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported, 9282 in2, in1, in_out, 9283 tmp1, tmp2, tmp3, 9284 w_xtmp1, w_xtmp2, w_xtmp3, 9285 tmp4, tmp5, 9286 tmp6); 9287 crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported, 9288 in2, in1, in_out, 9289 tmp1, tmp2, tmp3, 9290 w_xtmp1, w_xtmp2, w_xtmp3, 9291 tmp4, tmp5, 9292 tmp6); 9293 crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported, 9294 in2, in1, in_out, 9295 tmp1, tmp2, tmp3, 9296 w_xtmp1, w_xtmp2, w_xtmp3, 9297 tmp4, tmp5, 9298 tmp6); 9299 movl(tmp1, in2); 9300 andl(tmp1, 0x00000007); 9301 negl(tmp1); 9302 addl(tmp1, in2); 9303 addl(tmp1, in1); 9304 9305 BIND(L_wordByWord); 9306 cmpl(in1, tmp1); 9307 jcc(Assembler::greaterEqual, L_byteByByteProlog); 9308 crc32(in_out, Address(in1,0), 4); 9309 addl(in1, 4); 9310 jmp(L_wordByWord); 9311 9312 BIND(L_byteByByteProlog); 9313 andl(in2, 0x00000007); 9314 movl(tmp2, 1); 9315 9316 BIND(L_byteByByte); 9317 cmpl(tmp2, in2); 9318 jccb(Assembler::greater, L_exit); 9319 movb(tmp1, Address(in1, 0)); 9320 crc32(in_out, tmp1, 1); 9321 incl(in1); 9322 incl(tmp2); 9323 jmp(L_byteByByte); 9324 9325 BIND(L_exit); 9326 } 9327 #endif // LP64 9328 #undef BIND 9329 #undef BLOCK_COMMENT 9330 9331 // Compress char[] array to byte[]. 9332 // Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) 9333 // Return the array length if every element in array can be encoded, 9334 // otherwise, the index of first non-latin1 (> 0xff) character. 9335 // @IntrinsicCandidate 9336 // public static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) { 9337 // for (int i = 0; i < len; i++) { 9338 // char c = src[srcOff]; 9339 // if (c > 0xff) { 9340 // return i; // return index of non-latin1 char 9341 // } 9342 // dst[dstOff] = (byte)c; 9343 // srcOff++; 9344 // dstOff++; 9345 // } 9346 // return len; 9347 // } 9348 void MacroAssembler::char_array_compress(Register src, Register dst, Register len, 9349 XMMRegister tmp1Reg, XMMRegister tmp2Reg, 9350 XMMRegister tmp3Reg, XMMRegister tmp4Reg, 9351 Register tmp5, Register result, KRegister mask1, KRegister mask2) { 9352 Label copy_chars_loop, done, reset_sp, copy_tail; 9353 9354 // rsi: src 9355 // rdi: dst 9356 // rdx: len 9357 // rcx: tmp5 9358 // rax: result 9359 9360 // rsi holds start addr of source char[] to be compressed 9361 // rdi holds start addr of destination byte[] 9362 // rdx holds length 9363 9364 assert(len != result, ""); 9365 9366 // save length for return 9367 movl(result, len); 9368 9369 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 9370 VM_Version::supports_avx512vlbw() && 9371 VM_Version::supports_bmi2()) { 9372 9373 Label copy_32_loop, copy_loop_tail, below_threshold, reset_for_copy_tail; 9374 9375 // alignment 9376 Label post_alignment; 9377 9378 // if length of the string is less than 32, handle it the old fashioned way 9379 testl(len, -32); 9380 jcc(Assembler::zero, below_threshold); 9381 9382 // First check whether a character is compressible ( <= 0xFF). 9383 // Create mask to test for Unicode chars inside zmm vector 9384 movl(tmp5, 0x00FF); 9385 evpbroadcastw(tmp2Reg, tmp5, Assembler::AVX_512bit); 9386 9387 testl(len, -64); 9388 jccb(Assembler::zero, post_alignment); 9389 9390 movl(tmp5, dst); 9391 andl(tmp5, (32 - 1)); 9392 negl(tmp5); 9393 andl(tmp5, (32 - 1)); 9394 9395 // bail out when there is nothing to be done 9396 testl(tmp5, 0xFFFFFFFF); 9397 jccb(Assembler::zero, post_alignment); 9398 9399 // ~(~0 << len), where len is the # of remaining elements to process 9400 movl(len, 0xFFFFFFFF); 9401 shlxl(len, len, tmp5); 9402 notl(len); 9403 kmovdl(mask2, len); 9404 movl(len, result); 9405 9406 evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit); 9407 evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit); 9408 ktestd(mask1, mask2); 9409 jcc(Assembler::carryClear, copy_tail); 9410 9411 evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit); 9412 9413 addptr(src, tmp5); 9414 addptr(src, tmp5); 9415 addptr(dst, tmp5); 9416 subl(len, tmp5); 9417 9418 bind(post_alignment); 9419 // end of alignment 9420 9421 movl(tmp5, len); 9422 andl(tmp5, (32 - 1)); // tail count (in chars) 9423 andl(len, ~(32 - 1)); // vector count (in chars) 9424 jccb(Assembler::zero, copy_loop_tail); 9425 9426 lea(src, Address(src, len, Address::times_2)); 9427 lea(dst, Address(dst, len, Address::times_1)); 9428 negptr(len); 9429 9430 bind(copy_32_loop); 9431 evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit); 9432 evpcmpuw(mask1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit); 9433 kortestdl(mask1, mask1); 9434 jccb(Assembler::carryClear, reset_for_copy_tail); 9435 9436 // All elements in current processed chunk are valid candidates for 9437 // compression. Write a truncated byte elements to the memory. 9438 evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit); 9439 addptr(len, 32); 9440 jccb(Assembler::notZero, copy_32_loop); 9441 9442 bind(copy_loop_tail); 9443 // bail out when there is nothing to be done 9444 testl(tmp5, 0xFFFFFFFF); 9445 jcc(Assembler::zero, done); 9446 9447 movl(len, tmp5); 9448 9449 // ~(~0 << len), where len is the # of remaining elements to process 9450 movl(tmp5, 0xFFFFFFFF); 9451 shlxl(tmp5, tmp5, len); 9452 notl(tmp5); 9453 9454 kmovdl(mask2, tmp5); 9455 9456 evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit); 9457 evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit); 9458 ktestd(mask1, mask2); 9459 jcc(Assembler::carryClear, copy_tail); 9460 9461 evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit); 9462 jmp(done); 9463 9464 bind(reset_for_copy_tail); 9465 lea(src, Address(src, tmp5, Address::times_2)); 9466 lea(dst, Address(dst, tmp5, Address::times_1)); 9467 subptr(len, tmp5); 9468 jmp(copy_chars_loop); 9469 9470 bind(below_threshold); 9471 } 9472 9473 if (UseSSE42Intrinsics) { 9474 Label copy_32_loop, copy_16, copy_tail_sse, reset_for_copy_tail; 9475 9476 // vectored compression 9477 testl(len, 0xfffffff8); 9478 jcc(Assembler::zero, copy_tail); 9479 9480 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vectors 9481 movdl(tmp1Reg, tmp5); 9482 pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg 9483 9484 andl(len, 0xfffffff0); 9485 jccb(Assembler::zero, copy_16); 9486 9487 // compress 16 chars per iter 9488 pxor(tmp4Reg, tmp4Reg); 9489 9490 lea(src, Address(src, len, Address::times_2)); 9491 lea(dst, Address(dst, len, Address::times_1)); 9492 negptr(len); 9493 9494 bind(copy_32_loop); 9495 movdqu(tmp2Reg, Address(src, len, Address::times_2)); // load 1st 8 characters 9496 por(tmp4Reg, tmp2Reg); 9497 movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters 9498 por(tmp4Reg, tmp3Reg); 9499 ptest(tmp4Reg, tmp1Reg); // check for Unicode chars in next vector 9500 jccb(Assembler::notZero, reset_for_copy_tail); 9501 packuswb(tmp2Reg, tmp3Reg); // only ASCII chars; compress each to 1 byte 9502 movdqu(Address(dst, len, Address::times_1), tmp2Reg); 9503 addptr(len, 16); 9504 jccb(Assembler::notZero, copy_32_loop); 9505 9506 // compress next vector of 8 chars (if any) 9507 bind(copy_16); 9508 // len = 0 9509 testl(result, 0x00000008); // check if there's a block of 8 chars to compress 9510 jccb(Assembler::zero, copy_tail_sse); 9511 9512 pxor(tmp3Reg, tmp3Reg); 9513 9514 movdqu(tmp2Reg, Address(src, 0)); 9515 ptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector 9516 jccb(Assembler::notZero, reset_for_copy_tail); 9517 packuswb(tmp2Reg, tmp3Reg); // only LATIN1 chars; compress each to 1 byte 9518 movq(Address(dst, 0), tmp2Reg); 9519 addptr(src, 16); 9520 addptr(dst, 8); 9521 jmpb(copy_tail_sse); 9522 9523 bind(reset_for_copy_tail); 9524 movl(tmp5, result); 9525 andl(tmp5, 0x0000000f); 9526 lea(src, Address(src, tmp5, Address::times_2)); 9527 lea(dst, Address(dst, tmp5, Address::times_1)); 9528 subptr(len, tmp5); 9529 jmpb(copy_chars_loop); 9530 9531 bind(copy_tail_sse); 9532 movl(len, result); 9533 andl(len, 0x00000007); // tail count (in chars) 9534 } 9535 // compress 1 char per iter 9536 bind(copy_tail); 9537 testl(len, len); 9538 jccb(Assembler::zero, done); 9539 lea(src, Address(src, len, Address::times_2)); 9540 lea(dst, Address(dst, len, Address::times_1)); 9541 negptr(len); 9542 9543 bind(copy_chars_loop); 9544 load_unsigned_short(tmp5, Address(src, len, Address::times_2)); 9545 testl(tmp5, 0xff00); // check if Unicode char 9546 jccb(Assembler::notZero, reset_sp); 9547 movb(Address(dst, len, Address::times_1), tmp5); // ASCII char; compress to 1 byte 9548 increment(len); 9549 jccb(Assembler::notZero, copy_chars_loop); 9550 9551 // add len then return (len will be zero if compress succeeded, otherwise negative) 9552 bind(reset_sp); 9553 addl(result, len); 9554 9555 bind(done); 9556 } 9557 9558 // Inflate byte[] array to char[]. 9559 // ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java 9560 // @IntrinsicCandidate 9561 // private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) { 9562 // for (int i = 0; i < len; i++) { 9563 // dst[dstOff++] = (char)(src[srcOff++] & 0xff); 9564 // } 9565 // } 9566 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, 9567 XMMRegister tmp1, Register tmp2, KRegister mask) { 9568 Label copy_chars_loop, done, below_threshold, avx3_threshold; 9569 // rsi: src 9570 // rdi: dst 9571 // rdx: len 9572 // rcx: tmp2 9573 9574 // rsi holds start addr of source byte[] to be inflated 9575 // rdi holds start addr of destination char[] 9576 // rdx holds length 9577 assert_different_registers(src, dst, len, tmp2); 9578 movl(tmp2, len); 9579 if ((UseAVX > 2) && // AVX512 9580 VM_Version::supports_avx512vlbw() && 9581 VM_Version::supports_bmi2()) { 9582 9583 Label copy_32_loop, copy_tail; 9584 Register tmp3_aliased = len; 9585 9586 // if length of the string is less than 16, handle it in an old fashioned way 9587 testl(len, -16); 9588 jcc(Assembler::zero, below_threshold); 9589 9590 testl(len, -1 * AVX3Threshold); 9591 jcc(Assembler::zero, avx3_threshold); 9592 9593 // In order to use only one arithmetic operation for the main loop we use 9594 // this pre-calculation 9595 andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop 9596 andl(len, -32); // vector count 9597 jccb(Assembler::zero, copy_tail); 9598 9599 lea(src, Address(src, len, Address::times_1)); 9600 lea(dst, Address(dst, len, Address::times_2)); 9601 negptr(len); 9602 9603 9604 // inflate 32 chars per iter 9605 bind(copy_32_loop); 9606 vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit); 9607 evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit); 9608 addptr(len, 32); 9609 jcc(Assembler::notZero, copy_32_loop); 9610 9611 bind(copy_tail); 9612 // bail out when there is nothing to be done 9613 testl(tmp2, -1); // we don't destroy the contents of tmp2 here 9614 jcc(Assembler::zero, done); 9615 9616 // ~(~0 << length), where length is the # of remaining elements to process 9617 movl(tmp3_aliased, -1); 9618 shlxl(tmp3_aliased, tmp3_aliased, tmp2); 9619 notl(tmp3_aliased); 9620 kmovdl(mask, tmp3_aliased); 9621 evpmovzxbw(tmp1, mask, Address(src, 0), Assembler::AVX_512bit); 9622 evmovdquw(Address(dst, 0), mask, tmp1, /*merge*/ true, Assembler::AVX_512bit); 9623 9624 jmp(done); 9625 bind(avx3_threshold); 9626 } 9627 if (UseSSE42Intrinsics) { 9628 Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail; 9629 9630 if (UseAVX > 1) { 9631 andl(tmp2, (16 - 1)); 9632 andl(len, -16); 9633 jccb(Assembler::zero, copy_new_tail); 9634 } else { 9635 andl(tmp2, 0x00000007); // tail count (in chars) 9636 andl(len, 0xfffffff8); // vector count (in chars) 9637 jccb(Assembler::zero, copy_tail); 9638 } 9639 9640 // vectored inflation 9641 lea(src, Address(src, len, Address::times_1)); 9642 lea(dst, Address(dst, len, Address::times_2)); 9643 negptr(len); 9644 9645 if (UseAVX > 1) { 9646 bind(copy_16_loop); 9647 vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit); 9648 vmovdqu(Address(dst, len, Address::times_2), tmp1); 9649 addptr(len, 16); 9650 jcc(Assembler::notZero, copy_16_loop); 9651 9652 bind(below_threshold); 9653 bind(copy_new_tail); 9654 movl(len, tmp2); 9655 andl(tmp2, 0x00000007); 9656 andl(len, 0xFFFFFFF8); 9657 jccb(Assembler::zero, copy_tail); 9658 9659 pmovzxbw(tmp1, Address(src, 0)); 9660 movdqu(Address(dst, 0), tmp1); 9661 addptr(src, 8); 9662 addptr(dst, 2 * 8); 9663 9664 jmp(copy_tail, true); 9665 } 9666 9667 // inflate 8 chars per iter 9668 bind(copy_8_loop); 9669 pmovzxbw(tmp1, Address(src, len, Address::times_1)); // unpack to 8 words 9670 movdqu(Address(dst, len, Address::times_2), tmp1); 9671 addptr(len, 8); 9672 jcc(Assembler::notZero, copy_8_loop); 9673 9674 bind(copy_tail); 9675 movl(len, tmp2); 9676 9677 cmpl(len, 4); 9678 jccb(Assembler::less, copy_bytes); 9679 9680 movdl(tmp1, Address(src, 0)); // load 4 byte chars 9681 pmovzxbw(tmp1, tmp1); 9682 movq(Address(dst, 0), tmp1); 9683 subptr(len, 4); 9684 addptr(src, 4); 9685 addptr(dst, 8); 9686 9687 bind(copy_bytes); 9688 } else { 9689 bind(below_threshold); 9690 } 9691 9692 testl(len, len); 9693 jccb(Assembler::zero, done); 9694 lea(src, Address(src, len, Address::times_1)); 9695 lea(dst, Address(dst, len, Address::times_2)); 9696 negptr(len); 9697 9698 // inflate 1 char per iter 9699 bind(copy_chars_loop); 9700 load_unsigned_byte(tmp2, Address(src, len, Address::times_1)); // load byte char 9701 movw(Address(dst, len, Address::times_2), tmp2); // inflate byte char to word 9702 increment(len); 9703 jcc(Assembler::notZero, copy_chars_loop); 9704 9705 bind(done); 9706 } 9707 9708 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, XMMRegister src, bool merge, int vector_len) { 9709 switch(type) { 9710 case T_BYTE: 9711 case T_BOOLEAN: 9712 evmovdqub(dst, kmask, src, merge, vector_len); 9713 break; 9714 case T_CHAR: 9715 case T_SHORT: 9716 evmovdquw(dst, kmask, src, merge, vector_len); 9717 break; 9718 case T_INT: 9719 case T_FLOAT: 9720 evmovdqul(dst, kmask, src, merge, vector_len); 9721 break; 9722 case T_LONG: 9723 case T_DOUBLE: 9724 evmovdquq(dst, kmask, src, merge, vector_len); 9725 break; 9726 default: 9727 fatal("Unexpected type argument %s", type2name(type)); 9728 break; 9729 } 9730 } 9731 9732 9733 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) { 9734 switch(type) { 9735 case T_BYTE: 9736 case T_BOOLEAN: 9737 evmovdqub(dst, kmask, src, merge, vector_len); 9738 break; 9739 case T_CHAR: 9740 case T_SHORT: 9741 evmovdquw(dst, kmask, src, merge, vector_len); 9742 break; 9743 case T_INT: 9744 case T_FLOAT: 9745 evmovdqul(dst, kmask, src, merge, vector_len); 9746 break; 9747 case T_LONG: 9748 case T_DOUBLE: 9749 evmovdquq(dst, kmask, src, merge, vector_len); 9750 break; 9751 default: 9752 fatal("Unexpected type argument %s", type2name(type)); 9753 break; 9754 } 9755 } 9756 9757 void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, bool merge, int vector_len) { 9758 switch(type) { 9759 case T_BYTE: 9760 case T_BOOLEAN: 9761 evmovdqub(dst, kmask, src, merge, vector_len); 9762 break; 9763 case T_CHAR: 9764 case T_SHORT: 9765 evmovdquw(dst, kmask, src, merge, vector_len); 9766 break; 9767 case T_INT: 9768 case T_FLOAT: 9769 evmovdqul(dst, kmask, src, merge, vector_len); 9770 break; 9771 case T_LONG: 9772 case T_DOUBLE: 9773 evmovdquq(dst, kmask, src, merge, vector_len); 9774 break; 9775 default: 9776 fatal("Unexpected type argument %s", type2name(type)); 9777 break; 9778 } 9779 } 9780 9781 void MacroAssembler::knot(uint masklen, KRegister dst, KRegister src, KRegister ktmp, Register rtmp) { 9782 switch(masklen) { 9783 case 2: 9784 knotbl(dst, src); 9785 movl(rtmp, 3); 9786 kmovbl(ktmp, rtmp); 9787 kandbl(dst, ktmp, dst); 9788 break; 9789 case 4: 9790 knotbl(dst, src); 9791 movl(rtmp, 15); 9792 kmovbl(ktmp, rtmp); 9793 kandbl(dst, ktmp, dst); 9794 break; 9795 case 8: 9796 knotbl(dst, src); 9797 break; 9798 case 16: 9799 knotwl(dst, src); 9800 break; 9801 case 32: 9802 knotdl(dst, src); 9803 break; 9804 case 64: 9805 knotql(dst, src); 9806 break; 9807 default: 9808 fatal("Unexpected vector length %d", masklen); 9809 break; 9810 } 9811 } 9812 9813 void MacroAssembler::kand(BasicType type, KRegister dst, KRegister src1, KRegister src2) { 9814 switch(type) { 9815 case T_BOOLEAN: 9816 case T_BYTE: 9817 kandbl(dst, src1, src2); 9818 break; 9819 case T_CHAR: 9820 case T_SHORT: 9821 kandwl(dst, src1, src2); 9822 break; 9823 case T_INT: 9824 case T_FLOAT: 9825 kanddl(dst, src1, src2); 9826 break; 9827 case T_LONG: 9828 case T_DOUBLE: 9829 kandql(dst, src1, src2); 9830 break; 9831 default: 9832 fatal("Unexpected type argument %s", type2name(type)); 9833 break; 9834 } 9835 } 9836 9837 void MacroAssembler::kor(BasicType type, KRegister dst, KRegister src1, KRegister src2) { 9838 switch(type) { 9839 case T_BOOLEAN: 9840 case T_BYTE: 9841 korbl(dst, src1, src2); 9842 break; 9843 case T_CHAR: 9844 case T_SHORT: 9845 korwl(dst, src1, src2); 9846 break; 9847 case T_INT: 9848 case T_FLOAT: 9849 kordl(dst, src1, src2); 9850 break; 9851 case T_LONG: 9852 case T_DOUBLE: 9853 korql(dst, src1, src2); 9854 break; 9855 default: 9856 fatal("Unexpected type argument %s", type2name(type)); 9857 break; 9858 } 9859 } 9860 9861 void MacroAssembler::kxor(BasicType type, KRegister dst, KRegister src1, KRegister src2) { 9862 switch(type) { 9863 case T_BOOLEAN: 9864 case T_BYTE: 9865 kxorbl(dst, src1, src2); 9866 break; 9867 case T_CHAR: 9868 case T_SHORT: 9869 kxorwl(dst, src1, src2); 9870 break; 9871 case T_INT: 9872 case T_FLOAT: 9873 kxordl(dst, src1, src2); 9874 break; 9875 case T_LONG: 9876 case T_DOUBLE: 9877 kxorql(dst, src1, src2); 9878 break; 9879 default: 9880 fatal("Unexpected type argument %s", type2name(type)); 9881 break; 9882 } 9883 } 9884 9885 void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { 9886 switch(type) { 9887 case T_BOOLEAN: 9888 case T_BYTE: 9889 evpermb(dst, mask, nds, src, merge, vector_len); break; 9890 case T_CHAR: 9891 case T_SHORT: 9892 evpermw(dst, mask, nds, src, merge, vector_len); break; 9893 case T_INT: 9894 case T_FLOAT: 9895 evpermd(dst, mask, nds, src, merge, vector_len); break; 9896 case T_LONG: 9897 case T_DOUBLE: 9898 evpermq(dst, mask, nds, src, merge, vector_len); break; 9899 default: 9900 fatal("Unexpected type argument %s", type2name(type)); break; 9901 } 9902 } 9903 9904 void MacroAssembler::evperm(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) { 9905 switch(type) { 9906 case T_BOOLEAN: 9907 case T_BYTE: 9908 evpermb(dst, mask, nds, src, merge, vector_len); break; 9909 case T_CHAR: 9910 case T_SHORT: 9911 evpermw(dst, mask, nds, src, merge, vector_len); break; 9912 case T_INT: 9913 case T_FLOAT: 9914 evpermd(dst, mask, nds, src, merge, vector_len); break; 9915 case T_LONG: 9916 case T_DOUBLE: 9917 evpermq(dst, mask, nds, src, merge, vector_len); break; 9918 default: 9919 fatal("Unexpected type argument %s", type2name(type)); break; 9920 } 9921 } 9922 9923 void MacroAssembler::evpminu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) { 9924 switch(type) { 9925 case T_BYTE: 9926 evpminub(dst, mask, nds, src, merge, vector_len); break; 9927 case T_SHORT: 9928 evpminuw(dst, mask, nds, src, merge, vector_len); break; 9929 case T_INT: 9930 evpminud(dst, mask, nds, src, merge, vector_len); break; 9931 case T_LONG: 9932 evpminuq(dst, mask, nds, src, merge, vector_len); break; 9933 default: 9934 fatal("Unexpected type argument %s", type2name(type)); break; 9935 } 9936 } 9937 9938 void MacroAssembler::evpmaxu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) { 9939 switch(type) { 9940 case T_BYTE: 9941 evpmaxub(dst, mask, nds, src, merge, vector_len); break; 9942 case T_SHORT: 9943 evpmaxuw(dst, mask, nds, src, merge, vector_len); break; 9944 case T_INT: 9945 evpmaxud(dst, mask, nds, src, merge, vector_len); break; 9946 case T_LONG: 9947 evpmaxuq(dst, mask, nds, src, merge, vector_len); break; 9948 default: 9949 fatal("Unexpected type argument %s", type2name(type)); break; 9950 } 9951 } 9952 9953 void MacroAssembler::evpminu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { 9954 switch(type) { 9955 case T_BYTE: 9956 evpminub(dst, mask, nds, src, merge, vector_len); break; 9957 case T_SHORT: 9958 evpminuw(dst, mask, nds, src, merge, vector_len); break; 9959 case T_INT: 9960 evpminud(dst, mask, nds, src, merge, vector_len); break; 9961 case T_LONG: 9962 evpminuq(dst, mask, nds, src, merge, vector_len); break; 9963 default: 9964 fatal("Unexpected type argument %s", type2name(type)); break; 9965 } 9966 } 9967 9968 void MacroAssembler::evpmaxu(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { 9969 switch(type) { 9970 case T_BYTE: 9971 evpmaxub(dst, mask, nds, src, merge, vector_len); break; 9972 case T_SHORT: 9973 evpmaxuw(dst, mask, nds, src, merge, vector_len); break; 9974 case T_INT: 9975 evpmaxud(dst, mask, nds, src, merge, vector_len); break; 9976 case T_LONG: 9977 evpmaxuq(dst, mask, nds, src, merge, vector_len); break; 9978 default: 9979 fatal("Unexpected type argument %s", type2name(type)); break; 9980 } 9981 } 9982 9983 void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) { 9984 switch(type) { 9985 case T_BYTE: 9986 evpminsb(dst, mask, nds, src, merge, vector_len); break; 9987 case T_SHORT: 9988 evpminsw(dst, mask, nds, src, merge, vector_len); break; 9989 case T_INT: 9990 evpminsd(dst, mask, nds, src, merge, vector_len); break; 9991 case T_LONG: 9992 evpminsq(dst, mask, nds, src, merge, vector_len); break; 9993 default: 9994 fatal("Unexpected type argument %s", type2name(type)); break; 9995 } 9996 } 9997 9998 void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) { 9999 switch(type) { 10000 case T_BYTE: 10001 evpmaxsb(dst, mask, nds, src, merge, vector_len); break; 10002 case T_SHORT: 10003 evpmaxsw(dst, mask, nds, src, merge, vector_len); break; 10004 case T_INT: 10005 evpmaxsd(dst, mask, nds, src, merge, vector_len); break; 10006 case T_LONG: 10007 evpmaxsq(dst, mask, nds, src, merge, vector_len); break; 10008 default: 10009 fatal("Unexpected type argument %s", type2name(type)); break; 10010 } 10011 } 10012 10013 void MacroAssembler::evpmins(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { 10014 switch(type) { 10015 case T_BYTE: 10016 evpminsb(dst, mask, nds, src, merge, vector_len); break; 10017 case T_SHORT: 10018 evpminsw(dst, mask, nds, src, merge, vector_len); break; 10019 case T_INT: 10020 evpminsd(dst, mask, nds, src, merge, vector_len); break; 10021 case T_LONG: 10022 evpminsq(dst, mask, nds, src, merge, vector_len); break; 10023 default: 10024 fatal("Unexpected type argument %s", type2name(type)); break; 10025 } 10026 } 10027 10028 void MacroAssembler::evpmaxs(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { 10029 switch(type) { 10030 case T_BYTE: 10031 evpmaxsb(dst, mask, nds, src, merge, vector_len); break; 10032 case T_SHORT: 10033 evpmaxsw(dst, mask, nds, src, merge, vector_len); break; 10034 case T_INT: 10035 evpmaxsd(dst, mask, nds, src, merge, vector_len); break; 10036 case T_LONG: 10037 evpmaxsq(dst, mask, nds, src, merge, vector_len); break; 10038 default: 10039 fatal("Unexpected type argument %s", type2name(type)); break; 10040 } 10041 } 10042 10043 void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { 10044 switch(type) { 10045 case T_INT: 10046 evpxord(dst, mask, nds, src, merge, vector_len); break; 10047 case T_LONG: 10048 evpxorq(dst, mask, nds, src, merge, vector_len); break; 10049 default: 10050 fatal("Unexpected type argument %s", type2name(type)); break; 10051 } 10052 } 10053 10054 void MacroAssembler::evxor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) { 10055 switch(type) { 10056 case T_INT: 10057 evpxord(dst, mask, nds, src, merge, vector_len); break; 10058 case T_LONG: 10059 evpxorq(dst, mask, nds, src, merge, vector_len); break; 10060 default: 10061 fatal("Unexpected type argument %s", type2name(type)); break; 10062 } 10063 } 10064 10065 void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { 10066 switch(type) { 10067 case T_INT: 10068 Assembler::evpord(dst, mask, nds, src, merge, vector_len); break; 10069 case T_LONG: 10070 evporq(dst, mask, nds, src, merge, vector_len); break; 10071 default: 10072 fatal("Unexpected type argument %s", type2name(type)); break; 10073 } 10074 } 10075 10076 void MacroAssembler::evor(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) { 10077 switch(type) { 10078 case T_INT: 10079 Assembler::evpord(dst, mask, nds, src, merge, vector_len); break; 10080 case T_LONG: 10081 evporq(dst, mask, nds, src, merge, vector_len); break; 10082 default: 10083 fatal("Unexpected type argument %s", type2name(type)); break; 10084 } 10085 } 10086 10087 void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { 10088 switch(type) { 10089 case T_INT: 10090 evpandd(dst, mask, nds, src, merge, vector_len); break; 10091 case T_LONG: 10092 evpandq(dst, mask, nds, src, merge, vector_len); break; 10093 default: 10094 fatal("Unexpected type argument %s", type2name(type)); break; 10095 } 10096 } 10097 10098 void MacroAssembler::evand(BasicType type, XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) { 10099 switch(type) { 10100 case T_INT: 10101 evpandd(dst, mask, nds, src, merge, vector_len); break; 10102 case T_LONG: 10103 evpandq(dst, mask, nds, src, merge, vector_len); break; 10104 default: 10105 fatal("Unexpected type argument %s", type2name(type)); break; 10106 } 10107 } 10108 10109 void MacroAssembler::kortest(uint masklen, KRegister src1, KRegister src2) { 10110 switch(masklen) { 10111 case 8: 10112 kortestbl(src1, src2); 10113 break; 10114 case 16: 10115 kortestwl(src1, src2); 10116 break; 10117 case 32: 10118 kortestdl(src1, src2); 10119 break; 10120 case 64: 10121 kortestql(src1, src2); 10122 break; 10123 default: 10124 fatal("Unexpected mask length %d", masklen); 10125 break; 10126 } 10127 } 10128 10129 10130 void MacroAssembler::ktest(uint masklen, KRegister src1, KRegister src2) { 10131 switch(masklen) { 10132 case 8: 10133 ktestbl(src1, src2); 10134 break; 10135 case 16: 10136 ktestwl(src1, src2); 10137 break; 10138 case 32: 10139 ktestdl(src1, src2); 10140 break; 10141 case 64: 10142 ktestql(src1, src2); 10143 break; 10144 default: 10145 fatal("Unexpected mask length %d", masklen); 10146 break; 10147 } 10148 } 10149 10150 void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) { 10151 switch(type) { 10152 case T_INT: 10153 evprold(dst, mask, src, shift, merge, vlen_enc); break; 10154 case T_LONG: 10155 evprolq(dst, mask, src, shift, merge, vlen_enc); break; 10156 default: 10157 fatal("Unexpected type argument %s", type2name(type)); break; 10158 break; 10159 } 10160 } 10161 10162 void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc) { 10163 switch(type) { 10164 case T_INT: 10165 evprord(dst, mask, src, shift, merge, vlen_enc); break; 10166 case T_LONG: 10167 evprorq(dst, mask, src, shift, merge, vlen_enc); break; 10168 default: 10169 fatal("Unexpected type argument %s", type2name(type)); break; 10170 } 10171 } 10172 10173 void MacroAssembler::evrold(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 10174 switch(type) { 10175 case T_INT: 10176 evprolvd(dst, mask, src1, src2, merge, vlen_enc); break; 10177 case T_LONG: 10178 evprolvq(dst, mask, src1, src2, merge, vlen_enc); break; 10179 default: 10180 fatal("Unexpected type argument %s", type2name(type)); break; 10181 } 10182 } 10183 10184 void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc) { 10185 switch(type) { 10186 case T_INT: 10187 evprorvd(dst, mask, src1, src2, merge, vlen_enc); break; 10188 case T_LONG: 10189 evprorvq(dst, mask, src1, src2, merge, vlen_enc); break; 10190 default: 10191 fatal("Unexpected type argument %s", type2name(type)); break; 10192 } 10193 } 10194 10195 void MacroAssembler::evpandq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 10196 assert(rscratch != noreg || always_reachable(src), "missing"); 10197 10198 if (reachable(src)) { 10199 evpandq(dst, nds, as_Address(src), vector_len); 10200 } else { 10201 lea(rscratch, src); 10202 evpandq(dst, nds, Address(rscratch, 0), vector_len); 10203 } 10204 } 10205 10206 void MacroAssembler::evpaddq(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src, bool merge, int vector_len, Register rscratch) { 10207 assert(rscratch != noreg || always_reachable(src), "missing"); 10208 10209 if (reachable(src)) { 10210 Assembler::evpaddq(dst, mask, nds, as_Address(src), merge, vector_len); 10211 } else { 10212 lea(rscratch, src); 10213 Assembler::evpaddq(dst, mask, nds, Address(rscratch, 0), merge, vector_len); 10214 } 10215 } 10216 10217 void MacroAssembler::evporq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 10218 assert(rscratch != noreg || always_reachable(src), "missing"); 10219 10220 if (reachable(src)) { 10221 evporq(dst, nds, as_Address(src), vector_len); 10222 } else { 10223 lea(rscratch, src); 10224 evporq(dst, nds, Address(rscratch, 0), vector_len); 10225 } 10226 } 10227 10228 void MacroAssembler::vpshufb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 10229 assert(rscratch != noreg || always_reachable(src), "missing"); 10230 10231 if (reachable(src)) { 10232 vpshufb(dst, nds, as_Address(src), vector_len); 10233 } else { 10234 lea(rscratch, src); 10235 vpshufb(dst, nds, Address(rscratch, 0), vector_len); 10236 } 10237 } 10238 10239 void MacroAssembler::vpor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 10240 assert(rscratch != noreg || always_reachable(src), "missing"); 10241 10242 if (reachable(src)) { 10243 Assembler::vpor(dst, nds, as_Address(src), vector_len); 10244 } else { 10245 lea(rscratch, src); 10246 Assembler::vpor(dst, nds, Address(rscratch, 0), vector_len); 10247 } 10248 } 10249 10250 void MacroAssembler::vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, AddressLiteral src3, int vector_len, Register rscratch) { 10251 assert(rscratch != noreg || always_reachable(src3), "missing"); 10252 10253 if (reachable(src3)) { 10254 vpternlogq(dst, imm8, src2, as_Address(src3), vector_len); 10255 } else { 10256 lea(rscratch, src3); 10257 vpternlogq(dst, imm8, src2, Address(rscratch, 0), vector_len); 10258 } 10259 } 10260 10261 #if COMPILER2_OR_JVMCI 10262 10263 void MacroAssembler::fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask, 10264 Register length, Register temp, int vec_enc) { 10265 // Computing mask for predicated vector store. 10266 movptr(temp, -1); 10267 bzhiq(temp, temp, length); 10268 kmov(mask, temp); 10269 evmovdqu(bt, mask, dst, xmm, true, vec_enc); 10270 } 10271 10272 // Set memory operation for length "less than" 64 bytes. 10273 void MacroAssembler::fill64_masked(uint shift, Register dst, int disp, 10274 XMMRegister xmm, KRegister mask, Register length, 10275 Register temp, bool use64byteVector) { 10276 assert(MaxVectorSize >= 32, "vector length should be >= 32"); 10277 const BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; 10278 if (!use64byteVector) { 10279 fill32(dst, disp, xmm); 10280 subptr(length, 32 >> shift); 10281 fill32_masked(shift, dst, disp + 32, xmm, mask, length, temp); 10282 } else { 10283 assert(MaxVectorSize == 64, "vector length != 64"); 10284 fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_512bit); 10285 } 10286 } 10287 10288 10289 void MacroAssembler::fill32_masked(uint shift, Register dst, int disp, 10290 XMMRegister xmm, KRegister mask, Register length, 10291 Register temp) { 10292 assert(MaxVectorSize >= 32, "vector length should be >= 32"); 10293 const BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; 10294 fill_masked(type[shift], Address(dst, disp), xmm, mask, length, temp, Assembler::AVX_256bit); 10295 } 10296 10297 10298 void MacroAssembler::fill32(Address dst, XMMRegister xmm) { 10299 assert(MaxVectorSize >= 32, "vector length should be >= 32"); 10300 vmovdqu(dst, xmm); 10301 } 10302 10303 void MacroAssembler::fill32(Register dst, int disp, XMMRegister xmm) { 10304 fill32(Address(dst, disp), xmm); 10305 } 10306 10307 void MacroAssembler::fill64(Address dst, XMMRegister xmm, bool use64byteVector) { 10308 assert(MaxVectorSize >= 32, "vector length should be >= 32"); 10309 if (!use64byteVector) { 10310 fill32(dst, xmm); 10311 fill32(dst.plus_disp(32), xmm); 10312 } else { 10313 evmovdquq(dst, xmm, Assembler::AVX_512bit); 10314 } 10315 } 10316 10317 void MacroAssembler::fill64(Register dst, int disp, XMMRegister xmm, bool use64byteVector) { 10318 fill64(Address(dst, disp), xmm, use64byteVector); 10319 } 10320 10321 #ifdef _LP64 10322 void MacroAssembler::generate_fill_avx3(BasicType type, Register to, Register value, 10323 Register count, Register rtmp, XMMRegister xtmp) { 10324 Label L_exit; 10325 Label L_fill_start; 10326 Label L_fill_64_bytes; 10327 Label L_fill_96_bytes; 10328 Label L_fill_128_bytes; 10329 Label L_fill_128_bytes_loop; 10330 Label L_fill_128_loop_header; 10331 Label L_fill_128_bytes_loop_header; 10332 Label L_fill_128_bytes_loop_pre_header; 10333 Label L_fill_zmm_sequence; 10334 10335 int shift = -1; 10336 int avx3threshold = VM_Version::avx3_threshold(); 10337 switch(type) { 10338 case T_BYTE: shift = 0; 10339 break; 10340 case T_SHORT: shift = 1; 10341 break; 10342 case T_INT: shift = 2; 10343 break; 10344 /* Uncomment when LONG fill stubs are supported. 10345 case T_LONG: shift = 3; 10346 break; 10347 */ 10348 default: 10349 fatal("Unhandled type: %s\n", type2name(type)); 10350 } 10351 10352 if ((avx3threshold != 0) || (MaxVectorSize == 32)) { 10353 10354 if (MaxVectorSize == 64) { 10355 cmpq(count, avx3threshold >> shift); 10356 jcc(Assembler::greater, L_fill_zmm_sequence); 10357 } 10358 10359 evpbroadcast(type, xtmp, value, Assembler::AVX_256bit); 10360 10361 bind(L_fill_start); 10362 10363 cmpq(count, 32 >> shift); 10364 jccb(Assembler::greater, L_fill_64_bytes); 10365 fill32_masked(shift, to, 0, xtmp, k2, count, rtmp); 10366 jmp(L_exit); 10367 10368 bind(L_fill_64_bytes); 10369 cmpq(count, 64 >> shift); 10370 jccb(Assembler::greater, L_fill_96_bytes); 10371 fill64_masked(shift, to, 0, xtmp, k2, count, rtmp); 10372 jmp(L_exit); 10373 10374 bind(L_fill_96_bytes); 10375 cmpq(count, 96 >> shift); 10376 jccb(Assembler::greater, L_fill_128_bytes); 10377 fill64(to, 0, xtmp); 10378 subq(count, 64 >> shift); 10379 fill32_masked(shift, to, 64, xtmp, k2, count, rtmp); 10380 jmp(L_exit); 10381 10382 bind(L_fill_128_bytes); 10383 cmpq(count, 128 >> shift); 10384 jccb(Assembler::greater, L_fill_128_bytes_loop_pre_header); 10385 fill64(to, 0, xtmp); 10386 fill32(to, 64, xtmp); 10387 subq(count, 96 >> shift); 10388 fill32_masked(shift, to, 96, xtmp, k2, count, rtmp); 10389 jmp(L_exit); 10390 10391 bind(L_fill_128_bytes_loop_pre_header); 10392 { 10393 mov(rtmp, to); 10394 andq(rtmp, 31); 10395 jccb(Assembler::zero, L_fill_128_bytes_loop_header); 10396 negq(rtmp); 10397 addq(rtmp, 32); 10398 mov64(r8, -1L); 10399 bzhiq(r8, r8, rtmp); 10400 kmovql(k2, r8); 10401 evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, true, Assembler::AVX_256bit); 10402 addq(to, rtmp); 10403 shrq(rtmp, shift); 10404 subq(count, rtmp); 10405 } 10406 10407 cmpq(count, 128 >> shift); 10408 jcc(Assembler::less, L_fill_start); 10409 10410 bind(L_fill_128_bytes_loop_header); 10411 subq(count, 128 >> shift); 10412 10413 align32(); 10414 bind(L_fill_128_bytes_loop); 10415 fill64(to, 0, xtmp); 10416 fill64(to, 64, xtmp); 10417 addq(to, 128); 10418 subq(count, 128 >> shift); 10419 jccb(Assembler::greaterEqual, L_fill_128_bytes_loop); 10420 10421 addq(count, 128 >> shift); 10422 jcc(Assembler::zero, L_exit); 10423 jmp(L_fill_start); 10424 } 10425 10426 if (MaxVectorSize == 64) { 10427 // Sequence using 64 byte ZMM register. 10428 Label L_fill_128_bytes_zmm; 10429 Label L_fill_192_bytes_zmm; 10430 Label L_fill_192_bytes_loop_zmm; 10431 Label L_fill_192_bytes_loop_header_zmm; 10432 Label L_fill_192_bytes_loop_pre_header_zmm; 10433 Label L_fill_start_zmm_sequence; 10434 10435 bind(L_fill_zmm_sequence); 10436 evpbroadcast(type, xtmp, value, Assembler::AVX_512bit); 10437 10438 bind(L_fill_start_zmm_sequence); 10439 cmpq(count, 64 >> shift); 10440 jccb(Assembler::greater, L_fill_128_bytes_zmm); 10441 fill64_masked(shift, to, 0, xtmp, k2, count, rtmp, true); 10442 jmp(L_exit); 10443 10444 bind(L_fill_128_bytes_zmm); 10445 cmpq(count, 128 >> shift); 10446 jccb(Assembler::greater, L_fill_192_bytes_zmm); 10447 fill64(to, 0, xtmp, true); 10448 subq(count, 64 >> shift); 10449 fill64_masked(shift, to, 64, xtmp, k2, count, rtmp, true); 10450 jmp(L_exit); 10451 10452 bind(L_fill_192_bytes_zmm); 10453 cmpq(count, 192 >> shift); 10454 jccb(Assembler::greater, L_fill_192_bytes_loop_pre_header_zmm); 10455 fill64(to, 0, xtmp, true); 10456 fill64(to, 64, xtmp, true); 10457 subq(count, 128 >> shift); 10458 fill64_masked(shift, to, 128, xtmp, k2, count, rtmp, true); 10459 jmp(L_exit); 10460 10461 bind(L_fill_192_bytes_loop_pre_header_zmm); 10462 { 10463 movq(rtmp, to); 10464 andq(rtmp, 63); 10465 jccb(Assembler::zero, L_fill_192_bytes_loop_header_zmm); 10466 negq(rtmp); 10467 addq(rtmp, 64); 10468 mov64(r8, -1L); 10469 bzhiq(r8, r8, rtmp); 10470 kmovql(k2, r8); 10471 evmovdqu(T_BYTE, k2, Address(to, 0), xtmp, true, Assembler::AVX_512bit); 10472 addq(to, rtmp); 10473 shrq(rtmp, shift); 10474 subq(count, rtmp); 10475 } 10476 10477 cmpq(count, 192 >> shift); 10478 jcc(Assembler::less, L_fill_start_zmm_sequence); 10479 10480 bind(L_fill_192_bytes_loop_header_zmm); 10481 subq(count, 192 >> shift); 10482 10483 align32(); 10484 bind(L_fill_192_bytes_loop_zmm); 10485 fill64(to, 0, xtmp, true); 10486 fill64(to, 64, xtmp, true); 10487 fill64(to, 128, xtmp, true); 10488 addq(to, 192); 10489 subq(count, 192 >> shift); 10490 jccb(Assembler::greaterEqual, L_fill_192_bytes_loop_zmm); 10491 10492 addq(count, 192 >> shift); 10493 jcc(Assembler::zero, L_exit); 10494 jmp(L_fill_start_zmm_sequence); 10495 } 10496 bind(L_exit); 10497 } 10498 #endif 10499 #endif //COMPILER2_OR_JVMCI 10500 10501 10502 #ifdef _LP64 10503 void MacroAssembler::convert_f2i(Register dst, XMMRegister src) { 10504 Label done; 10505 cvttss2sil(dst, src); 10506 // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub 10507 cmpl(dst, 0x80000000); // float_sign_flip 10508 jccb(Assembler::notEqual, done); 10509 subptr(rsp, 8); 10510 movflt(Address(rsp, 0), src); 10511 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2i_fixup()))); 10512 pop(dst); 10513 bind(done); 10514 } 10515 10516 void MacroAssembler::convert_d2i(Register dst, XMMRegister src) { 10517 Label done; 10518 cvttsd2sil(dst, src); 10519 // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub 10520 cmpl(dst, 0x80000000); // float_sign_flip 10521 jccb(Assembler::notEqual, done); 10522 subptr(rsp, 8); 10523 movdbl(Address(rsp, 0), src); 10524 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2i_fixup()))); 10525 pop(dst); 10526 bind(done); 10527 } 10528 10529 void MacroAssembler::convert_f2l(Register dst, XMMRegister src) { 10530 Label done; 10531 cvttss2siq(dst, src); 10532 cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip())); 10533 jccb(Assembler::notEqual, done); 10534 subptr(rsp, 8); 10535 movflt(Address(rsp, 0), src); 10536 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2l_fixup()))); 10537 pop(dst); 10538 bind(done); 10539 } 10540 10541 void MacroAssembler::round_float(Register dst, XMMRegister src, Register rtmp, Register rcx) { 10542 // Following code is line by line assembly translation rounding algorithm. 10543 // Please refer to java.lang.Math.round(float) algorithm for details. 10544 const int32_t FloatConsts_EXP_BIT_MASK = 0x7F800000; 10545 const int32_t FloatConsts_SIGNIFICAND_WIDTH = 24; 10546 const int32_t FloatConsts_EXP_BIAS = 127; 10547 const int32_t FloatConsts_SIGNIF_BIT_MASK = 0x007FFFFF; 10548 const int32_t MINUS_32 = 0xFFFFFFE0; 10549 Label L_special_case, L_block1, L_exit; 10550 movl(rtmp, FloatConsts_EXP_BIT_MASK); 10551 movdl(dst, src); 10552 andl(dst, rtmp); 10553 sarl(dst, FloatConsts_SIGNIFICAND_WIDTH - 1); 10554 movl(rtmp, FloatConsts_SIGNIFICAND_WIDTH - 2 + FloatConsts_EXP_BIAS); 10555 subl(rtmp, dst); 10556 movl(rcx, rtmp); 10557 movl(dst, MINUS_32); 10558 testl(rtmp, dst); 10559 jccb(Assembler::notEqual, L_special_case); 10560 movdl(dst, src); 10561 andl(dst, FloatConsts_SIGNIF_BIT_MASK); 10562 orl(dst, FloatConsts_SIGNIF_BIT_MASK + 1); 10563 movdl(rtmp, src); 10564 testl(rtmp, rtmp); 10565 jccb(Assembler::greaterEqual, L_block1); 10566 negl(dst); 10567 bind(L_block1); 10568 sarl(dst); 10569 addl(dst, 0x1); 10570 sarl(dst, 0x1); 10571 jmp(L_exit); 10572 bind(L_special_case); 10573 convert_f2i(dst, src); 10574 bind(L_exit); 10575 } 10576 10577 void MacroAssembler::round_double(Register dst, XMMRegister src, Register rtmp, Register rcx) { 10578 // Following code is line by line assembly translation rounding algorithm. 10579 // Please refer to java.lang.Math.round(double) algorithm for details. 10580 const int64_t DoubleConsts_EXP_BIT_MASK = 0x7FF0000000000000L; 10581 const int64_t DoubleConsts_SIGNIFICAND_WIDTH = 53; 10582 const int64_t DoubleConsts_EXP_BIAS = 1023; 10583 const int64_t DoubleConsts_SIGNIF_BIT_MASK = 0x000FFFFFFFFFFFFFL; 10584 const int64_t MINUS_64 = 0xFFFFFFFFFFFFFFC0L; 10585 Label L_special_case, L_block1, L_exit; 10586 mov64(rtmp, DoubleConsts_EXP_BIT_MASK); 10587 movq(dst, src); 10588 andq(dst, rtmp); 10589 sarq(dst, DoubleConsts_SIGNIFICAND_WIDTH - 1); 10590 mov64(rtmp, DoubleConsts_SIGNIFICAND_WIDTH - 2 + DoubleConsts_EXP_BIAS); 10591 subq(rtmp, dst); 10592 movq(rcx, rtmp); 10593 mov64(dst, MINUS_64); 10594 testq(rtmp, dst); 10595 jccb(Assembler::notEqual, L_special_case); 10596 movq(dst, src); 10597 mov64(rtmp, DoubleConsts_SIGNIF_BIT_MASK); 10598 andq(dst, rtmp); 10599 mov64(rtmp, DoubleConsts_SIGNIF_BIT_MASK + 1); 10600 orq(dst, rtmp); 10601 movq(rtmp, src); 10602 testq(rtmp, rtmp); 10603 jccb(Assembler::greaterEqual, L_block1); 10604 negq(dst); 10605 bind(L_block1); 10606 sarq(dst); 10607 addq(dst, 0x1); 10608 sarq(dst, 0x1); 10609 jmp(L_exit); 10610 bind(L_special_case); 10611 convert_d2l(dst, src); 10612 bind(L_exit); 10613 } 10614 10615 void MacroAssembler::convert_d2l(Register dst, XMMRegister src) { 10616 Label done; 10617 cvttsd2siq(dst, src); 10618 cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip())); 10619 jccb(Assembler::notEqual, done); 10620 subptr(rsp, 8); 10621 movdbl(Address(rsp, 0), src); 10622 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2l_fixup()))); 10623 pop(dst); 10624 bind(done); 10625 } 10626 10627 void MacroAssembler::cache_wb(Address line) 10628 { 10629 // 64 bit cpus always support clflush 10630 assert(VM_Version::supports_clflush(), "clflush should be available"); 10631 bool optimized = VM_Version::supports_clflushopt(); 10632 bool no_evict = VM_Version::supports_clwb(); 10633 10634 // prefer clwb (writeback without evict) otherwise 10635 // prefer clflushopt (potentially parallel writeback with evict) 10636 // otherwise fallback on clflush (serial writeback with evict) 10637 10638 if (optimized) { 10639 if (no_evict) { 10640 clwb(line); 10641 } else { 10642 clflushopt(line); 10643 } 10644 } else { 10645 // no need for fence when using CLFLUSH 10646 clflush(line); 10647 } 10648 } 10649 10650 void MacroAssembler::cache_wbsync(bool is_pre) 10651 { 10652 assert(VM_Version::supports_clflush(), "clflush should be available"); 10653 bool optimized = VM_Version::supports_clflushopt(); 10654 bool no_evict = VM_Version::supports_clwb(); 10655 10656 // pick the correct implementation 10657 10658 if (!is_pre && (optimized || no_evict)) { 10659 // need an sfence for post flush when using clflushopt or clwb 10660 // otherwise no no need for any synchroniaztion 10661 10662 sfence(); 10663 } 10664 } 10665 10666 #endif // _LP64 10667 10668 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) { 10669 switch (cond) { 10670 // Note some conditions are synonyms for others 10671 case Assembler::zero: return Assembler::notZero; 10672 case Assembler::notZero: return Assembler::zero; 10673 case Assembler::less: return Assembler::greaterEqual; 10674 case Assembler::lessEqual: return Assembler::greater; 10675 case Assembler::greater: return Assembler::lessEqual; 10676 case Assembler::greaterEqual: return Assembler::less; 10677 case Assembler::below: return Assembler::aboveEqual; 10678 case Assembler::belowEqual: return Assembler::above; 10679 case Assembler::above: return Assembler::belowEqual; 10680 case Assembler::aboveEqual: return Assembler::below; 10681 case Assembler::overflow: return Assembler::noOverflow; 10682 case Assembler::noOverflow: return Assembler::overflow; 10683 case Assembler::negative: return Assembler::positive; 10684 case Assembler::positive: return Assembler::negative; 10685 case Assembler::parity: return Assembler::noParity; 10686 case Assembler::noParity: return Assembler::parity; 10687 } 10688 ShouldNotReachHere(); return Assembler::overflow; 10689 } 10690 10691 // This is simply a call to Thread::current() 10692 void MacroAssembler::get_thread_slow(Register thread) { 10693 if (thread != rax) { 10694 push(rax); 10695 } 10696 push(rdi); 10697 push(rsi); 10698 push(rdx); 10699 push(rcx); 10700 push(r8); 10701 push(r9); 10702 push(r10); 10703 push(r11); 10704 10705 MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0); 10706 10707 pop(r11); 10708 pop(r10); 10709 pop(r9); 10710 pop(r8); 10711 pop(rcx); 10712 pop(rdx); 10713 pop(rsi); 10714 pop(rdi); 10715 if (thread != rax) { 10716 mov(thread, rax); 10717 pop(rax); 10718 } 10719 } 10720 10721 void MacroAssembler::check_stack_alignment(Register sp, const char* msg, unsigned bias, Register tmp) { 10722 Label L_stack_ok; 10723 if (bias == 0) { 10724 testptr(sp, 2 * wordSize - 1); 10725 } else { 10726 // lea(tmp, Address(rsp, bias); 10727 mov(tmp, sp); 10728 addptr(tmp, bias); 10729 testptr(tmp, 2 * wordSize - 1); 10730 } 10731 jcc(Assembler::equal, L_stack_ok); 10732 block_comment(msg); 10733 stop(msg); 10734 bind(L_stack_ok); 10735 } 10736 10737 // Implements lightweight-locking. 10738 // 10739 // obj: the object to be locked 10740 // reg_rax: rax 10741 // thread: the thread which attempts to lock obj 10742 // tmp: a temporary register 10743 void MacroAssembler::lightweight_lock(Register basic_lock, Register obj, Register reg_rax, Register tmp, Label& slow) { 10744 Register thread = r15_thread; 10745 10746 assert(reg_rax == rax, ""); 10747 assert_different_registers(basic_lock, obj, reg_rax, thread, tmp); 10748 10749 Label push; 10750 const Register top = tmp; 10751 10752 // Preload the markWord. It is important that this is the first 10753 // instruction emitted as it is part of C1's null check semantics. 10754 movptr(reg_rax, Address(obj, oopDesc::mark_offset_in_bytes())); 10755 10756 if (UseObjectMonitorTable) { 10757 // Clear cache in case fast locking succeeds. 10758 movptr(Address(basic_lock, BasicObjectLock::lock_offset() + in_ByteSize((BasicLock::object_monitor_cache_offset_in_bytes()))), 0); 10759 } 10760 10761 // Load top. 10762 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 10763 10764 // Check if the lock-stack is full. 10765 cmpl(top, LockStack::end_offset()); 10766 jcc(Assembler::greaterEqual, slow); 10767 10768 // Check for recursion. 10769 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 10770 jcc(Assembler::equal, push); 10771 10772 // Check header for monitor (0b10). 10773 testptr(reg_rax, markWord::monitor_value); 10774 jcc(Assembler::notZero, slow); 10775 10776 // Try to lock. Transition lock bits 0b01 => 0b00 10777 movptr(tmp, reg_rax); 10778 andptr(tmp, ~(int32_t)markWord::unlocked_value); 10779 orptr(reg_rax, markWord::unlocked_value); 10780 lock(); cmpxchgptr(tmp, Address(obj, oopDesc::mark_offset_in_bytes())); 10781 jcc(Assembler::notEqual, slow); 10782 10783 // Restore top, CAS clobbers register. 10784 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 10785 10786 bind(push); 10787 // After successful lock, push object on lock-stack. 10788 movptr(Address(thread, top), obj); 10789 incrementl(top, oopSize); 10790 movl(Address(thread, JavaThread::lock_stack_top_offset()), top); 10791 } 10792 10793 // Implements lightweight-unlocking. 10794 // 10795 // obj: the object to be unlocked 10796 // reg_rax: rax 10797 // thread: the thread 10798 // tmp: a temporary register 10799 void MacroAssembler::lightweight_unlock(Register obj, Register reg_rax, Register tmp, Label& slow) { 10800 Register thread = r15_thread; 10801 10802 assert(reg_rax == rax, ""); 10803 assert_different_registers(obj, reg_rax, thread, tmp); 10804 10805 Label unlocked, push_and_slow; 10806 const Register top = tmp; 10807 10808 // Check if obj is top of lock-stack. 10809 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 10810 cmpptr(obj, Address(thread, top, Address::times_1, -oopSize)); 10811 jcc(Assembler::notEqual, slow); 10812 10813 // Pop lock-stack. 10814 DEBUG_ONLY(movptr(Address(thread, top, Address::times_1, -oopSize), 0);) 10815 subl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 10816 10817 // Check if recursive. 10818 cmpptr(obj, Address(thread, top, Address::times_1, -2 * oopSize)); 10819 jcc(Assembler::equal, unlocked); 10820 10821 // Not recursive. Check header for monitor (0b10). 10822 movptr(reg_rax, Address(obj, oopDesc::mark_offset_in_bytes())); 10823 testptr(reg_rax, markWord::monitor_value); 10824 jcc(Assembler::notZero, push_and_slow); 10825 10826 #ifdef ASSERT 10827 // Check header not unlocked (0b01). 10828 Label not_unlocked; 10829 testptr(reg_rax, markWord::unlocked_value); 10830 jcc(Assembler::zero, not_unlocked); 10831 stop("lightweight_unlock already unlocked"); 10832 bind(not_unlocked); 10833 #endif 10834 10835 // Try to unlock. Transition lock bits 0b00 => 0b01 10836 movptr(tmp, reg_rax); 10837 orptr(tmp, markWord::unlocked_value); 10838 lock(); cmpxchgptr(tmp, Address(obj, oopDesc::mark_offset_in_bytes())); 10839 jcc(Assembler::equal, unlocked); 10840 10841 bind(push_and_slow); 10842 // Restore lock-stack and handle the unlock in runtime. 10843 #ifdef ASSERT 10844 movl(top, Address(thread, JavaThread::lock_stack_top_offset())); 10845 movptr(Address(thread, top), obj); 10846 #endif 10847 addl(Address(thread, JavaThread::lock_stack_top_offset()), oopSize); 10848 jmp(slow); 10849 10850 bind(unlocked); 10851 } 10852 10853 #ifdef _LP64 10854 // Saves legacy GPRs state on stack. 10855 void MacroAssembler::save_legacy_gprs() { 10856 subq(rsp, 16 * wordSize); 10857 movq(Address(rsp, 15 * wordSize), rax); 10858 movq(Address(rsp, 14 * wordSize), rcx); 10859 movq(Address(rsp, 13 * wordSize), rdx); 10860 movq(Address(rsp, 12 * wordSize), rbx); 10861 movq(Address(rsp, 10 * wordSize), rbp); 10862 movq(Address(rsp, 9 * wordSize), rsi); 10863 movq(Address(rsp, 8 * wordSize), rdi); 10864 movq(Address(rsp, 7 * wordSize), r8); 10865 movq(Address(rsp, 6 * wordSize), r9); 10866 movq(Address(rsp, 5 * wordSize), r10); 10867 movq(Address(rsp, 4 * wordSize), r11); 10868 movq(Address(rsp, 3 * wordSize), r12); 10869 movq(Address(rsp, 2 * wordSize), r13); 10870 movq(Address(rsp, wordSize), r14); 10871 movq(Address(rsp, 0), r15); 10872 } 10873 10874 // Resotres back legacy GPRs state from stack. 10875 void MacroAssembler::restore_legacy_gprs() { 10876 movq(r15, Address(rsp, 0)); 10877 movq(r14, Address(rsp, wordSize)); 10878 movq(r13, Address(rsp, 2 * wordSize)); 10879 movq(r12, Address(rsp, 3 * wordSize)); 10880 movq(r11, Address(rsp, 4 * wordSize)); 10881 movq(r10, Address(rsp, 5 * wordSize)); 10882 movq(r9, Address(rsp, 6 * wordSize)); 10883 movq(r8, Address(rsp, 7 * wordSize)); 10884 movq(rdi, Address(rsp, 8 * wordSize)); 10885 movq(rsi, Address(rsp, 9 * wordSize)); 10886 movq(rbp, Address(rsp, 10 * wordSize)); 10887 movq(rbx, Address(rsp, 12 * wordSize)); 10888 movq(rdx, Address(rsp, 13 * wordSize)); 10889 movq(rcx, Address(rsp, 14 * wordSize)); 10890 movq(rax, Address(rsp, 15 * wordSize)); 10891 addq(rsp, 16 * wordSize); 10892 } 10893 10894 void MacroAssembler::setcc(Assembler::Condition comparison, Register dst) { 10895 if (VM_Version::supports_apx_f()) { 10896 esetzucc(comparison, dst); 10897 } else { 10898 setb(comparison, dst); 10899 movzbl(dst, dst); 10900 } 10901 } 10902 #endif